1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module pfft.neon_float; 7 8 import pfft.fft_impl; 9 import gcc.builtins; 10 import core.simd; 11 12 version(GNU) { } else 13 { 14 static assert(0, "This compiler is not supported."); 15 } 16 17 struct NeonVec 18 { 19 float4 v; 20 21 this(float4 _v){ v = _v; } 22 23 NeonVec opBinary(string s)(NeonVec other) if(s == "+") 24 { 25 return NeonVec(__builtin_neon_vaddv4sf(v, other.v, 3)); 26 } 27 NeonVec opBinary(string s)(NeonVec other) if(s == "-") 28 { 29 return NeonVec(__builtin_neon_vsubv4sf(v, other.v, 3)); 30 } 31 NeonVec opBinary(string s)(NeonVec other) if(s == "*") 32 { 33 return NeonVec(__builtin_neon_vmulv4sf(v, other.v, 3)); 34 } 35 36 /*NeonVec muladd(NeonVec a, NeonVec b) 37 { 38 return NeonVec(__builtin_neon_vmlav4sf(v, a.v, b.v, 3)); 39 } 40 NeonVec mulsub(NeonVec a, NeonVec b) 41 { 42 return NeonVec(__builtin_neon_vmlsv4sf(v, a.v, b.v, 3)); 43 }*/ 44 } 45 46 struct Vector 47 { 48 alias NeonVec vec; 49 alias float T; 50 51 enum vec_size = 4; 52 enum log2_bitreverse_chunk_size = 2; 53 54 static vec scalar_to_vector(T a) 55 { 56 return vec(a); 57 } 58 59 static void complex_array_to_real_imag_vec(int N)(T * arr, ref vec rr, ref vec ri) 60 { 61 static if(N==4) 62 { 63 deinterleave((cast(vec*)arr)[0], (cast(vec*)arr)[1], rr, ri); 64 } 65 else if(N==2) 66 { 67 asm nothrow @nogc 68 { 69 "vldmia %2, {%e0-%f0} \n" 70 "vmov %q1, %q0 \n" 71 "vuzp.32 %q0, %q1 \n" 72 "vuzp.32 %e0, %f0 \n" 73 "vuzp.32 %e1, %f1 \n" 74 : "=w" rr, "=w" ri 75 : "r" arr ; 76 } 77 } 78 } 79 80 static void transpose(int elements_per_vector)( 81 vec a0, vec a1, ref vec r0, ref vec r1) 82 { 83 if(elements_per_vector == 4) 84 { 85 float4[2] tmp; 86 __builtin_neon_vtrnv4sf(&tmp[0], a0.v, a1.v); 87 r0.v = tmp[0]; 88 r1.v = tmp[1]; 89 } 90 else if(elements_per_vector == 2) 91 { 92 asm nothrow @nogc 93 { 94 "vswp %f0, %e1 \n" 95 :"+w" a0.v, "+w" a1.v ; 96 } 97 r0 = a0; 98 r1 = a1; 99 } 100 } 101 102 static void interleave(vec a0, vec a1, ref vec r0, ref vec r1) 103 { 104 float4[2] tmp; 105 __builtin_neon_vzipv4sf(&tmp[0], a0.v, a1.v); 106 r0.v = tmp[0]; 107 r1.v = tmp[1]; 108 } 109 110 static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1) 111 { 112 float4[2] tmp; 113 __builtin_neon_vuzpv4sf(&tmp[0], a0.v, a1.v); 114 r0.v = tmp[0]; 115 r1.v = tmp[1]; 116 } 117 118 private static float4 * v(float * a) 119 { 120 return cast(float4*)a; 121 } 122 123 private static _bit_reverse(ref float4 a0, ref float4 a1, 124 ref float4 a2, ref float4 a3) 125 { 126 asm nothrow @nogc 127 { 128 "vtrn.32 %q0, %q2 \n" 129 "vtrn.32 %q1, %q3 \n" 130 "vswp %f0, %e1 \n" 131 "vswp %f2, %e3 \n" 132 : "+w" a0, "+w" a1, "+w" a2, "+w" a3; 133 } 134 } 135 136 137 static void bit_reverse_swap(T * p0, T * p1, int m) 138 { 139 float4 140 a0 = *v(p0 + 0 * m), 141 a1 = *v(p0 + 1 * m), 142 a2 = *v(p0 + 2 * m), 143 a3 = *v(p0 + 3 * m); 144 _bit_reverse(a0, a1, a2, a3); 145 146 float4 147 b0 = *v(p1 + 0 * m), 148 b1 = *v(p1 + 1 * m), 149 b2 = *v(p1 + 2 * m), 150 b3 = *v(p1 + 3 * m); 151 *v(p1 + 0 * m) = a0; 152 *v(p1 + 1 * m) = a1; 153 *v(p1 + 2 * m) = a2; 154 *v(p1 + 3 * m) = a3; 155 156 _bit_reverse(b0, b1, b2, b3); 157 *v(p0 + 0 * m) = b0; 158 *v(p0 + 1 * m) = b1; 159 *v(p0 + 2 * m) = b2; 160 *v(p0 + 3 * m) = b3; 161 } 162 163 static void bit_reverse(T * p, int m) 164 { 165 _bit_reverse(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m)); 166 } 167 } 168 169 struct Options 170 { 171 enum log2_bitreverse_large_chunk_size = 5; 172 enum large_limit = 14; 173 enum log2_optimal_n = 9; 174 enum passes_per_recursive_call = 5; 175 enum log2_recursive_passes_chunk_size = 5; 176 } 177