1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module pfft.avx_float; 7 8 import core.simd; 9 10 import pfft.fft_impl; 11 12 import pfft.ldc_compat; 13 import pfft.dmd32_compat; 14 15 @target("avx2"): // else no AVX instruction is generated 16 //version = SSE_AVX; // only backend supported in this fork 17 18 version(DigitalMars) 19 { 20 21 } 22 else version(LDC) 23 { 24 } 25 else version(GNU) 26 { 27 import gcc.builtins; 28 29 template shuf_mask(int a3, int a2, int a1, int a0) 30 { 31 enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 32 } 33 34 float8 insert128_0(float8 a, float4 b) 35 { 36 return __builtin_ia32_vinsertf128_ps256(a, b, 0); 37 } 38 39 float8 insert128_1(float8 a, float4 b) 40 { 41 return __builtin_ia32_vinsertf128_ps256(a, b, 1); 42 } 43 44 float8 interleave128_lo(float8 a, float8 b) 45 { 46 return __builtin_ia32_vperm2f128_ps256(a, b, shuf_mask!(0,2,0,0)); // 32 0010 0000 47 } 48 49 float8 interleave128_hi(float8 a, float8 b) 50 { 51 return __builtin_ia32_vperm2f128_ps256(a, b, shuf_mask!(0,3,0,1)); 52 } 53 54 float8 reverse128(float8 v) 55 { 56 return __builtin_ia32_vperm2f128_ps256(v, v, shuf_mask!(0, 0, 0, 1)); 57 } 58 59 alias __builtin_ia32_unpcklps256 unpcklps; 60 alias __builtin_ia32_unpckhps256 unpckhps; 61 alias __builtin_ia32_loadups256 loadups; 62 alias __builtin_ia32_storeups256 storeups; 63 64 auto shufps(param...)(float8 a, float8 b) 65 { 66 return __builtin_ia32_shufps256(a, b, shuf_mask!param); 67 } 68 69 } 70 71 struct Vector 72 { 73 nothrow: 74 @nogc: 75 alias float8 vec; 76 alias float T; 77 78 enum vec_size = 8; 79 80 enum log2_bitreverse_chunk_size = 3; 81 82 static auto v(T* p){ return cast(float4*) p; } 83 static auto v8(T* p){ return cast(float8*) p; } 84 85 static void _deinterleave2(vec a0, vec a1, ref vec r0, ref vec r1) 86 { 87 r0 = interleave128_lo(a0, a1); 88 r1 = interleave128_hi(a0, a1); 89 } 90 91 // the three functions below do not do exactly what the names imply, but that's 92 // ok (fft works correctly when using them) 93 94 static void complex_array_to_real_imag_vec(int n)(T* arr, ref vec rr, ref vec ri) 95 { 96 static if(n == 8) 97 { 98 deinterleave(v8(arr)[0], v8(arr)[1], rr, ri); 99 } 100 else static if (n == 4) 101 { 102 vec a = *v8(arr); 103 rr = shufps!(2, 2, 0, 0)(a, a); 104 ri = shufps!(3, 3, 1, 1)(a, a); 105 } 106 else static if(n == 2) 107 { 108 rr = insert128_0(rr, arr[0]); 109 rr = insert128_1(rr, arr[2]); 110 ri = insert128_0(ri, arr[1]); 111 ri = insert128_1(ri, arr[3]); 112 } 113 else 114 static assert(0); 115 } 116 117 static void interleave(vec a0, vec a1, ref vec r0, ref vec r1) 118 { 119 vec a0_tmp = unpcklps(a0, a1); 120 a1 = unpckhps(a0, a1); 121 _deinterleave2(a0_tmp, a1, r0, r1); 122 } 123 124 static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1) 125 { 126 _deinterleave2(a0, a1, a0, a1); 127 r0 = shufps!(2,0,2,0)(a0, a1); 128 r1 = shufps!(3,1,3,1)(a0, a1); 129 } 130 131 static void transpose(int elements_per_vector)( 132 vec a0, vec a1, ref vec r0, ref vec r1) 133 { 134 static if(elements_per_vector == 8) 135 { 136 r0 = shufps!(2,0,2,0)(a0, a1); 137 r1 = shufps!(3,1,3,1)(a0, a1); 138 r0 = shufps!(3,1,2,0)(r0, r0); 139 r1 = shufps!(3,1,2,0)(r1, r1); 140 } 141 else static if(elements_per_vector == 4) 142 { 143 r0 = shufps!(1,0,1,0)(a0, a1); 144 r1 = shufps!(3,2,3,2)(a0, a1); 145 } 146 else static if(elements_per_vector == 2) 147 { 148 r0 = interleave128_lo(a0, a1); 149 r1 = interleave128_hi(a0, a1); 150 } 151 else 152 static assert(0); 153 } 154 155 private static void br16_two(ref vec a0, ref vec a1, ref vec a2, ref vec a3) 156 { 157 vec b0 = shufps!(1, 0, 1, 0)(a0, a2); 158 vec b1 = shufps!(1, 0, 1, 0)(a1, a3); 159 vec b2 = shufps!(3, 2, 3, 2)(a0, a2); 160 vec b3 = shufps!(3, 2, 3, 2)(a1, a3); 161 162 a0 = shufps!(2, 0, 2, 0)(b0, b1); 163 a1 = shufps!(2, 0, 2, 0)(b2, b3); 164 a2 = shufps!(3, 1, 3, 1)(b0, b1); 165 a3 = shufps!(3, 1, 3, 1)(b2, b3); 166 } 167 168 private static void br64( 169 ref vec a0, ref vec a1, ref vec a2, ref vec a3, 170 ref vec a4, ref vec a5, ref vec a6, ref vec a7) 171 { 172 // reverse the outer four bits 173 br16_two(a0, a2, a4, a6); 174 br16_two(a1, a3, a5, a7); 175 176 // reverse the inner two bits 177 _deinterleave2(a0, a1, a0, a1); 178 _deinterleave2(a2, a3, a2, a3); 179 _deinterleave2(a4, a5, a4, a5); 180 _deinterleave2(a6, a7, a6, a7); 181 } 182 183 template RepeatType(T, int n, R...) 184 { 185 static if(n == 0) 186 alias R RepeatType; 187 else 188 alias RepeatType!(T, n - 1, T, R) RepeatType; 189 } 190 191 static void bit_reverse_swap(T* p0, T* p1, size_t m) 192 { 193 RepeatType!(vec, 8) a, b; 194 195 foreach(i, _; a) 196 a[i] = *v8(p0 + i * m); 197 198 br64(a); 199 200 foreach(i, _; a) 201 b[i] = *v8(p1 + i * m); 202 203 foreach(i, _; a) 204 *v8(p1 + i * m) = a[i]; 205 206 br64(b); 207 208 foreach(i, _; a) 209 *v8(p0 + i * m) = b[i]; 210 } 211 212 static void bit_reverse(T* p0, size_t m) 213 { 214 RepeatType!(vec, 8) a; 215 216 foreach(i, _; a) 217 a[i] = *v8(p0 + i * m); 218 219 br64(a); 220 221 foreach(i, _; a) 222 *v8(p0 + i * m) = a[i]; 223 } 224 225 static vec scalar_to_vector(T a) 226 { 227 return a; 228 } 229 230 static vec unaligned_load(T* p) 231 { 232 return loadups(p); 233 } 234 235 static void unaligned_store(T* p, vec v) 236 { 237 storeups(p, v); 238 } 239 240 static vec reverse(vec v) 241 { 242 v = shufps!(0, 1, 2, 3)(v, v); 243 return reverse128(v); 244 } 245 246 version(LDC) 247 { 248 static shufps(int m3, int m2, int m1, int m0)(vec a, vec b) 249 { 250 return shufflevector!(float8, m0, m1, m2+8, m3+8, 251 m0+4, m1+4, m2+12, m3+12)(a, b); 252 } 253 254 static vec unpcklps(vec a, vec b) 255 { 256 return shufflevector!(float8, 0, 8, 1, 9, 257 4, 12, 5, 13)(a, b); 258 } 259 260 static vec unpckhps(vec a, vec b) 261 { 262 return shufflevector!(float8, 2, 10, 3, 11, 263 6, 14, 7, 15)(a, b); 264 } 265 266 static vec interleave128_lo(vec a, vec b) 267 { 268 return shufflevector!(float8, 0, 1, 2, 3, 8, 9, 10, 11)(a, b); 269 } 270 271 static vec interleave128_hi(vec a, vec b) 272 { 273 return shufflevector!(float8, 4, 5, 6, 7, 12, 13, 14, 15)(a, b); 274 } 275 276 static vec reverse128(vec a) 277 { 278 return shufflevector!(float8, 4, 5, 6, 7, 0, 1, 2, 3)(a, a); 279 } 280 281 static vec loadups(T* p) 282 { 283 return loadUnaligned!vec(cast(float*)p); 284 } 285 286 static void storeups(T* p, vec v) 287 { 288 storeUnaligned!vec(v, cast(float*)p); 289 } 290 291 static float8 insert128_0(float8 a, float4 b) 292 { 293 float[8] res = a.array; 294 float[4] toInsert = b.array; 295 res[0..4] = toInsert[0..4]; 296 return loadups(res.ptr); 297 } 298 299 static float8 insert128_1(float8 a, float4 b) 300 { 301 float[8] res = a.array; 302 float[4] toInsert = b.array; 303 res[4..8] = toInsert[0..4]; 304 return loadups(res.ptr); 305 } 306 } 307 } 308 309 struct Options 310 { 311 enum log2_bitreverse_large_chunk_size = 5; 312 enum large_limit = 14; 313 enum log2_optimal_n = 8; 314 enum passes_per_recursive_call = 4; 315 enum log2_recursive_passes_chunk_size = 4; 316 enum prefered_alignment = 4 * (1 << 10); 317 } 318 319 version(SSE_AVX) 320 { 321 import pfft.fft_impl; 322 enum implementation = 0; 323 alias TypeTuple!(FFT!(Vector, Options)) FFTs; 324 mixin Instantiate!(); 325 }