1 // Copyright Jernej Krempuš 2012 2 // Distributed under the Boost Software License, Version 1.0. 3 // (See accompanying file LICENSE_1_0.txt or copy at 4 // http://www.boost.org/LICENSE_1_0.txt) 5 6 module pfft.avx_double; 7 8 import core.simd; 9 10 import pfft.fft_impl; 11 12 import pfft.ldc_compat; 13 import pfft.dmd32_compat; 14 15 // TODO: this was never testted 16 17 @target("avx2"): 18 //version = SSE_AVX; // only version supported by this fork 19 20 version(DigitalMars) 21 { 22 } 23 else version(LDC) 24 { 25 } 26 else version(GNU) 27 { 28 import gcc.builtins; 29 30 template shuf_mask(int a3, int a2, int a1, int a0) 31 { 32 enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 33 } 34 35 double4 interleave128_lo_d(double4 a, double4 b) 36 { 37 return __builtin_ia32_vperm2f128_pd256(a, b, shuf_mask!(0,2,0,0)); 38 } 39 40 double4 interleave128_hi_d(double4 a, double4 b) 41 { 42 return __builtin_ia32_vperm2f128_pd256(a, b, shuf_mask!(0,3,0,1)); 43 } 44 45 alias __builtin_ia32_unpcklpd256 unpcklpd; 46 alias __builtin_ia32_unpckhpd256 unpckhpd; 47 alias __builtin_ia32_loadupd256 loadupd; 48 alias __builtin_ia32_storeupd256 storeupd; 49 } 50 else 51 { 52 static assert("Unsupported compiler"); 53 } 54 55 struct Vector 56 { 57 nothrow: 58 @nogc: 59 alias double4 vec; 60 alias double T; 61 62 enum vec_size = 4; 63 enum log2_bitreverse_chunk_size = 2; 64 65 static auto v(T* p){ return cast(vec*) p; } 66 67 static void complex_array_to_real_imag_vec(int n)(T* arr, ref vec rr, ref vec ri) 68 { 69 static if (n == 4) 70 deinterleave(v(arr)[0], v(arr)[1], rr, ri); 71 else static if(n == 2) 72 { 73 vec a = *v(arr); 74 rr = unpcklpd(a, a); 75 ri = unpckhpd(a, a); 76 } 77 else 78 static assert(0); 79 } 80 81 static void transpose(int elements_per_vector)(vec a0, vec a1, ref vec r0, ref vec r1) 82 { 83 static if(elements_per_vector == 4) 84 { 85 r0 = unpcklpd(a0, a1); 86 r1 = unpckhpd(a0, a1); 87 } 88 else static if(elements_per_vector == 2) 89 { 90 r0 = interleave128_lo_d(a0, a1); 91 r1 = interleave128_hi_d(a0, a1); 92 } 93 else 94 static assert(0); 95 } 96 97 static void interleave(vec a0, vec a1, ref vec r0, ref vec r1) 98 { 99 vec b0, b1; 100 b0 = unpcklpd(a0, a1); 101 b1 = unpckhpd(a0, a1); 102 transpose!2(b0, b1, r0, r1); 103 } 104 105 static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1) 106 { 107 vec b0, b1; 108 109 transpose!2(a0, a1, b0, b1); 110 r0 = unpcklpd(b0, b1); 111 r1 = unpckhpd(b0, b1); 112 } 113 114 115 static void bit_reverse_swap(double * p0, double * p1, size_t m) 116 { 117 vec a0, a1, a2, a3, b0, b1, b2, b3; 118 119 a0 = *v(p0 + 0 * m); 120 a1 = *v(p0 + 1 * m); 121 a2 = *v(p0 + 2 * m); 122 a3 = *v(p0 + 3 * m); 123 124 b0 = unpcklpd(a0, a2); 125 b2 = unpckhpd(a0, a2); 126 b1 = unpcklpd(a1, a3); 127 b3 = unpckhpd(a1, a3); 128 129 a0 = interleave128_lo_d(b0, b1); 130 a1 = interleave128_hi_d(b0, b1); 131 a2 = interleave128_lo_d(b2, b3); 132 a3 = interleave128_hi_d(b2, b3); 133 134 b0 = *v(p1 + 0 * m); 135 b1 = *v(p1 + 1 * m); 136 b2 = *v(p1 + 2 * m); 137 b3 = *v(p1 + 3 * m); 138 139 *v(p1 + 0 * m) = a0; 140 *v(p1 + 1 * m) = a1; 141 *v(p1 + 2 * m) = a2; 142 *v(p1 + 3 * m) = a3; 143 144 a0 = unpcklpd(b0, b2); 145 a2 = unpckhpd(b0, b2); 146 a1 = unpcklpd(b1, b3); 147 a3 = unpckhpd(b1, b3); 148 149 b0 = interleave128_lo_d(a0, a1); 150 b1 = interleave128_hi_d(a0, a1); 151 b2 = interleave128_lo_d(a2, a3); 152 b3 = interleave128_hi_d(a2, a3); 153 154 *v(p0 + 0 * m) = b0; 155 *v(p0 + 1 * m) = b1; 156 *v(p0 + 2 * m) = b2; 157 *v(p0 + 3 * m) = b3; 158 } 159 160 static void bit_reverse(double * p, size_t m) 161 { 162 vec a0, a1, a2, a3, b0, b1, b2, b3; 163 164 a0 = *v(p + 0 * m); 165 a1 = *v(p + 1 * m); 166 a2 = *v(p + 2 * m); 167 a3 = *v(p + 3 * m); 168 169 b0 = unpcklpd(a0, a2); 170 b2 = unpckhpd(a0, a2); 171 b1 = unpcklpd(a1, a3); 172 b3 = unpckhpd(a1, a3); 173 174 *v(p + 0 * m) = interleave128_lo_d(b0, b1); 175 *v(p + 1 * m) = interleave128_hi_d(b0, b1); 176 *v(p + 2 * m) = interleave128_lo_d(b2, b3); 177 *v(p + 3 * m) = interleave128_hi_d(b2, b3); 178 } 179 180 static vec scalar_to_vector(T a) 181 { 182 return a; 183 } 184 185 static vec unaligned_load(T* p) 186 { 187 return loadupd(p); 188 } 189 190 static void unaligned_store(T* p, vec v) 191 { 192 storeupd(p, v); 193 } 194 /* 195 static vec reverse(vec v) 196 { 197 v = __builtin_ia32_shufpd256(v, v, 0x5); 198 v = __builtin_ia32_vperm2f128_pd256(v, v, shuf_mask!(0,0,0,1)); 199 return v; 200 } 201 */ 202 203 version(LDC) 204 { 205 static vec unpcklpd(vec a, vec b) 206 { 207 return shufflevector!(double4, 0, 4, 2, 6)(a, b); 208 } 209 210 static vec unpckhpd(vec a, vec b) 211 { 212 return shufflevector!(double4, 1, 5, 3, 7)(a, b); 213 } 214 215 static vec interleave128_lo_d(vec a, vec b) 216 { 217 return shufflevector!(double4, 0, 1, 4, 5)(a, b); 218 } 219 220 static vec interleave128_hi_d(vec a, vec b) 221 { 222 return shufflevector!(double4, 2, 3, 6, 7)(a, b); 223 } 224 225 static vec loadupd(T* p) 226 { 227 return loadUnaligned!vec(cast(double*)p); 228 } 229 230 static void storeupd(T* p, vec v) 231 { 232 storeUnaligned!vec(v, cast(double*)p); 233 } 234 } 235 236 } 237 238 struct Options 239 { 240 enum log2_bitreverse_large_chunk_size = 5; 241 enum large_limit = 14; 242 enum log2_optimal_n = 8; 243 enum passes_per_recursive_call = 4; 244 enum log2_recursive_passes_chunk_size = 4; 245 enum prefered_alignment = 4 * (1 << 10); 246 enum { fast_init }; 247 } 248 249 version(SSE_AVX) 250 { 251 import pfft.fft_impl; 252 enum implementation = 0; 253 alias TypeTuple!(FFT!(Vector, Options)) FFTs; 254 mixin Instantiate!(); 255 }