1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module pfft.avx_float;
7 
8 import core.simd;
9 
10 import pfft.fft_impl;
11 
12 import pfft.ldc_compat;
13 import pfft.dmd32_compat;
14 
15 @target("avx2"): // else no AVX instruction is generated
16 //version = SSE_AVX; // only backend supported in this fork
17 
18 version(DigitalMars)
19 {
20 
21 }
22 else version(LDC)
23 {
24 }
25 else version(GNU)
26 {
27     import gcc.builtins;
28 
29     template shuf_mask(int a3, int a2, int a1, int a0)
30     { 
31         enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 
32     }
33 
34     float8 insert128_0(float8 a, float4 b)
35     {
36         return __builtin_ia32_vinsertf128_ps256(a, b, 0);
37     }
38     
39     float8 insert128_1(float8 a, float4 b)
40     {
41         return __builtin_ia32_vinsertf128_ps256(a, b, 1);
42     }
43 
44     float8 interleave128_lo(float8 a, float8 b)
45     {
46         return __builtin_ia32_vperm2f128_ps256(a, b, shuf_mask!(0,2,0,0)); // 32 0010 0000
47     }
48 
49     float8 interleave128_hi(float8 a, float8 b)
50     {
51         return __builtin_ia32_vperm2f128_ps256(a, b, shuf_mask!(0,3,0,1));
52     }
53 
54     float8  reverse128(float8 v)
55     {
56         return __builtin_ia32_vperm2f128_ps256(v, v, shuf_mask!(0, 0, 0, 1));
57     }
58 
59     alias __builtin_ia32_unpcklps256 unpcklps;
60     alias __builtin_ia32_unpckhps256 unpckhps;
61     alias __builtin_ia32_loadups256 loadups;
62     alias __builtin_ia32_storeups256 storeups;
63     
64     auto shufps(param...)(float8 a, float8 b)
65     {
66         return __builtin_ia32_shufps256(a, b, shuf_mask!param);
67     }
68 
69 }
70 
71 struct Vector 
72 {
73 nothrow:
74 @nogc:
75     alias float8 vec;
76     alias float T;
77     
78     enum vec_size = 8;
79   
80     enum log2_bitreverse_chunk_size = 3;
81  
82     static auto v(T* p){ return cast(float4*) p; }
83     static auto v8(T* p){ return cast(float8*) p; }
84     
85     static void _deinterleave2(vec a0, vec a1, ref vec r0, ref vec r1)
86     {
87         r0 = interleave128_lo(a0, a1);
88         r1 = interleave128_hi(a0, a1);
89     }
90    
91     // the three functions below do not do exactly what the names imply, but that's
92     // ok (fft works correctly when using them)
93  
94     static void complex_array_to_real_imag_vec(int n)(T* arr, ref vec rr, ref vec ri)
95     {
96         static if(n == 8)
97         {
98             deinterleave(v8(arr)[0], v8(arr)[1], rr, ri); 
99         }
100         else static if (n == 4)
101         {
102             vec a = *v8(arr);
103             rr = shufps!(2, 2, 0, 0)(a, a);
104             ri = shufps!(3, 3, 1, 1)(a, a);
105         }
106         else static if(n == 2)
107         {
108             rr = insert128_0(rr, arr[0]);
109             rr = insert128_1(rr, arr[2]);
110             ri = insert128_0(ri,  arr[1]);
111             ri = insert128_1(ri,  arr[3]);
112         }
113         else
114             static assert(0);
115     }
116    
117     static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
118     {
119         vec a0_tmp = unpcklps(a0, a1);
120         a1 =         unpckhps(a0, a1);
121         _deinterleave2(a0_tmp, a1, r0, r1);
122     }
123     
124     static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
125     {
126         _deinterleave2(a0, a1, a0, a1); 
127         r0 = shufps!(2,0,2,0)(a0, a1);
128         r1 = shufps!(3,1,3,1)(a0, a1);
129     }
130 
131     static void transpose(int elements_per_vector)(
132         vec a0, vec a1, ref vec r0, ref vec r1)
133     {
134         static if(elements_per_vector == 8)
135         {
136             r0 = shufps!(2,0,2,0)(a0, a1);
137             r1 = shufps!(3,1,3,1)(a0, a1);
138             r0 = shufps!(3,1,2,0)(r0, r0);
139             r1 = shufps!(3,1,2,0)(r1, r1);
140         }
141         else static if(elements_per_vector == 4)
142         {
143             r0 = shufps!(1,0,1,0)(a0, a1);
144             r1 = shufps!(3,2,3,2)(a0, a1);
145         }
146         else static if(elements_per_vector == 2)
147         {
148             r0 = interleave128_lo(a0, a1);
149             r1 = interleave128_hi(a0, a1);
150         }
151         else
152             static assert(0);
153     }
154 
155     private static void br16_two(ref vec a0, ref vec a1, ref vec a2, ref vec a3)
156     {
157         vec b0 = shufps!(1, 0, 1, 0)(a0, a2);
158         vec b1 = shufps!(1, 0, 1, 0)(a1, a3);
159         vec b2 = shufps!(3, 2, 3, 2)(a0, a2);
160         vec b3 = shufps!(3, 2, 3, 2)(a1, a3);
161 
162         a0 = shufps!(2, 0, 2, 0)(b0, b1);
163         a1 = shufps!(2, 0, 2, 0)(b2, b3);
164         a2 = shufps!(3, 1, 3, 1)(b0, b1);
165         a3 = shufps!(3, 1, 3, 1)(b2, b3);
166     }
167 
168     private static void br64(
169         ref vec a0, ref vec a1, ref vec a2, ref vec a3,
170         ref vec a4, ref vec a5, ref vec a6, ref vec a7)
171     {
172         // reverse the outer four bits 
173         br16_two(a0, a2, a4, a6);
174         br16_two(a1, a3, a5, a7);
175         
176         // reverse the inner two bits
177         _deinterleave2(a0, a1, a0, a1); 
178         _deinterleave2(a2, a3, a2, a3); 
179         _deinterleave2(a4, a5, a4, a5); 
180         _deinterleave2(a6, a7, a6, a7); 
181     }
182     
183     template RepeatType(T, int n, R...)
184     {
185         static if(n == 0)
186             alias R RepeatType;
187         else
188             alias RepeatType!(T, n - 1, T, R) RepeatType;
189     }
190         
191     static void bit_reverse_swap(T* p0, T* p1, size_t m)
192     {
193         RepeatType!(vec, 8) a, b;    
194 
195         foreach(i, _; a)
196             a[i] = *v8(p0 + i * m);
197 
198         br64(a);
199 
200         foreach(i, _; a)
201             b[i] = *v8(p1 + i * m);
202 
203         foreach(i, _; a)
204             *v8(p1 + i * m) = a[i];
205 
206         br64(b);
207 
208         foreach(i, _; a)
209             *v8(p0 + i * m) = b[i];
210     }
211 
212     static void bit_reverse(T* p0, size_t m)
213     {
214         RepeatType!(vec, 8) a;    
215 
216         foreach(i, _; a)
217             a[i] = *v8(p0 + i * m);
218 
219         br64(a);
220 
221         foreach(i, _; a)
222             *v8(p0 + i * m) = a[i];
223     }
224 
225     static vec scalar_to_vector(T a)
226     {
227         return a;
228     }
229 
230     static vec unaligned_load(T* p)
231     {
232         return loadups(p);
233     }
234 
235     static void unaligned_store(T* p, vec v)
236     {
237         storeups(p, v);
238     }
239 
240     static vec reverse(vec v)
241     {
242         v = shufps!(0, 1, 2, 3)(v, v);
243         return reverse128(v);
244     }
245 
246     version(LDC)
247     {
248         static shufps(int m3, int m2, int m1, int m0)(vec a, vec b)
249         {
250             return shufflevector!(float8, m0, m1, m2+8, m3+8, 
251                                           m0+4, m1+4, m2+12, m3+12)(a, b);
252         }
253 
254         static vec unpcklps(vec a, vec b)
255         { 
256             return shufflevector!(float8, 0, 8, 1, 9, 
257                                           4, 12, 5, 13)(a, b);
258         }
259 
260         static vec unpckhps(vec a, vec b)
261         { 
262             return shufflevector!(float8, 2, 10, 3, 11, 
263                                           6, 14, 7, 15)(a, b);
264         }
265 
266         static vec interleave128_lo(vec a, vec b)
267         {
268             return shufflevector!(float8, 0, 1, 2, 3, 8, 9, 10, 11)(a, b);
269         }
270 
271         static vec interleave128_hi(vec a, vec b)
272         {
273             return shufflevector!(float8, 4, 5, 6, 7, 12, 13, 14, 15)(a, b);
274         }
275 
276         static vec reverse128(vec a)
277         {
278             return shufflevector!(float8, 4, 5, 6, 7, 0, 1, 2, 3)(a, a);
279         }
280 
281         static vec loadups(T* p)
282         {
283             return loadUnaligned!vec(cast(float*)p);
284         }
285 
286         static void storeups(T* p, vec v)
287         {
288             storeUnaligned!vec(v, cast(float*)p);
289         }
290 
291         static float8 insert128_0(float8 a, float4 b)
292         {
293             float[8] res = a.array;
294             float[4] toInsert = b.array;
295             res[0..4] = toInsert[0..4];
296             return loadups(res.ptr);
297         }
298 
299         static float8 insert128_1(float8 a, float4 b)
300         {
301             float[8] res = a.array;
302             float[4] toInsert = b.array;
303             res[4..8] = toInsert[0..4];
304             return loadups(res.ptr);
305         }
306     }
307 }
308 
309 struct Options
310 {
311     enum log2_bitreverse_large_chunk_size = 5;
312     enum large_limit = 14;
313     enum log2_optimal_n = 8;
314     enum passes_per_recursive_call = 4;
315     enum log2_recursive_passes_chunk_size = 4;
316     enum prefered_alignment = 4 * (1 << 10);
317 }
318 
319 version(SSE_AVX)
320 {
321     import pfft.fft_impl;
322     enum implementation = 0;
323     alias TypeTuple!(FFT!(Vector, Options)) FFTs;
324     mixin Instantiate!();
325 }