1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module pfft.avx_double;
7 
8 import core.simd;
9 
10 import pfft.fft_impl;
11 
12 import pfft.ldc_compat;
13 import pfft.dmd32_compat;
14 
15 // TODO: this was never testted
16 
17 @target("avx2"):
18 //version = SSE_AVX; // only version supported by this fork
19 
20 version(DigitalMars)
21 {
22 }
23 else version(LDC)
24 {
25 }
26 else version(GNU)
27 {
28     import gcc.builtins;
29 
30     template shuf_mask(int a3, int a2, int a1, int a0)
31     { 
32         enum shuf_mask = a0 | (a1<<2) | (a2<<4) | (a3<<6); 
33     }
34 
35     double4 interleave128_lo_d(double4 a, double4 b)
36     {
37         return __builtin_ia32_vperm2f128_pd256(a, b, shuf_mask!(0,2,0,0));
38     }
39 
40     double4 interleave128_hi_d(double4 a, double4 b)
41     {
42         return __builtin_ia32_vperm2f128_pd256(a, b, shuf_mask!(0,3,0,1));
43     }
44 
45     alias __builtin_ia32_unpcklpd256 unpcklpd;
46     alias __builtin_ia32_unpckhpd256 unpckhpd;
47     alias __builtin_ia32_loadupd256 loadupd;
48     alias __builtin_ia32_storeupd256 storeupd;
49 }
50 else
51 {
52     static assert("Unsupported compiler");
53 }
54 
55 struct Vector 
56 {
57 nothrow:
58 @nogc:
59     alias double4 vec;
60     alias double T;
61     
62     enum vec_size = 4;
63     enum log2_bitreverse_chunk_size = 2;
64     
65     static auto v(T* p){ return cast(vec*) p; }
66 
67     static void complex_array_to_real_imag_vec(int n)(T* arr, ref vec rr, ref vec ri)
68     {
69         static if (n == 4)
70             deinterleave(v(arr)[0], v(arr)[1], rr, ri);
71         else static if(n == 2)
72         {
73             vec a = *v(arr);
74             rr = unpcklpd(a, a);
75             ri = unpckhpd(a, a);
76         }
77         else
78             static assert(0);
79     }
80       
81     static void transpose(int elements_per_vector)(vec a0, vec a1, ref vec r0, ref vec r1)
82     {
83         static if(elements_per_vector == 4)
84         {
85             r0 = unpcklpd(a0, a1);
86             r1 = unpckhpd(a0, a1);
87         }
88         else static if(elements_per_vector == 2)
89         {
90             r0 = interleave128_lo_d(a0, a1);
91             r1 = interleave128_hi_d(a0, a1);
92         }
93         else
94             static assert(0);
95     }
96 
97     static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
98     {
99         vec b0, b1;
100         b0 = unpcklpd(a0, a1);
101         b1 = unpckhpd(a0, a1);
102         transpose!2(b0, b1, r0, r1);
103     }
104     
105     static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
106     {
107         vec b0, b1;
108 
109         transpose!2(a0, a1, b0, b1);
110         r0 = unpcklpd(b0, b1);
111         r1 = unpckhpd(b0, b1);
112     }
113 
114 
115     static void bit_reverse_swap(double * p0, double * p1, size_t m)
116     {
117         vec a0, a1, a2, a3, b0, b1, b2, b3;
118 
119         a0 = *v(p0 + 0 * m);
120         a1 = *v(p0 + 1 * m);
121         a2 = *v(p0 + 2 * m);
122         a3 = *v(p0 + 3 * m);
123 
124         b0 = unpcklpd(a0, a2);
125         b2 = unpckhpd(a0, a2);
126         b1 = unpcklpd(a1, a3);
127         b3 = unpckhpd(a1, a3);
128 
129         a0 = interleave128_lo_d(b0, b1);
130         a1 = interleave128_hi_d(b0, b1);
131         a2 = interleave128_lo_d(b2, b3);
132         a3 = interleave128_hi_d(b2, b3);
133 
134         b0 = *v(p1 + 0 * m);
135         b1 = *v(p1 + 1 * m);
136         b2 = *v(p1 + 2 * m);
137         b3 = *v(p1 + 3 * m);
138 
139         *v(p1 + 0 * m) = a0;
140         *v(p1 + 1 * m) = a1;
141         *v(p1 + 2 * m) = a2;
142         *v(p1 + 3 * m) = a3;
143 
144         a0 = unpcklpd(b0, b2);
145         a2 = unpckhpd(b0, b2);
146         a1 = unpcklpd(b1, b3);
147         a3 = unpckhpd(b1, b3);
148 
149         b0 = interleave128_lo_d(a0, a1);
150         b1 = interleave128_hi_d(a0, a1);
151         b2 = interleave128_lo_d(a2, a3);
152         b3 = interleave128_hi_d(a2, a3);
153 
154         *v(p0 + 0 * m) = b0;
155         *v(p0 + 1 * m) = b1;
156         *v(p0 + 2 * m) = b2;
157         *v(p0 + 3 * m) = b3;
158     }
159 
160     static void bit_reverse(double * p, size_t m)
161     {
162         vec a0, a1, a2, a3, b0, b1, b2, b3;
163 
164         a0 = *v(p + 0 * m);
165         a1 = *v(p + 1 * m);
166         a2 = *v(p + 2 * m);
167         a3 = *v(p + 3 * m);
168 
169         b0 = unpcklpd(a0, a2);
170         b2 = unpckhpd(a0, a2);
171         b1 = unpcklpd(a1, a3);
172         b3 = unpckhpd(a1, a3);
173 
174         *v(p + 0 * m) = interleave128_lo_d(b0, b1);
175         *v(p + 1 * m) = interleave128_hi_d(b0, b1);
176         *v(p + 2 * m) = interleave128_lo_d(b2, b3);
177         *v(p + 3 * m) = interleave128_hi_d(b2, b3);
178     }
179 
180     static vec scalar_to_vector(T a)
181     {
182         return a;
183     }
184     
185     static vec unaligned_load(T* p)
186     {
187         return loadupd(p);
188     }
189 
190     static void unaligned_store(T* p, vec v)
191     {
192         storeupd(p, v);
193     }
194 /*
195     static vec reverse(vec v)
196     {
197         v = __builtin_ia32_shufpd256(v, v, 0x5);
198         v = __builtin_ia32_vperm2f128_pd256(v, v, shuf_mask!(0,0,0,1));
199         return v;
200     }
201 */
202 
203     version(LDC)
204     {    
205         static vec unpcklpd(vec a, vec b)
206         { 
207             return shufflevector!(double4, 0, 4, 2, 6)(a, b);
208         }
209 
210         static vec unpckhpd(vec a, vec b)
211         { 
212             return shufflevector!(double4, 1, 5, 3, 7)(a, b);
213         }
214 
215         static vec interleave128_lo_d(vec a, vec b)
216         {
217             return shufflevector!(double4, 0, 1, 4, 5)(a, b);
218         }
219 
220         static vec interleave128_hi_d(vec a, vec b)
221         {
222             return shufflevector!(double4, 2, 3, 6, 7)(a, b);
223         }
224 
225         static vec loadupd(T* p)
226         {
227             return loadUnaligned!vec(cast(double*)p);
228         }
229 
230         static void storeupd(T* p, vec v)
231         {
232             storeUnaligned!vec(v, cast(double*)p);
233         }
234     }
235 
236 }
237 
238 struct Options
239 {
240     enum log2_bitreverse_large_chunk_size = 5;
241     enum large_limit = 14;
242     enum log2_optimal_n = 8;
243     enum passes_per_recursive_call = 4;
244     enum log2_recursive_passes_chunk_size = 4;
245     enum prefered_alignment = 4 * (1 << 10);
246     enum { fast_init };
247 }
248 
249 version(SSE_AVX)
250 {
251     import pfft.fft_impl;
252     enum implementation = 0;
253     alias TypeTuple!(FFT!(Vector, Options)) FFTs;
254     mixin Instantiate!();
255 }