1 //          Copyright Jernej Krempuš 2012
2 // Distributed under the Boost Software License, Version 1.0.
3 //    (See accompanying file LICENSE_1_0.txt or copy at
4 //          http://www.boost.org/LICENSE_1_0.txt)
5 
6 module pfft.neon_float;
7 
8 import pfft.fft_impl;
9 import gcc.builtins;
10 import core.simd;
11 
12 version(GNU) { } else
13 {
14     static assert(0, "This compiler is not supported.");
15 }
16 
17 struct NeonVec
18 {
19     float4 v;
20     
21     this(float4 _v){ v = _v; }
22     
23     NeonVec opBinary(string s)(NeonVec other) if(s == "+")
24     {
25         return NeonVec(__builtin_neon_vaddv4sf(v, other.v, 3));
26     }
27     NeonVec opBinary(string s)(NeonVec other) if(s == "-")
28     {
29         return NeonVec(__builtin_neon_vsubv4sf(v, other.v, 3));
30     }
31     NeonVec opBinary(string s)(NeonVec other) if(s == "*")
32     {
33         return NeonVec(__builtin_neon_vmulv4sf(v, other.v, 3));
34     }
35     
36     /*NeonVec muladd(NeonVec a, NeonVec b)
37     {
38         return NeonVec(__builtin_neon_vmlav4sf(v, a.v, b.v, 3));
39     }
40     NeonVec mulsub(NeonVec a, NeonVec b)
41     {
42         return NeonVec(__builtin_neon_vmlsv4sf(v, a.v, b.v, 3));
43     }*/
44 }
45 
46 struct Vector 
47 {
48     alias NeonVec vec;
49     alias float T;
50     
51     enum vec_size = 4;
52     enum log2_bitreverse_chunk_size = 2;   
53  
54     static vec scalar_to_vector(T a)
55     {
56         return vec(a);
57     }
58     
59     static void complex_array_to_real_imag_vec(int N)(T * arr, ref vec rr, ref vec ri)
60     {
61         static if(N==4)
62         {
63             deinterleave((cast(vec*)arr)[0], (cast(vec*)arr)[1], rr, ri);
64         }
65         else if(N==2)
66         {
67             asm nothrow @nogc
68             {
69                 "vldmia  %2, {%e0-%f0} \n"
70                 "vmov %q1, %q0 \n"
71                 "vuzp.32 %q0, %q1 \n"
72                 "vuzp.32 %e0, %f0 \n"
73                 "vuzp.32 %e1, %f1 \n"
74                 : "=w" rr, "=w" ri
75                 : "r" arr ;
76             }
77         }
78     }
79     
80     static void transpose(int elements_per_vector)(
81         vec a0, vec a1, ref vec r0, ref vec r1)
82     {
83         if(elements_per_vector == 4)
84         {
85             float4[2] tmp;
86             __builtin_neon_vtrnv4sf(&tmp[0], a0.v, a1.v);
87             r0.v = tmp[0];
88             r1.v = tmp[1];
89         }
90         else if(elements_per_vector == 2)
91         {
92             asm nothrow @nogc
93             {
94                 "vswp %f0, %e1 \n"
95                 :"+w" a0.v, "+w" a1.v ;
96             }
97             r0 = a0;
98             r1 = a1;
99         }
100     }
101     
102     static void interleave(vec a0, vec a1, ref vec r0, ref vec r1)
103     {
104         float4[2] tmp;
105         __builtin_neon_vzipv4sf(&tmp[0], a0.v, a1.v);
106         r0.v = tmp[0];
107         r1.v = tmp[1];
108     }
109     
110     static void deinterleave(vec a0, vec a1, ref vec r0, ref vec r1)
111     {
112         float4[2] tmp;
113         __builtin_neon_vuzpv4sf(&tmp[0], a0.v, a1.v);
114         r0.v = tmp[0];
115         r1.v = tmp[1];
116     }
117     
118     private static float4 * v(float * a)
119     {
120         return cast(float4*)a;
121     }
122     
123     private static _bit_reverse(ref float4 a0, ref float4 a1, 
124                                ref float4 a2, ref float4 a3)
125     {
126         asm nothrow @nogc
127         {
128             "vtrn.32 %q0, %q2 \n"
129             "vtrn.32 %q1, %q3 \n"
130             "vswp %f0, %e1 \n"
131             "vswp %f2, %e3 \n"
132             : "+w" a0, "+w" a1, "+w" a2, "+w" a3;
133         }
134     }
135     
136     
137     static void bit_reverse_swap(T * p0, T * p1, int m)
138     {                
139         float4  
140         a0 = *v(p0 + 0 * m), 
141         a1 = *v(p0 + 1 * m), 
142         a2 = *v(p0 + 2 * m), 
143         a3 = *v(p0 + 3 * m);
144         _bit_reverse(a0, a1, a2, a3);
145         
146         float4  
147         b0 = *v(p1 + 0 * m), 
148         b1 = *v(p1 + 1 * m), 
149         b2 = *v(p1 + 2 * m), 
150         b3 = *v(p1 + 3 * m);
151         *v(p1 + 0 * m) = a0;
152         *v(p1 + 1 * m) = a1;
153         *v(p1 + 2 * m) = a2;
154         *v(p1 + 3 * m) = a3;
155         
156         _bit_reverse(b0, b1, b2, b3);
157         *v(p0 + 0 * m) = b0;
158         *v(p0 + 1 * m) = b1;
159         *v(p0 + 2 * m) = b2;
160         *v(p0 + 3 * m) = b3;
161     }
162 
163     static void bit_reverse(T * p, int m)
164     {
165         _bit_reverse(*v(p + 0 * m), *v(p + 1 * m), *v(p + 2 * m), *v(p + 3 * m));
166     }
167 }
168 
169 struct Options
170 {
171     enum log2_bitreverse_large_chunk_size = 5;
172     enum large_limit = 14;
173     enum log2_optimal_n = 9;
174     enum passes_per_recursive_call = 5;
175     enum log2_recursive_passes_chunk_size = 5;
176 }
177