Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_rfft_float32.neonintrinsic.c
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_rfft_float32.neonintrinsic.c
45 */
46
47#include <arm_neon.h>
48
49#include "NE10_types.h"
50#include "NE10_macros.h"
51#include "NE10_fft.h"
52#include "NE10_dsp.h"
53#include "NE10_fft.neonintrinsic.h"
54
55NE10_INLINE void ne10_radix8x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
56 const ne10_fft_cpx_float32_t *Fin,
57 const ne10_int32_t fstride,
58 const ne10_int32_t mstride,
59 const ne10_int32_t nfft)
60{
61 ne10_int32_t f_count;
62
63 NE10_DECLARE_8(float32x4_t,q_in);
64 NE10_DECLARE_8(float32x4_t,q_out);
65
66 const float32x4_t *Fin_neon = (float32x4_t*) Fin; // 8 x fstride
67 float32x4_t *Fout_neon = (float32x4_t*) Fout; // fstride x 8
68
69 for (f_count = fstride; f_count > 0; f_count --)
70 {
71 // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = fstride
72 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
73
74 // print q_in0 ~ q_in7
75 // NE10_PRINT_Qx8_VECTOR(q_in);
76
77 // do r2c fft, size = 8
78 NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
79
80 // print q_out0 ~ q_out7
81 // NE10_PRINT_Qx8_VECTOR(q_out);
82
83 // store q_out0 ~ q_out7 to Fout_neon, by step = 1
84 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
85
86 Fin_neon = Fin_neon - fstride * 8 + 1;
87 Fout_neon += 8; // next column
88 }
89}
90
91NE10_INLINE void ne10_radix8x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
92 const ne10_fft_cpx_float32_t *Fin,
93 const ne10_int32_t fstride,
94 const ne10_int32_t mstride,
95 const ne10_int32_t nfft)
96{
97 ne10_int32_t f_count;
98
99 NE10_DECLARE_8(float32x4_t,q_in);
100 NE10_DECLARE_8(float32x4_t,q_out);
101
102 const ne10_float32_t one_by_N = 0.25 / nfft;
103 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
104
105 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106 float32x4_t *Fout_neon = (float32x4_t*) Fout;
107
108 for (f_count = fstride; f_count > 0; f_count --)
109 {
110 // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = 1
111 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
112
113 // NE10_PRINT_Qx8_VECTOR(q_in);
114
115 NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
116
117 // NE10_PRINT_Qx8_VECTOR(q_out);
118
119#ifdef NE10_DSP_RFFT_SCALING
120 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124 q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125 q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126 q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127 q_out7 = vmulq_f32(q_out7,one_by_N_neon);
128#endif
129
130 // store
131 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
132
133 Fout_neon ++;
134 }
135}
136
137NE10_INLINE void ne10_radix4x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
138 const ne10_fft_cpx_float32_t *Fin,
139 const ne10_int32_t fstride,
140 const ne10_int32_t mstride,
141 const ne10_int32_t nfft)
142{
143 ne10_int32_t f_count;
144
145 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146 float32x4_t *Fout_neon = (float32x4_t*) Fout;
147
148 for (f_count = 0; f_count < fstride; f_count ++)
149 {
150 NE10_DECLARE_4(float32x4_t,q_in);
151 NE10_DECLARE_4(float32x4_t,q_out);
152
153 // load
154 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
155
156 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in)
157
158 // store
159 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
160
161 Fin_neon = Fin_neon - 4*fstride + 1;
162 Fout_neon += 4;
163 }
164}
165
166NE10_INLINE void ne10_radix4x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
167 const ne10_fft_cpx_float32_t *Fin,
168 const ne10_int32_t fstride,
169 const ne10_int32_t mstride,
170 const ne10_int32_t nfft)
171{
172 ne10_int32_t f_count;
173
174 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175 float32x4_t *Fout_neon = (float32x4_t*) Fout;
176
177 const ne10_float32_t one_by_N = 0.25 / nfft;
178 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
179
180 for (f_count = 0; f_count < fstride; f_count ++)
181 {
182 NE10_DECLARE_4(float32x4_t,q_in);
183 NE10_DECLARE_4(float32x4_t,q_out);
184
185 // load
186 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
187
188 // NE10_PRINT_Qx4_VECTOR(q_in);
189
190 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in)
191
192 // NE10_PRINT_Qx4_VECTOR(q_out);
193
194#ifdef NE10_DSP_RFFT_SCALING
195 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
199#endif
200
201 // store
202 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
203
204 Fout_neon ++;
205 }
206}
207
208NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
209 const float32x4_t *Fin_neon,
210 const ne10_int32_t out_step,
211 const ne10_int32_t in_step,
212 const ne10_fft_cpx_float32_t *twiddles)
213{
214 NE10_DECLARE_4(float32x4_t,q_in);
215 NE10_DECLARE_4(float32x4_t,q_out);
216
217 // load
218 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
219
220 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
221
222 // store
223 vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
224 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
227}
228
229NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
230 const float32x4_t *Fin_neon,
231 const ne10_int32_t out_step,
232 const ne10_int32_t in_step,
233 const ne10_fft_cpx_float32_t *twiddles)
234{
235 NE10_DECLARE_4(float32x4_t,q_in);
236 NE10_DECLARE_4(float32x4_t,q_out);
237
238 // load
239 q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
240 q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241 q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242 q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
243
244 // NE10_PRINT_Qx4_VECTOR(q_in);
245
246 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
247
248 // NE10_PRINT_Qx4_VECTOR(q_out);
249
250 // store
251 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
252}
253
254NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
255 const float32x4_t *Fin_neon,
256 const ne10_int32_t out_step,
257 const ne10_int32_t in_step,
258 const ne10_fft_cpx_float32_t *twiddles)
259{
260 ne10_int32_t m_count;
261 ne10_int32_t loop_count = (out_step>>1) -1;
262 float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2; // reversed
263
264 NE10_DECLARE_3(float32x4x2_t,q2_tw);
265 NE10_DECLARE_4(float32x4x2_t,q2_in);
266 NE10_DECLARE_4(float32x4x2_t,q2_out);
267
268 for (m_count = loop_count; m_count > 0; m_count -- )
269 {
270#ifndef NE10_INLINE_ASM_OPT
271 // load
272 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step ) );
273 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
274
275 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step ) );
276 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
277
278 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step ) );
279 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
280
281 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step ) );
282 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
283
284 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
286
287 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
289
290 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
292
293 // R2C TW KERNEL
294 NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
295#else // NE10_INLINE_ASM_OPT
296#ifndef __aarch64__
297#error Currently, inline assembly optimizations are only available on AArch64.
298#else // __aarch64__
299 const ne10_float32_t *ptr_inr = ((const ne10_float32_t *) Fin_neon);
300 const ne10_float32_t *ptr_ini = ((const ne10_float32_t *) Fin_neon) + 4;
301 asm volatile (
302 "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
303 "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
304 "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
305 "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
306 "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
307 "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
308
309 "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
310 "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
311 "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t" // RR - II
312 "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t" // RI + IR
313
314 "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t" // RR
315 "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t" // RI
316 "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t" // RR - II
317 "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t" // RI + IR
318
319 "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t" // RR
320 "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t" // RI
321 "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t" // RR - II
322 "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t" // RI + IR
323 : [q2_out0r]"+w"(q2_out0.val[0]),
324 [q2_out0i]"+w"(q2_out0.val[1]),
325 [q2_out1r]"+w"(q2_out1.val[0]),
326 [q2_out1i]"+w"(q2_out1.val[1]),
327 [q2_out2r]"+w"(q2_out2.val[0]),
328 [q2_out2i]"+w"(q2_out2.val[1]),
329 [q2_out3r]"+w"(q2_out3.val[0]),
330 [q2_out3i]"+w"(q2_out3.val[1]),
331 [ptr_inr]"+r"(ptr_inr),
332 [ptr_ini]"+r"(ptr_ini)
333 : [offset_in]"r"(in_step * 16),
334 [ptr_tw]"r"(twiddles)
335 : "memory", "v0", "v1", "v2",
336 "v10", "v11", "v12", "v13", "v14", "v15"
337 );
338#endif // __aarch64__
339#endif // NE10_INLINE_ASM_OPT
340
341 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1 (q2_in, q2_out);
342 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2 (q2_out, q2_in);
343
344 // store
345 vst1q_f32( (ne10_float32_t*) ( Fout_neon ), q2_out0.val[0] );
346 vst1q_f32( (ne10_float32_t*) ( Fout_neon + 1), q2_out0.val[1] );
347
348 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
350
351 vst1q_f32( (ne10_float32_t*) ( Fout_b ), q2_out2.val[0] );
352 vst1q_f32( (ne10_float32_t*) ( Fout_b + 1), q2_out2.val[1] );
353
354 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
356
357 // update pointers
358 Fin_neon += 2;
359 Fout_neon += 2;
360 Fout_b -= 2;
361 twiddles += 3;
362 }
363}
364
365NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
366 const float32x4_t *Fin_neon,
367 const ne10_int32_t out_step,
368 const ne10_int32_t in_step,
369 const ne10_fft_cpx_float32_t *twiddles)
370{
371 ne10_int32_t m_count;
372 ne10_int32_t loop_count = (out_step>>1) -1;
373 const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2; // reversed
374
375 NE10_DECLARE_3(float32x4x2_t,q2_tw);
376 NE10_DECLARE_4(float32x4x2_t,q2_in);
377 NE10_DECLARE_4(float32x4x2_t,q2_out);
378
379 for (m_count = loop_count; m_count > 0; m_count -- )
380 {
381 // load
382 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon ) );
383 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + 1) );
384
385 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
387
388 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b ) );
389 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b + 1) );
390
391 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
393
394 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
396
397 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
399
400 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
402
403 // NE10_PRINT_Q2x4_VECTOR(q2_in);
404
405 // R2C TW KERNEL
406 NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
407
408 // NE10_PRINT_Q2x4_VECTOR(q2_out);
409
410 // store
411 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
413
414 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
416
417 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
419
420 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
422
423 // update pointers
424 Fin_neon += 2;
425 Fout_neon += 2;
426 Fin_b -= 2;
427 twiddles += 3;
428 }
429}
430