49#include "NE10_types.h"
50#include "NE10_macros.h"
53#include "NE10_fft.neonintrinsic.h"
57 const ne10_int32_t fstride,
58 const ne10_int32_t mstride,
59 const ne10_int32_t nfft)
63 NE10_DECLARE_8(float32x4_t,q_in);
64 NE10_DECLARE_8(float32x4_t,q_out);
66 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
67 float32x4_t *Fout_neon = (float32x4_t*) Fout;
69 for (f_count = fstride; f_count > 0; f_count --)
72 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
78 NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
84 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
86 Fin_neon = Fin_neon - fstride * 8 + 1;
93 const ne10_int32_t fstride,
94 const ne10_int32_t mstride,
95 const ne10_int32_t nfft)
99 NE10_DECLARE_8(float32x4_t,q_in);
100 NE10_DECLARE_8(float32x4_t,q_out);
102 const ne10_float32_t one_by_N = 0.25 / nfft;
103 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
105 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106 float32x4_t *Fout_neon = (float32x4_t*) Fout;
108 for (f_count = fstride; f_count > 0; f_count --)
111 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
115 NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
119#ifdef NE10_DSP_RFFT_SCALING
120 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124 q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125 q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126 q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127 q_out7 = vmulq_f32(q_out7,one_by_N_neon);
131 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
139 const ne10_int32_t fstride,
140 const ne10_int32_t mstride,
141 const ne10_int32_t nfft)
143 ne10_int32_t f_count;
145 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146 float32x4_t *Fout_neon = (float32x4_t*) Fout;
148 for (f_count = 0; f_count < fstride; f_count ++)
150 NE10_DECLARE_4(float32x4_t,q_in);
151 NE10_DECLARE_4(float32x4_t,q_out);
154 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
156 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in)
159 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
161 Fin_neon = Fin_neon - 4*fstride + 1;
168 const ne10_int32_t fstride,
169 const ne10_int32_t mstride,
170 const ne10_int32_t nfft)
172 ne10_int32_t f_count;
174 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175 float32x4_t *Fout_neon = (float32x4_t*) Fout;
177 const ne10_float32_t one_by_N = 0.25 / nfft;
178 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
180 for (f_count = 0; f_count < fstride; f_count ++)
182 NE10_DECLARE_4(float32x4_t,q_in);
183 NE10_DECLARE_4(float32x4_t,q_out);
186 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
190 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in)
194#ifdef NE10_DSP_RFFT_SCALING
195 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
202 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
208NE10_INLINE
void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
209 const float32x4_t *Fin_neon,
210 const ne10_int32_t out_step,
211 const ne10_int32_t in_step,
214 NE10_DECLARE_4(float32x4_t,q_in);
215 NE10_DECLARE_4(float32x4_t,q_out);
218 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
220 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
223 vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
224 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
229NE10_INLINE
void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
230 const float32x4_t *Fin_neon,
231 const ne10_int32_t out_step,
232 const ne10_int32_t in_step,
235 NE10_DECLARE_4(float32x4_t,q_in);
236 NE10_DECLARE_4(float32x4_t,q_out);
239 q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
240 q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241 q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242 q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
246 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
251 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
254NE10_INLINE
void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
255 const float32x4_t *Fin_neon,
256 const ne10_int32_t out_step,
257 const ne10_int32_t in_step,
260 ne10_int32_t m_count;
261 ne10_int32_t loop_count = (out_step>>1) -1;
262 float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2;
264 NE10_DECLARE_3(float32x4x2_t,q2_tw);
265 NE10_DECLARE_4(float32x4x2_t,q2_in);
266 NE10_DECLARE_4(float32x4x2_t,q2_out);
268 for (m_count = loop_count; m_count > 0; m_count -- )
270#ifndef NE10_INLINE_ASM_OPT
272 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step ) );
273 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
275 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step ) );
276 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
278 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step ) );
279 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
281 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step ) );
282 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
284 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
287 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
290 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
294 NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
297#error Currently, inline assembly optimizations are only available on AArch64.
299 const ne10_float32_t *ptr_inr = ((
const ne10_float32_t *) Fin_neon);
300 const ne10_float32_t *ptr_ini = ((
const ne10_float32_t *) Fin_neon) + 4;
302 "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
303 "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
304 "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
305 "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
306 "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
307 "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
309 "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t"
310 "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t"
311 "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t"
312 "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t"
314 "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t"
315 "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t"
316 "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t"
317 "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t"
319 "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t"
320 "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t"
321 "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t"
322 "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t"
323 : [q2_out0r]
"+w"(q2_out0.val[0]),
324 [q2_out0i]
"+w"(q2_out0.val[1]),
325 [q2_out1r]
"+w"(q2_out1.val[0]),
326 [q2_out1i]
"+w"(q2_out1.val[1]),
327 [q2_out2r]
"+w"(q2_out2.val[0]),
328 [q2_out2i]
"+w"(q2_out2.val[1]),
329 [q2_out3r]
"+w"(q2_out3.val[0]),
330 [q2_out3i]
"+w"(q2_out3.val[1]),
331 [ptr_inr]
"+r"(ptr_inr),
332 [ptr_ini]
"+r"(ptr_ini)
333 : [offset_in]
"r"(in_step * 16),
334 [ptr_tw]
"r"(twiddles)
335 :
"memory",
"v0",
"v1",
"v2",
336 "v10",
"v11",
"v12",
"v13",
"v14",
"v15"
341 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1 (q2_in, q2_out);
342 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2 (q2_out, q2_in);
345 vst1q_f32( (ne10_float32_t*) ( Fout_neon ), q2_out0.val[0] );
346 vst1q_f32( (ne10_float32_t*) ( Fout_neon + 1), q2_out0.val[1] );
348 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
351 vst1q_f32( (ne10_float32_t*) ( Fout_b ), q2_out2.val[0] );
352 vst1q_f32( (ne10_float32_t*) ( Fout_b + 1), q2_out2.val[1] );
354 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
365NE10_INLINE
void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
366 const float32x4_t *Fin_neon,
367 const ne10_int32_t out_step,
368 const ne10_int32_t in_step,
371 ne10_int32_t m_count;
372 ne10_int32_t loop_count = (out_step>>1) -1;
373 const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2;
375 NE10_DECLARE_3(float32x4x2_t,q2_tw);
376 NE10_DECLARE_4(float32x4x2_t,q2_in);
377 NE10_DECLARE_4(float32x4x2_t,q2_out);
379 for (m_count = loop_count; m_count > 0; m_count -- )
382 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon ) );
383 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + 1) );
385 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
388 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b ) );
389 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b + 1) );
391 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
394 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
397 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
400 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
406 NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
411 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
414 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
417 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
420 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );