Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_fft.c
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_fft.c
45 */
46
47#include "NE10_types.h"
48#include "NE10_macros.h"
49#include "NE10_fft.h"
50
51/* factors buffer:
52 * 0: stage number
53 * 1: stride for the first stage
54 * 2*stage number+2: algorithm flag to imply whether the generic algorithm works.
55 * others: factors
56 *
57 * Only the leading 42 int32 is used to store factors.
58 * The left can be used as algorithm flags, or status flags.
59 * Even the leading bits of stage number can be reused.
60 * */
61ne10_int32_t ne10_factor (ne10_int32_t n,
62 ne10_int32_t * facbuf,
63 ne10_int32_t ne10_factor_flags)
64{
65 // This is a walk around. We need to "return" some flags.
66 // Otherwise, we need to modify signature of ne10_factor.
67 assert (NE10_MAXFACTORS >= 32);
68
69 if ((facbuf == NULL)
70 || (n <= 0))
71 {
72 return NE10_ERR;
73 }
74
75 ne10_int32_t p;
76 ne10_int32_t i = 1;
77 ne10_int32_t stage_num = 0;
78 ne10_int32_t stride_max = n;
79
80 // Default algorithm flag is NE10_FFT_ALG_24
81 ne10_int32_t alg_flag = NE10_FFT_ALG_24;
82
83 // Factor out powers of 4, 2, 5, 3, and other.
84 do
85 {
86 // If NE10_FACTOR_FLAGS has enable NE10_FACTOR_EIGHT.
87 // Try to combine one radix-4 and one radix-2 stages
88 // into one radix-8 stage.
89 if ((ne10_factor_flags & NE10_FACTOR_EIGHT)
90 && ((n==8) || (n==40) || (n==24)))
91 {
92 switch (n)
93 {
94 case 8:
95 p = 8;
96 break;
97 case 24:
98 p = 3;
99 alg_flag = NE10_FFT_ALG_ANY;
100 break;
101 default: // n == 40
102 p = 5;
103 alg_flag = NE10_FFT_ALG_ANY;
104 break;
105 }
106 }
107 else if ((n % 4) == 0)
108 {
109 p = 4;
110 }
111 else if ((n % 2) == 0)
112 {
113 p = 2;
114 }
115 else if ((n % 5) == 0)
116 {
117 p = 5;
118 alg_flag = NE10_FFT_ALG_ANY;
119 }
120 else if ((n % 3) == 0)
121 {
122 p = 3;
123 alg_flag = NE10_FFT_ALG_ANY;
124 }
125 else // stop factoring
126 {
127 p = n;
128 alg_flag = NE10_FFT_ALG_ANY;
129 }
130
131 n /= p;
132 facbuf[2 * i] = p;
133 facbuf[2 * i + 1] = n;
134 i++;
135 stage_num++;
136 }
137 while (n > 1);
138 facbuf[0] = stage_num;
139 facbuf[1] = stride_max / p;
140
141 if (stage_num > 21)
142 {
143 // Since nfft is ne10_int32_t, stage_num can never be greater than 21,
144 // because 3^21 > 2^32
145 return NE10_ERR;
146 }
147
148 facbuf[2 * i] = alg_flag;
149 return NE10_OK;
150}
151
152// Twiddles matrix [radix-1][mstride]
153// First column (k == 0) is ignored because phase == 1, and
154// twiddle = (1.0, 0.0).
155void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
156 const ne10_int32_t mstride,
157 const ne10_int32_t fstride,
158 const ne10_int32_t radix,
159 const ne10_int32_t nfft)
160{
161 ne10_int32_t j, k;
162 ne10_float32_t phase;
163 const ne10_float64_t pi = NE10_PI;
164
165 for (j = 0; j < mstride; j++)
166 {
167 for (k = 1; k < radix; k++) // phase = 1 when k = 0
168 {
169 phase = -2 * pi * fstride * k * j / nfft;
170 twiddles[mstride * (k - 1) + j].r = (ne10_float32_t) cos (phase);
171 twiddles[mstride * (k - 1) + j].i = (ne10_float32_t) sin (phase);
172 } // radix
173 } // mstride
174}
175
176// Transposed twiddles matrix [mstride][radix-1]
177// First row (k == 0) is ignored because phase == 1, and
178// twiddle = (1.0, 0.0).
179// Transposed twiddle tables are used in RFFT to avoid memory access by a large
180// stride.
181void ne10_fft_generate_twiddles_line_transposed_float32 (
182 ne10_fft_cpx_float32_t* twiddles,
183 const ne10_int32_t mstride,
184 const ne10_int32_t fstride,
185 const ne10_int32_t radix,
186 const ne10_int32_t nfft)
187{
188 ne10_int32_t j, k;
189 ne10_float32_t phase;
190 const ne10_float64_t pi = NE10_PI;
191
192 for (j = 0; j < mstride; j++)
193 {
194 for (k = 1; k < radix; k++) // phase = 1 when k = 0
195 {
196 phase = -2 * pi * fstride * k * j / nfft;
197 twiddles[(radix - 1) * j + k - 1].r = (ne10_float32_t) cos (phase);
198 twiddles[(radix - 1) * j + k - 1].i = (ne10_float32_t) sin (phase);
199 } // radix
200 } // mstride
201}
202
203// Twiddles matrix [mstride][radix-1]
204// First column (k == 0)is ignored because phase == 1, and
205// twiddle = (1.0, 0.0).
206static void ne10_fft_generate_twiddles_line_int32 (ne10_fft_cpx_int32_t * twiddles,
207 const ne10_int32_t mstride,
208 const ne10_int32_t fstride,
209 const ne10_int32_t radix,
210 const ne10_int32_t nfft)
211{
212 ne10_int32_t j, k;
213 ne10_float32_t phase;
214 const ne10_float64_t pi = NE10_PI;
215
216 for (j = 0; j < mstride; j++)
217 {
218 for (k = 1; k < radix; k++) // phase = 1 when k = 0
219 {
220 phase = -2 * pi * fstride * k * j / nfft;
221
222 ne10_fft_cpx_int32_t *tw = &twiddles[mstride * (k - 1) + j];
223
224 tw->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos(phase));
225 tw->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin(phase));
226 } // radix
227 } // mstride
228}
229
230ne10_fft_cpx_int32_t* ne10_fft_generate_twiddles_int32 (ne10_fft_cpx_int32_t * twiddles,
231 const ne10_int32_t * factors,
232 const ne10_int32_t nfft )
233{
234 ne10_int32_t stage_count = factors[0];
235 ne10_int32_t fstride = factors[1];
236 ne10_int32_t mstride;
237 ne10_int32_t cur_radix; // current radix
238
239 // for first stage
240 cur_radix = factors[2 * stage_count];
241 if (cur_radix % 2) // current radix is not 4 or 2
242 {
243 twiddles += 1;
244 ne10_fft_generate_twiddles_line_int32 (twiddles, 1, fstride, cur_radix, nfft);
245 twiddles += cur_radix - 1;
246 }
247 stage_count--;
248
249 // for other stage
250 for (; stage_count > 0; stage_count--)
251 {
252 cur_radix = factors[2 * stage_count];
253 fstride /= cur_radix;
254 mstride = factors[2 * stage_count + 1];
255 ne10_fft_generate_twiddles_line_int32 (twiddles, mstride, fstride, cur_radix, nfft);
256 twiddles += mstride * (cur_radix - 1);
257 } // stage_count
258
259 return twiddles;
260}
261
262typedef void (*line_generator_float32)(ne10_fft_cpx_float32_t*,
263 const ne10_int32_t,
264 const ne10_int32_t,
265 const ne10_int32_t,
266 const ne10_int32_t);
267
268ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_impl_float32 (
269 line_generator_float32 generator,
270 ne10_fft_cpx_float32_t * twiddles,
271 const ne10_int32_t * factors,
272 const ne10_int32_t nfft)
273{
274 ne10_int32_t stage_count = factors[0];
275 ne10_int32_t fstride = factors[1];
276 ne10_int32_t mstride;
277 ne10_int32_t cur_radix; // current radix
278
279 // for first stage
280 cur_radix = factors[2 * stage_count];
281 if (cur_radix % 2) // current radix is not 4 or 2
282 {
283 twiddles[0].r = 1.0;
284 twiddles[0].i = 0.0;
285 twiddles += 1;
286 generator (twiddles, 1, fstride, cur_radix, nfft);
287 twiddles += cur_radix - 1;
288 }
289 stage_count --;
290
291 // for other stage
292 for (; stage_count > 0; stage_count --)
293 {
294 cur_radix = factors[2 * stage_count];
295 fstride /= cur_radix;
296 mstride = factors[2 * stage_count + 1];
297 generator (twiddles, mstride, fstride, cur_radix, nfft);
298 twiddles += mstride * (cur_radix - 1);
299 } // stage_count
300
301 return twiddles;
302}
303
304ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
305 const ne10_int32_t * factors,
306 const ne10_int32_t nfft )
307{
308 line_generator_float32 generator = ne10_fft_generate_twiddles_line_float32;
309 twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
310 twiddles, factors, nfft);
311 return twiddles;
312}
313
314ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
315 ne10_fft_cpx_float32_t * twiddles,
316 const ne10_int32_t * factors,
317 const ne10_int32_t nfft)
318{
319 line_generator_float32 generator =
320 ne10_fft_generate_twiddles_line_transposed_float32;
321 twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
322 twiddles, factors, nfft);
323 return twiddles;
324}
325
338{
339 // For input shorter than 16, fall back to c version.
340 // We would not get much improvement from NEON for these cases.
341 if (nfft < 16)
342 {
343 return ne10_fft_alloc_c2c_float32_c (nfft);
344 }
345
346 ne10_fft_cfg_float32_t st = NULL;
347 ne10_uint32_t memneeded = sizeof (ne10_fft_state_float32_t)
348 + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/
349 + sizeof (ne10_fft_cpx_float32_t) * nfft /* twiddle*/
350 + sizeof (ne10_fft_cpx_float32_t) * nfft /* buffer*/
351 + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
352
353 st = (ne10_fft_cfg_float32_t) NE10_MALLOC (memneeded);
354
355 // Only backward FFT is scaled by default.
356 st->is_forward_scaled = 0;
357 st->is_backward_scaled = 1;
358
359 // Bad allocation.
360 if (st == NULL)
361 {
362 return st;
363 }
364
365 uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_float32_t);
366 NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
367 st->factors = (ne10_int32_t*) address;
368 st->twiddles = (ne10_fft_cpx_float32_t*) (st->factors + (NE10_MAXFACTORS * 2));
369 st->buffer = st->twiddles + nfft;
370
371 // st->last_twiddles is default NULL.
372 // Calling fft_c or fft_neon is decided by this pointers.
373 st->last_twiddles = NULL;
374
375 st->nfft = nfft;
376 if (nfft % NE10_FFT_PARA_LEVEL == 0)
377 {
378 // Size of FFT satisfies requirement of NEON optimization.
379 st->nfft /= NE10_FFT_PARA_LEVEL;
380 st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL;
381 }
382
383 ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
384
385 // Can not factor.
386 if (result == NE10_ERR)
387 {
388 NE10_FREE (st);
389 return st;
390 }
391
392 // Check if radix-8 can be enabled
393 ne10_int32_t stage_count = st->factors[0];
394 ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
395
396 // Enable radix-8.
397 if (algorithm_flag == NE10_FFT_ALG_ANY)
398 {
399 result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
400 if (result == NE10_ERR)
401 {
402 NE10_FREE (st);
403 return st;
404 }
405 ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, st->nfft);
406 }
407 else
408 {
409 st->last_twiddles = NULL;
410 st->nfft = nfft;
411 result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
412 ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, st->nfft);
413 return st;
414 }
415
416 // Generate super twiddles for the last stage.
417 if (nfft % NE10_FFT_PARA_LEVEL == 0)
418 {
419 // Size of FFT satisfies requirement of NEON optimization.
420 ne10_fft_generate_twiddles_line_float32 (st->last_twiddles,
421 st->nfft,
422 1,
423 NE10_FFT_PARA_LEVEL,
424 nfft);
425 }
426 return st;
427}
428
436{
437 // For input shorter than 16, fall back to c version.
438 // We would not get much improvement from NEON for these cases.
439 if (nfft < 16)
440 {
441 return ne10_fft_alloc_c2c_int32_c (nfft);
442 }
443
444 ne10_fft_cfg_int32_t st = NULL;
445 ne10_uint32_t memneeded = sizeof (ne10_fft_state_int32_t)
446 + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/
447 + sizeof (ne10_fft_cpx_int32_t) * nfft /* twiddle*/
448 + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer*/
449 + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
450
451 st = (ne10_fft_cfg_int32_t) NE10_MALLOC (memneeded);
452
453 // Bad allocation.
454 if (st == NULL)
455 {
456 return st;
457 }
458
459 uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_int32_t);
460 NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
461 st->factors = (ne10_int32_t*) address;
462 st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
463 st->buffer = st->twiddles + nfft;
464
465 // st->last_twiddles is default NULL.
466 // Calling fft_c or fft_neon is decided by this pointers.
467 st->last_twiddles = NULL;
468
469 st->nfft = nfft;
470 if (nfft % NE10_FFT_PARA_LEVEL == 0)
471 {
472 // Size of FFT satisfies requirement of NEON optimization.
473 st->nfft /= NE10_FFT_PARA_LEVEL;
474 st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL;
475 }
476
477 ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
478
479 // Can not factor.
480 if (result == NE10_ERR)
481 {
482 NE10_FREE (st);
483 return st;
484 }
485
486 // Check if radix-8 can be enabled
487 ne10_int32_t stage_count = st->factors[0];
488 ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
489
490 // Enable radix-8.
491 if (algorithm_flag == NE10_FFT_ALG_ANY)
492 {
493 result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
494 if (result == NE10_ERR)
495 {
496 NE10_FREE (st);
497 return st;
498 }
499 ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft);
500 }
501 else
502 {
503 st->last_twiddles = NULL;
504 st->nfft = nfft;
505 result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
506 ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft);
507 return st;
508 }
509
510 // Generate super twiddles for the last stage.
511 if (nfft % NE10_FFT_PARA_LEVEL == 0)
512 {
513 // Size of FFT satisfies requirement of NEON optimization.
514 ne10_fft_generate_twiddles_line_int32 (st->last_twiddles,
515 st->nfft,
516 1,
517 NE10_FFT_PARA_LEVEL,
518 nfft);
519 }
520 return st;
521}
522
530void ne10_fft_destroy_c2c_float32 (ne10_fft_cfg_float32_t cfg)
531{
532 free(cfg);
533}
534
535void ne10_fft_destroy_c2c_int32 (ne10_fft_cfg_int32_t cfg)
536{
537 free (cfg);
538}
539
540void ne10_fft_destroy_c2c_int16 (ne10_fft_cfg_int16_t cfg)
541{
542 free (cfg);
543}
544
//end of C2C_FFT_IFFT_DESTROY group
548
//end of C2C_FFT_IFFT group
552
565void ne10_fft_destroy_r2c_float32 (ne10_fft_r2c_cfg_float32_t cfg)
566{
567 free(cfg);
568}
569
570void ne10_fft_destroy_r2c_int32 (ne10_fft_r2c_cfg_int32_t cfg)
571{
572 free (cfg);
573}
574
575void ne10_fft_destroy_r2c_int16 (ne10_fft_r2c_cfg_int16_t cfg)
576{
577 free (cfg);
578}
579
//end of R2C_FFT_IFFT_DESTROY group
583
//end of R2C_FFT_IFFT group
ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_neon(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition NE10_fft.c:337
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_neon(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition NE10_fft.c:435
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_c(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_c(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
structure for the 32 bits fixed point FFT function.
Definition NE10_types.h:329
structure for the floating point FFT state
Definition NE10_types.h:241
ne10_int32_t is_forward_scaled
@biref Flag to control scaling behaviour in forward floating point complex FFT.
Definition NE10_types.h:255
ne10_int32_t is_backward_scaled
@biref Flag to control scaling behaviour in backward floating point complex FFT.
Definition NE10_types.h:264