Line data Source code
1 : /*====================================================================================
2 : EVS Codec 3GPP TS26.452 Aug 12, 2021. Version 16.3.0
3 : ====================================================================================*/
4 :
5 : #include <stdint.h>
6 : #include "options.h" /* Compilation switches */
7 : #include "prot_fx.h" /* Function prototypes */
8 : #include "rom_com.h" /* Static table prototypes */
9 : #include "stl.h"
10 : #include "stdint.h"
11 : #include "wmc_auto.h"
12 :
13 : /*---------------------------------------------------------------------*
14 : * Local constants
15 : *---------------------------------------------------------------------*/
16 :
17 : #define N_MAX_FFT 1024
18 : #define N_MAX_DIV2 ( N_MAX_FFT >> 1 )
19 : #define N_MAX_DIV4 ( N_MAX_DIV2 >> 1 )
20 : #define INV_SQR2_FX 23170
21 : #define N_MAX_SAS 256
22 :
23 : /*------------------------------------------------------------------
24 : *
25 : * This is an implementation of decimation-in-time FFT algorithm for
26 : * real sequences. The techniques used here can be found in several
27 : * books, e.g., i) Proakis and Manolakis, "Digital Signal Processing",
28 : * 2nd Edition, Chapter 9, and ii) W.H. Press et. al., "Numerical
29 : * Recipes in C", 2nd Edition, Chapter 12.
30 : *
31 : * Input - There are two inputs to this function:
32 : *
33 : * 1) An integer pointer to the input data array
34 : * 2) An integer value which should be set as +1 for FFT
35 : * and some other value, e.g., -1 for ifFT
36 : *
37 : * Output - There is no return value.
38 : * The input data are replaced with transformed data. if the
39 : * input is a real time domain sequence, it is replaced with
40 : * the complex FFT for positive frequencies. The FFT value
41 : * for DC and the foldover frequency are combined to form the
42 : * first complex number in the array. The remaining complex
43 : * numbers correspond to increasing frequencies. if the input
44 : * is a complex frequency domain sequence arranged as above,
45 : * it is replaced with the corresponding time domain sequence.
46 : *
47 : * Notes:
48 : *
49 : * 1) This function is designed to be a part of a noise supp-
50 : * ression algorithm that requires 128-point FFT of real
51 : * sequences. This is achieved here through a 64-point
52 : * complex FFT. Consequently, the FFT size information is
53 : * not transmitted explicitly. However, some flexibility
54 : * is provided in the function to change the size of the
55 : * FFT by specifying the size information through "define"
56 : * statements.
57 : *
58 : * 2) The values of the complex sinusoids used in the FFT
59 : * algorithm are computed once (i.e., the first time the
60 : * r_fft function is called) and stored in a table. To
61 : * further speed up the algorithm, these values can be
62 : * precomputed and stored in a ROM table in actual DSP
63 : * based implementations.
64 : *
65 : * 3) In the c_fft function, the FFT values are divided by
66 : * 2 after each stage of computation thus dividing the
67 : * final FFT values by 64. No multiplying factor is used
68 : * for the ifFT. This is somewhat different from the usual
69 : * definition of FFT where the factor 1/N, i.e., 1/64, is
70 : * used for the ifFT and not the FFT. No factor is used in
71 : * the r_fft function.
72 : *
73 : * 4) Much of the code for the FFT and ifFT parts in r_fft
74 : * and c_fft functions are similar and can be combined.
75 : * They are, however, kept separate here to speed up the
76 : * execution.
77 : *------------------------------------------------------------------------*/
78 : /*------------------------------------------------------------------------*
79 : * c_fft_fx:
80 : *
81 : * Computes the complex part of the split-radix FFT
82 : *------------------------------------------------------------------------*/
83 :
84 69200 : static void c_fft_fx(
85 : const Word16 *phs_tbl, /* i : Table of phases */
86 : Word16 SIZE, /* i : Size of the FFT */
87 : Word16 NUM_STAGE, /* i : Number of stages */
88 : const Word16 *in_ptr, /* i : coefficients in the order re[0], re[n/2], re[1], im[1], ..., re[n/2-1], im[n/2-1] Qx*/
89 : Word16 *out_ptr, /* o : coefficients in the order re[0], re[n/2], re[1], im[1], ..., re[n/2-1], im[n/2-1] Q(x - 1)*/
90 : /* in_ptr & out_ptr must not overlap! */
91 : const Word16 isign ) /* i : 1=fft, otherwise it is ifft*/
92 : {
93 : Word16 i, j, k, ii, jj, kk, ji, kj;
94 : Word32 L_tmp1, L_tmp2;
95 : Word16 tmp1, tmp2, tmp3, tmp4;
96 : const Word16 *table_ptr;
97 : const Word16 *input_ptr1, *input_ptr2, *input_ptr3, *input_ptr4;
98 69200 : Word16 shift = 0;
99 69200 : move16();
100 : /* Setup Reorder Variables */
101 69200 : table_ptr = NULL;
102 69200 : table_ptr = FFT_REORDER_1024;
103 69200 : SWITCH( SIZE )
104 : {
105 68 : case 1024:
106 68 : shift = 0;
107 68 : move16();
108 68 : BREAK;
109 420 : case 512:
110 420 : shift = 1;
111 420 : move16();
112 420 : BREAK;
113 6268 : case 256:
114 6268 : shift = 2;
115 6268 : move16();
116 6268 : BREAK;
117 186 : case 128:
118 186 : shift = 3;
119 186 : move16();
120 186 : BREAK;
121 62258 : case 64:
122 62258 : shift = 4;
123 62258 : move16();
124 62258 : BREAK;
125 : }
126 : /* The FFT part */
127 69200 : IF( isign != 0 )
128 : {
129 : /* Unrolled 1st/2nd Stage
130 : * 1) to take advantage of Table Values (0 & +/- 16384)
131 : * 2) to perform reordering of Input Values
132 : */
133 500598 : FOR( k = 0; k < SIZE; k += 8 )
134 : {
135 : /*
136 : * This loop use:
137 : * 4 Word16 (tmp1...tmp4)
138 : * 2 Word32 (L_tmp1 & L_tmp2)
139 : * 4 Pointers (table_ptr, input_ptr1, input_ptr2, input_ptr3)
140 : *
141 : * The addition of 'in_ptr' + and index value from 'reorder_ptr'
142 : * is counted as a move16()
143 : */
144 :
145 462888 : input_ptr1 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift ); // Qx
146 :
147 462888 : L_tmp1 = L_mult( *input_ptr1++, 16384 );
148 462888 : L_tmp2 = L_mult( *input_ptr1, 16384 );
149 :
150 462888 : input_ptr1 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
151 :
152 462888 : tmp1 = msu_r_sat( L_tmp1, *input_ptr1, 16384 );
153 462888 : tmp3 = mac_r_sat( L_tmp1, *input_ptr1++, 16384 );
154 462888 : input_ptr2 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
155 462888 : input_ptr3 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
156 :
157 462888 : L_tmp1 = L_mult( *input_ptr2++, 16384 );
158 462888 : tmp2 = mac_r_sat( L_tmp1, *input_ptr3, 16384 );
159 462888 : tmp4 = msu_r_sat( L_tmp1, *input_ptr3++, 16384 );
160 462888 : L_tmp1 = L_mult( tmp3, 16384 );
161 462888 : out_ptr[k] = mac_r_sat( L_tmp1, tmp2, 16384 );
162 462888 : move16();
163 462888 : out_ptr[k + 4] = msu_r_sat( L_tmp1, tmp2, 16384 );
164 462888 : move16();
165 :
166 462888 : tmp2 = mac_r_sat( L_tmp2, *input_ptr1, 16384 );
167 462888 : tmp3 = msu_r_sat( L_tmp2, *input_ptr1, 16384 );
168 462888 : L_tmp2 = L_mult( *input_ptr2, 16384 );
169 :
170 462888 : L_tmp1 = L_mult( tmp1, 16384 );
171 462888 : tmp1 = msu_r_sat( L_tmp2, *input_ptr3, 16384 );
172 462888 : out_ptr[k + 2] = mac_r_sat( L_tmp1, tmp1, 16384 );
173 462888 : move16();
174 462888 : out_ptr[k + 6] = msu_r_sat( L_tmp1, tmp1, 16384 );
175 462888 : move16();
176 462888 : L_tmp1 = L_mult( tmp2, 16384 );
177 462888 : tmp2 = mac_r_sat( L_tmp2, *input_ptr3, 16384 );
178 462888 : out_ptr[k + 1] = mac_r_sat( L_tmp1, tmp2, 16384 );
179 462888 : move16();
180 462888 : out_ptr[k + 5] = msu_r_sat( L_tmp1, tmp2, 16384 );
181 462888 : move16();
182 462888 : L_tmp1 = L_mult( tmp3, 16384 );
183 462888 : out_ptr[k + 3] = msu_r_sat( L_tmp1, tmp4, 16384 );
184 462888 : move16();
185 462888 : out_ptr[k + 7] = mac_r_sat( L_tmp1, tmp4, 16384 );
186 462888 : move16();
187 : }
188 :
189 : /* Remaining Stages */
190 163977 : FOR( i = 2; i < NUM_STAGE; i++ )
191 : {
192 : /* i is stage counter */
193 126267 : jj = shl( 2, i ); /* FFT size */
194 126267 : kk = shl( jj, 1 ); /* 2 * FFT size */
195 126267 : ii = shr( SIZE, i );
196 126267 : ji = 0;
197 126267 : move16(); /* ji is phase table index */
198 :
199 1826979 : FOR( j = 0; j < jj; j += 2 )
200 : {
201 : /* j is sample counter */
202 5356824 : FOR( k = j; k < SIZE; k += kk )
203 : {
204 : /* k is butterfly top */
205 3656112 : kj = add( k, jj ); /* kj is butterfly bottom */
206 :
207 : /* Butterfly computations */
208 3656112 : L_tmp1 = L_msu( L_mult( *( out_ptr + kj ), phs_tbl[ji] ),
209 3656112 : *( out_ptr + kj + 1 ), phs_tbl[ji + 1] );
210 3656112 : L_tmp2 = L_mac( L_mult( *( out_ptr + kj + 1 ), phs_tbl[ji] ),
211 3656112 : *( out_ptr + kj ), phs_tbl[ji + 1] );
212 :
213 3656112 : out_ptr[kj] = mac_r( L_negate( L_tmp1 ), out_ptr[k], 16384 );
214 3656112 : move16();
215 3656112 : out_ptr[kj + 1] = mac_r( L_negate( L_tmp2 ), out_ptr[k + 1], 16384 );
216 3656112 : move16();
217 3656112 : out_ptr[k] = mac_r( L_tmp1, out_ptr[k], 16384 );
218 3656112 : move16();
219 3656112 : out_ptr[k + 1] = mac_r( L_tmp2, out_ptr[k + 1], 16384 );
220 3656112 : move16();
221 : }
222 1700712 : ji = add( ji, ii );
223 : }
224 : }
225 : }
226 : ELSE /* The ifFT part */
227 : {
228 : /* Unrolled 1st/2nd Stage
229 : * 1) to take advantage of Table Values (0 & +/- 16384)
230 : * 2) to perform reordering of Input Values
231 : */
232 305802 : FOR( k = 0; k < SIZE; k += 8 )
233 : {
234 : /*
235 : * This loop use:
236 : * 4 Word16 (tmp1...tmp4)
237 : * 2 Word32 (L_tmp1 & L_tmp2)
238 : * 5 Pointers (reorder_ptr, input_ptr1...input_ptr4)
239 : *
240 : * The addition of 'in_ptr' + and index value from 'reorder_ptr'
241 : * is counted as a move16()
242 : */
243 274312 : input_ptr1 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
244 274312 : input_ptr2 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
245 :
246 274312 : input_ptr3 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
247 274312 : input_ptr4 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
248 :
249 :
250 274312 : tmp3 = sub( *input_ptr1, *input_ptr2 );
251 274312 : tmp4 = add( *input_ptr1++, *input_ptr2++ );
252 :
253 274312 : tmp2 = sub( input_ptr3[0], input_ptr4[0] );
254 274312 : tmp1 = sub( input_ptr3[1], input_ptr4[1] );
255 :
256 274312 : out_ptr[k + 2] = sub( tmp3, tmp1 );
257 274312 : move16();
258 274312 : out_ptr[k + 6] = add( tmp3, tmp1 );
259 274312 : move16();
260 :
261 274312 : tmp1 = sub( *input_ptr1, *input_ptr2 );
262 274312 : out_ptr[k + 3] = add( tmp1, tmp2 );
263 274312 : move16();
264 274312 : out_ptr[k + 7] = sub( tmp1, tmp2 );
265 274312 : move16();
266 :
267 274312 : tmp1 = add( input_ptr3[0], input_ptr4[0] );
268 274312 : tmp3 = add( input_ptr3[1], input_ptr4[1] );
269 :
270 274312 : out_ptr[k] = add( tmp4, tmp1 );
271 274312 : move16();
272 274312 : out_ptr[k + 4] = sub( tmp4, tmp1 );
273 274312 : move16();
274 :
275 274312 : tmp4 = add( *input_ptr1, *input_ptr2 );
276 274312 : out_ptr[k + 1] = add( tmp4, tmp3 );
277 274312 : move16();
278 274312 : out_ptr[k + 5] = sub( tmp4, tmp3 );
279 274312 : move16();
280 : }
281 :
282 31490 : table_ptr = phs_tbl + SIZE; /* access part of table that is scaled by 2 */
283 :
284 : /* Remaining Stages */
285 127077 : FOR( i = 2; i < NUM_STAGE; i++ )
286 : {
287 : /* i is stage counter */
288 95587 : jj = shl( 2, i ); /* FFT size */
289 95587 : kk = shl( jj, 1 ); /* 2 * FFT size */
290 95587 : ii = shr( SIZE, i );
291 95587 : ji = 0;
292 95587 : move16(); /* ji is phase table index */
293 :
294 1066875 : FOR( j = 0; j < jj; j += 2 )
295 : {
296 : /* j is sample counter */
297 : /* This can be computed by successive add_fxitions of ii to ji, starting from 0
298 : hence line-count it as a one-line add (still need to increment op count!!) */
299 :
300 2777544 : FOR( k = j; k < SIZE; k += kk )
301 : {
302 : /* k is butterfly top */
303 1806256 : kj = add( k, jj ); /* kj is butterfly bottom */
304 :
305 : /* Butterfly computations */
306 1806256 : tmp1 = mac_r_sat( L_mult_sat( out_ptr[kj], table_ptr[ji] ), out_ptr[kj + 1], table_ptr[ji + 1] );
307 :
308 1806256 : tmp2 = msu_r_sat( L_mult_sat( out_ptr[kj + 1], table_ptr[ji] ), out_ptr[kj], table_ptr[ji + 1] );
309 :
310 1806256 : out_ptr[kj] = sub_sat( out_ptr[k], tmp1 );
311 1806256 : move16();
312 1806256 : out_ptr[kj + 1] = sub_sat( out_ptr[k + 1], tmp2 );
313 1806256 : move16();
314 1806256 : out_ptr[k] = add_sat( out_ptr[k], tmp1 );
315 1806256 : move16();
316 1806256 : out_ptr[k + 1] = add_sat( out_ptr[k + 1], tmp2 );
317 1806256 : move16();
318 : }
319 971288 : ji = add( ji, ii );
320 : }
321 : }
322 : }
323 69200 : }
324 :
325 : /*--------------------------------------------------------------------------------*
326 : * r_fft_fx:
327 : *
328 : * Perform FFT fixed-point for real-valued sequences of length 32, 64 or 128
329 : *--------------------------------------------------------------------------------*/
330 69200 : void r_fft_fx_lc(
331 : const Word16 *phs_tbl, /* i : Table of phase */
332 : const Word16 SIZE, /* i : Size of the FFT */
333 : const Word16 SIZE2, /* i : Size / 2 */
334 : const Word16 NUM_STAGE, /* i : Number of stage */
335 : const Word16 *in_ptr, /* i : coefficients in the order re[0], re[1], ... re[n/2], im[n/2-1], im[n/2-2], ..., im[1] Qx*/
336 : Word16 *out_ptr, /* o : coefficients in the order re[0], re[1], ... re[n/2], im[n/2-1], im[n/2-2], ..., im[1] Q(x - 1)*/
337 : const Word16 isign /* i : 1=fft, otherwize it's ifft */
338 : )
339 : {
340 : Word16 tmp2_real, tmp2_imag;
341 : Word32 Ltmp1_real, Ltmp1_imag;
342 : Word16 i;
343 : Word32 Ltmp1;
344 : const Word16 *phstbl_ptrDn;
345 : Word16 *ptrDn;
346 : Word16 temp[1024]; /* Accommodates real input FFT size up to 1024. */
347 :
348 : /* Setup Pointers */
349 69200 : phstbl_ptrDn = &phs_tbl[SIZE - 1];
350 :
351 : /* The FFT part */
352 69200 : IF( isign != 0 )
353 : {
354 : Word16 *ptRealUp, *ptRealDn, *ptImaUp, *ptImaDn;
355 :
356 : /* Perform the complex FFT */
357 37710 : c_fft_fx( phs_tbl, SIZE, NUM_STAGE, in_ptr, temp, isign );
358 :
359 : /* First, handle the DC and foldover frequencies */
360 37710 : out_ptr[SIZE2] = sub( temp[0], temp[1] );
361 37710 : move16();
362 37710 : out_ptr[0] = sub( add( temp[0], temp[1] ), shr( NUM_STAGE, 1 ) );
363 37710 : move16(); /* DC have a small offset */
364 :
365 37710 : ptrDn = &temp[SIZE - 1];
366 :
367 37710 : ptImaDn = &out_ptr[SIZE - 1];
368 37710 : ptRealUp = &out_ptr[1];
369 37710 : ptImaUp = &out_ptr[SIZE2 + 1];
370 37710 : ptRealDn = &out_ptr[SIZE2 - 1];
371 :
372 : /* Now, handle the remaining positive frequencies */
373 963486 : FOR( i = 2; i <= SIZE2; i += 2 )
374 : {
375 925776 : Ltmp1_imag = L_mult( temp[i + 1], 16384 );
376 925776 : Ltmp1_imag = L_msu_sat( Ltmp1_imag, *ptrDn, 16384 );
377 925776 : tmp2_real = add_sat( temp[i + 1], *ptrDn-- );
378 :
379 925776 : Ltmp1_real = L_mult( temp[i], 16384 );
380 925776 : Ltmp1_real = L_mac_sat( Ltmp1_real, *ptrDn, 16384 );
381 925776 : tmp2_imag = sub( *ptrDn--, temp[i] );
382 :
383 :
384 925776 : *ptRealUp++ = msu_r_sat( L_mac_sat( Ltmp1_real, tmp2_real, phs_tbl[i] ), tmp2_imag, phs_tbl[i + 1] );
385 925776 : move16();
386 925776 : *ptImaDn-- = mac_r_sat( L_mac_sat( Ltmp1_imag, tmp2_imag, phs_tbl[i] ), tmp2_real, phs_tbl[i + 1] );
387 925776 : move16();
388 925776 : Ltmp1 = L_mac_sat( L_negate( Ltmp1_imag ), tmp2_real, *phstbl_ptrDn );
389 925776 : Ltmp1_real = L_mac_sat( Ltmp1_real, tmp2_imag, *phstbl_ptrDn-- );
390 925776 : *ptImaUp++ = msu_r_sat( Ltmp1, tmp2_imag, *phstbl_ptrDn );
391 925776 : move16();
392 925776 : *ptRealDn-- = mac_r_sat( Ltmp1_real, tmp2_real, *phstbl_ptrDn-- );
393 925776 : move16();
394 : }
395 : }
396 : ELSE /* The ifFT part */
397 : {
398 : const Word16 *ptRealUp, *ptRealDn, *ptImaUp, *ptImaDn;
399 :
400 : /* First, handle the DC and foldover frequencies */
401 31490 : Ltmp1 = L_mult( in_ptr[0], 16384 );
402 31490 : temp[0] = mac_r( Ltmp1, in_ptr[SIZE2], 16384 );
403 31490 : move16();
404 31490 : temp[1] = msu_r( Ltmp1, in_ptr[SIZE2], 16384 );
405 31490 : move16();
406 :
407 31490 : ptrDn = &temp[SIZE - 1];
408 :
409 : /* Here we cast to Word16 * from a const Word16 *. */
410 : /* This is ok because we use these pointers for */
411 : /* reading only. This is just to avoid declaring a */
412 : /* bunch of 4 other pointer with const Word16 *. */
413 31490 : ptImaDn = &in_ptr[SIZE - 1]; // Qx
414 31490 : ptRealUp = &in_ptr[1]; // Qx
415 31490 : ptImaUp = &in_ptr[SIZE2 + 1]; // Qx
416 31490 : ptRealDn = &in_ptr[SIZE2 - 1]; // Qx
417 :
418 : /* Now, handle the remaining positive frequencies */
419 580114 : FOR( i = 2; i <= SIZE2; i += 2 )
420 : {
421 548624 : Ltmp1_imag = L_mult( *ptImaDn, 16384 );
422 548624 : Ltmp1_imag = L_msu_sat( Ltmp1_imag, *ptImaUp, 16384 );
423 548624 : tmp2_real = add_sat( *ptImaDn--, *ptImaUp++ );
424 548624 : Ltmp1_real = L_mult( *ptRealUp, 16384 );
425 548624 : Ltmp1_real = L_mac_sat( Ltmp1_real, *ptRealDn, 16384 );
426 548624 : tmp2_imag = sub_sat( *ptRealUp++, *ptRealDn-- );
427 548624 : temp[i] = mac_r_sat( L_msu_sat( Ltmp1_real, tmp2_real, phs_tbl[i] ), tmp2_imag, phs_tbl[i + 1] );
428 548624 : move16();
429 548624 : temp[i + 1] = mac_r_sat( L_mac_sat( Ltmp1_imag, tmp2_imag, phs_tbl[i] ), tmp2_real, phs_tbl[i + 1] );
430 548624 : move16();
431 548624 : Ltmp1 = L_mac_sat( L_negate( Ltmp1_imag ), tmp2_real, *phstbl_ptrDn );
432 548624 : Ltmp1_real = L_msu_sat( Ltmp1_real, tmp2_imag, *phstbl_ptrDn-- );
433 548624 : *ptrDn-- = msu_r_sat( Ltmp1, tmp2_imag, *phstbl_ptrDn );
434 548624 : move16();
435 548624 : *ptrDn-- = msu_r_sat( Ltmp1_real, tmp2_real, *phstbl_ptrDn-- );
436 548624 : move16();
437 : }
438 :
439 : /* Perform the complex ifFT */
440 31490 : c_fft_fx( phs_tbl, SIZE, NUM_STAGE, temp, out_ptr, isign );
441 : }
442 69200 : }
443 :
444 :
445 : /*---------------------------------------------------------------------*
446 : * fft_rel()
447 : *
448 : * Computes the split-radix FFT in place for the real-valued
449 : * signal x of length n. The algorithm has been ported from
450 : * the Fortran code of [1].
451 : *
452 : * The function needs sine and cosine tables t_sin and t_cos,
453 : * and the constant N_MAX_FFT. The table entries are defined as
454 : * sin(2*pi*i) and cos(2*pi*i) for i = 0, 1, ..., N_MAX_FFT-1. The
455 : * implementation assumes that any entry will not be needed
456 : * outside the tables. Therefore, N_MAX_FFT and n must be properly
457 : * set. The function has been tested with the values n = 16,
458 : * 32, 64, 128, 256, and N_MAX_FFT = 1280.
459 : *
460 : * References
461 : * [1] H.V. Sorensen, D.L. Jones, M.T. Heideman, C.S. Burrus,
462 : * "Real-valued fast Fourier transform algorithm," IEEE
463 : * Trans. on Signal Processing, Vol.35, No.6, pp 849-863,
464 : * 1987.
465 : *
466 : * OUTPUT
467 : * x[0:n-1] Transform coeffients in the order re[0], re[1],
468 : * ..., re[n/2], im[n/2-1], ..., im[1].
469 : *---------------------------------------------------------------------*/
470 :
471 0 : void fft_rel(
472 : float x[], /* i/o: input/output vector */
473 : const int16_t n, /* i : vector length */
474 : const int16_t m /* i : log2 of vector length */
475 : )
476 : {
477 : int16_t i, j, k, n1, n2, n4;
478 : int16_t step;
479 : float xt, t1, t2;
480 : float *x0, *x1, *x2;
481 : float *xi2, *xi3, *xi4, *xi1;
482 : const float *s, *c;
483 : const int16_t *idx;
484 :
485 : /* !!!! NMAX = 256 is hardcoded here (similar optimizations should be done for NMAX > 256) !!! */
486 :
487 : float *x2even, *x2odd;
488 : float temp[512];
489 :
490 0 : if ( n == 128 || n == 256 || n == 512 )
491 : {
492 0 : idx = fft256_read_indexes;
493 :
494 : /* Combined Digit reverse counter & Length two butterflies */
495 0 : if ( n == 128 )
496 : {
497 0 : x2 = temp;
498 0 : for ( i = 0; i < 64; i++ )
499 : {
500 0 : j = *idx++;
501 0 : k = *idx++;
502 :
503 0 : *x2++ = x[j >> 1] + x[k >> 1];
504 0 : *x2++ = x[j >> 1] - x[k >> 1];
505 : }
506 : }
507 0 : else if ( n == 256 )
508 : {
509 0 : x2 = temp;
510 0 : for ( i = 0; i < 128; i++ )
511 : {
512 0 : j = *idx++;
513 0 : k = *idx++;
514 :
515 0 : *x2++ = x[j] + x[k];
516 0 : *x2++ = x[j] - x[k];
517 : }
518 : }
519 0 : else if ( n == 512 )
520 : {
521 0 : x2even = temp;
522 0 : x2odd = temp + 256;
523 :
524 0 : for ( i = 0; i < 128; i++ )
525 : {
526 0 : j = 2 * *idx++;
527 0 : k = 2 * *idx++;
528 :
529 0 : *x2even++ = x[j] + x[k];
530 0 : *x2even++ = x[j] - x[k];
531 0 : *x2odd++ = x[++j] + x[++k];
532 0 : *x2odd++ = x[j] - x[k];
533 : }
534 : }
535 :
536 : /*-----------------------------------------------------------------*
537 : * 1st Stage Loop has been Unrolled because n4 is '1' and that
538 : * allows the elimination of the 'for_ (j = 1; j < n4; j++)' loop
539 : * and the associated pointers initialization.
540 : * Also, it allows to Put the Data from 'temp' back into 'x' due
541 : * to the previous Combined Digit Reverse and Length two butterflies
542 : *-----------------------------------------------------------------*/
543 :
544 : /*for_ (k = 2; k < 3; k++)*/
545 : {
546 0 : x0 = temp;
547 0 : x1 = x0 + 2;
548 0 : x2 = x;
549 :
550 0 : for ( i = 0; i < n; i += 4 )
551 : {
552 0 : *x2++ = *x0++ + *x1; /* x[i] = xt + x[i+n2]; */
553 0 : *x2++ = *x0;
554 0 : *x2++ = *--x0 - *x1++; /* x[i+n2] = xt - x[i+n2]; */
555 0 : *x2++ = -*x1; /* x[i+n2+n4] = -x[i+n2+n4]; */
556 :
557 0 : x0 += 4;
558 0 : x1 += 3; /* x1 has already advanced */
559 : }
560 : }
561 : }
562 : else
563 : {
564 : /*-----------------------------------------------------------------*
565 : * Digit reverse counter
566 : *-----------------------------------------------------------------*/
567 :
568 0 : j = 0;
569 0 : x0 = &x[0];
570 0 : for ( i = 0; i < n - 1; i++ )
571 : {
572 0 : if ( i < j )
573 : {
574 0 : xt = x[j];
575 0 : x[j] = *x0;
576 0 : *x0 = xt;
577 : }
578 0 : x0++;
579 0 : k = n / 2;
580 0 : while ( k <= j )
581 : {
582 0 : j -= k;
583 0 : k = k >> 1;
584 : }
585 0 : j += k;
586 : }
587 :
588 : /*-----------------------------------------------------------------*
589 : * Length two butterflies
590 : *-----------------------------------------------------------------*/
591 :
592 0 : x0 = &x[0];
593 0 : x1 = &x[1];
594 0 : for ( i = 0; i < n / 2; i++ )
595 : {
596 0 : *x1 = *x0 - *x1;
597 0 : *x0 = *x0 * 2 - *x1;
598 :
599 0 : x0++;
600 0 : x0++;
601 0 : x1++;
602 0 : x1++;
603 : }
604 :
605 : /*-----------------------------------------------------------------*
606 : * 1st Stage Loop has been Unrolled because n4 is '1' and that
607 : * allows the elimination of the 'for_ (j = 1; j < n4; j++)' loop
608 : * and the associated pointers initialization.
609 : *-----------------------------------------------------------------*/
610 :
611 : /* for_ (k = 2; k < 3; k++) */
612 : {
613 0 : x0 = x;
614 0 : x1 = x0 + 2;
615 :
616 0 : for ( i = 0; i < n; i += 4 )
617 : {
618 0 : *x1 = *x0 - *x1; /* x[i+n2] = xt - x[i+n2]; */
619 0 : *x0 = *x0 * 2 - *x1++; /* x[i] = xt + x[i+n2]; */
620 0 : *x1 = -*x1; /* x[i+n2+n4] = -x[i+n2+n4]; */
621 :
622 0 : x0 += 4;
623 0 : x1 += 3; /* x1 has already advanced */
624 : }
625 : }
626 : }
627 :
628 : /*-----------------------------------------------------------------*
629 : * Other butterflies
630 : *
631 : * The implementation described in [1] has been changed by using
632 : * table lookup for evaluating sine and cosine functions. The
633 : * variable ind and its increment step are needed to access table
634 : * entries. Note that this implementation assumes n4 to be so
635 : * small that ind will never exceed the table. Thus the input
636 : * argument n and the constant N_MAX_FFT must be set properly.
637 : *-----------------------------------------------------------------*/
638 :
639 0 : n4 = 1;
640 0 : n2 = 2;
641 0 : n1 = 4;
642 :
643 0 : step = N_MAX_DIV4;
644 :
645 0 : for ( k = 3; k <= m; k++ )
646 : {
647 0 : step >>= 1;
648 0 : n4 <<= 1;
649 0 : n2 <<= 1;
650 0 : n1 <<= 1;
651 :
652 0 : x0 = x;
653 0 : x1 = x0 + n2;
654 0 : x2 = x1 + n4;
655 :
656 0 : for ( i = 0; i < n; i += n1 )
657 : {
658 0 : *x1 = *x0 - *x1; /* x[i+n2] = xt - x[i+n2]; */
659 0 : *x0 = *x0 * 2 - *x1; /* x[i] = xt + x[i+n2]; */
660 0 : *x2 = -*x2; /* x[i+n2+n4] = -x[i+n2+n4]; */
661 :
662 0 : s = sincos_t_ext;
663 0 : c = s + N_MAX_FFT / 4; /* 1024/4 = 256, 256/4=64 */
664 0 : xi1 = x0;
665 0 : xi3 = xi1 + n2;
666 0 : xi2 = xi3;
667 0 : x0 += n1;
668 0 : xi4 = x0;
669 :
670 0 : for ( j = 1; j < n4; j++ )
671 : {
672 0 : xi3++;
673 0 : xi1++;
674 0 : xi4--;
675 0 : xi2--;
676 0 : c += step;
677 0 : s += step; /* autoincrement by ar0 */
678 :
679 0 : t1 = *xi3 * *c + *xi4 * *s; /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); */
680 0 : t2 = *xi3 * *s - *xi4 * *c; /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); */
681 :
682 0 : *xi4 = *xi2 - t2;
683 0 : *xi2 = *xi1 - t1;
684 0 : *xi1 = *xi1 * 2 - *xi2;
685 0 : *xi3 = -2 * t2 - *xi4;
686 : }
687 :
688 0 : x1 += n1;
689 0 : x2 += n1;
690 : }
691 : }
692 :
693 0 : return;
694 : }
695 :
696 2118622 : void fft_rel_16_32fx(
697 : Word16 x[], /* i/o: input/output vector Qx */
698 : Word16 *q_x, /* extra scaling added on speech buffer*/
699 : Word16 i_subfr,
700 : const Word16 n, /* i : vector length */
701 : const Word16 m /* i : log2 of vector length */
702 : )
703 : {
704 : Word16 i, j, k, n1, n2, n4;
705 : Word16 step;
706 : Word32 xt, t1, t2;
707 : Word32 *x0, *x1, *x2;
708 : const Word16 *s, *c;
709 : Word32 *xi2, *xi3, *xi4, *xi1;
710 :
711 : Word32 fft_bff32[L_FFT];
712 2118622 : Copy_Scale_sig_16_32_no_sat( x, fft_bff32, L_FFT, 0 ); // copying x to fft_bff32 without scaling
713 :
714 : /*-----------------------------------------------------------------*
715 : * Digit reverse counter
716 : *-----------------------------------------------------------------*/
717 :
718 2118622 : j = 0;
719 2118622 : move16();
720 2118622 : x0 = &fft_bff32[0]; // Qx
721 542367232 : FOR( i = 0; i < n - 1; i++ )
722 : {
723 540248610 : IF( LT_16( i, j ) )
724 : {
725 254234640 : xt = fft_bff32[j]; // Qx
726 254234640 : move32();
727 254234640 : fft_bff32[j] = *x0; // Qx
728 254234640 : move32();
729 254234640 : *x0 = xt; // Qx
730 254234640 : move32();
731 : }
732 540248610 : x0++;
733 540248610 : k = shr( n, 1 );
734 1063548244 : WHILE( ( k <= j ) )
735 : {
736 523299634 : j = sub( j, k );
737 523299634 : k = shr( k, 1 );
738 : }
739 540248610 : j = add( j, k );
740 : }
741 :
742 : /*-----------------------------------------------------------------*
743 : * Length two butterflies
744 : *-----------------------------------------------------------------*/
745 :
746 2118622 : x0 = &fft_bff32[0];
747 2118622 : x1 = &fft_bff32[1];
748 273302238 : FOR( i = 0; i < ( n >> 1 ); i++ )
749 : {
750 271183616 : xt = *x0;
751 271183616 : move32();
752 271183616 : *x0 = L_add( xt, *x1 );
753 271183616 : move32();
754 271183616 : *x1 = L_sub( xt, *x1 );
755 271183616 : move32();
756 271183616 : x0++;
757 271183616 : x0++;
758 271183616 : x1++;
759 271183616 : x1++;
760 : }
761 :
762 : /*-----------------------------------------------------------------*
763 : * Other butterflies
764 : *
765 : * The implementation described in [1] has been changed by using
766 : * table lookup for evaluating sine and cosine functions. The
767 : * variable ind and its increment step are needed to access table
768 : * entries. Note that this implementation assumes n4 to be so
769 : * small that ind will never exceed the table. Thus the input
770 : * argument n and the constant N_MAX_SAS must be set properly.
771 : *-----------------------------------------------------------------*/
772 :
773 2118622 : n2 = 1;
774 2118622 : move16();
775 : /* step = N_MAX_SAS/4; */
776 16948976 : FOR( k = 2; k <= m; k++ )
777 : {
778 14830354 : n4 = n2;
779 14830354 : move16();
780 14830354 : n2 = shl( n4, 1 );
781 14830354 : n1 = shl( n2, 1 );
782 :
783 14830354 : step = idiv1616( N_MAX_SAS, n1 );
784 :
785 14830354 : x0 = fft_bff32;
786 14830354 : x1 = fft_bff32 + n2;
787 14830354 : x2 = fft_bff32 + add( n2, n4 );
788 283895348 : FOR( i = 0; i < n; i += n1 )
789 : {
790 269064994 : xt = *x0;
791 269064994 : move32(); /* xt = x[i]; */
792 269064994 : *x0 = L_add( xt, *x1 );
793 269064994 : move32(); /* x[i] = xt + x[i+n2]; */
794 269064994 : *x1 = L_sub( xt, *x1 );
795 269064994 : move32(); /* x[i+n2] = xt - x[i+n2]; */
796 269064994 : *x2 = L_negate( *x2 );
797 269064994 : move32(); /* x[i+n2+n4] = -x[i+n2+n4]; */
798 :
799 :
800 269064994 : s = sincos_t_fx + step; // Q15
801 269064994 : c = s + 64; // Q15
802 269064994 : xi1 = fft_bff32 + add( i, 1 );
803 269064994 : xi3 = xi1 + n2;
804 269064994 : xi2 = xi3 - 2;
805 269064994 : xi4 = xi1 + sub( n1, 2 );
806 :
807 949142656 : FOR( j = 1; j < n4; j++ )
808 : {
809 680077662 : t1 = L_add( Mpy_32_16_1( *xi3, *c ), Mpy_32_16_1( *xi4, *s ) ); /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); Qx */
810 680077662 : t2 = L_sub( Mpy_32_16_1( *xi3, *s ), Mpy_32_16_1( *xi4, *c ) ); /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); Qx */
811 680077662 : *xi4 = L_sub( *xi2, t2 );
812 680077662 : move32();
813 680077662 : *xi3 = L_negate( L_add( *xi2, t2 ) );
814 680077662 : move32();
815 680077662 : *xi2 = L_sub( *xi1, t1 );
816 680077662 : move32();
817 680077662 : *xi1 = L_add( *xi1, t1 );
818 680077662 : move32();
819 :
820 680077662 : xi4--;
821 680077662 : xi2--;
822 680077662 : xi3++;
823 680077662 : xi1++;
824 680077662 : c += step;
825 680077662 : s += step; /* autoincrement by ar0 */
826 : }
827 :
828 269064994 : x0 += n1;
829 269064994 : x1 += n1;
830 269064994 : x2 += n1;
831 : }
832 : /* step = shr(step, 1); */
833 : }
834 2118622 : Word16 norm = L_norm_arr( fft_bff32, L_FFT );
835 2118622 : IF( i_subfr == 0 )
836 : {
837 1059311 : Copy_Scale_sig32_16( fft_bff32, x, L_FFT, norm );
838 1059311 : *q_x = sub( norm, 16 );
839 1059311 : move16();
840 : }
841 : ELSE
842 : {
843 1059311 : IF( LT_16( sub( norm, 16 ), *q_x ) )
844 : {
845 211536 : scale_sig( x - L_FFT, L_FFT, sub( sub( norm, 16 ), *q_x ) );
846 211536 : Copy_Scale_sig32_16( fft_bff32, x, L_FFT, norm );
847 211536 : *q_x = sub( norm, 16 );
848 211536 : move16();
849 : }
850 : ELSE
851 : {
852 847775 : Copy_Scale_sig32_16( fft_bff32, x, L_FFT, add( 16, *q_x ) );
853 : }
854 : }
855 :
856 2118622 : return;
857 : }
858 :
859 206912 : void fft_rel_fx(
860 : Word16 x[], /* i/o: input/output vector Qx */
861 : const Word16 n, /* i : vector length */
862 : const Word16 m /* i : log2 of vector length */
863 : )
864 : {
865 : Word16 i, j, k, n1, n2, n4;
866 : Word16 step;
867 : Word16 xt, t1, t2;
868 : Word16 *x0, *x1, *x2;
869 : const Word16 *s, *c;
870 : Word16 *xi2, *xi3, *xi4, *xi1;
871 : #ifdef BASOP_NOGLOB_DECLARE_LOCAL
872 206912 : Flag Overflow = 0;
873 206912 : move32();
874 : #endif
875 :
876 :
877 : /*-----------------------------------------------------------------*
878 : * Digit reverse counter
879 : *-----------------------------------------------------------------*/
880 :
881 206912 : j = 0;
882 206912 : move16();
883 206912 : x0 = &x[0]; // Qx
884 206912 : move16();
885 46000916 : FOR( i = 0; i < n - 1; i++ )
886 : {
887 45794004 : IF( LT_16( i, j ) )
888 : {
889 21538733 : xt = x[j]; // Qx
890 21538733 : move16();
891 21538733 : x[j] = *x0; // Qx
892 21538733 : move16();
893 21538733 : *x0 = xt; // Qx
894 21538733 : move16();
895 : }
896 45794004 : x0++;
897 45794004 : k = shr( n, 1 );
898 90098630 : WHILE( ( k <= j ) )
899 : {
900 44304626 : j = sub( j, k );
901 44304626 : k = shr( k, 1 );
902 : }
903 45794004 : j = add( j, k );
904 : }
905 :
906 : /*-----------------------------------------------------------------*
907 : * Length two butterflies
908 : *-----------------------------------------------------------------*/
909 :
910 206912 : x0 = &x[0];
911 206912 : move16();
912 206912 : x1 = &x[1];
913 206912 : move16();
914 23207370 : FOR( i = 0; i < ( n >> 1 ); i++ )
915 : {
916 23000458 : xt = *x0;
917 23000458 : move16();
918 23000458 : *x0 = add_o( xt, *x1, &Overflow );
919 23000458 : move16();
920 23000458 : *x1 = sub_o( xt, *x1, &Overflow );
921 23000458 : move16();
922 23000458 : x0++;
923 23000458 : x0++;
924 23000458 : x1++;
925 23000458 : x1++;
926 : }
927 :
928 : /*-----------------------------------------------------------------*
929 : * Other butterflies
930 : *
931 : * The implementation described in [1] has been changed by using
932 : * table lookup for evaluating sine and cosine functions. The
933 : * variable ind and its increment step are needed to access table
934 : * entries. Note that this implementation assumes n4 to be so
935 : * small that ind will never exceed the table. Thus the input
936 : * argument n and the constant N_MAX_SAS must be set properly.
937 : *-----------------------------------------------------------------*/
938 :
939 206912 : n2 = 1;
940 206912 : move16();
941 : /* step = N_MAX_SAS/4; */
942 1489378 : FOR( k = 2; k <= m; k++ )
943 : {
944 1282466 : n4 = n2;
945 1282466 : move16();
946 1282466 : n2 = shl( n4, 1 );
947 1282466 : n1 = shl( n2, 1 );
948 :
949 1282466 : step = idiv1616( N_MAX_SAS, n1 );
950 :
951 1282466 : x0 = x;
952 1282466 : x1 = x + n2;
953 1282466 : x2 = x + add( n2, n4 );
954 24076012 : FOR( i = 0; i < n; i += n1 )
955 : {
956 22793546 : xt = *x0;
957 22793546 : move16(); /* xt = x[i]; */
958 22793546 : *x0 = add_o( xt, *x1, &Overflow );
959 22793546 : move16(); /* x[i] = xt + x[i+n2]; */
960 22793546 : *x1 = sub_o( xt, *x1, &Overflow );
961 22793546 : move16(); /* x[i+n2] = xt - x[i+n2]; */
962 22793546 : *x2 = negate( *x2 );
963 22793546 : move16(); /* x[i+n2+n4] = -x[i+n2+n4]; */
964 :
965 :
966 22793546 : s = sincos_t_fx + step; // Q15
967 22793546 : c = s + 64; // Q15
968 22793546 : xi1 = x + add( i, 1 );
969 22793546 : xi3 = xi1 + n2;
970 22793546 : xi2 = xi3 - 2;
971 22793546 : xi4 = xi1 + sub( n1, 2 );
972 :
973 80335685 : FOR( j = 1; j < n4; j++ )
974 : {
975 57542139 : t1 = add_o( mult_r( *xi3, *c ), mult_r( *xi4, *s ), &Overflow ); /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); Qx */
976 57542139 : t2 = sub_o( mult_r( *xi3, *s ), mult_r( *xi4, *c ), &Overflow ); /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); Qx */
977 57542139 : *xi4 = sub_o( *xi2, t2, &Overflow );
978 57542139 : move16();
979 57542139 : *xi3 = negate( add_o( *xi2, t2, &Overflow ) );
980 57542139 : move16();
981 57542139 : *xi2 = sub_o( *xi1, t1, &Overflow );
982 57542139 : move16();
983 57542139 : *xi1 = add_o( *xi1, t1, &Overflow );
984 57542139 : move16();
985 :
986 57542139 : xi4--;
987 57542139 : xi2--;
988 57542139 : xi3++;
989 57542139 : xi1++;
990 57542139 : c += step;
991 57542139 : s += step; /* autoincrement by ar0 */
992 : }
993 :
994 22793546 : x0 += n1;
995 22793546 : x1 += n1;
996 22793546 : x2 += n1;
997 : }
998 : /* step = shr(step, 1); */
999 : }
1000 :
1001 206912 : return;
1002 : }
1003 :
1004 44321 : void fft_rel_fx32(
1005 : Word32 x[], /* i/o: input/output vector Qx */
1006 : const Word16 n, /* i : vector length */
1007 : const Word16 m /* i : log2 of vector length */
1008 : )
1009 : {
1010 : Word16 i, j, k, n1, n2, n4;
1011 : Word16 step;
1012 : Word32 xt, t1, t2;
1013 : Word32 *x0, *x1, *x2;
1014 : Word32 *xi2, *xi3, *xi4, *xi1;
1015 : const Word16 *s, *c;
1016 : const Word16 *idx;
1017 :
1018 : /* !!!! NMAX = 256 is hardcoded here (similar optimizations should be done for NMAX > 256) !!! */
1019 :
1020 : Word32 *x2even, *x2odd;
1021 : Word32 temp[512];
1022 :
1023 44321 : test();
1024 44321 : test();
1025 44321 : IF( EQ_16( n, 128 ) || EQ_16( n, 256 ) || EQ_16( n, 512 ) )
1026 : {
1027 44321 : idx = fft256_read_indexes;
1028 :
1029 : /* Combined Digit reverse counter & Length two butterflies */
1030 44321 : IF( EQ_16( n, 128 ) )
1031 : {
1032 0 : x2 = temp;
1033 0 : FOR( i = 0; i < 64; i++ )
1034 : {
1035 0 : j = *idx++;
1036 0 : move16();
1037 0 : k = *idx++;
1038 0 : move16();
1039 :
1040 0 : *x2++ = L_add( x[( j >> 1 )], x[( k >> 1 )] ); // Qx
1041 0 : move16();
1042 0 : *x2++ = L_sub( x[( j >> 1 )], x[( k >> 1 )] ); // Qx
1043 0 : move16();
1044 : }
1045 : }
1046 44321 : ELSE IF( EQ_16( n, 256 ) )
1047 : {
1048 488 : x2 = temp;
1049 62952 : FOR( i = 0; i < 128; i++ )
1050 : {
1051 62464 : j = *idx++;
1052 62464 : move16();
1053 62464 : k = *idx++;
1054 62464 : move16();
1055 :
1056 62464 : *x2++ = L_add( x[j], x[k] ); // Qx
1057 62464 : move16();
1058 62464 : *x2++ = L_sub( x[j], x[k] ); // Qx
1059 62464 : move16();
1060 : }
1061 : }
1062 43833 : ELSE IF( EQ_16( n, 512 ) )
1063 : {
1064 43833 : x2even = temp;
1065 43833 : x2odd = temp + 256;
1066 :
1067 5654457 : FOR( i = 0; i < 128; i++ )
1068 : {
1069 5610624 : j = shl( *idx, 1 );
1070 5610624 : idx++;
1071 5610624 : k = shl( *idx, 1 );
1072 5610624 : idx++;
1073 :
1074 5610624 : *x2even++ = L_add( x[j], x[k] ); // Qx
1075 5610624 : move16();
1076 5610624 : *x2even++ = L_sub( x[j], x[k] ); // Qx
1077 5610624 : move16();
1078 5610624 : j = add( j, 1 );
1079 5610624 : k = add( k, 1 );
1080 5610624 : *x2odd++ = L_add( x[j], x[k] ); // Qx
1081 5610624 : move16();
1082 5610624 : *x2odd++ = L_sub( x[j], x[k] ); // Qx
1083 5610624 : move16();
1084 : }
1085 : }
1086 :
1087 : /*-----------------------------------------------------------------*
1088 : * 1st Stage Loop has been Unrolled because n4 is '1' and that
1089 : * allows the elimination of the 'for_ (j = 1; j < n4; j++)' loop
1090 : * and the associated pointers initialization.
1091 : * Also, it allows to Put the Data from 'temp' back into 'x' due
1092 : * to the previous Combined Digit Reverse and Length two butterflies
1093 : *-----------------------------------------------------------------*/
1094 :
1095 : /*for_ (k = 2; k < 3; k++)*/
1096 : {
1097 44321 : x0 = temp;
1098 44321 : x1 = x0 + 2;
1099 44321 : x2 = x; // Qx
1100 :
1101 5686177 : FOR( i = 0; i < n; i += 4 )
1102 : {
1103 5641856 : *x2++ = L_add( *x0++, *x1 ); /* x[i] = xt + x[i+n2]; */
1104 5641856 : move16();
1105 5641856 : *x2++ = *x0;
1106 5641856 : move16();
1107 5641856 : x0--;
1108 5641856 : *x2++ = L_sub( *x0, *x1 ); /* x[i+n2] = xt - x[i+n2]; */
1109 5641856 : move16();
1110 5641856 : x1++;
1111 5641856 : *x2++ = L_negate( *x1 ); /* x[i+n2+n4] = -x[i+n2+n4]; */
1112 5641856 : move16();
1113 :
1114 5641856 : x0 += 4;
1115 5641856 : x1 += 3; /* x1 has already advanced */
1116 : }
1117 : }
1118 : }
1119 : ELSE
1120 : {
1121 : /*-----------------------------------------------------------------*
1122 : * Digit reverse counter
1123 : *-----------------------------------------------------------------*/
1124 :
1125 0 : j = 0;
1126 0 : move16();
1127 0 : x0 = &x[0]; // Qx
1128 0 : FOR( i = 0; i < ( n - 1 ); i++ )
1129 : {
1130 0 : IF( LT_16( i, j ) )
1131 : {
1132 0 : xt = x[j]; // Qx
1133 0 : move32();
1134 0 : x[j] = *x0;
1135 0 : move32();
1136 0 : *x0 = xt;
1137 0 : move32();
1138 : }
1139 0 : x0++;
1140 0 : k = shr( n, 1 );
1141 0 : WHILE( ( k <= j ) )
1142 : {
1143 0 : j = sub( j, k );
1144 0 : k = shr( k, 1 );
1145 : }
1146 0 : j = add( j, k );
1147 : }
1148 :
1149 : /*-----------------------------------------------------------------*
1150 : * Length two butterflies
1151 : *-----------------------------------------------------------------*/
1152 :
1153 0 : x0 = &x[0]; // Qx
1154 0 : x1 = &x[1]; // Qx
1155 0 : FOR( i = 0; i < ( n >> 1 ); i++ )
1156 : {
1157 0 : *x1 = L_sub( *x0, *x1 ); // Qx
1158 0 : move32();
1159 0 : *x0 = L_sub( L_shl( *x0, 1 ), *x1 ); //*x0 * 2 - *x1 (Qx)
1160 0 : move32();
1161 :
1162 0 : x0++;
1163 0 : x0++;
1164 0 : x1++;
1165 0 : x1++;
1166 : }
1167 :
1168 : /*-----------------------------------------------------------------*
1169 : * 1st Stage Loop has been Unrolled because n4 is '1' and that
1170 : * allows the elimination of the 'for_ (j = 1; j < n4; j++)' loop
1171 : * and the associated pointers initialization.
1172 : *-----------------------------------------------------------------*/
1173 :
1174 : /* for_ (k = 2; k < 3; k++) */
1175 : {
1176 0 : x0 = x; // Qx
1177 0 : x1 = x0 + 2;
1178 :
1179 0 : FOR( i = 0; i < n; i += 4 )
1180 : {
1181 0 : *x1 = L_sub( *x0, *x1 ); /* x[i+n2] = xt - x[i+n2]; Qx*/
1182 0 : move32();
1183 0 : *x0 = L_sub( L_shl( *x0, 1 ), *x1++ ); /* x[i] = xt + x[i+n2]; */ /**x0 * 2 - *x1 (Qx)*/
1184 0 : move32();
1185 0 : *x1 = L_negate( *x1 ); /* x[i+n2+n4] = -x[i+n2+n4]; Qx*/
1186 0 : move32();
1187 :
1188 0 : x0 += 4;
1189 0 : x1 += 3; /* x1 has already advanced */
1190 : }
1191 : }
1192 : }
1193 :
1194 : /*-----------------------------------------------------------------*
1195 : * Other butterflies
1196 : *
1197 : * The implementation described in [1] has been changed by using
1198 : * table lookup for evaluating sine and cosine functions. The
1199 : * variable ind and its increment step are needed to access table
1200 : * entries. Note that this implementation assumes n4 to be so
1201 : * small that ind will never exceed the table. Thus the input
1202 : * argument n and the constant N_MAX_FFT must be set properly.
1203 : *-----------------------------------------------------------------*/
1204 :
1205 44321 : n4 = 1;
1206 44321 : move16();
1207 44321 : n2 = 2;
1208 44321 : move16();
1209 44321 : n1 = 4;
1210 44321 : move16();
1211 :
1212 44321 : step = N_MAX_DIV4;
1213 44321 : move16();
1214 :
1215 354080 : FOR( k = 3; k <= m; k++ )
1216 : {
1217 309759 : step = shr( step, 1 );
1218 309759 : n4 = shl( n4, 1 );
1219 309759 : n2 = shl( n2, 1 );
1220 309759 : n1 = shl( n1, 1 );
1221 :
1222 309759 : x0 = x;
1223 309759 : x1 = x0 + n2;
1224 309759 : x2 = x1 + n4;
1225 :
1226 5907294 : FOR( i = 0; i < n; i += n1 )
1227 : {
1228 5597535 : *x1 = L_sub( *x0, *x1 ); /* x[i+n2] = xt - x[i+n2]; */
1229 5597535 : move32();
1230 5597535 : *x0 = L_sub( L_shl( *x0, 1 ), *x1 ); /* x[i] = xt + x[i+n2]; */ /**x0 * 2 - *x1 (Qx)*/
1231 5597535 : move32();
1232 5597535 : *x2 = L_negate( *x2 ); /* x[i+n2+n4] = -x[i+n2+n4]; Qx */
1233 5597535 : move32();
1234 :
1235 5597535 : s = sincos_t_ext_fx; // Q15
1236 5597535 : c = s + N_MAX_FFT / 4; /* 1024/4 = 256, 256/4=64 Q15*/
1237 5597535 : xi1 = x0;
1238 5597535 : xi3 = xi1 + n2;
1239 5597535 : xi2 = xi3;
1240 5597535 : x0 += n1;
1241 5597535 : xi4 = x0;
1242 :
1243 39461760 : FOR( j = 1; j < n4; j++ )
1244 : {
1245 33864225 : xi3++;
1246 33864225 : xi1++;
1247 33864225 : xi4--;
1248 33864225 : xi2--;
1249 33864225 : c += step;
1250 33864225 : s += step; /* autoincrement by ar0 */
1251 :
1252 33864225 : t1 = L_add( Mpy_32_16_1( *xi3, *c ), Mpy_32_16_1( *xi4, *s ) ); /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); Qx*/
1253 33864225 : t2 = L_sub( Mpy_32_16_1( *xi3, *s ), Mpy_32_16_1( *xi4, *c ) ); /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); Qx*/
1254 :
1255 33864225 : *xi4 = L_sub( *xi2, t2 );
1256 33864225 : move32();
1257 33864225 : *xi2 = L_sub( *xi1, t1 );
1258 33864225 : move32();
1259 33864225 : *xi1 = L_sub( L_shl( *xi1, 1 ), *xi2 ); // Qx
1260 33864225 : move32();
1261 33864225 : *xi3 = L_negate( L_add( L_shl( t2, 1 ), *xi4 ) ); // Qx
1262 33864225 : move32();
1263 : }
1264 :
1265 5597535 : x1 += n1;
1266 5597535 : x2 += n1;
1267 : }
1268 : }
1269 :
1270 44321 : return;
1271 : }
|