Line data Source code
1 : /*====================================================================================
2 : EVS Codec 3GPP TS26.452 Aug 12, 2021. Version 16.3.0
3 : ====================================================================================*/
4 :
5 : #include <stdint.h>
6 : #include "options.h" /* Compilation switches */
7 : #include "prot_fx.h" /* Function prototypes */
8 : #include "rom_com.h" /* Static table prototypes */
9 : #include "stl.h"
10 : #include "stdint.h"
11 : #include "wmc_auto.h"
12 :
13 : /*---------------------------------------------------------------------*
14 : * Local constants
15 : *---------------------------------------------------------------------*/
16 :
17 : #define N_MAX_FFT 1024
18 : #define N_MAX_DIV2 ( N_MAX_FFT >> 1 )
19 : #define N_MAX_DIV4 ( N_MAX_DIV2 >> 1 )
20 : #define INV_SQR2_FX 23170
21 : #define N_MAX_SAS 256
22 :
23 : /*------------------------------------------------------------------
24 : *
25 : * This is an implementation of decimation-in-time FFT algorithm for
26 : * real sequences. The techniques used here can be found in several
27 : * books, e.g., i) Proakis and Manolakis, "Digital Signal Processing",
28 : * 2nd Edition, Chapter 9, and ii) W.H. Press et. al., "Numerical
29 : * Recipes in C", 2nd Edition, Chapter 12.
30 : *
31 : * Input - There are two inputs to this function:
32 : *
33 : * 1) An integer pointer to the input data array
34 : * 2) An integer value which should be set as +1 for FFT
35 : * and some other value, e.g., -1 for ifFT
36 : *
37 : * Output - There is no return value.
38 : * The input data are replaced with transformed data. if the
39 : * input is a real time domain sequence, it is replaced with
40 : * the complex FFT for positive frequencies. The FFT value
41 : * for DC and the foldover frequency are combined to form the
42 : * first complex number in the array. The remaining complex
43 : * numbers correspond to increasing frequencies. if the input
44 : * is a complex frequency domain sequence arranged as above,
45 : * it is replaced with the corresponding time domain sequence.
46 : *
47 : * Notes:
48 : *
49 : * 1) This function is designed to be a part of a noise supp-
50 : * ression algorithm that requires 128-point FFT of real
51 : * sequences. This is achieved here through a 64-point
52 : * complex FFT. Consequently, the FFT size information is
53 : * not transmitted explicitly. However, some flexibility
54 : * is provided in the function to change the size of the
55 : * FFT by specifying the size information through "define"
56 : * statements.
57 : *
58 : * 2) The values of the complex sinusoids used in the FFT
59 : * algorithm are computed once (i.e., the first time the
60 : * r_fft function is called) and stored in a table. To
61 : * further speed up the algorithm, these values can be
62 : * precomputed and stored in a ROM table in actual DSP
63 : * based implementations.
64 : *
65 : * 3) In the c_fft function, the FFT values are divided by
66 : * 2 after each stage of computation thus dividing the
67 : * final FFT values by 64. No multiplying factor is used
68 : * for the ifFT. This is somewhat different from the usual
69 : * definition of FFT where the factor 1/N, i.e., 1/64, is
70 : * used for the ifFT and not the FFT. No factor is used in
71 : * the r_fft function.
72 : *
73 : * 4) Much of the code for the FFT and ifFT parts in r_fft
74 : * and c_fft functions are similar and can be combined.
75 : * They are, however, kept separate here to speed up the
76 : * execution.
77 : *------------------------------------------------------------------------*/
78 : /*------------------------------------------------------------------------*
79 : * c_fft_fx:
80 : *
81 : * Computes the complex part of the split-radix FFT
82 : *------------------------------------------------------------------------*/
83 :
84 88063 : static void c_fft_fx(
85 : const Word16 *phs_tbl, /* i : Table of phases */
86 : Word16 SIZE, /* i : Size of the FFT */
87 : Word16 NUM_STAGE, /* i : Number of stages */
88 : const Word16 *in_ptr, /* i : coefficients in the order re[0], re[n/2], re[1], im[1], ..., re[n/2-1], im[n/2-1] Qx*/
89 : Word16 *out_ptr, /* o : coefficients in the order re[0], re[n/2], re[1], im[1], ..., re[n/2-1], im[n/2-1] Q(x - 1)*/
90 : /* in_ptr & out_ptr must not overlap! */
91 : const Word16 isign ) /* i : 1=fft, otherwise it is ifft*/
92 : {
93 : Word16 i, j, k, ii, jj, kk, ji, kj;
94 : Word32 L_tmp1, L_tmp2;
95 : Word16 tmp1, tmp2, tmp3, tmp4;
96 : const Word16 *table_ptr;
97 : const Word16 *input_ptr1, *input_ptr2, *input_ptr3, *input_ptr4;
98 88063 : Word16 shift = 0;
99 88063 : move16();
100 : /* Setup Reorder Variables */
101 88063 : table_ptr = NULL;
102 88063 : table_ptr = FFT_REORDER_1024;
103 88063 : SWITCH( SIZE )
104 : {
105 105 : case 1024:
106 105 : shift = 0;
107 105 : move16();
108 105 : BREAK;
109 686 : case 512:
110 686 : shift = 1;
111 686 : move16();
112 686 : BREAK;
113 6302 : case 256:
114 6302 : shift = 2;
115 6302 : move16();
116 6302 : BREAK;
117 420 : case 128:
118 420 : shift = 3;
119 420 : move16();
120 420 : BREAK;
121 80550 : case 64:
122 80550 : shift = 4;
123 80550 : move16();
124 80550 : BREAK;
125 : }
126 : /* The FFT part */
127 88063 : IF( isign != 0 )
128 : {
129 : /* Unrolled 1st/2nd Stage
130 : * 1) to take advantage of Table Values (0 & +/- 16384)
131 : * 2) to perform reordering of Input Values
132 : */
133 597810 : FOR( k = 0; k < SIZE; k += 8 )
134 : {
135 : /*
136 : * This loop use:
137 : * 4 Word16 (tmp1...tmp4)
138 : * 2 Word32 (L_tmp1 & L_tmp2)
139 : * 4 Pointers (table_ptr, input_ptr1, input_ptr2, input_ptr3)
140 : *
141 : * The addition of 'in_ptr' + and index value from 'reorder_ptr'
142 : * is counted as a move16()
143 : */
144 :
145 550552 : input_ptr1 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift ); // Qx
146 :
147 550552 : L_tmp1 = L_mult( *input_ptr1++, 16384 );
148 550552 : L_tmp2 = L_mult( *input_ptr1, 16384 );
149 :
150 550552 : input_ptr1 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
151 :
152 550552 : tmp1 = msu_r_sat( L_tmp1, *input_ptr1, 16384 );
153 550552 : tmp3 = mac_r_sat( L_tmp1, *input_ptr1++, 16384 );
154 550552 : input_ptr2 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
155 550552 : input_ptr3 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
156 :
157 550552 : L_tmp1 = L_mult( *input_ptr2++, 16384 );
158 550552 : tmp2 = mac_r_sat( L_tmp1, *input_ptr3, 16384 );
159 550552 : tmp4 = msu_r_sat( L_tmp1, *input_ptr3++, 16384 );
160 550552 : L_tmp1 = L_mult( tmp3, 16384 );
161 550552 : out_ptr[k] = mac_r_sat( L_tmp1, tmp2, 16384 );
162 550552 : move16();
163 550552 : out_ptr[k + 4] = msu_r_sat( L_tmp1, tmp2, 16384 );
164 550552 : move16();
165 :
166 550552 : tmp2 = mac_r_sat( L_tmp2, *input_ptr1, 16384 );
167 550552 : tmp3 = msu_r_sat( L_tmp2, *input_ptr1, 16384 );
168 550552 : L_tmp2 = L_mult( *input_ptr2, 16384 );
169 :
170 550552 : L_tmp1 = L_mult( tmp1, 16384 );
171 550552 : tmp1 = msu_r_sat( L_tmp2, *input_ptr3, 16384 );
172 550552 : out_ptr[k + 2] = mac_r_sat( L_tmp1, tmp1, 16384 );
173 550552 : move16();
174 550552 : out_ptr[k + 6] = msu_r_sat( L_tmp1, tmp1, 16384 );
175 550552 : move16();
176 550552 : L_tmp1 = L_mult( tmp2, 16384 );
177 550552 : tmp2 = mac_r_sat( L_tmp2, *input_ptr3, 16384 );
178 550552 : out_ptr[k + 1] = mac_r_sat( L_tmp1, tmp2, 16384 );
179 550552 : move16();
180 550552 : out_ptr[k + 5] = msu_r_sat( L_tmp1, tmp2, 16384 );
181 550552 : move16();
182 550552 : L_tmp1 = L_mult( tmp3, 16384 );
183 550552 : out_ptr[k + 3] = msu_r_sat( L_tmp1, tmp4, 16384 );
184 550552 : move16();
185 550552 : out_ptr[k + 7] = mac_r_sat( L_tmp1, tmp4, 16384 );
186 550552 : move16();
187 : }
188 :
189 : /* Remaining Stages */
190 202890 : FOR( i = 2; i < NUM_STAGE; i++ )
191 : {
192 : /* i is stage counter */
193 155632 : jj = shl( 2, i ); /* FFT size */
194 155632 : kk = shl( jj, 1 ); /* 2 * FFT size */
195 155632 : ii = shr( SIZE, i );
196 155632 : ji = 0;
197 155632 : move16(); /* ji is phase table index */
198 :
199 2168808 : FOR( j = 0; j < jj; j += 2 )
200 : {
201 : /* j is sample counter */
202 6269448 : FOR( k = j; k < SIZE; k += kk )
203 : {
204 : /* k is butterfly top */
205 4256272 : kj = add( k, jj ); /* kj is butterfly bottom */
206 :
207 : /* Butterfly computations */
208 4256272 : L_tmp1 = L_msu( L_mult( *( out_ptr + kj ), phs_tbl[ji] ),
209 4256272 : *( out_ptr + kj + 1 ), phs_tbl[ji + 1] );
210 4256272 : L_tmp2 = L_mac( L_mult( *( out_ptr + kj + 1 ), phs_tbl[ji] ),
211 4256272 : *( out_ptr + kj ), phs_tbl[ji + 1] );
212 :
213 4256272 : out_ptr[kj] = mac_r( L_negate( L_tmp1 ), out_ptr[k], 16384 );
214 4256272 : move16();
215 4256272 : out_ptr[kj + 1] = mac_r( L_negate( L_tmp2 ), out_ptr[k + 1], 16384 );
216 4256272 : move16();
217 4256272 : out_ptr[k] = mac_r( L_tmp1, out_ptr[k], 16384 );
218 4256272 : move16();
219 4256272 : out_ptr[k + 1] = mac_r( L_tmp2, out_ptr[k + 1], 16384 );
220 4256272 : move16();
221 : }
222 2013176 : ji = add( ji, ii );
223 : }
224 : }
225 : }
226 : ELSE /* The ifFT part */
227 : {
228 : /* Unrolled 1st/2nd Stage
229 : * 1) to take advantage of Table Values (0 & +/- 16384)
230 : * 2) to perform reordering of Input Values
231 : */
232 400381 : FOR( k = 0; k < SIZE; k += 8 )
233 : {
234 : /*
235 : * This loop use:
236 : * 4 Word16 (tmp1...tmp4)
237 : * 2 Word32 (L_tmp1 & L_tmp2)
238 : * 5 Pointers (reorder_ptr, input_ptr1...input_ptr4)
239 : *
240 : * The addition of 'in_ptr' + and index value from 'reorder_ptr'
241 : * is counted as a move16()
242 : */
243 359576 : input_ptr1 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
244 359576 : input_ptr2 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
245 :
246 359576 : input_ptr3 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
247 359576 : input_ptr4 = in_ptr + ( const Word16 )( (uintptr_t) ( *table_ptr++ ) >> (uintptr_t) shift );
248 :
249 :
250 359576 : tmp3 = sub( *input_ptr1, *input_ptr2 );
251 359576 : tmp4 = add( *input_ptr1++, *input_ptr2++ );
252 :
253 359576 : tmp2 = sub( input_ptr3[0], input_ptr4[0] );
254 359576 : tmp1 = sub( input_ptr3[1], input_ptr4[1] );
255 :
256 359576 : out_ptr[k + 2] = sub( tmp3, tmp1 );
257 359576 : move16();
258 359576 : out_ptr[k + 6] = add( tmp3, tmp1 );
259 359576 : move16();
260 :
261 359576 : tmp1 = sub( *input_ptr1, *input_ptr2 );
262 359576 : out_ptr[k + 3] = add( tmp1, tmp2 );
263 359576 : move16();
264 359576 : out_ptr[k + 7] = sub( tmp1, tmp2 );
265 359576 : move16();
266 :
267 359576 : tmp1 = add( input_ptr3[0], input_ptr4[0] );
268 359576 : tmp3 = add( input_ptr3[1], input_ptr4[1] );
269 :
270 359576 : out_ptr[k] = add( tmp4, tmp1 );
271 359576 : move16();
272 359576 : out_ptr[k + 4] = sub( tmp4, tmp1 );
273 359576 : move16();
274 :
275 359576 : tmp4 = add( *input_ptr1, *input_ptr2 );
276 359576 : out_ptr[k + 1] = add( tmp4, tmp3 );
277 359576 : move16();
278 359576 : out_ptr[k + 5] = sub( tmp4, tmp3 );
279 359576 : move16();
280 : }
281 :
282 40805 : table_ptr = phs_tbl + SIZE; /* access part of table that is scaled by 2 */
283 :
284 : /* Remaining Stages */
285 164864 : FOR( i = 2; i < NUM_STAGE; i++ )
286 : {
287 : /* i is stage counter */
288 124059 : jj = shl( 2, i ); /* FFT size */
289 124059 : kk = shl( jj, 1 ); /* 2 * FFT size */
290 124059 : ii = shr( SIZE, i );
291 124059 : ji = 0;
292 124059 : move16(); /* ji is phase table index */
293 :
294 1399143 : FOR( j = 0; j < jj; j += 2 )
295 : {
296 : /* j is sample counter */
297 : /* This can be computed by successive add_fxitions of ii to ji, starting from 0
298 : hence line-count it as a one-line add (still need to increment op count!!) */
299 :
300 3670620 : FOR( k = j; k < SIZE; k += kk )
301 : {
302 : /* k is butterfly top */
303 2395536 : kj = add( k, jj ); /* kj is butterfly bottom */
304 :
305 : /* Butterfly computations */
306 2395536 : tmp1 = mac_r_sat( L_mult_sat( out_ptr[kj], table_ptr[ji] ), out_ptr[kj + 1], table_ptr[ji + 1] );
307 :
308 2395536 : tmp2 = msu_r_sat( L_mult_sat( out_ptr[kj + 1], table_ptr[ji] ), out_ptr[kj], table_ptr[ji + 1] );
309 :
310 2395536 : out_ptr[kj] = sub_sat( out_ptr[k], tmp1 );
311 2395536 : move16();
312 2395536 : out_ptr[kj + 1] = sub_sat( out_ptr[k + 1], tmp2 );
313 2395536 : move16();
314 2395536 : out_ptr[k] = add_sat( out_ptr[k], tmp1 );
315 2395536 : move16();
316 2395536 : out_ptr[k + 1] = add_sat( out_ptr[k + 1], tmp2 );
317 2395536 : move16();
318 : }
319 1275084 : ji = add( ji, ii );
320 : }
321 : }
322 : }
323 88063 : }
324 :
325 : /*--------------------------------------------------------------------------------*
326 : * r_fft_fx:
327 : *
328 : * Perform FFT fixed-point for real-valued sequences of length 32, 64 or 128
329 : *--------------------------------------------------------------------------------*/
330 88063 : void r_fft_fx_lc(
331 : const Word16 *phs_tbl, /* i : Table of phase */
332 : const Word16 SIZE, /* i : Size of the FFT */
333 : const Word16 SIZE2, /* i : Size / 2 */
334 : const Word16 NUM_STAGE, /* i : Number of stage */
335 : const Word16 *in_ptr, /* i : coefficients in the order re[0], re[1], ... re[n/2], im[n/2-1], im[n/2-2], ..., im[1] Qx*/
336 : Word16 *out_ptr, /* o : coefficients in the order re[0], re[1], ... re[n/2], im[n/2-1], im[n/2-2], ..., im[1] Q(x - 1)*/
337 : const Word16 isign /* i : 1=fft, otherwize it's ifft */
338 : )
339 : {
340 : Word16 tmp2_real, tmp2_imag;
341 : Word32 Ltmp1_real, Ltmp1_imag;
342 : Word16 i;
343 : Word32 Ltmp1;
344 : const Word16 *phstbl_ptrDn;
345 : Word16 *ptrDn;
346 : Word16 temp[1024]; /* Accommodates real input FFT size up to 1024. */
347 :
348 : /* Setup Pointers */
349 88063 : phstbl_ptrDn = &phs_tbl[SIZE - 1];
350 :
351 : /* The FFT part */
352 88063 : IF( isign != 0 )
353 : {
354 : Word16 *ptRealUp, *ptRealDn, *ptImaUp, *ptImaDn;
355 :
356 : /* Perform the complex FFT */
357 47258 : c_fft_fx( phs_tbl, SIZE, NUM_STAGE, in_ptr, temp, isign );
358 :
359 : /* First, handle the DC and foldover frequencies */
360 47258 : out_ptr[SIZE2] = sub( temp[0], temp[1] );
361 47258 : move16();
362 47258 : out_ptr[0] = sub( add( temp[0], temp[1] ), shr( NUM_STAGE, 1 ) );
363 47258 : move16(); /* DC have a small offset */
364 :
365 47258 : ptrDn = &temp[SIZE - 1];
366 :
367 47258 : ptImaDn = &out_ptr[SIZE - 1];
368 47258 : ptRealUp = &out_ptr[1];
369 47258 : ptImaUp = &out_ptr[SIZE2 + 1];
370 47258 : ptRealDn = &out_ptr[SIZE2 - 1];
371 :
372 : /* Now, handle the remaining positive frequencies */
373 1148362 : FOR( i = 2; i <= SIZE2; i += 2 )
374 : {
375 1101104 : Ltmp1_imag = L_mult( temp[i + 1], 16384 );
376 1101104 : Ltmp1_imag = L_msu_sat( Ltmp1_imag, *ptrDn, 16384 );
377 1101104 : tmp2_real = add_sat( temp[i + 1], *ptrDn-- );
378 :
379 1101104 : Ltmp1_real = L_mult( temp[i], 16384 );
380 1101104 : Ltmp1_real = L_mac_sat( Ltmp1_real, *ptrDn, 16384 );
381 1101104 : tmp2_imag = sub( *ptrDn--, temp[i] );
382 :
383 :
384 1101104 : *ptRealUp++ = msu_r_sat( L_mac_sat( Ltmp1_real, tmp2_real, phs_tbl[i] ), tmp2_imag, phs_tbl[i + 1] );
385 1101104 : move16();
386 1101104 : *ptImaDn-- = mac_r_sat( L_mac_sat( Ltmp1_imag, tmp2_imag, phs_tbl[i] ), tmp2_real, phs_tbl[i + 1] );
387 1101104 : move16();
388 1101104 : Ltmp1 = L_mac_sat( L_negate( Ltmp1_imag ), tmp2_real, *phstbl_ptrDn );
389 1101104 : Ltmp1_real = L_mac_sat( Ltmp1_real, tmp2_imag, *phstbl_ptrDn-- );
390 1101104 : *ptImaUp++ = msu_r_sat( Ltmp1, tmp2_imag, *phstbl_ptrDn );
391 1101104 : move16();
392 1101104 : *ptRealDn-- = mac_r_sat( Ltmp1_real, tmp2_real, *phstbl_ptrDn-- );
393 1101104 : move16();
394 : }
395 : }
396 : ELSE /* The ifFT part */
397 : {
398 : const Word16 *ptRealUp, *ptRealDn, *ptImaUp, *ptImaDn;
399 :
400 : /* First, handle the DC and foldover frequencies */
401 40805 : Ltmp1 = L_mult( in_ptr[0], 16384 );
402 40805 : temp[0] = mac_r( Ltmp1, in_ptr[SIZE2], 16384 );
403 40805 : move16();
404 40805 : temp[1] = msu_r( Ltmp1, in_ptr[SIZE2], 16384 );
405 40805 : move16();
406 :
407 40805 : ptrDn = &temp[SIZE - 1];
408 :
409 : /* Here we cast to Word16 * from a const Word16 *. */
410 : /* This is ok because we use these pointers for */
411 : /* reading only. This is just to avoid declaring a */
412 : /* bunch of 4 other pointer with const Word16 *. */
413 40805 : ptImaDn = &in_ptr[SIZE - 1]; // Qx
414 40805 : ptRealUp = &in_ptr[1]; // Qx
415 40805 : ptImaUp = &in_ptr[SIZE2 + 1]; // Qx
416 40805 : ptRealDn = &in_ptr[SIZE2 - 1]; // Qx
417 :
418 : /* Now, handle the remaining positive frequencies */
419 759957 : FOR( i = 2; i <= SIZE2; i += 2 )
420 : {
421 719152 : Ltmp1_imag = L_mult( *ptImaDn, 16384 );
422 719152 : Ltmp1_imag = L_msu_sat( Ltmp1_imag, *ptImaUp, 16384 );
423 719152 : tmp2_real = add_sat( *ptImaDn--, *ptImaUp++ );
424 719152 : Ltmp1_real = L_mult( *ptRealUp, 16384 );
425 719152 : Ltmp1_real = L_mac_sat( Ltmp1_real, *ptRealDn, 16384 );
426 719152 : tmp2_imag = sub_sat( *ptRealUp++, *ptRealDn-- );
427 719152 : temp[i] = mac_r_sat( L_msu_sat( Ltmp1_real, tmp2_real, phs_tbl[i] ), tmp2_imag, phs_tbl[i + 1] );
428 719152 : move16();
429 719152 : temp[i + 1] = mac_r_sat( L_mac_sat( Ltmp1_imag, tmp2_imag, phs_tbl[i] ), tmp2_real, phs_tbl[i + 1] );
430 719152 : move16();
431 719152 : Ltmp1 = L_mac_sat( L_negate( Ltmp1_imag ), tmp2_real, *phstbl_ptrDn );
432 719152 : Ltmp1_real = L_msu_sat( Ltmp1_real, tmp2_imag, *phstbl_ptrDn-- );
433 719152 : *ptrDn-- = msu_r_sat( Ltmp1, tmp2_imag, *phstbl_ptrDn );
434 719152 : move16();
435 719152 : *ptrDn-- = msu_r_sat( Ltmp1_real, tmp2_real, *phstbl_ptrDn-- );
436 719152 : move16();
437 : }
438 :
439 : /* Perform the complex ifFT */
440 40805 : c_fft_fx( phs_tbl, SIZE, NUM_STAGE, temp, out_ptr, isign );
441 : }
442 88063 : }
443 :
444 :
445 : /*---------------------------------------------------------------------*
446 : * fft_rel()
447 : *
448 : * Computes the split-radix FFT in place for the real-valued
449 : * signal x of length n. The algorithm has been ported from
450 : * the Fortran code of [1].
451 : *
452 : * The function needs sine and cosine tables t_sin and t_cos,
453 : * and the constant N_MAX_FFT. The table entries are defined as
454 : * sin(2*pi*i) and cos(2*pi*i) for i = 0, 1, ..., N_MAX_FFT-1. The
455 : * implementation assumes that any entry will not be needed
456 : * outside the tables. Therefore, N_MAX_FFT and n must be properly
457 : * set. The function has been tested with the values n = 16,
458 : * 32, 64, 128, 256, and N_MAX_FFT = 1280.
459 : *
460 : * References
461 : * [1] H.V. Sorensen, D.L. Jones, M.T. Heideman, C.S. Burrus,
462 : * "Real-valued fast Fourier transform algorithm," IEEE
463 : * Trans. on Signal Processing, Vol.35, No.6, pp 849-863,
464 : * 1987.
465 : *
466 : * OUTPUT
467 : * x[0:n-1] Transform coeffients in the order re[0], re[1],
468 : * ..., re[n/2], im[n/2-1], ..., im[1].
469 : *---------------------------------------------------------------------*/
470 :
471 2393694 : void fft_rel_16_32fx(
472 : Word16 x[], /* i/o: input/output vector Qx */
473 : Word16 *q_x, /* extra scaling added on speech buffer*/
474 : Word16 i_subfr,
475 : const Word16 n, /* i : vector length */
476 : const Word16 m /* i : log2 of vector length */
477 : )
478 : {
479 : Word16 i, j, k, n1, n2, n4;
480 : Word16 step;
481 : Word32 xt, t1, t2;
482 : Word32 *x0, *x1, *x2;
483 : const Word16 *s, *c;
484 : Word32 *xi2, *xi3, *xi4, *xi1;
485 :
486 : Word32 fft_bff32[L_FFT];
487 2393694 : Copy_Scale_sig_16_32_no_sat( x, fft_bff32, L_FFT, 0 ); // copying x to fft_bff32 without scaling
488 :
489 : /*-----------------------------------------------------------------*
490 : * Digit reverse counter
491 : *-----------------------------------------------------------------*/
492 :
493 2393694 : j = 0;
494 2393694 : move16();
495 2393694 : x0 = &fft_bff32[0]; // Qx
496 612785664 : FOR( i = 0; i < n - 1; i++ )
497 : {
498 610391970 : IF( LT_16( i, j ) )
499 : {
500 287243280 : xt = fft_bff32[j]; // Qx
501 287243280 : move32();
502 287243280 : fft_bff32[j] = *x0; // Qx
503 287243280 : move32();
504 287243280 : *x0 = xt; // Qx
505 287243280 : move32();
506 : }
507 610391970 : x0++;
508 610391970 : k = shr( n, 1 );
509 1201634388 : WHILE( ( k <= j ) )
510 : {
511 591242418 : j = sub( j, k );
512 591242418 : k = shr( k, 1 );
513 : }
514 610391970 : j = add( j, k );
515 : }
516 :
517 : /*-----------------------------------------------------------------*
518 : * Length two butterflies
519 : *-----------------------------------------------------------------*/
520 :
521 2393694 : x0 = &fft_bff32[0];
522 2393694 : x1 = &fft_bff32[1];
523 308786526 : FOR( i = 0; i < ( n >> 1 ); i++ )
524 : {
525 306392832 : xt = *x0;
526 306392832 : move32();
527 306392832 : *x0 = L_add( xt, *x1 );
528 306392832 : move32();
529 306392832 : *x1 = L_sub( xt, *x1 );
530 306392832 : move32();
531 306392832 : x0++;
532 306392832 : x0++;
533 306392832 : x1++;
534 306392832 : x1++;
535 : }
536 :
537 : /*-----------------------------------------------------------------*
538 : * Other butterflies
539 : *
540 : * The implementation described in [1] has been changed by using
541 : * table lookup for evaluating sine and cosine functions. The
542 : * variable ind and its increment step are needed to access table
543 : * entries. Note that this implementation assumes n4 to be so
544 : * small that ind will never exceed the table. Thus the input
545 : * argument n and the constant N_MAX_SAS must be set properly.
546 : *-----------------------------------------------------------------*/
547 :
548 2393694 : n2 = 1;
549 2393694 : move16();
550 : /* step = N_MAX_SAS/4; */
551 19149552 : FOR( k = 2; k <= m; k++ )
552 : {
553 16755858 : n4 = n2;
554 16755858 : move16();
555 16755858 : n2 = shl( n4, 1 );
556 16755858 : n1 = shl( n2, 1 );
557 :
558 16755858 : step = idiv1616( N_MAX_SAS, n1 );
559 :
560 16755858 : x0 = fft_bff32;
561 16755858 : x1 = fft_bff32 + n2;
562 16755858 : x2 = fft_bff32 + add( n2, n4 );
563 320754996 : FOR( i = 0; i < n; i += n1 )
564 : {
565 303999138 : xt = *x0;
566 303999138 : move32(); /* xt = x[i]; */
567 303999138 : *x0 = L_add( xt, *x1 );
568 303999138 : move32(); /* x[i] = xt + x[i+n2]; */
569 303999138 : *x1 = L_sub( xt, *x1 );
570 303999138 : move32(); /* x[i+n2] = xt - x[i+n2]; */
571 303999138 : *x2 = L_negate( *x2 );
572 303999138 : move32(); /* x[i+n2+n4] = -x[i+n2+n4]; */
573 :
574 :
575 303999138 : s = sincos_t_fx + step; // Q15
576 303999138 : c = s + 64; // Q15
577 303999138 : xi1 = fft_bff32 + add( i, 1 );
578 303999138 : xi3 = xi1 + n2;
579 303999138 : xi2 = xi3 - 2;
580 303999138 : xi4 = xi1 + sub( n1, 2 );
581 :
582 1072374912 : FOR( j = 1; j < n4; j++ )
583 : {
584 768375774 : t1 = L_add( Mpy_32_16_1( *xi3, *c ), Mpy_32_16_1( *xi4, *s ) ); /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); Qx */
585 768375774 : t2 = L_sub( Mpy_32_16_1( *xi3, *s ), Mpy_32_16_1( *xi4, *c ) ); /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); Qx */
586 768375774 : *xi4 = L_sub( *xi2, t2 );
587 768375774 : move32();
588 768375774 : *xi3 = L_negate( L_add( *xi2, t2 ) );
589 768375774 : move32();
590 768375774 : *xi2 = L_sub( *xi1, t1 );
591 768375774 : move32();
592 768375774 : *xi1 = L_add( *xi1, t1 );
593 768375774 : move32();
594 :
595 768375774 : xi4--;
596 768375774 : xi2--;
597 768375774 : xi3++;
598 768375774 : xi1++;
599 768375774 : c += step;
600 768375774 : s += step; /* autoincrement by ar0 */
601 : }
602 :
603 303999138 : x0 += n1;
604 303999138 : x1 += n1;
605 303999138 : x2 += n1;
606 : }
607 : /* step = shr(step, 1); */
608 : }
609 2393694 : Word16 norm = L_norm_arr( fft_bff32, L_FFT );
610 2393694 : IF( i_subfr == 0 )
611 : {
612 1196847 : Copy_Scale_sig32_16( fft_bff32, x, L_FFT, norm );
613 1196847 : *q_x = sub( norm, 16 );
614 1196847 : move16();
615 : }
616 : ELSE
617 : {
618 1196847 : IF( LT_16( sub( norm, 16 ), *q_x ) )
619 : {
620 240144 : scale_sig( x - L_FFT, L_FFT, sub( sub( norm, 16 ), *q_x ) );
621 240144 : Copy_Scale_sig32_16( fft_bff32, x, L_FFT, norm );
622 240144 : *q_x = sub( norm, 16 );
623 240144 : move16();
624 : }
625 : ELSE
626 : {
627 956703 : Copy_Scale_sig32_16( fft_bff32, x, L_FFT, add( 16, *q_x ) );
628 : }
629 : }
630 :
631 2393694 : return;
632 : }
633 :
634 268148 : void fft_rel_fx(
635 : Word16 x[], /* i/o: input/output vector Qx */
636 : const Word16 n, /* i : vector length */
637 : const Word16 m /* i : log2 of vector length */
638 : )
639 : {
640 : Word16 i, j, k, n1, n2, n4;
641 : Word16 step;
642 : Word16 xt, t1, t2;
643 : Word16 *x0, *x1, *x2;
644 : const Word16 *s, *c;
645 : Word16 *xi2, *xi3, *xi4, *xi1;
646 : #ifdef BASOP_NOGLOB_DECLARE_LOCAL
647 268148 : Flag Overflow = 0;
648 268148 : move32();
649 : #endif
650 :
651 :
652 : /*-----------------------------------------------------------------*
653 : * Digit reverse counter
654 : *-----------------------------------------------------------------*/
655 :
656 268148 : j = 0;
657 268148 : move16();
658 268148 : x0 = &x[0]; // Qx
659 268148 : move16();
660 60958376 : FOR( i = 0; i < n - 1; i++ )
661 : {
662 60690228 : IF( LT_16( i, j ) )
663 : {
664 28547546 : xt = x[j]; // Qx
665 28547546 : move16();
666 28547546 : x[j] = *x0; // Qx
667 28547546 : move16();
668 28547546 : *x0 = xt; // Qx
669 28547546 : move16();
670 : }
671 60690228 : x0++;
672 60690228 : k = shr( n, 1 );
673 119418308 : WHILE( ( k <= j ) )
674 : {
675 58728080 : j = sub( j, k );
676 58728080 : k = shr( k, 1 );
677 : }
678 60690228 : j = add( j, k );
679 : }
680 :
681 : /*-----------------------------------------------------------------*
682 : * Length two butterflies
683 : *-----------------------------------------------------------------*/
684 :
685 268148 : x0 = &x[0];
686 268148 : move16();
687 268148 : x1 = &x[1];
688 268148 : move16();
689 30747336 : FOR( i = 0; i < ( n >> 1 ); i++ )
690 : {
691 30479188 : xt = *x0;
692 30479188 : move16();
693 30479188 : *x0 = add_o( xt, *x1, &Overflow );
694 30479188 : move16();
695 30479188 : *x1 = sub_o( xt, *x1, &Overflow );
696 30479188 : move16();
697 30479188 : x0++;
698 30479188 : x0++;
699 30479188 : x1++;
700 30479188 : x1++;
701 : }
702 :
703 : /*-----------------------------------------------------------------*
704 : * Other butterflies
705 : *
706 : * The implementation described in [1] has been changed by using
707 : * table lookup for evaluating sine and cosine functions. The
708 : * variable ind and its increment step are needed to access table
709 : * entries. Note that this implementation assumes n4 to be so
710 : * small that ind will never exceed the table. Thus the input
711 : * argument n and the constant N_MAX_SAS must be set properly.
712 : *-----------------------------------------------------------------*/
713 :
714 268148 : n2 = 1;
715 268148 : move16();
716 : /* step = N_MAX_SAS/4; */
717 1962148 : FOR( k = 2; k <= m; k++ )
718 : {
719 1694000 : n4 = n2;
720 1694000 : move16();
721 1694000 : n2 = shl( n4, 1 );
722 1694000 : n1 = shl( n2, 1 );
723 :
724 1694000 : step = idiv1616( N_MAX_SAS, n1 );
725 :
726 1694000 : x0 = x;
727 1694000 : x1 = x + n2;
728 1694000 : x2 = x + add( n2, n4 );
729 31905040 : FOR( i = 0; i < n; i += n1 )
730 : {
731 30211040 : xt = *x0;
732 30211040 : move16(); /* xt = x[i]; */
733 30211040 : *x0 = add_o( xt, *x1, &Overflow );
734 30211040 : move16(); /* x[i] = xt + x[i+n2]; */
735 30211040 : *x1 = sub_o( xt, *x1, &Overflow );
736 30211040 : move16(); /* x[i+n2] = xt - x[i+n2]; */
737 30211040 : *x2 = negate( *x2 );
738 30211040 : move16(); /* x[i+n2+n4] = -x[i+n2+n4]; */
739 :
740 :
741 30211040 : s = sincos_t_fx + step; // Q15
742 30211040 : c = s + 64; // Q15
743 30211040 : xi1 = x + add( i, 1 );
744 30211040 : xi3 = xi1 + n2;
745 30211040 : xi2 = xi3 - 2;
746 30211040 : xi4 = xi1 + sub( n1, 2 );
747 :
748 106494122 : FOR( j = 1; j < n4; j++ )
749 : {
750 76283082 : t1 = add_o( mult_r( *xi3, *c ), mult_r( *xi4, *s ), &Overflow ); /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); Qx */
751 76283082 : t2 = sub_o( mult_r( *xi3, *s ), mult_r( *xi4, *c ), &Overflow ); /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); Qx */
752 76283082 : *xi4 = sub_o( *xi2, t2, &Overflow );
753 76283082 : move16();
754 76283082 : *xi3 = negate( add_o( *xi2, t2, &Overflow ) );
755 76283082 : move16();
756 76283082 : *xi2 = sub_o( *xi1, t1, &Overflow );
757 76283082 : move16();
758 76283082 : *xi1 = add_o( *xi1, t1, &Overflow );
759 76283082 : move16();
760 :
761 76283082 : xi4--;
762 76283082 : xi2--;
763 76283082 : xi3++;
764 76283082 : xi1++;
765 76283082 : c += step;
766 76283082 : s += step; /* autoincrement by ar0 */
767 : }
768 :
769 30211040 : x0 += n1;
770 30211040 : x1 += n1;
771 30211040 : x2 += n1;
772 : }
773 : /* step = shr(step, 1); */
774 : }
775 :
776 268148 : return;
777 : }
778 :
779 60557 : void fft_rel_fx32(
780 : Word32 x[], /* i/o: input/output vector Qx */
781 : const Word16 n, /* i : vector length */
782 : const Word16 m /* i : log2 of vector length */
783 : )
784 : {
785 : Word16 i, j, k, n1, n2, n4;
786 : Word16 step;
787 : Word32 xt, t1, t2;
788 : Word32 *x0, *x1, *x2;
789 : Word32 *xi2, *xi3, *xi4, *xi1;
790 : const Word16 *s, *c;
791 : const Word16 *idx;
792 :
793 : /* !!!! NMAX = 256 is hardcoded here (similar optimizations should be done for NMAX > 256) !!! */
794 :
795 : Word32 *x2even, *x2odd;
796 : Word32 temp[512];
797 :
798 60557 : test();
799 60557 : test();
800 60557 : IF( EQ_16( n, 128 ) || EQ_16( n, 256 ) || EQ_16( n, 512 ) )
801 : {
802 60557 : idx = fft256_read_indexes;
803 :
804 : /* Combined Digit reverse counter & Length two butterflies */
805 60557 : IF( EQ_16( n, 128 ) )
806 : {
807 0 : x2 = temp;
808 0 : FOR( i = 0; i < 64; i++ )
809 : {
810 0 : j = *idx++;
811 0 : move16();
812 0 : k = *idx++;
813 0 : move16();
814 :
815 0 : *x2++ = L_add( x[( j >> 1 )], x[( k >> 1 )] ); // Qx
816 0 : move16();
817 0 : *x2++ = L_sub( x[( j >> 1 )], x[( k >> 1 )] ); // Qx
818 0 : move16();
819 : }
820 : }
821 60557 : ELSE IF( EQ_16( n, 256 ) )
822 : {
823 488 : x2 = temp;
824 62952 : FOR( i = 0; i < 128; i++ )
825 : {
826 62464 : j = *idx++;
827 62464 : move16();
828 62464 : k = *idx++;
829 62464 : move16();
830 :
831 62464 : *x2++ = L_add( x[j], x[k] ); // Qx
832 62464 : move16();
833 62464 : *x2++ = L_sub( x[j], x[k] ); // Qx
834 62464 : move16();
835 : }
836 : }
837 60069 : ELSE IF( EQ_16( n, 512 ) )
838 : {
839 60069 : x2even = temp;
840 60069 : x2odd = temp + 256;
841 :
842 7748901 : FOR( i = 0; i < 128; i++ )
843 : {
844 7688832 : j = shl( *idx, 1 );
845 7688832 : idx++;
846 7688832 : k = shl( *idx, 1 );
847 7688832 : idx++;
848 :
849 7688832 : *x2even++ = L_add( x[j], x[k] ); // Qx
850 7688832 : move16();
851 7688832 : *x2even++ = L_sub( x[j], x[k] ); // Qx
852 7688832 : move16();
853 7688832 : j = add( j, 1 );
854 7688832 : k = add( k, 1 );
855 7688832 : *x2odd++ = L_add( x[j], x[k] ); // Qx
856 7688832 : move16();
857 7688832 : *x2odd++ = L_sub( x[j], x[k] ); // Qx
858 7688832 : move16();
859 : }
860 : }
861 :
862 : /*-----------------------------------------------------------------*
863 : * 1st Stage Loop has been Unrolled because n4 is '1' and that
864 : * allows the elimination of the 'for_ (j = 1; j < n4; j++)' loop
865 : * and the associated pointers initialization.
866 : * Also, it allows to Put the Data from 'temp' back into 'x' due
867 : * to the previous Combined Digit Reverse and Length two butterflies
868 : *-----------------------------------------------------------------*/
869 :
870 : /*for_ (k = 2; k < 3; k++)*/
871 : {
872 60557 : x0 = temp;
873 60557 : x1 = x0 + 2;
874 60557 : x2 = x; // Qx
875 :
876 7780621 : FOR( i = 0; i < n; i += 4 )
877 : {
878 7720064 : *x2++ = L_add( *x0++, *x1 ); /* x[i] = xt + x[i+n2]; */
879 7720064 : move16();
880 7720064 : *x2++ = *x0;
881 7720064 : move16();
882 7720064 : x0--;
883 7720064 : *x2++ = L_sub( *x0, *x1 ); /* x[i+n2] = xt - x[i+n2]; */
884 7720064 : move16();
885 7720064 : x1++;
886 7720064 : *x2++ = L_negate( *x1 ); /* x[i+n2+n4] = -x[i+n2+n4]; */
887 7720064 : move16();
888 :
889 7720064 : x0 += 4;
890 7720064 : x1 += 3; /* x1 has already advanced */
891 : }
892 : }
893 : }
894 : ELSE
895 : {
896 : /*-----------------------------------------------------------------*
897 : * Digit reverse counter
898 : *-----------------------------------------------------------------*/
899 :
900 0 : j = 0;
901 0 : move16();
902 0 : x0 = &x[0]; // Qx
903 0 : FOR( i = 0; i < ( n - 1 ); i++ )
904 : {
905 0 : IF( LT_16( i, j ) )
906 : {
907 0 : xt = x[j]; // Qx
908 0 : move32();
909 0 : x[j] = *x0;
910 0 : move32();
911 0 : *x0 = xt;
912 0 : move32();
913 : }
914 0 : x0++;
915 0 : k = shr( n, 1 );
916 0 : WHILE( ( k <= j ) )
917 : {
918 0 : j = sub( j, k );
919 0 : k = shr( k, 1 );
920 : }
921 0 : j = add( j, k );
922 : }
923 :
924 : /*-----------------------------------------------------------------*
925 : * Length two butterflies
926 : *-----------------------------------------------------------------*/
927 :
928 0 : x0 = &x[0]; // Qx
929 0 : x1 = &x[1]; // Qx
930 0 : FOR( i = 0; i < ( n >> 1 ); i++ )
931 : {
932 0 : *x1 = L_sub( *x0, *x1 ); // Qx
933 0 : move32();
934 0 : *x0 = L_sub( L_shl( *x0, 1 ), *x1 ); //*x0 * 2 - *x1 (Qx)
935 0 : move32();
936 :
937 0 : x0++;
938 0 : x0++;
939 0 : x1++;
940 0 : x1++;
941 : }
942 :
943 : /*-----------------------------------------------------------------*
944 : * 1st Stage Loop has been Unrolled because n4 is '1' and that
945 : * allows the elimination of the 'for_ (j = 1; j < n4; j++)' loop
946 : * and the associated pointers initialization.
947 : *-----------------------------------------------------------------*/
948 :
949 : /* for_ (k = 2; k < 3; k++) */
950 : {
951 0 : x0 = x; // Qx
952 0 : x1 = x0 + 2;
953 :
954 0 : FOR( i = 0; i < n; i += 4 )
955 : {
956 0 : *x1 = L_sub( *x0, *x1 ); /* x[i+n2] = xt - x[i+n2]; Qx*/
957 0 : move32();
958 0 : *x0 = L_sub( L_shl( *x0, 1 ), *x1++ ); /* x[i] = xt + x[i+n2]; */ /**x0 * 2 - *x1 (Qx)*/
959 0 : move32();
960 0 : *x1 = L_negate( *x1 ); /* x[i+n2+n4] = -x[i+n2+n4]; Qx*/
961 0 : move32();
962 :
963 0 : x0 += 4;
964 0 : x1 += 3; /* x1 has already advanced */
965 : }
966 : }
967 : }
968 :
969 : /*-----------------------------------------------------------------*
970 : * Other butterflies
971 : *
972 : * The implementation described in [1] has been changed by using
973 : * table lookup for evaluating sine and cosine functions. The
974 : * variable ind and its increment step are needed to access table
975 : * entries. Note that this implementation assumes n4 to be so
976 : * small that ind will never exceed the table. Thus the input
977 : * argument n and the constant N_MAX_FFT must be set properly.
978 : *-----------------------------------------------------------------*/
979 :
980 60557 : n4 = 1;
981 60557 : move16();
982 60557 : n2 = 2;
983 60557 : move16();
984 60557 : n1 = 4;
985 60557 : move16();
986 :
987 60557 : step = N_MAX_DIV4;
988 60557 : move16();
989 :
990 483968 : FOR( k = 3; k <= m; k++ )
991 : {
992 423411 : step = shr( step, 1 );
993 423411 : n4 = shl( n4, 1 );
994 423411 : n2 = shl( n2, 1 );
995 423411 : n1 = shl( n1, 1 );
996 :
997 423411 : x0 = x;
998 423411 : x1 = x0 + n2;
999 423411 : x2 = x1 + n4;
1000 :
1001 8082918 : FOR( i = 0; i < n; i += n1 )
1002 : {
1003 7659507 : *x1 = L_sub( *x0, *x1 ); /* x[i+n2] = xt - x[i+n2]; */
1004 7659507 : move32();
1005 7659507 : *x0 = L_sub( L_shl( *x0, 1 ), *x1 ); /* x[i] = xt + x[i+n2]; */ /**x0 * 2 - *x1 (Qx)*/
1006 7659507 : move32();
1007 7659507 : *x2 = L_negate( *x2 ); /* x[i+n2+n4] = -x[i+n2+n4]; Qx */
1008 7659507 : move32();
1009 :
1010 7659507 : s = sincos_t_ext_fx; // Q15
1011 7659507 : c = s + N_MAX_FFT / 4; /* 1024/4 = 256, 256/4=64 Q15*/
1012 7659507 : xi1 = x0;
1013 7659507 : xi3 = xi1 + n2;
1014 7659507 : xi2 = xi3;
1015 7659507 : x0 += n1;
1016 7659507 : xi4 = x0;
1017 :
1018 54009216 : FOR( j = 1; j < n4; j++ )
1019 : {
1020 46349709 : xi3++;
1021 46349709 : xi1++;
1022 46349709 : xi4--;
1023 46349709 : xi2--;
1024 46349709 : c += step;
1025 46349709 : s += step; /* autoincrement by ar0 */
1026 :
1027 46349709 : t1 = L_add( Mpy_32_16_1( *xi3, *c ), Mpy_32_16_1( *xi4, *s ) ); /* t1 = *xi3**(pt_c+ind) + *xi4**(pt_s+ind); Qx*/
1028 46349709 : t2 = L_sub( Mpy_32_16_1( *xi3, *s ), Mpy_32_16_1( *xi4, *c ) ); /* t2 = *xi3**(pt_s+ind) - *xi4**(pt_c+ind); Qx*/
1029 :
1030 46349709 : *xi4 = L_sub( *xi2, t2 );
1031 46349709 : move32();
1032 46349709 : *xi2 = L_sub( *xi1, t1 );
1033 46349709 : move32();
1034 46349709 : *xi1 = L_sub( L_shl( *xi1, 1 ), *xi2 ); // Qx
1035 46349709 : move32();
1036 46349709 : *xi3 = L_negate( L_add( L_shl( t2, 1 ), *xi4 ) ); // Qx
1037 46349709 : move32();
1038 : }
1039 :
1040 7659507 : x1 += n1;
1041 7659507 : x2 += n1;
1042 : }
1043 : }
1044 :
1045 60557 : return;
1046 : }
|