Line data Source code
1 : /*====================================================================================
2 : EVS Codec 3GPP TS26.452 Aug 12, 2021. Version 16.3.0
3 : ====================================================================================*/
4 :
5 : #include <assert.h>
6 : #include "prot_fx.h"
7 : #include "basop_util.h"
8 : #include "rom_basop_util.h"
9 : #include "rom_com.h"
10 : #include "options.h"
11 : #include "stl.h"
12 : /************************************************************************/
13 : /* FFT */
14 : /************************************************************************/
15 : #define SCALEFACTOR16 ( 5 )
16 : #define SCALEFACTOR20 ( 5 )
17 :
18 :
19 : void fft16_with_cmplx_data( cmplx *pInp, Word16 bsacle );
20 :
21 : /**
22 : * \brief Profiling / Precision results
23 : *
24 : * Profiling / Precision of complex valued FFTs: BASOP_cfft()
25 : *
26 : * WOPS BASOP Precision BASOP
27 : * FFT5 87 16.96
28 : * FFT8 108 17.04
29 : * FFT10 194 16.70
30 : * FFT15 354 16.97
31 : * FFT16 288 16.62
32 : * FFT20 368 16.06
33 : * FFT30 828 16.80
34 : * FFT32 752 15.45 (cplx mult mit 3 mult und 3 add)
35 : * FFT32 824 16.07 (cplx mult mit 4 mult und 2 add)
36 : * FFT64 ( 8x 8) 3.129 15.16
37 : * FFT80 (10x 8) 4.385 15.55
38 : * FFT100 (20x 5) 6.518 15.65
39 : * FFT120 (15x 8) 7.029 15.38
40 : * FFT128 (16x 8) 6.777 15.28
41 : * FFT160 (20x 8) 9.033 14.95
42 : * FFT240 (30x 8) 14.961 15.49
43 : * FFT256 (32x 8) 14.905 14.61 (cplx mult mit 3 mult und 3 add)
44 : * FFT256 (32x 8) 15.265 15.04 (cplx mult mit 4 mult und 2 add)
45 : * FFT320 (20x16) 21.517 15.21
46 : *
47 : *
48 : * Profiling / Precision of real valued FFTs / iFFTs: BASOP_rfft()
49 : *
50 : * WOPS BASOP Precision BASOP
51 : * rFFT40 955 15.68
52 : * rFFT64 1635 16.17
53 : *
54 : * irFFT40 1116 15.36
55 : * irFFT64 1759 15.18
56 : *
57 : */
58 :
59 :
60 : #define Mpy_32_xx Mpy_32_16_1
61 :
62 : #define FFTC( x ) WORD322WORD16( (Word32) x )
63 :
64 : #define C31 ( FFTC( 0x91261468 ) ) /* FL2WORD32( -0.86602540) -sqrt(3)/2 */
65 :
66 : #define C51 ( FFTC( 0x79bc3854 ) ) /* FL2WORD32( 0.95105652) */
67 : #define C52 ( FFTC( 0x9d839db0 ) ) /* FL2WORD32(-1.53884180/2) */
68 : #define C53 ( FFTC( 0xd18053ce ) ) /* FL2WORD32(-0.36327126) */
69 : #define C54 ( FFTC( 0x478dde64 ) ) /* FL2WORD32( 0.55901699) */
70 : #define C55 ( FFTC( 0xb0000001 ) ) /* FL2WORD32(-1.25/2) */
71 :
72 : #define C81 ( FFTC( 0x5a82799a ) ) /* FL2WORD32( 7.071067811865475e-1) */
73 : #define C82 ( FFTC( 0xa57d8666 ) ) /* FL2WORD32(-7.071067811865475e-1) */
74 :
75 : #define C161 ( FFTC( 0x5a82799a ) ) /* FL2WORD32( 7.071067811865475e-1) INV_SQRT2 */
76 : #define C162 ( FFTC( 0xa57d8666 ) ) /* FL2WORD32(-7.071067811865475e-1) -INV_SQRT2 */
77 :
78 : #define C163 ( FFTC( 0x7641af3d ) ) /* FL2WORD32( 9.238795325112867e-1) COS_PI_DIV8 */
79 : #define C164 ( FFTC( 0x89be50c3 ) ) /* FL2WORD32(-9.238795325112867e-1) -COS_PI_DIV8 */
80 :
81 : #define C165 ( FFTC( 0x30fbc54d ) ) /* FL2WORD32( 3.826834323650898e-1) COS_3PI_DIV8 */
82 : #define C166 ( FFTC( 0xcf043ab3 ) ) /* FL2WORD32(-3.826834323650898e-1) -COS_3PI_DIV8 */
83 :
84 :
85 : #define cplxMpy4_8_0( re, im, a, b, c, d ) \
86 : re = L_shr( L_sub( Mpy_32_xx( a, c ), Mpy_32_xx( b, d ) ), 1 ); \
87 : im = L_shr( L_add( Mpy_32_xx( a, d ), Mpy_32_xx( b, c ) ), 1 );
88 :
89 : #define cplxMpy4_8_1( re, im, a, b ) \
90 : re = L_shr( a, 1 ); \
91 : im = L_shr( b, 1 );
92 :
93 :
94 : /**
95 : * \brief Function performs a complex 5-point FFT
96 : * The FFT is performed inplace. The result of the FFT
97 : * is scaled by SCALEFACTOR5 bits.
98 : *
99 : * WOPS with 32x16 bit multiplications: 88 cycles
100 : *
101 : * \param [i/o] re real input / output
102 : * \param [i/o] im imag input / output
103 : * \param [i ] s stride real and imag input / output
104 : *
105 : * \return void
106 : */
107 98160 : static void fft5_with_cmplx_data( cmplx *inp /*Qx*/ )
108 : {
109 : cmplx x0, x1, x2, x3, x4;
110 : cmplx y1, y2, y3, y4;
111 : cmplx t;
112 :
113 98160 : x0 = CL_shr( inp[0], SCALEFACTOR5 ); // Qx - 4
114 98160 : x1 = CL_shr( inp[1], SCALEFACTOR5 ); // Qx - 4
115 98160 : x2 = CL_shr( inp[2], SCALEFACTOR5 ); // Qx - 4
116 98160 : x3 = CL_shr( inp[3], SCALEFACTOR5 ); // Qx - 4
117 98160 : x4 = CL_shr( inp[4], SCALEFACTOR5 ); // Qx - 4
118 :
119 98160 : y1 = CL_add( x1, x4 );
120 98160 : y4 = CL_sub( x1, x4 );
121 98160 : y3 = CL_add( x2, x3 );
122 98160 : y2 = CL_sub( x2, x3 );
123 98160 : t = CL_scale_t( CL_sub( y1, y3 ), C54 );
124 98160 : y1 = CL_add( y1, y3 );
125 98160 : inp[0] = CL_add( x0, y1 );
126 :
127 : /* Bit shift left because of the constant C55 which was scaled with the factor 0.5 because of the representation of
128 : the values as fracts */
129 98160 : y1 = CL_add( inp[0], ( CL_shl( CL_scale_t( y1, C55 ), 1 ) ) );
130 98160 : y3 = CL_sub( y1, t );
131 98160 : y1 = CL_add( y1, t );
132 :
133 98160 : t = CL_scale_t( CL_add( y4, y2 ), C51 );
134 : /* Bit shift left because of the constant C55 which was scaled with the factor 0.5 because of the representation of
135 : the values as fracts */
136 98160 : y4 = CL_add( t, CL_shl( CL_scale_t( y4, C52 ), 1 ) );
137 98160 : y2 = CL_add( t, CL_scale_t( y2, C53 ) );
138 :
139 :
140 : /* combination */
141 98160 : inp[1] = CL_msu_j( y1, y2 );
142 98160 : inp[4] = CL_mac_j( y1, y2 );
143 :
144 98160 : inp[2] = CL_mac_j( y3, y4 );
145 98160 : inp[3] = CL_msu_j( y3, y4 );
146 :
147 : #ifdef WMOPS
148 : multiCounter[currCounter].CL_move += 5;
149 : #endif
150 98160 : }
151 :
152 : /**
153 : * \brief Function performs a complex 8-point FFT
154 : * The FFT is performed inplace. The result of the FFT
155 : * is scaled by SCALEFACTOR8 bits.
156 : *
157 : * WOPS with 32x16 bit multiplications: 108 cycles
158 : *
159 : * \param [i/o] re real input / output
160 : * \param [i/o] im imag input / output
161 : * \param [i ] s stride real and imag input / output
162 : *
163 : * \return void
164 : */
165 2368776 : static void fft8_with_cmplx_data( cmplx *inp /*Qx*/ )
166 : {
167 : cmplx x0, x1, x2, x3, x4, x5, x6, x7;
168 : cmplx s0, s1, s2, s3, s4, s5, s6, s7;
169 : cmplx t0, t1, t2, t3, t4, t5, t6, t7;
170 :
171 : /* Pre-additions */
172 2368776 : x0 = CL_shr( inp[0], SCALEFACTOR8 ); // Qx - 4
173 2368776 : x1 = CL_shr( inp[1], SCALEFACTOR8 );
174 2368776 : x2 = CL_shr( inp[2], SCALEFACTOR8 );
175 2368776 : x3 = CL_shr( inp[3], SCALEFACTOR8 );
176 2368776 : x4 = CL_shr( inp[4], SCALEFACTOR8 );
177 2368776 : x5 = CL_shr( inp[5], SCALEFACTOR8 );
178 2368776 : x6 = CL_shr( inp[6], SCALEFACTOR8 );
179 2368776 : x7 = CL_shr( inp[7], SCALEFACTOR8 );
180 :
181 : /* loops are unrolled */
182 : {
183 2368776 : t0 = CL_add( x0, x4 );
184 2368776 : t1 = CL_sub( x0, x4 );
185 :
186 2368776 : t2 = CL_add( x1, x5 );
187 2368776 : t3 = CL_sub( x1, x5 );
188 :
189 2368776 : t4 = CL_add( x2, x6 );
190 2368776 : t5 = CL_sub( x2, x6 );
191 :
192 2368776 : t6 = CL_add( x3, x7 );
193 2368776 : t7 = CL_sub( x3, x7 );
194 : }
195 :
196 : /* Pre-additions and core multiplications */
197 :
198 2368776 : s0 = CL_add( t0, t4 );
199 2368776 : s2 = CL_sub( t0, t4 );
200 :
201 2368776 : s4 = CL_mac_j( t1, t5 );
202 2368776 : s5 = CL_msu_j( t1, t5 );
203 :
204 2368776 : s1 = CL_add( t2, t6 );
205 2368776 : s3 = CL_sub( t2, t6 );
206 2368776 : s3 = CL_mul_j( s3 );
207 :
208 2368776 : t0 = CL_add( t3, t7 );
209 2368776 : t1 = CL_sub( t3, t7 );
210 :
211 2368776 : s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
212 2368776 : s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
213 :
214 : /* Post-additions */
215 :
216 2368776 : inp[0] = CL_add( s0, s1 );
217 2368776 : inp[4] = CL_sub( s0, s1 );
218 :
219 2368776 : inp[2] = CL_sub( s2, s3 );
220 2368776 : inp[6] = CL_add( s2, s3 );
221 :
222 2368776 : inp[3] = CL_add( s4, s7 );
223 2368776 : inp[7] = CL_sub( s4, s7 );
224 :
225 2368776 : inp[1] = CL_add( s5, s6 );
226 2368776 : inp[5] = CL_sub( s5, s6 );
227 : #ifdef WMOPS
228 : multiCounter[currCounter].CL_move += 8;
229 : #endif
230 2368776 : }
231 :
232 :
233 : /**
234 : * \brief Function performs a complex 10-point FFT
235 : * The FFT is performed inplace. The result of the FFT
236 : * is scaled by SCALEFACTOR10 bits.
237 : *
238 : * WOPS with 32x16 bit multiplications: 196 cycles
239 : *
240 : * \param [i/o] re real input / output
241 : * \param [i/o] im imag input / output
242 : * \param [i ] s stride real and imag input / output
243 : *
244 : * \return void
245 : */
246 :
247 1294944 : static void fft10_with_cmplx_data( cmplx *inp_data /*Qx*/ )
248 : {
249 : cmplx r1, r2, r3, r4;
250 : cmplx x0, x1, x2, x3, x4, t;
251 : cmplx y[10];
252 :
253 : /* FOR i=0 */
254 : {
255 1294944 : x0 = CL_shr( inp_data[0], SCALEFACTOR10 ); // Qx - 5
256 1294944 : x1 = CL_shr( inp_data[2], SCALEFACTOR10 );
257 1294944 : x2 = CL_shr( inp_data[4], SCALEFACTOR10 );
258 1294944 : x3 = CL_shr( inp_data[6], SCALEFACTOR10 );
259 1294944 : x4 = CL_shr( inp_data[8], SCALEFACTOR10 );
260 :
261 1294944 : r1 = CL_add( x3, x2 );
262 1294944 : r4 = CL_sub( x3, x2 );
263 1294944 : r3 = CL_add( x1, x4 );
264 1294944 : r2 = CL_sub( x1, x4 );
265 1294944 : t = CL_scale_t( CL_sub( r1, r3 ), C54 );
266 1294944 : r1 = CL_add( r1, r3 );
267 1294944 : y[0] = CL_add( x0, r1 );
268 1294944 : r1 = CL_add( y[0], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
269 1294944 : r3 = CL_sub( r1, t );
270 1294944 : r1 = CL_add( r1, t );
271 1294944 : t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
272 1294944 : r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
273 1294944 : r2 = CL_add( t, CL_scale_t( r2, C53 ) );
274 :
275 :
276 1294944 : y[2] = CL_msu_j( r1, r2 );
277 1294944 : y[8] = CL_mac_j( r1, r2 );
278 1294944 : y[4] = CL_mac_j( r3, r4 );
279 1294944 : y[6] = CL_msu_j( r3, r4 );
280 : }
281 : /* FOR i=1 */
282 : {
283 1294944 : x0 = CL_shr( inp_data[5], SCALEFACTOR10 ); // Qx - 5
284 1294944 : x1 = CL_shr( inp_data[1], SCALEFACTOR10 );
285 1294944 : x2 = CL_shr( inp_data[3], SCALEFACTOR10 );
286 1294944 : x3 = CL_shr( inp_data[7], SCALEFACTOR10 );
287 1294944 : x4 = CL_shr( inp_data[9], SCALEFACTOR10 );
288 :
289 1294944 : r1 = CL_add( x1, x4 );
290 1294944 : r4 = CL_sub( x1, x4 );
291 1294944 : r3 = CL_add( x3, x2 );
292 1294944 : r2 = CL_sub( x3, x2 );
293 1294944 : t = CL_scale_t( CL_sub( r1, r3 ), C54 );
294 1294944 : r1 = CL_add( r1, r3 );
295 1294944 : y[1] = CL_add( x0, r1 );
296 1294944 : r1 = CL_add( y[1], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
297 1294944 : r3 = CL_sub( r1, t );
298 1294944 : r1 = CL_add( r1, t );
299 1294944 : t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
300 1294944 : r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
301 1294944 : r2 = CL_add( t, CL_scale_t( r2, C53 ) );
302 :
303 :
304 1294944 : y[3] = CL_msu_j( r1, r2 );
305 1294944 : y[9] = CL_mac_j( r1, r2 );
306 1294944 : y[5] = CL_mac_j( r3, r4 );
307 1294944 : y[7] = CL_msu_j( r3, r4 );
308 : }
309 :
310 : /* FOR i=0 */
311 : {
312 1294944 : inp_data[0] = CL_add( y[0], y[1] );
313 1294944 : inp_data[5] = CL_sub( y[0], y[1] );
314 : }
315 : /* FOR i=2 */
316 : {
317 1294944 : inp_data[2] = CL_add( y[2], y[3] );
318 1294944 : inp_data[7] = CL_sub( y[2], y[3] );
319 : }
320 : /* FOR i=4 */
321 : {
322 1294944 : inp_data[4] = CL_add( y[4], y[5] );
323 1294944 : inp_data[9] = CL_sub( y[4], y[5] );
324 : }
325 : /* FOR i=6 */
326 : {
327 1294944 : inp_data[6] = CL_add( y[6], y[7] );
328 1294944 : inp_data[1] = CL_sub( y[6], y[7] );
329 : }
330 : /* FOR i=8 */
331 : {
332 1294944 : inp_data[8] = CL_add( y[8], y[9] );
333 1294944 : inp_data[3] = CL_sub( y[8], y[9] );
334 : }
335 :
336 : #ifdef WMOPS
337 : multiCounter[currCounter].CL_move += 10;
338 : #endif
339 1294944 : }
340 :
341 :
342 : /**
343 : * \brief Function performs a complex 15-point FFT
344 : * The FFT is performed inplace. The result of the FFT
345 : * is scaled by SCALEFACTOR15 bits.
346 : *
347 : * WOPS with 32x16 bit multiplications: 354 cycles
348 : *
349 : * \param [i/o] re real input / output
350 : * \param [i/o] im imag input / output
351 : * \param [i ] s stride real and imag input / output
352 : *
353 : * \return void
354 : */
355 :
356 532032 : static void fft15_with_cmplx_data( cmplx *inp_data /*Qx*/ )
357 : {
358 : cmplx c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14;
359 : cmplx c_z0, c_z1, c_z2, c_z3, c_z4, c_z5, c_z6, c_z7, c_z8, c_z9, c_z10, c_z11, c_z12, c_z13, c_z14;
360 : cmplx c_y1, c_y2, c_y3, c_y4;
361 : cmplx c_t;
362 :
363 532032 : c0 = CL_shr( inp_data[0], SCALEFACTOR15 ); // Qx - 5
364 532032 : c1 = CL_shr( inp_data[3], SCALEFACTOR15 );
365 532032 : c2 = CL_shr( inp_data[6], SCALEFACTOR15 );
366 532032 : c3 = CL_shr( inp_data[9], SCALEFACTOR15 );
367 532032 : c4 = CL_shr( inp_data[12], SCALEFACTOR15 );
368 532032 : c5 = CL_shr( inp_data[5], SCALEFACTOR15 );
369 532032 : c6 = CL_shr( inp_data[8], SCALEFACTOR15 );
370 532032 : c7 = CL_shr( inp_data[11], SCALEFACTOR15 );
371 532032 : c8 = CL_shr( inp_data[14], SCALEFACTOR15 );
372 532032 : c9 = CL_shr( inp_data[2], SCALEFACTOR15 );
373 532032 : c10 = CL_shr( inp_data[10], SCALEFACTOR15 );
374 532032 : c11 = CL_shr( inp_data[13], SCALEFACTOR15 );
375 532032 : c12 = CL_shr( inp_data[1], SCALEFACTOR15 );
376 532032 : c13 = CL_shr( inp_data[4], SCALEFACTOR15 );
377 532032 : c14 = CL_shr( inp_data[7], SCALEFACTOR15 );
378 :
379 : /* 1. FFT5 stage */
380 532032 : c_y1 = CL_add( c1, c4 );
381 532032 : c_y4 = CL_sub( c1, c4 );
382 532032 : c_y3 = CL_add( c2, c3 );
383 532032 : c_y2 = CL_sub( c2, c3 );
384 532032 : c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
385 532032 : c_y1 = CL_add( c_y1, c_y3 );
386 532032 : c_z0 = CL_add( c0, c_y1 );
387 532032 : c_y1 = CL_add( c_z0, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
388 532032 : c_y3 = CL_sub( c_y1, c_t );
389 532032 : c_y1 = CL_add( c_y1, c_t );
390 532032 : c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
391 532032 : c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
392 532032 : c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
393 :
394 : /* combination */
395 532032 : c_z1 = CL_msu_j( c_y1, c_y2 );
396 532032 : c_z2 = CL_mac_j( c_y3, c_y4 );
397 532032 : c_z3 = CL_msu_j( c_y3, c_y4 );
398 532032 : c_z4 = CL_mac_j( c_y1, c_y2 );
399 :
400 :
401 : /* 2. FFT5 stage */
402 532032 : c_y1 = CL_add( c6, c9 );
403 532032 : c_y4 = CL_sub( c6, c9 );
404 532032 : c_y3 = CL_add( c7, c8 );
405 532032 : c_y2 = CL_sub( c7, c8 );
406 532032 : c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
407 532032 : c_y1 = CL_add( c_y1, c_y3 );
408 532032 : c_z5 = CL_add( c5, c_y1 );
409 532032 : c_y1 = CL_add( c_z5, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
410 532032 : c_y3 = CL_sub( c_y1, c_t );
411 532032 : c_y1 = CL_add( c_y1, c_t );
412 532032 : c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
413 532032 : c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
414 532032 : c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
415 : /* combination */
416 532032 : c_z6 = CL_msu_j( c_y1, c_y2 );
417 532032 : c_z7 = CL_mac_j( c_y3, c_y4 );
418 532032 : c_z8 = CL_msu_j( c_y3, c_y4 );
419 532032 : c_z9 = CL_mac_j( c_y1, c_y2 );
420 :
421 :
422 : /* 3. FFT5 stage */
423 :
424 532032 : c_y1 = CL_add( c11, c14 );
425 532032 : c_y4 = CL_sub( c11, c14 );
426 532032 : c_y3 = CL_add( c12, c13 );
427 532032 : c_y2 = CL_sub( c12, c13 );
428 532032 : c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
429 532032 : c_y1 = CL_add( c_y1, c_y3 );
430 532032 : c_z10 = CL_add( c10, c_y1 );
431 532032 : c_y1 = CL_add( c_z10, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
432 532032 : c_y3 = CL_sub( c_y1, c_t );
433 532032 : c_y1 = CL_add( c_y1, c_t );
434 532032 : c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
435 532032 : c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
436 532032 : c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
437 : /* combination */
438 532032 : c_z11 = CL_msu_j( c_y1, c_y2 );
439 532032 : c_z12 = CL_mac_j( c_y3, c_y4 );
440 532032 : c_z13 = CL_msu_j( c_y3, c_y4 );
441 532032 : c_z14 = CL_mac_j( c_y1, c_y2 );
442 :
443 :
444 : /* 1. FFT3 stage */
445 532032 : c_y1 = CL_add( c_z5, c_z10 );
446 532032 : c_y2 = CL_scale_t( CL_sub( c_z5, c_z10 ), C31 );
447 532032 : inp_data[0] = CL_add( c_z0, c_y1 );
448 532032 : c_y1 = CL_sub( c_z0, CL_shr( c_y1, 1 ) );
449 532032 : inp_data[10] = CL_mac_j( c_y1, c_y2 );
450 532032 : inp_data[5] = CL_msu_j( c_y1, c_y2 );
451 :
452 : /* 2. FFT3 stage */
453 532032 : c_y1 = CL_add( c_z6, c_z11 );
454 532032 : c_y2 = CL_scale_t( CL_sub( c_z6, c_z11 ), C31 );
455 532032 : inp_data[6] = CL_add( c_z1, c_y1 );
456 532032 : c_y1 = CL_sub( c_z1, CL_shr( c_y1, 1 ) );
457 532032 : inp_data[1] = CL_mac_j( c_y1, c_y2 );
458 532032 : inp_data[11] = CL_msu_j( c_y1, c_y2 );
459 :
460 : /* 3. FFT3 stage */
461 532032 : c_y1 = CL_add( c_z7, c_z12 );
462 532032 : c_y2 = CL_scale_t( CL_sub( c_z7, c_z12 ), C31 );
463 532032 : inp_data[12] = CL_add( c_z2, c_y1 );
464 532032 : c_y1 = CL_sub( c_z2, CL_shr( c_y1, 1 ) );
465 532032 : inp_data[7] = CL_mac_j( c_y1, c_y2 );
466 532032 : inp_data[2] = CL_msu_j( c_y1, c_y2 );
467 :
468 :
469 : /* 4. FFT3 stage */
470 532032 : c_y1 = CL_add( c_z8, c_z13 );
471 532032 : c_y2 = CL_scale_t( CL_sub( c_z8, c_z13 ), C31 );
472 532032 : inp_data[3] = CL_add( c_z3, c_y1 );
473 532032 : c_y1 = CL_sub( c_z3, CL_shr( c_y1, 1 ) );
474 532032 : inp_data[13] = CL_mac_j( c_y1, c_y2 );
475 532032 : inp_data[8] = CL_msu_j( c_y1, c_y2 );
476 :
477 :
478 : /* 5. FFT3 stage */
479 532032 : c_y1 = CL_add( c_z9, c_z14 );
480 532032 : c_y2 = CL_scale_t( CL_sub( c_z9, c_z14 ), C31 );
481 532032 : inp_data[9] = CL_add( c_z4, c_y1 );
482 532032 : c_y1 = CL_sub( c_z4, CL_shr( c_y1, 1 ) );
483 532032 : inp_data[4] = CL_mac_j( c_y1, c_y2 );
484 532032 : inp_data[14] = CL_msu_j( c_y1, c_y2 );
485 :
486 : #ifdef WMOPS
487 : multiCounter[currCounter].CL_move += 15;
488 : #endif
489 532032 : }
490 :
491 :
492 : /**
493 : * \brief Function performs a complex 16-point FFT
494 : * The FFT is performed inplace. The result of the FFT
495 : * is scaled by SCALEFACTOR16 bits.
496 : *
497 : * WOPS with 32x16 bit multiplications (scale on ): 288 cycles
498 : * WOPS with 32x16 bit multiplications (scale off): 256 cycles
499 : *
500 : * \param [i/o] re real input / output Qx
501 : * \param [i/o] im imag input / output Qx
502 : * \param [i ] s stride real and imag input / output
503 : *
504 : * \return void
505 : */
506 0 : void fft16( Word32 *re, Word32 *im, Word16 s, Word16 bScale )
507 : {
508 : Word16 i;
509 0 : if ( s == 2 )
510 : {
511 0 : fft16_with_cmplx_data( (cmplx *) re, bScale );
512 : }
513 : else
514 : {
515 : cmplx inp_data[16];
516 0 : FOR( i = 0; i < 16; i++ )
517 : {
518 0 : inp_data[i] = CL_form( re[s * i], im[s * i] );
519 0 : move64();
520 : }
521 0 : fft16_with_cmplx_data( inp_data, bScale );
522 0 : FOR( i = 0; i < 16; i++ )
523 : {
524 0 : re[s * i] = CL_Extract_real( inp_data[i] );
525 0 : move32();
526 0 : im[s * i] = CL_Extract_imag( inp_data[i] );
527 0 : move32();
528 : }
529 : }
530 0 : }
531 :
532 53078086 : void fft16_with_cmplx_data( cmplx *input /*Qx*/, Word16 bScale )
533 : {
534 : cmplx x0, x1, x2, x3, temp;
535 : cmplx t0, t2, t4, t6, t7;
536 : cmplx y[16];
537 :
538 53078086 : IF( bScale )
539 : {
540 : {
541 2115592 : x0 = CL_shr( input[0], SCALEFACTOR16 ); // Qx - 5
542 2115592 : x1 = CL_shr( input[4], SCALEFACTOR16 );
543 2115592 : x2 = CL_shr( input[8], SCALEFACTOR16 );
544 2115592 : x3 = CL_shr( input[12], SCALEFACTOR16 );
545 2115592 : t0 = CL_add( x0, x2 );
546 2115592 : t2 = CL_sub( x0, x2 );
547 2115592 : t4 = CL_add( x1, x3 );
548 2115592 : t6 = CL_sub( x1, x3 );
549 2115592 : t6 = CL_mul_j( t6 );
550 2115592 : y[0] = CL_add( t0, t4 );
551 2115592 : y[1] = CL_sub( t2, t6 );
552 2115592 : y[2] = CL_sub( t0, t4 );
553 2115592 : y[3] = CL_add( t2, t6 );
554 :
555 :
556 2115592 : x0 = CL_shr( input[1], SCALEFACTOR16 ); // Qx - 5
557 2115592 : x1 = CL_shr( input[5], SCALEFACTOR16 );
558 2115592 : x2 = CL_shr( input[9], SCALEFACTOR16 );
559 2115592 : x3 = CL_shr( input[13], SCALEFACTOR16 );
560 2115592 : t0 = CL_add( x0, x2 );
561 2115592 : t2 = CL_sub( x0, x2 );
562 2115592 : t4 = CL_add( x1, x3 );
563 2115592 : t6 = CL_sub( x1, x3 );
564 2115592 : t6 = CL_mul_j( t6 );
565 2115592 : y[4] = CL_add( t0, t4 );
566 2115592 : y[5] = CL_sub( t2, t6 );
567 2115592 : y[6] = CL_sub( t0, t4 );
568 2115592 : y[7] = CL_add( t2, t6 );
569 :
570 :
571 2115592 : x0 = CL_shr( input[2], SCALEFACTOR16 ); // Qx - 5
572 2115592 : x1 = CL_shr( input[6], SCALEFACTOR16 );
573 2115592 : x2 = CL_shr( input[10], SCALEFACTOR16 );
574 2115592 : x3 = CL_shr( input[14], SCALEFACTOR16 );
575 2115592 : t0 = CL_add( x0, x2 );
576 2115592 : t2 = CL_sub( x0, x2 );
577 2115592 : t4 = CL_add( x1, x3 );
578 2115592 : t6 = CL_sub( x1, x3 );
579 2115592 : t6 = CL_mul_j( t6 );
580 2115592 : y[8] = CL_add( t0, t4 );
581 2115592 : y[9] = CL_sub( t2, t6 );
582 2115592 : y[10] = CL_sub( t4, t0 );
583 2115592 : y[10] = CL_mul_j( y[10] );
584 2115592 : y[11] = CL_add( t2, t6 );
585 :
586 :
587 2115592 : x0 = CL_shr( input[3], SCALEFACTOR16 ); // Qx - 5
588 2115592 : x1 = CL_shr( input[7], SCALEFACTOR16 );
589 2115592 : x2 = CL_shr( input[11], SCALEFACTOR16 );
590 2115592 : x3 = CL_shr( input[15], SCALEFACTOR16 );
591 2115592 : t0 = CL_add( x0, x2 );
592 2115592 : t2 = CL_sub( x0, x2 );
593 2115592 : t4 = CL_add( x1, x3 );
594 2115592 : t6 = CL_sub( x1, x3 );
595 2115592 : t6 = CL_mul_j( t6 );
596 2115592 : y[12] = CL_add( t0, t4 );
597 2115592 : y[13] = CL_sub( t2, t6 );
598 2115592 : y[14] = CL_sub( t0, t4 );
599 2115592 : y[15] = CL_add( t2, t6 );
600 : }
601 : }
602 : else
603 : {
604 : {
605 50962494 : t0 = CL_add( input[0], input[8] );
606 50962494 : t2 = CL_sub( input[0], input[8] );
607 50962494 : t4 = CL_add( input[4], input[12] );
608 50962494 : t7 = CL_sub( input[4], input[12] );
609 :
610 50962494 : y[0] = CL_add( t0, t4 );
611 50962494 : y[1] = CL_msu_j( t2, t7 );
612 50962494 : y[2] = CL_sub( t0, t4 );
613 50962494 : y[3] = CL_mac_j( t2, t7 );
614 : }
615 : /* i=1 */
616 : {
617 50962494 : t0 = CL_add( input[1], input[9] );
618 50962494 : t2 = CL_sub( input[1], input[9] );
619 50962494 : t4 = CL_add( input[5], input[13] );
620 50962494 : t7 = CL_sub( input[5], input[13] );
621 :
622 50962494 : y[4] = CL_add( t0, t4 );
623 50962494 : y[5] = CL_msu_j( t2, t7 );
624 50962494 : y[6] = CL_sub( t0, t4 );
625 50962494 : y[7] = CL_mac_j( t2, t7 );
626 : }
627 : /* i=2 */
628 : {
629 50962494 : t0 = CL_add( input[2], input[10] );
630 50962494 : t2 = CL_sub( input[2], input[10] );
631 50962494 : t4 = CL_add( input[6], input[14] );
632 50962494 : t7 = CL_sub( input[6], input[14] );
633 :
634 50962494 : y[8] = CL_add( t0, t4 );
635 50962494 : y[9] = CL_msu_j( t2, t7 );
636 50962494 : temp = CL_sub( t0, t4 );
637 50962494 : y[10] = CL_negate( CL_mul_j( temp ) );
638 50962494 : y[11] = CL_mac_j( t2, t7 );
639 : }
640 : /* i=3 */
641 : {
642 50962494 : t0 = CL_add( input[3], input[11] );
643 50962494 : t2 = CL_sub( input[3], input[11] );
644 50962494 : t4 = CL_add( input[7], input[15] );
645 50962494 : t7 = CL_sub( input[7], input[15] );
646 :
647 50962494 : y[12] = CL_add( t0, t4 );
648 50962494 : y[13] = CL_msu_j( t2, t7 );
649 50962494 : y[14] = CL_sub( t0, t4 );
650 50962494 : y[15] = CL_mac_j( t2, t7 );
651 : }
652 : }
653 :
654 53078086 : x0 = CL_scale_t( y[11], C162 );
655 53078086 : y[11] = CL_mac_j( x0, x0 );
656 :
657 53078086 : x0 = CL_scale_t( y[14], C162 );
658 53078086 : y[14] = CL_mac_j( x0, x0 );
659 :
660 53078086 : x0 = CL_scale_t( y[6], C161 );
661 53078086 : y[6] = CL_msu_j( x0, x0 );
662 :
663 53078086 : x0 = CL_scale_t( y[9], C161 );
664 53078086 : y[9] = CL_msu_j( x0, x0 );
665 :
666 53078086 : y[5] = CL_mac_j( CL_scale_t( y[5], C163 ), CL_scale_t( y[5], C166 ) );
667 53078086 : y[7] = CL_mac_j( CL_scale_t( y[7], C165 ), CL_scale_t( y[7], C164 ) );
668 53078086 : y[13] = CL_mac_j( CL_scale_t( y[13], C165 ), CL_scale_t( y[13], C164 ) );
669 53078086 : y[15] = CL_mac_j( CL_scale_t( y[15], C164 ), CL_scale_t( y[15], C165 ) );
670 :
671 :
672 : /* i=0 */
673 : {
674 53078086 : t0 = CL_add( y[0], y[8] );
675 53078086 : t2 = CL_sub( y[0], y[8] );
676 53078086 : t4 = CL_add( y[4], y[12] );
677 53078086 : t7 = CL_sub( y[4], y[12] );
678 :
679 53078086 : input[0] = CL_add( t0, t4 );
680 53078086 : input[4] = CL_msu_j( t2, t7 );
681 53078086 : input[8] = CL_sub( t0, t4 );
682 53078086 : input[12] = CL_mac_j( t2, t7 );
683 : }
684 : /* i=1 */
685 : {
686 53078086 : t0 = CL_add( y[1], y[9] );
687 53078086 : t2 = CL_sub( y[1], y[9] );
688 53078086 : t4 = CL_add( y[5], y[13] );
689 53078086 : t7 = CL_sub( y[5], y[13] );
690 :
691 53078086 : input[1] = CL_add( t0, t4 );
692 53078086 : input[5] = CL_msu_j( t2, t7 );
693 53078086 : input[9] = CL_sub( t0, t4 );
694 53078086 : input[13] = CL_mac_j( t2, t7 );
695 : }
696 : /* i=2 */
697 : {
698 53078086 : t0 = CL_add( y[2], y[10] );
699 53078086 : t2 = CL_sub( y[2], y[10] );
700 53078086 : t4 = CL_add( y[6], y[14] );
701 53078086 : t7 = CL_sub( y[6], y[14] );
702 :
703 53078086 : input[2] = CL_add( t0, t4 );
704 53078086 : input[6] = CL_msu_j( t2, t7 );
705 53078086 : input[10] = CL_sub( t0, t4 );
706 53078086 : input[14] = CL_mac_j( t2, t7 );
707 : }
708 : /* i=3 */
709 : {
710 53078086 : t0 = CL_add( y[3], y[11] );
711 53078086 : t2 = CL_sub( y[3], y[11] );
712 53078086 : t4 = CL_add( y[7], y[15] );
713 53078086 : t7 = CL_sub( y[7], y[15] );
714 :
715 53078086 : input[3] = CL_add( t0, t4 );
716 53078086 : input[7] = CL_msu_j( t2, t7 );
717 53078086 : input[11] = CL_sub( t0, t4 );
718 53078086 : input[15] = CL_mac_j( t2, t7 );
719 : }
720 : #ifdef WMOPS
721 : multiCounter[currCounter].CL_move += 16;
722 : #endif
723 53078086 : }
724 :
725 :
726 : /**
727 : * \brief Function performs a complex 20-point FFT
728 : * The FFT is performed inplace. The result of the FFT
729 : * is scaled by SCALEFACTOR20 bits.
730 : *
731 : * WOPS with 32x16 bit multiplications: 432 cycles
732 : *
733 : * \param [i/o] re real input / output
734 : * \param [i/o] im imag input / output
735 : * \param [i ] s stride real and imag input / output
736 : *
737 : * \return void
738 : */
739 16342708 : static void fft20_with_cmplx_data( cmplx *inp_data /*Qx*/ )
740 : {
741 : cmplx r1, r2, r3, r4;
742 : cmplx x0, x1, x2, x3, x4;
743 : cmplx t, t0, t1, t2, t3;
744 : cmplx y[20];
745 : cmplx *y0, *y1, *y2, *y3, *y4;
746 :
747 16342708 : y0 = y;
748 16342708 : y1 = &y[4];
749 16342708 : y2 = &y[16];
750 16342708 : y3 = &y[8];
751 16342708 : y4 = &y[12];
752 :
753 : {
754 16342708 : x0 = CL_shr( inp_data[0], SCALEFACTOR20 ); // Qx - 5
755 16342708 : x1 = CL_shr( inp_data[16], SCALEFACTOR20 );
756 16342708 : x2 = CL_shr( inp_data[12], SCALEFACTOR20 );
757 16342708 : x3 = CL_shr( inp_data[8], SCALEFACTOR20 );
758 16342708 : x4 = CL_shr( inp_data[4], SCALEFACTOR20 );
759 :
760 16342708 : r4 = CL_sub( x1, x4 );
761 16342708 : r2 = CL_sub( x2, x3 );
762 16342708 : r1 = CL_add( x1, x4 );
763 16342708 : r3 = CL_add( x2, x3 );
764 16342708 : t = CL_scale_t( CL_sub( r1, r3 ), C54 );
765 16342708 : r1 = CL_add( r1, r3 );
766 16342708 : y0[0] = CL_add( x0, r1 );
767 16342708 : r1 = CL_add( y0[0], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
768 16342708 : r3 = CL_sub( r1, t );
769 16342708 : r1 = CL_add( r1, t );
770 16342708 : t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
771 16342708 : r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
772 16342708 : r2 = CL_add( t, CL_scale_t( r2, C53 ) );
773 :
774 :
775 16342708 : y1[0] = CL_msu_j( r1, r2 );
776 16342708 : y2[0] = CL_mac_j( r1, r2 );
777 16342708 : y3[0] = CL_mac_j( r3, r4 );
778 16342708 : y4[0] = CL_msu_j( r3, r4 );
779 : }
780 : {
781 16342708 : x0 = CL_shr( inp_data[5], SCALEFACTOR20 ); // Qx - 5
782 16342708 : x1 = CL_shr( inp_data[1], SCALEFACTOR20 );
783 16342708 : x2 = CL_shr( inp_data[17], SCALEFACTOR20 );
784 16342708 : x3 = CL_shr( inp_data[13], SCALEFACTOR20 );
785 16342708 : x4 = CL_shr( inp_data[9], SCALEFACTOR20 );
786 :
787 16342708 : r4 = CL_sub( x1, x4 );
788 16342708 : r2 = CL_sub( x2, x3 );
789 16342708 : r1 = CL_add( x1, x4 );
790 16342708 : r3 = CL_add( x2, x3 );
791 16342708 : t = CL_scale_t( CL_sub( r1, r3 ), C54 );
792 16342708 : r1 = CL_add( r1, r3 );
793 16342708 : y0[1] = CL_add( x0, r1 );
794 16342708 : r1 = CL_add( y0[1], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
795 16342708 : r3 = CL_sub( r1, t );
796 16342708 : r1 = CL_add( r1, t );
797 16342708 : t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
798 16342708 : r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
799 16342708 : r2 = CL_add( t, CL_scale_t( r2, C53 ) );
800 :
801 :
802 16342708 : y1[1] = CL_msu_j( r1, r2 );
803 16342708 : y2[1] = CL_mac_j( r1, r2 );
804 16342708 : y3[1] = CL_mac_j( r3, r4 );
805 16342708 : y4[1] = CL_msu_j( r3, r4 );
806 : }
807 : {
808 16342708 : x0 = CL_shr( inp_data[10], SCALEFACTOR20 ); // Qx - 5
809 16342708 : x1 = CL_shr( inp_data[6], SCALEFACTOR20 );
810 16342708 : x2 = CL_shr( inp_data[2], SCALEFACTOR20 );
811 16342708 : x3 = CL_shr( inp_data[18], SCALEFACTOR20 );
812 16342708 : x4 = CL_shr( inp_data[14], SCALEFACTOR20 );
813 :
814 16342708 : r4 = CL_sub( x1, x4 );
815 16342708 : r2 = CL_sub( x2, x3 );
816 16342708 : r1 = CL_add( x1, x4 );
817 16342708 : r3 = CL_add( x2, x3 );
818 16342708 : t = CL_scale_t( CL_sub( r1, r3 ), C54 );
819 16342708 : r1 = CL_add( r1, r3 );
820 16342708 : y0[2] = CL_add( x0, r1 );
821 16342708 : r1 = CL_add( y0[2], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
822 16342708 : r3 = CL_sub( r1, t );
823 16342708 : r1 = CL_add( r1, t );
824 16342708 : t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
825 16342708 : r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
826 16342708 : r2 = CL_add( t, CL_scale_t( r2, C53 ) );
827 :
828 :
829 16342708 : y1[2] = CL_msu_j( r1, r2 );
830 16342708 : y2[2] = CL_mac_j( r1, r2 );
831 16342708 : y3[2] = CL_mac_j( r3, r4 );
832 16342708 : y4[2] = CL_msu_j( r3, r4 );
833 : }
834 : {
835 16342708 : x0 = CL_shr( inp_data[15], SCALEFACTOR20 ); // Qx - 5
836 16342708 : x1 = CL_shr( inp_data[11], SCALEFACTOR20 );
837 16342708 : x2 = CL_shr( inp_data[7], SCALEFACTOR20 );
838 16342708 : x3 = CL_shr( inp_data[3], SCALEFACTOR20 );
839 16342708 : x4 = CL_shr( inp_data[19], SCALEFACTOR20 );
840 :
841 16342708 : r4 = CL_sub( x1, x4 );
842 16342708 : r2 = CL_sub( x2, x3 );
843 16342708 : r1 = CL_add( x1, x4 );
844 16342708 : r3 = CL_add( x2, x3 );
845 16342708 : t = CL_scale_t( CL_sub( r1, r3 ), C54 );
846 16342708 : r1 = CL_add( r1, r3 );
847 16342708 : y0[3] = CL_add( x0, r1 );
848 16342708 : r1 = CL_add( y0[3], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
849 16342708 : r3 = CL_sub( r1, t );
850 16342708 : r1 = CL_add( r1, t );
851 16342708 : t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
852 16342708 : r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
853 16342708 : r2 = CL_add( t, CL_scale_t( r2, C53 ) );
854 :
855 :
856 16342708 : y1[3] = CL_msu_j( r1, r2 );
857 16342708 : y2[3] = CL_mac_j( r1, r2 );
858 16342708 : y3[3] = CL_mac_j( r3, r4 );
859 16342708 : y4[3] = CL_msu_j( r3, r4 );
860 : }
861 :
862 : {
863 16342708 : cmplx *ptr_y = y;
864 : {
865 : cmplx Cy0, Cy1, Cy2, Cy3;
866 :
867 16342708 : Cy0 = *ptr_y++;
868 16342708 : Cy1 = *ptr_y++;
869 16342708 : Cy2 = *ptr_y++;
870 16342708 : Cy3 = *ptr_y++;
871 :
872 : /* Pre-additions */
873 16342708 : t0 = CL_add( Cy0, Cy2 );
874 16342708 : t1 = CL_sub( Cy0, Cy2 );
875 16342708 : t2 = CL_add( Cy1, Cy3 );
876 16342708 : t3 = CL_sub( Cy1, Cy3 );
877 :
878 :
879 16342708 : inp_data[0] = CL_add( t0, t2 );
880 16342708 : inp_data[5] = CL_msu_j( t1, t3 );
881 16342708 : inp_data[10] = CL_sub( t0, t2 );
882 16342708 : inp_data[15] = CL_mac_j( t1, t3 );
883 : }
884 :
885 : {
886 : cmplx Cy0, Cy1, Cy2, Cy3;
887 :
888 16342708 : Cy0 = *ptr_y++;
889 16342708 : Cy1 = *ptr_y++;
890 16342708 : Cy2 = *ptr_y++;
891 16342708 : Cy3 = *ptr_y++;
892 :
893 : /* Pre-additions */
894 16342708 : t0 = CL_add( Cy0, Cy2 );
895 16342708 : t1 = CL_sub( Cy0, Cy2 );
896 16342708 : t2 = CL_add( Cy1, Cy3 );
897 16342708 : t3 = CL_sub( Cy1, Cy3 );
898 :
899 :
900 16342708 : inp_data[4] = CL_add( t0, t2 );
901 16342708 : inp_data[9] = CL_msu_j( t1, t3 );
902 16342708 : inp_data[14] = CL_sub( t0, t2 );
903 16342708 : inp_data[19] = CL_mac_j( t1, t3 );
904 : }
905 :
906 : {
907 : cmplx Cy0, Cy1, Cy2, Cy3;
908 :
909 16342708 : Cy0 = *ptr_y++;
910 16342708 : Cy1 = *ptr_y++;
911 16342708 : Cy2 = *ptr_y++;
912 16342708 : Cy3 = *ptr_y++;
913 :
914 : /* Pre-additions */
915 16342708 : t0 = CL_add( Cy0, Cy2 );
916 16342708 : t1 = CL_sub( Cy0, Cy2 );
917 16342708 : t2 = CL_add( Cy1, Cy3 );
918 16342708 : t3 = CL_sub( Cy1, Cy3 );
919 :
920 :
921 16342708 : inp_data[8] = CL_add( t0, t2 );
922 16342708 : inp_data[13] = CL_msu_j( t1, t3 );
923 16342708 : inp_data[18] = CL_sub( t0, t2 );
924 16342708 : inp_data[3] = CL_mac_j( t1, t3 );
925 : }
926 :
927 : {
928 : cmplx Cy0, Cy1, Cy2, Cy3;
929 :
930 16342708 : Cy0 = *ptr_y++;
931 16342708 : Cy1 = *ptr_y++;
932 16342708 : Cy2 = *ptr_y++;
933 16342708 : Cy3 = *ptr_y++;
934 :
935 : /* Pre-additions */
936 16342708 : t0 = CL_add( Cy0, Cy2 );
937 16342708 : t1 = CL_sub( Cy0, Cy2 );
938 16342708 : t2 = CL_add( Cy1, Cy3 );
939 16342708 : t3 = CL_sub( Cy1, Cy3 );
940 :
941 16342708 : inp_data[12] = CL_add( t0, t2 );
942 16342708 : inp_data[17] = CL_msu_j( t1, t3 );
943 16342708 : inp_data[2] = CL_sub( t0, t2 );
944 16342708 : inp_data[7] = CL_mac_j( t1, t3 );
945 : }
946 :
947 : {
948 : cmplx Cy0, Cy1, Cy2, Cy3;
949 :
950 16342708 : Cy0 = *ptr_y++;
951 16342708 : Cy1 = *ptr_y++;
952 16342708 : Cy2 = *ptr_y++;
953 16342708 : Cy3 = *ptr_y++;
954 :
955 : /* Pre-additions */
956 16342708 : t0 = CL_add( Cy0, Cy2 );
957 16342708 : t1 = CL_sub( Cy0, Cy2 );
958 16342708 : t2 = CL_add( Cy1, Cy3 );
959 16342708 : t3 = CL_sub( Cy1, Cy3 );
960 :
961 :
962 16342708 : inp_data[16] = CL_add( t0, t2 );
963 16342708 : inp_data[1] = CL_msu_j( t1, t3 );
964 16342708 : inp_data[6] = CL_sub( t0, t2 );
965 16342708 : inp_data[11] = CL_mac_j( t1, t3 );
966 : }
967 : }
968 : #ifdef WMOPS
969 : multiCounter[currCounter].CL_move += 20;
970 : #endif
971 16342708 : }
972 :
973 :
974 : /**
975 : * \brief Function performs a complex 30-point FFT
976 : * The FFT is performed inplace. The result of the FFT
977 : * is scaled by SCALEFACTOR30 bits.
978 : *
979 : * WOPS with 32x16 bit multiplications: 828 cycles
980 : *
981 : * \param [i/o] re real input / output
982 : * \param [i/o] im imag input / output
983 : * \param [i ] s stride real and imag input / output
984 : *
985 : * \return void
986 : */
987 :
988 19853168 : static void fft30_with_cmplx_data( cmplx *inp /*Qx*/ )
989 : {
990 19853168 : cmplx *l = &inp[0];
991 19853168 : cmplx *h = &inp[15];
992 :
993 : cmplx z[30], y[15], x[15], rs1, rs2, rs3, rs4, t;
994 :
995 : /* 1. FFT15 stage */
996 :
997 19853168 : x[0] = CL_shr( inp[0], SCALEFACTOR30_1 ); // Qx - 5
998 19853168 : x[1] = CL_shr( inp[18], SCALEFACTOR30_1 );
999 19853168 : x[2] = CL_shr( inp[6], SCALEFACTOR30_1 );
1000 19853168 : x[3] = CL_shr( inp[24], SCALEFACTOR30_1 );
1001 19853168 : x[4] = CL_shr( inp[12], SCALEFACTOR30_1 );
1002 :
1003 19853168 : x[5] = CL_shr( inp[20], SCALEFACTOR30_1 ); // Qx - 5
1004 19853168 : x[6] = CL_shr( inp[8], SCALEFACTOR30_1 );
1005 19853168 : x[7] = CL_shr( inp[26], SCALEFACTOR30_1 );
1006 19853168 : x[8] = CL_shr( inp[14], SCALEFACTOR30_1 );
1007 19853168 : x[9] = CL_shr( inp[2], SCALEFACTOR30_1 );
1008 :
1009 19853168 : x[10] = CL_shr( inp[10], SCALEFACTOR30_1 ); // Qx - 5
1010 19853168 : x[11] = CL_shr( inp[28], SCALEFACTOR30_1 );
1011 19853168 : x[12] = CL_shr( inp[16], SCALEFACTOR30_1 );
1012 19853168 : x[13] = CL_shr( inp[4], SCALEFACTOR30_1 );
1013 19853168 : x[14] = CL_shr( inp[22], SCALEFACTOR30_1 );
1014 :
1015 :
1016 : /* 1. FFT5 stage */
1017 19853168 : rs1 = CL_add( x[1], x[4] );
1018 19853168 : rs4 = CL_sub( x[1], x[4] );
1019 19853168 : rs3 = CL_add( x[2], x[3] );
1020 19853168 : rs2 = CL_sub( x[2], x[3] );
1021 19853168 : t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
1022 19853168 : rs1 = CL_add( rs1, rs3 );
1023 19853168 : y[0] = CL_add( x[0], rs1 );
1024 19853168 : rs1 = CL_add( y[0], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
1025 19853168 : rs3 = CL_sub( rs1, t );
1026 19853168 : rs1 = CL_add( rs1, t );
1027 19853168 : t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
1028 19853168 : rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
1029 19853168 : rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
1030 :
1031 : /* combination */
1032 19853168 : y[1] = CL_msu_j( rs1, rs2 );
1033 19853168 : y[4] = CL_mac_j( rs1, rs2 );
1034 19853168 : y[2] = CL_mac_j( rs3, rs4 );
1035 19853168 : y[3] = CL_msu_j( rs3, rs4 );
1036 :
1037 :
1038 : /* 2. FFT5 stage */
1039 19853168 : rs1 = CL_add( x[6], x[9] );
1040 19853168 : rs4 = CL_sub( x[6], x[9] );
1041 19853168 : rs3 = CL_add( x[7], x[8] );
1042 19853168 : rs2 = CL_sub( x[7], x[8] );
1043 19853168 : t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
1044 19853168 : rs1 = CL_add( rs1, rs3 );
1045 19853168 : y[5] = CL_add( x[5], rs1 );
1046 19853168 : rs1 = CL_add( y[5], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
1047 19853168 : rs3 = CL_sub( rs1, t );
1048 19853168 : rs1 = CL_add( rs1, t );
1049 19853168 : t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
1050 19853168 : rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
1051 19853168 : rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
1052 :
1053 : /* combination */
1054 19853168 : y[6] = CL_msu_j( rs1, rs2 );
1055 19853168 : y[9] = CL_mac_j( rs1, rs2 );
1056 19853168 : y[7] = CL_mac_j( rs3, rs4 );
1057 19853168 : y[8] = CL_msu_j( rs3, rs4 );
1058 :
1059 :
1060 : /* 3. FFT5 stage */
1061 19853168 : rs1 = CL_add( x[11], x[14] );
1062 19853168 : rs4 = CL_sub( x[11], x[14] );
1063 19853168 : rs3 = CL_add( x[12], x[13] );
1064 19853168 : rs2 = CL_sub( x[12], x[13] );
1065 19853168 : t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
1066 19853168 : rs1 = CL_add( rs1, rs3 );
1067 19853168 : y[10] = CL_add( x[10], rs1 );
1068 19853168 : rs1 = CL_add( y[10], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
1069 19853168 : rs3 = CL_sub( rs1, t );
1070 19853168 : rs1 = CL_add( rs1, t );
1071 19853168 : t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
1072 19853168 : rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
1073 19853168 : rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
1074 :
1075 : /* combination */
1076 19853168 : y[11] = CL_msu_j( rs1, rs2 );
1077 19853168 : y[14] = CL_mac_j( rs1, rs2 );
1078 19853168 : y[12] = CL_mac_j( rs3, rs4 );
1079 19853168 : y[13] = CL_msu_j( rs3, rs4 );
1080 : /*for (i=10; i<15; i++)
1081 : {
1082 : printf("%d,\t %d,\t",y[i].re, y[i].im);
1083 : }
1084 : printf("\n\n");*/
1085 :
1086 :
1087 : /* 1. FFT3 stage */
1088 : /* real part */
1089 19853168 : rs1 = CL_add( y[5], y[10] );
1090 19853168 : rs2 = CL_scale_t( CL_sub( y[5], y[10] ), C31 );
1091 19853168 : z[0] = CL_add( y[0], rs1 );
1092 19853168 : rs1 = CL_sub( y[0], CL_shr( rs1, 1 ) );
1093 :
1094 19853168 : z[10] = CL_mac_j( rs1, rs2 );
1095 19853168 : z[5] = CL_msu_j( rs1, rs2 );
1096 :
1097 : /* 2. FFT3 stage */
1098 19853168 : rs1 = CL_add( y[6], y[11] );
1099 19853168 : rs2 = CL_scale_t( CL_sub( y[6], y[11] ), C31 );
1100 19853168 : z[6] = CL_add( y[1], rs1 );
1101 19853168 : rs1 = CL_sub( y[1], CL_shr( rs1, 1 ) );
1102 :
1103 19853168 : z[1] = CL_mac_j( rs1, rs2 );
1104 19853168 : z[11] = CL_msu_j( rs1, rs2 );
1105 :
1106 :
1107 : /* 3. FFT3 stage */
1108 19853168 : rs1 = CL_add( y[7], y[12] );
1109 19853168 : rs2 = CL_scale_t( CL_sub( y[7], y[12] ), C31 );
1110 19853168 : z[12] = CL_add( y[2], rs1 );
1111 19853168 : rs1 = CL_sub( y[2], CL_shr( rs1, 1 ) );
1112 :
1113 19853168 : z[7] = CL_mac_j( rs1, rs2 );
1114 19853168 : z[2] = CL_msu_j( rs1, rs2 );
1115 :
1116 :
1117 : /* 4. FFT3 stage */
1118 19853168 : rs1 = CL_add( y[8], y[13] );
1119 19853168 : rs2 = CL_scale_t( CL_sub( y[8], y[13] ), C31 );
1120 19853168 : z[3] = CL_add( y[3], rs1 );
1121 19853168 : rs1 = CL_sub( y[3], CL_shr( rs1, 1 ) );
1122 :
1123 19853168 : z[13] = CL_mac_j( rs1, rs2 );
1124 19853168 : z[8] = CL_msu_j( rs1, rs2 );
1125 :
1126 :
1127 : /* 5. FFT3 stage */
1128 19853168 : rs1 = CL_add( y[9], y[14] );
1129 19853168 : rs2 = CL_scale_t( CL_sub( y[9], y[14] ), C31 );
1130 19853168 : z[9] = CL_add( y[4], rs1 );
1131 19853168 : rs1 = CL_sub( y[4], CL_shr( rs1, 1 ) );
1132 :
1133 19853168 : z[4] = CL_mac_j( rs1, rs2 );
1134 19853168 : z[14] = CL_msu_j( rs1, rs2 );
1135 :
1136 : /*for (i=0; i<15; i++)
1137 : printf("%d,\t %d,\t",z[i].re, z[i].im);
1138 : printf("\n\n");*/
1139 :
1140 :
1141 : /* 2. FFT15 stage */
1142 :
1143 19853168 : x[0] = CL_shr( inp[15], SCALEFACTOR30_1 ); // Qx - 5
1144 19853168 : x[1] = CL_shr( inp[3], SCALEFACTOR30_1 );
1145 19853168 : x[2] = CL_shr( inp[21], SCALEFACTOR30_1 );
1146 19853168 : x[3] = CL_shr( inp[9], SCALEFACTOR30_1 );
1147 19853168 : x[4] = CL_shr( inp[27], SCALEFACTOR30_1 );
1148 :
1149 19853168 : x[5] = CL_shr( inp[5], SCALEFACTOR30_1 ); // Qx - 5
1150 19853168 : x[6] = CL_shr( inp[23], SCALEFACTOR30_1 );
1151 19853168 : x[7] = CL_shr( inp[11], SCALEFACTOR30_1 );
1152 19853168 : x[8] = CL_shr( inp[29], SCALEFACTOR30_1 );
1153 19853168 : x[9] = CL_shr( inp[17], SCALEFACTOR30_1 );
1154 :
1155 19853168 : x[10] = CL_shr( inp[25], SCALEFACTOR30_1 ); // Qx - 5
1156 19853168 : x[11] = CL_shr( inp[13], SCALEFACTOR30_1 );
1157 19853168 : x[12] = CL_shr( inp[1], SCALEFACTOR30_1 );
1158 19853168 : x[13] = CL_shr( inp[19], SCALEFACTOR30_1 );
1159 19853168 : x[14] = CL_shr( inp[7], SCALEFACTOR30_1 );
1160 :
1161 : /* 1. FFT5 stage */
1162 19853168 : rs1 = CL_add( x[1], x[4] );
1163 19853168 : rs4 = CL_sub( x[1], x[4] );
1164 19853168 : rs3 = CL_add( x[2], x[3] );
1165 19853168 : rs2 = CL_sub( x[2], x[3] );
1166 19853168 : t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
1167 19853168 : rs1 = CL_add( rs1, rs3 );
1168 19853168 : y[0] = CL_add( x[0], rs1 );
1169 19853168 : rs1 = CL_add( y[0], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
1170 19853168 : rs3 = CL_sub( rs1, t );
1171 19853168 : rs1 = CL_add( rs1, t );
1172 19853168 : t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
1173 19853168 : rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
1174 19853168 : rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
1175 :
1176 : /* combination */
1177 19853168 : y[1] = CL_msu_j( rs1, rs2 );
1178 19853168 : y[4] = CL_mac_j( rs1, rs2 );
1179 19853168 : y[2] = CL_mac_j( rs3, rs4 );
1180 19853168 : y[3] = CL_msu_j( rs3, rs4 );
1181 :
1182 :
1183 : /* 2. FFT5 stage */
1184 19853168 : rs1 = CL_add( x[6], x[9] );
1185 19853168 : rs4 = CL_sub( x[6], x[9] );
1186 19853168 : rs3 = CL_add( x[7], x[8] );
1187 19853168 : rs2 = CL_sub( x[7], x[8] );
1188 19853168 : t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
1189 19853168 : rs1 = CL_add( rs1, rs3 );
1190 19853168 : y[5] = CL_add( x[5], rs1 );
1191 19853168 : rs1 = CL_add( y[5], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
1192 19853168 : rs3 = CL_sub( rs1, t );
1193 19853168 : rs1 = CL_add( rs1, t );
1194 19853168 : t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
1195 19853168 : rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
1196 19853168 : rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
1197 :
1198 : /* combination */
1199 19853168 : y[6] = CL_msu_j( rs1, rs2 );
1200 19853168 : y[9] = CL_mac_j( rs1, rs2 );
1201 19853168 : y[7] = CL_mac_j( rs3, rs4 );
1202 19853168 : y[8] = CL_msu_j( rs3, rs4 );
1203 :
1204 :
1205 : /* 3. FFT5 stage */
1206 19853168 : rs1 = CL_add( x[11], x[14] );
1207 19853168 : rs4 = CL_sub( x[11], x[14] );
1208 19853168 : rs3 = CL_add( x[12], x[13] );
1209 19853168 : rs2 = CL_sub( x[12], x[13] );
1210 19853168 : t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
1211 19853168 : rs1 = CL_add( rs1, rs3 );
1212 19853168 : y[10] = CL_add( x[10], rs1 );
1213 19853168 : rs1 = CL_add( y[10], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
1214 19853168 : rs3 = CL_sub( rs1, t );
1215 19853168 : rs1 = CL_add( rs1, t );
1216 19853168 : t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
1217 19853168 : rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
1218 19853168 : rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
1219 :
1220 : /* combination */
1221 19853168 : y[11] = CL_msu_j( rs1, rs2 );
1222 19853168 : y[14] = CL_mac_j( rs1, rs2 );
1223 19853168 : y[12] = CL_mac_j( rs3, rs4 );
1224 19853168 : y[13] = CL_msu_j( rs3, rs4 );
1225 : /*for (i=10; i<15; i++)
1226 : {
1227 : printf("%d,\t %d,\t",y[i].re, y[i].im);
1228 : }
1229 : printf("\n\n");*/
1230 :
1231 :
1232 : /* 1. FFT3 stage */
1233 : /* real part */
1234 19853168 : rs1 = CL_add( y[5], y[10] );
1235 19853168 : rs2 = CL_scale_t( CL_sub( y[5], y[10] ), C31 );
1236 19853168 : z[15] = CL_add( y[0], rs1 );
1237 19853168 : rs1 = CL_sub( y[0], CL_shr( rs1, 1 ) );
1238 :
1239 19853168 : z[25] = CL_mac_j( rs1, rs2 );
1240 19853168 : z[20] = CL_msu_j( rs1, rs2 );
1241 :
1242 : /* 2. FFT3 stage */
1243 19853168 : rs1 = CL_add( y[6], y[11] );
1244 19853168 : rs2 = CL_scale_t( CL_sub( y[6], y[11] ), C31 );
1245 19853168 : z[21] = CL_add( y[1], rs1 );
1246 19853168 : rs1 = CL_sub( y[1], CL_shr( rs1, 1 ) );
1247 :
1248 19853168 : z[16] = CL_mac_j( rs1, rs2 );
1249 19853168 : z[26] = CL_msu_j( rs1, rs2 );
1250 :
1251 :
1252 : /* 3. FFT3 stage */
1253 19853168 : rs1 = CL_add( y[7], y[12] );
1254 19853168 : rs2 = CL_scale_t( CL_sub( y[7], y[12] ), C31 );
1255 19853168 : z[27] = CL_add( y[2], rs1 );
1256 19853168 : rs1 = CL_sub( y[2], CL_shr( rs1, 1 ) );
1257 :
1258 19853168 : z[22] = CL_mac_j( rs1, rs2 );
1259 19853168 : z[17] = CL_msu_j( rs1, rs2 );
1260 :
1261 :
1262 : /* 4. FFT3 stage */
1263 19853168 : rs1 = CL_add( y[8], y[13] );
1264 19853168 : rs2 = CL_scale_t( CL_sub( y[8], y[13] ), C31 );
1265 19853168 : z[18] = CL_add( y[3], rs1 );
1266 19853168 : rs1 = CL_sub( y[3], CL_shr( rs1, 1 ) );
1267 :
1268 19853168 : z[28] = CL_mac_j( rs1, rs2 );
1269 19853168 : z[23] = CL_msu_j( rs1, rs2 );
1270 :
1271 :
1272 : /* 5. FFT3 stage */
1273 19853168 : rs1 = CL_add( y[9], y[14] );
1274 19853168 : rs2 = CL_scale_t( CL_sub( y[9], y[14] ), C31 );
1275 19853168 : z[24] = CL_add( y[4], rs1 );
1276 19853168 : rs1 = CL_sub( y[4], CL_shr( rs1, 1 ) );
1277 :
1278 19853168 : z[19] = CL_mac_j( rs1, rs2 );
1279 19853168 : z[29] = CL_msu_j( rs1, rs2 );
1280 :
1281 : /*for (i=0; i<30; i++)
1282 : printf("%d,\t %d,\t",z[i].re, z[i].im);
1283 : printf("\n\n");*/
1284 :
1285 :
1286 : /* 1. FFT2 stage */
1287 19853168 : rs1 = CL_shr( z[0], SCALEFACTOR30_2 );
1288 19853168 : rs2 = CL_shr( z[15], SCALEFACTOR30_2 );
1289 19853168 : *l = CL_add( rs1, rs2 );
1290 19853168 : *h = CL_sub( rs1, rs2 );
1291 19853168 : l += 1;
1292 19853168 : h += 1;
1293 :
1294 : /* 2. FFT2 stage */
1295 19853168 : rs1 = CL_shr( z[8], SCALEFACTOR30_2 );
1296 19853168 : rs2 = CL_shr( z[23], SCALEFACTOR30_2 );
1297 19853168 : *h = CL_add( rs1, rs2 );
1298 19853168 : *l = CL_sub( rs1, rs2 );
1299 19853168 : l += 1;
1300 19853168 : h += 1;
1301 :
1302 :
1303 : /* 3. FFT2 stage */
1304 19853168 : rs1 = CL_shr( z[1], SCALEFACTOR30_2 );
1305 19853168 : rs2 = CL_shr( z[16], SCALEFACTOR30_2 );
1306 19853168 : *l = CL_add( rs1, rs2 );
1307 19853168 : *h = CL_sub( rs1, rs2 );
1308 19853168 : l += 1;
1309 19853168 : h += 1;
1310 :
1311 :
1312 : /* 4. FFT2 stage */
1313 19853168 : rs1 = CL_shr( z[9], SCALEFACTOR30_2 );
1314 19853168 : rs2 = CL_shr( z[24], SCALEFACTOR30_2 );
1315 19853168 : *h = CL_add( rs1, rs2 );
1316 19853168 : *l = CL_sub( rs1, rs2 );
1317 19853168 : l += 1;
1318 19853168 : h += 1;
1319 :
1320 : /* 5. FFT2 stage */
1321 19853168 : rs1 = CL_shr( z[2], SCALEFACTOR30_2 );
1322 19853168 : rs2 = CL_shr( z[17], SCALEFACTOR30_2 );
1323 19853168 : *l = CL_add( rs1, rs2 );
1324 19853168 : *h = CL_sub( rs1, rs2 );
1325 19853168 : l += 1;
1326 19853168 : h += 1;
1327 :
1328 : /* 6. FFT2 stage */
1329 19853168 : rs1 = CL_shr( z[10], SCALEFACTOR30_2 );
1330 19853168 : rs2 = CL_shr( z[25], SCALEFACTOR30_2 );
1331 19853168 : *h = CL_add( rs1, rs2 );
1332 19853168 : *l = CL_sub( rs1, rs2 );
1333 19853168 : l += 1;
1334 19853168 : h += 1;
1335 :
1336 : /* 7. FFT2 stage */
1337 19853168 : rs1 = CL_shr( z[3], SCALEFACTOR30_2 );
1338 19853168 : rs2 = CL_shr( z[18], SCALEFACTOR30_2 );
1339 19853168 : *l = CL_add( rs1, rs2 );
1340 19853168 : *h = CL_sub( rs1, rs2 );
1341 19853168 : l += 1;
1342 19853168 : h += 1;
1343 :
1344 : /* 8. FFT2 stage */
1345 19853168 : rs1 = CL_shr( z[11], SCALEFACTOR30_2 );
1346 19853168 : rs2 = CL_shr( z[26], SCALEFACTOR30_2 );
1347 19853168 : *h = CL_add( rs1, rs2 );
1348 19853168 : *l = CL_sub( rs1, rs2 );
1349 19853168 : l += 1;
1350 19853168 : h += 1;
1351 :
1352 : /* 9. FFT2 stage */
1353 19853168 : rs1 = CL_shr( z[4], SCALEFACTOR30_2 );
1354 19853168 : rs2 = CL_shr( z[19], SCALEFACTOR30_2 );
1355 19853168 : *l = CL_add( rs1, rs2 );
1356 19853168 : *h = CL_sub( rs1, rs2 );
1357 19853168 : l += 1;
1358 19853168 : h += 1;
1359 :
1360 : /* 10. FFT2 stage */
1361 19853168 : rs1 = CL_shr( z[12], SCALEFACTOR30_2 );
1362 19853168 : rs2 = CL_shr( z[27], SCALEFACTOR30_2 );
1363 19853168 : *h = CL_add( rs1, rs2 );
1364 19853168 : *l = CL_sub( rs1, rs2 );
1365 19853168 : l += 1;
1366 19853168 : h += 1;
1367 :
1368 : /* 11. FFT2 stage */
1369 19853168 : rs1 = CL_shr( z[5], SCALEFACTOR30_2 );
1370 19853168 : rs2 = CL_shr( z[20], SCALEFACTOR30_2 );
1371 19853168 : *l = CL_add( rs1, rs2 );
1372 19853168 : *h = CL_sub( rs1, rs2 );
1373 19853168 : l += 1;
1374 19853168 : h += 1;
1375 :
1376 : /* 12. FFT2 stage */
1377 19853168 : rs1 = CL_shr( z[13], SCALEFACTOR30_2 );
1378 19853168 : rs2 = CL_shr( z[28], SCALEFACTOR30_2 );
1379 19853168 : *h = CL_add( rs1, rs2 );
1380 19853168 : *l = CL_sub( rs1, rs2 );
1381 19853168 : l += 1;
1382 19853168 : h += 1;
1383 :
1384 : /* 13. FFT2 stage */
1385 19853168 : rs1 = CL_shr( z[6], SCALEFACTOR30_2 );
1386 19853168 : rs2 = CL_shr( z[21], SCALEFACTOR30_2 );
1387 19853168 : *l = CL_add( rs1, rs2 );
1388 19853168 : *h = CL_sub( rs1, rs2 );
1389 19853168 : l += 1;
1390 19853168 : h += 1;
1391 :
1392 : /* 14. FFT2 stage */
1393 19853168 : rs1 = CL_shr( z[14], SCALEFACTOR30_2 );
1394 19853168 : rs2 = CL_shr( z[29], SCALEFACTOR30_2 );
1395 19853168 : *h = CL_add( rs1, rs2 );
1396 19853168 : *l = CL_sub( rs1, rs2 );
1397 19853168 : l += 1;
1398 19853168 : h += 1;
1399 :
1400 : /* 15. FFT2 stage */
1401 19853168 : rs1 = CL_shr( z[7], SCALEFACTOR30_2 );
1402 19853168 : rs2 = CL_shr( z[22], SCALEFACTOR30_2 );
1403 19853168 : *l = CL_add( rs1, rs2 );
1404 19853168 : *h = CL_sub( rs1, rs2 );
1405 19853168 : l += 1;
1406 19853168 : h += 1;
1407 :
1408 : #ifdef WMOPS
1409 : multiCounter[currCounter].CL_move += 30;
1410 : #endif
1411 19853168 : }
1412 :
1413 : /**
1414 : * \brief Function performs a complex 32-point FFT
1415 : * The FFT is performed inplace. The result of the FFT
1416 : * is scaled by SCALEFACTOR32 bits.
1417 : *
1418 : * WOPS with 32x16 bit multiplications: 752 cycles
1419 : *
1420 : * \param [i/o] re real input / output
1421 : * \param [i/o] im imag input / output
1422 : * \param [i ] s stride real and imag input / output
1423 : *
1424 : * \return void
1425 : */
1426 :
1427 :
1428 1919872 : static void fft32_with_cmplx_data( cmplx *inp /*Qx*/ )
1429 : {
1430 : cmplx x[32], y[32], t[32], s[32], temp, temp1;
1431 1919872 : const cmplx_s *pRotVector_32 = (const cmplx_s *) RotVector_32;
1432 :
1433 : /* 1. FFT8 stage */
1434 :
1435 1919872 : x[0] = CL_shr( inp[0], SCALEFACTOR32_1 ); // Qx - 5
1436 1919872 : x[1] = CL_shr( inp[4], SCALEFACTOR32_1 );
1437 1919872 : x[2] = CL_shr( inp[8], SCALEFACTOR32_1 );
1438 1919872 : x[3] = CL_shr( inp[12], SCALEFACTOR32_1 );
1439 1919872 : x[4] = CL_shr( inp[16], SCALEFACTOR32_1 );
1440 1919872 : x[5] = CL_shr( inp[20], SCALEFACTOR32_1 );
1441 1919872 : x[6] = CL_shr( inp[24], SCALEFACTOR32_1 );
1442 1919872 : x[7] = CL_shr( inp[28], SCALEFACTOR32_1 );
1443 :
1444 :
1445 1919872 : t[0] = CL_add( x[0], x[4] );
1446 1919872 : t[1] = CL_sub( x[0], x[4] );
1447 1919872 : t[2] = CL_add( x[1], x[5] );
1448 1919872 : t[3] = CL_sub( x[1], x[5] );
1449 1919872 : t[4] = CL_add( x[2], x[6] );
1450 1919872 : t[5] = CL_sub( x[2], x[6] );
1451 1919872 : t[6] = CL_add( x[3], x[7] );
1452 1919872 : t[7] = CL_sub( x[3], x[7] );
1453 :
1454 : /* Pre-additions and core multiplications */
1455 :
1456 1919872 : s[0] = CL_add( t[0], t[4] );
1457 1919872 : s[2] = CL_sub( t[0], t[4] );
1458 1919872 : s[4] = CL_mac_j( t[1], t[5] );
1459 1919872 : s[5] = CL_msu_j( t[1], t[5] );
1460 1919872 : s[1] = CL_add( t[2], t[6] );
1461 1919872 : s[3] = CL_sub( t[2], t[6] );
1462 1919872 : s[3] = CL_mul_j( s[3] );
1463 :
1464 1919872 : temp = CL_add( t[3], t[7] );
1465 1919872 : temp1 = CL_sub( t[3], t[7] );
1466 1919872 : s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
1467 1919872 : s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
1468 :
1469 :
1470 1919872 : y[0] = CL_add( s[0], s[1] );
1471 1919872 : y[4] = CL_sub( s[0], s[1] );
1472 1919872 : y[2] = CL_sub( s[2], s[3] );
1473 1919872 : y[6] = CL_add( s[2], s[3] );
1474 1919872 : y[3] = CL_add( s[4], s[7] );
1475 1919872 : y[7] = CL_sub( s[4], s[7] );
1476 1919872 : y[1] = CL_add( s[5], s[6] );
1477 1919872 : y[5] = CL_sub( s[5], s[6] );
1478 :
1479 : /* 2. FFT8 stage */
1480 :
1481 1919872 : x[0] = CL_shr( inp[1], SCALEFACTOR32_1 ); // Qx - 5
1482 1919872 : x[1] = CL_shr( inp[5], SCALEFACTOR32_1 );
1483 1919872 : x[2] = CL_shr( inp[9], SCALEFACTOR32_1 );
1484 1919872 : x[3] = CL_shr( inp[13], SCALEFACTOR32_1 );
1485 1919872 : x[4] = CL_shr( inp[17], SCALEFACTOR32_1 );
1486 1919872 : x[5] = CL_shr( inp[21], SCALEFACTOR32_1 );
1487 1919872 : x[6] = CL_shr( inp[25], SCALEFACTOR32_1 );
1488 1919872 : x[7] = CL_shr( inp[29], SCALEFACTOR32_1 );
1489 :
1490 :
1491 1919872 : t[0] = CL_add( x[0], x[4] );
1492 1919872 : t[1] = CL_sub( x[0], x[4] );
1493 1919872 : t[2] = CL_add( x[1], x[5] );
1494 1919872 : t[3] = CL_sub( x[1], x[5] );
1495 1919872 : t[4] = CL_add( x[2], x[6] );
1496 1919872 : t[5] = CL_sub( x[2], x[6] );
1497 1919872 : t[6] = CL_add( x[3], x[7] );
1498 1919872 : t[7] = CL_sub( x[3], x[7] );
1499 :
1500 : /* Pre-additions and core multiplications */
1501 :
1502 1919872 : s[0] = CL_add( t[0], t[4] );
1503 1919872 : s[2] = CL_sub( t[0], t[4] );
1504 1919872 : s[4] = CL_mac_j( t[1], t[5] );
1505 1919872 : s[5] = CL_msu_j( t[1], t[5] );
1506 1919872 : s[1] = CL_add( t[2], t[6] );
1507 1919872 : s[3] = CL_sub( t[2], t[6] );
1508 1919872 : s[3] = CL_mul_j( s[3] );
1509 :
1510 1919872 : temp = CL_add( t[3], t[7] );
1511 1919872 : temp1 = CL_sub( t[3], t[7] );
1512 1919872 : s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
1513 1919872 : s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
1514 :
1515 : /* Post-additions */
1516 :
1517 1919872 : y[8] = CL_add( s[0], s[1] );
1518 1919872 : y[12] = CL_sub( s[0], s[1] );
1519 1919872 : y[10] = CL_sub( s[2], s[3] );
1520 1919872 : y[14] = CL_add( s[2], s[3] );
1521 1919872 : y[11] = CL_add( s[4], s[7] );
1522 1919872 : y[15] = CL_sub( s[4], s[7] );
1523 1919872 : y[9] = CL_add( s[5], s[6] );
1524 1919872 : y[13] = CL_sub( s[5], s[6] );
1525 :
1526 : /* 3. FFT8 stage */
1527 :
1528 1919872 : x[0] = CL_shr( inp[2], SCALEFACTOR32_1 ); // Qx - 5
1529 1919872 : x[1] = CL_shr( inp[6], SCALEFACTOR32_1 );
1530 1919872 : x[2] = CL_shr( inp[10], SCALEFACTOR32_1 );
1531 1919872 : x[3] = CL_shr( inp[14], SCALEFACTOR32_1 );
1532 1919872 : x[4] = CL_shr( inp[18], SCALEFACTOR32_1 );
1533 1919872 : x[5] = CL_shr( inp[22], SCALEFACTOR32_1 );
1534 1919872 : x[6] = CL_shr( inp[26], SCALEFACTOR32_1 );
1535 1919872 : x[7] = CL_shr( inp[30], SCALEFACTOR32_1 );
1536 :
1537 :
1538 1919872 : t[0] = CL_add( x[0], x[4] );
1539 1919872 : t[1] = CL_sub( x[0], x[4] );
1540 1919872 : t[2] = CL_add( x[1], x[5] );
1541 1919872 : t[3] = CL_sub( x[1], x[5] );
1542 1919872 : t[4] = CL_add( x[2], x[6] );
1543 1919872 : t[5] = CL_sub( x[2], x[6] );
1544 1919872 : t[6] = CL_add( x[3], x[7] );
1545 1919872 : t[7] = CL_sub( x[3], x[7] );
1546 :
1547 : /* Pre-additions and core multiplications */
1548 :
1549 1919872 : s[0] = CL_add( t[0], t[4] );
1550 1919872 : s[2] = CL_sub( t[0], t[4] );
1551 1919872 : s[4] = CL_mac_j( t[1], t[5] );
1552 1919872 : s[5] = CL_msu_j( t[1], t[5] );
1553 1919872 : s[1] = CL_add( t[2], t[6] );
1554 1919872 : s[3] = CL_sub( t[2], t[6] );
1555 1919872 : s[3] = CL_mul_j( s[3] );
1556 :
1557 1919872 : temp = CL_add( t[3], t[7] );
1558 1919872 : temp1 = CL_sub( t[3], t[7] );
1559 1919872 : s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
1560 1919872 : s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
1561 :
1562 : /* Post-additions */
1563 :
1564 1919872 : y[16] = CL_add( s[0], s[1] );
1565 1919872 : y[20] = CL_sub( s[0], s[1] );
1566 1919872 : y[18] = CL_sub( s[2], s[3] );
1567 1919872 : y[22] = CL_add( s[2], s[3] );
1568 1919872 : y[19] = CL_add( s[4], s[7] );
1569 1919872 : y[23] = CL_sub( s[4], s[7] );
1570 1919872 : y[17] = CL_add( s[5], s[6] );
1571 1919872 : y[21] = CL_sub( s[5], s[6] );
1572 :
1573 : /* 4. FFT8 stage */
1574 :
1575 1919872 : x[0] = CL_shr( inp[3], SCALEFACTOR32_1 ); // Qx - 5
1576 1919872 : x[1] = CL_shr( inp[7], SCALEFACTOR32_1 );
1577 1919872 : x[2] = CL_shr( inp[11], SCALEFACTOR32_1 );
1578 1919872 : x[3] = CL_shr( inp[15], SCALEFACTOR32_1 );
1579 1919872 : x[4] = CL_shr( inp[19], SCALEFACTOR32_1 );
1580 1919872 : x[5] = CL_shr( inp[23], SCALEFACTOR32_1 );
1581 1919872 : x[6] = CL_shr( inp[27], SCALEFACTOR32_1 );
1582 1919872 : x[7] = CL_shr( inp[31], SCALEFACTOR32_1 );
1583 :
1584 :
1585 1919872 : t[0] = CL_add( x[0], x[4] );
1586 1919872 : t[1] = CL_sub( x[0], x[4] );
1587 1919872 : t[2] = CL_add( x[1], x[5] );
1588 1919872 : t[3] = CL_sub( x[1], x[5] );
1589 1919872 : t[4] = CL_add( x[2], x[6] );
1590 1919872 : t[5] = CL_sub( x[2], x[6] );
1591 1919872 : t[6] = CL_add( x[3], x[7] );
1592 1919872 : t[7] = CL_sub( x[3], x[7] );
1593 :
1594 :
1595 : /* Pre-additions and core multiplications */
1596 :
1597 1919872 : s[0] = CL_add( t[0], t[4] );
1598 1919872 : s[2] = CL_sub( t[0], t[4] );
1599 1919872 : s[4] = CL_mac_j( t[1], t[5] );
1600 1919872 : s[5] = CL_msu_j( t[1], t[5] );
1601 1919872 : s[1] = CL_add( t[2], t[6] );
1602 1919872 : s[3] = CL_sub( t[2], t[6] );
1603 1919872 : s[3] = CL_mul_j( s[3] );
1604 :
1605 1919872 : temp = CL_add( t[3], t[7] );
1606 1919872 : temp1 = CL_sub( t[3], t[7] );
1607 1919872 : s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
1608 1919872 : s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
1609 :
1610 : /* Post-additions */
1611 :
1612 1919872 : y[24] = CL_add( s[0], s[1] );
1613 1919872 : y[28] = CL_sub( s[0], s[1] );
1614 1919872 : y[26] = CL_sub( s[2], s[3] );
1615 1919872 : y[30] = CL_add( s[2], s[3] );
1616 1919872 : y[27] = CL_add( s[4], s[7] );
1617 1919872 : y[31] = CL_sub( s[4], s[7] );
1618 1919872 : y[25] = CL_add( s[5], s[6] );
1619 1919872 : y[29] = CL_sub( s[5], s[6] );
1620 :
1621 :
1622 : /* apply twiddle factors */
1623 1919872 : y[0] = CL_shr( y[0], SCALEFACTOR32_2 );
1624 1919872 : y[1] = CL_shr( y[1], SCALEFACTOR32_2 );
1625 1919872 : y[2] = CL_shr( y[2], SCALEFACTOR32_2 );
1626 1919872 : y[3] = CL_shr( y[3], SCALEFACTOR32_2 );
1627 1919872 : y[4] = CL_shr( y[4], SCALEFACTOR32_2 );
1628 1919872 : y[5] = CL_shr( y[5], SCALEFACTOR32_2 );
1629 1919872 : y[6] = CL_shr( y[6], SCALEFACTOR32_2 );
1630 1919872 : y[7] = CL_shr( y[7], SCALEFACTOR32_2 );
1631 1919872 : y[8] = CL_shr( y[8], SCALEFACTOR32_2 );
1632 1919872 : y[16] = CL_shr( y[16], SCALEFACTOR32_2 );
1633 1919872 : y[24] = CL_shr( y[24], SCALEFACTOR32_2 );
1634 1919872 : y[20] = CL_shr( y[20], SCALEFACTOR32_2 );
1635 :
1636 :
1637 1919872 : y[9] = CL_mult_32x16( ( CL_shr( y[9], 1 ) ), pRotVector_32[0] );
1638 1919872 : y[10] = CL_mult_32x16( ( CL_shr( y[10], 1 ) ), pRotVector_32[1] );
1639 1919872 : y[11] = CL_mult_32x16( ( CL_shr( y[11], 1 ) ), pRotVector_32[2] );
1640 1919872 : y[12] = CL_mult_32x16( ( CL_shr( y[12], 1 ) ), pRotVector_32[3] );
1641 1919872 : y[13] = CL_mult_32x16( ( CL_shr( y[13], 1 ) ), pRotVector_32[4] );
1642 1919872 : y[14] = CL_mult_32x16( ( CL_shr( y[14], 1 ) ), pRotVector_32[5] );
1643 1919872 : y[15] = CL_mult_32x16( ( CL_shr( y[15], 1 ) ), pRotVector_32[6] );
1644 1919872 : y[17] = CL_mult_32x16( ( CL_shr( y[17], 1 ) ), pRotVector_32[7] );
1645 1919872 : y[18] = CL_mult_32x16( ( CL_shr( y[18], 1 ) ), pRotVector_32[8] );
1646 1919872 : y[19] = CL_mult_32x16( ( CL_shr( y[19], 1 ) ), pRotVector_32[9] );
1647 1919872 : y[21] = CL_mult_32x16( ( CL_shr( y[21], 1 ) ), pRotVector_32[10] );
1648 1919872 : y[22] = CL_mult_32x16( ( CL_shr( y[22], 1 ) ), pRotVector_32[11] );
1649 1919872 : y[23] = CL_mult_32x16( ( CL_shr( y[23], 1 ) ), pRotVector_32[12] );
1650 1919872 : y[25] = CL_mult_32x16( ( CL_shr( y[25], 1 ) ), pRotVector_32[13] );
1651 1919872 : y[26] = CL_mult_32x16( ( CL_shr( y[26], 1 ) ), pRotVector_32[14] );
1652 1919872 : y[27] = CL_mult_32x16( ( CL_shr( y[27], 1 ) ), pRotVector_32[15] );
1653 1919872 : y[28] = CL_mult_32x16( ( CL_shr( y[28], 1 ) ), pRotVector_32[16] );
1654 1919872 : y[29] = CL_mult_32x16( ( CL_shr( y[29], 1 ) ), pRotVector_32[17] );
1655 1919872 : y[30] = CL_mult_32x16( ( CL_shr( y[30], 1 ) ), pRotVector_32[18] );
1656 1919872 : y[31] = CL_mult_32x16( ( CL_shr( y[31], 1 ) ), pRotVector_32[19] );
1657 :
1658 : /* 1. FFT4 stage */
1659 :
1660 : /* Pre-additions */
1661 1919872 : t[0] = CL_add( y[0], y[16] );
1662 1919872 : t[1] = CL_sub( y[0], y[16] );
1663 1919872 : t[2] = CL_add( y[8], y[24] );
1664 1919872 : t[3] = CL_mul_j( CL_sub( y[8], y[24] ) );
1665 :
1666 : /* Post-additions */
1667 1919872 : inp[0] = CL_add( t[0], t[2] );
1668 1919872 : inp[8] = CL_sub( t[1], t[3] );
1669 1919872 : inp[16] = CL_sub( t[0], t[2] );
1670 1919872 : inp[24] = CL_add( t[1], t[3] );
1671 :
1672 : /* 2. FFT4 stage */
1673 :
1674 : /* Pre-additions */
1675 1919872 : t[0] = CL_add( y[1], y[17] );
1676 1919872 : t[1] = CL_sub( y[1], y[17] );
1677 1919872 : t[2] = CL_add( y[9], y[25] );
1678 1919872 : t[3] = CL_mul_j( CL_sub( y[9], y[25] ) );
1679 :
1680 : /* Post-additions */
1681 1919872 : inp[1] = CL_add( t[0], t[2] );
1682 1919872 : inp[9] = CL_sub( t[1], t[3] );
1683 1919872 : inp[17] = CL_sub( t[0], t[2] );
1684 1919872 : inp[25] = CL_add( t[1], t[3] );
1685 :
1686 :
1687 : /* 3. FFT4 stage */
1688 :
1689 : /* Pre-additions */
1690 1919872 : t[0] = CL_add( y[2], y[18] );
1691 1919872 : t[1] = CL_sub( y[2], y[18] );
1692 1919872 : t[2] = CL_add( y[10], y[26] );
1693 1919872 : t[3] = CL_mul_j( CL_sub( y[10], y[26] ) );
1694 :
1695 : /* Post-additions */
1696 1919872 : inp[2] = CL_add( t[0], t[2] );
1697 1919872 : inp[10] = CL_sub( t[1], t[3] );
1698 1919872 : inp[18] = CL_sub( t[0], t[2] );
1699 1919872 : inp[26] = CL_add( t[1], t[3] );
1700 :
1701 :
1702 : /* 4. FFT4 stage */
1703 :
1704 : /* Pre-additions */
1705 1919872 : t[0] = CL_add( y[3], y[19] );
1706 1919872 : t[1] = CL_sub( y[3], y[19] );
1707 1919872 : t[2] = CL_add( y[11], y[27] );
1708 1919872 : t[3] = CL_mul_j( CL_sub( y[11], y[27] ) );
1709 :
1710 :
1711 : /* Post-additions */
1712 1919872 : inp[3] = CL_add( t[0], t[2] );
1713 1919872 : inp[11] = CL_sub( t[1], t[3] );
1714 1919872 : inp[19] = CL_sub( t[0], t[2] );
1715 1919872 : inp[27] = CL_add( t[1], t[3] );
1716 :
1717 :
1718 : /* 5. FFT4 stage */
1719 :
1720 : /* Pre-additions */
1721 1919872 : t[0] = CL_msu_j( y[4], y[20] );
1722 1919872 : t[1] = CL_mac_j( y[4], y[20] );
1723 1919872 : t[2] = CL_add( y[12], y[28] );
1724 1919872 : t[3] = CL_mul_j( CL_sub( y[12], y[28] ) );
1725 :
1726 :
1727 : /* Post-additions */
1728 1919872 : inp[4] = CL_add( t[0], t[2] );
1729 1919872 : inp[12] = CL_sub( t[1], t[3] );
1730 1919872 : inp[20] = CL_sub( t[0], t[2] );
1731 1919872 : inp[28] = CL_add( t[1], t[3] );
1732 :
1733 :
1734 : /* 6. FFT4 stage */
1735 :
1736 : /* Pre-additions */
1737 1919872 : t[0] = CL_add( y[5], y[21] );
1738 1919872 : t[1] = CL_sub( y[5], y[21] );
1739 1919872 : t[2] = CL_add( y[13], y[29] );
1740 1919872 : t[3] = CL_mul_j( CL_sub( y[13], y[29] ) );
1741 :
1742 :
1743 : /* Post-additions */
1744 1919872 : inp[5] = CL_add( t[0], t[2] );
1745 1919872 : inp[13] = CL_sub( t[1], t[3] );
1746 1919872 : inp[21] = CL_sub( t[0], t[2] );
1747 1919872 : inp[29] = CL_add( t[1], t[3] );
1748 :
1749 :
1750 : /* 7. FFT4 stage */
1751 :
1752 : /* Pre-additions */
1753 1919872 : t[0] = CL_add( y[6], y[22] );
1754 1919872 : t[1] = CL_sub( y[6], y[22] );
1755 1919872 : t[2] = CL_add( y[14], y[30] );
1756 1919872 : t[3] = CL_mul_j( CL_sub( y[14], y[30] ) );
1757 :
1758 :
1759 : /* Post-additions */
1760 1919872 : inp[6] = CL_add( t[0], t[2] );
1761 1919872 : inp[14] = CL_sub( t[1], t[3] );
1762 1919872 : inp[22] = CL_sub( t[0], t[2] );
1763 1919872 : inp[30] = CL_add( t[1], t[3] );
1764 :
1765 :
1766 : /* 8. FFT4 stage */
1767 :
1768 : /* Pre-additions */
1769 1919872 : t[0] = CL_add( y[7], y[23] );
1770 1919872 : t[1] = CL_sub( y[7], y[23] );
1771 1919872 : t[2] = CL_add( y[15], y[31] );
1772 1919872 : t[3] = CL_mul_j( CL_sub( y[15], y[31] ) );
1773 :
1774 :
1775 : /* Post-additions */
1776 1919872 : inp[7] = CL_add( t[0], t[2] );
1777 1919872 : inp[15] = CL_sub( t[1], t[3] );
1778 1919872 : inp[23] = CL_sub( t[0], t[2] );
1779 1919872 : inp[31] = CL_add( t[1], t[3] );
1780 :
1781 : #ifdef WMOPS
1782 : multiCounter[currCounter].CL_move += 32;
1783 : #endif
1784 1919872 : }
1785 :
1786 :
1787 : /**
1788 : * \brief Combined FFT
1789 : *
1790 : * \param [i/o] re real part
1791 : * \param [i/o] im imag part
1792 : * \param [i ] W rotation factor
1793 : * \param [i ] len length of fft
1794 : * \param [i ] dim1 length of fft1
1795 : * \param [i ] dim2 length of fft2
1796 : * \param [i ] sx stride real and imag part
1797 : * \param [i ] sc stride phase rotation coefficients
1798 : * \param [tmp] x 32-bit workbuffer of length=2*len
1799 : * \param [i ] Woff offset for addressing the rotation vector table
1800 : *
1801 : * \return void
1802 : */
1803 :
1804 3362503 : static void fftN2(
1805 : cmplx *__restrict pComplexBuf,
1806 : const Word16 *__restrict W,
1807 : Word16 len,
1808 : Word16 dim1,
1809 : Word16 dim2,
1810 : Word16 sc,
1811 : Word32 *x,
1812 : Word16 Woff )
1813 : {
1814 : Word16 i, j;
1815 3362503 : cmplx *x_cmplx = (cmplx *) x;
1816 :
1817 3362503 : assert( len == ( dim1 * dim2 ) );
1818 3362503 : assert( ( dim1 == 3 ) || ( dim1 == 5 ) || ( dim1 == 8 ) || ( dim1 == 10 ) || ( dim1 == 15 ) || ( dim1 == 16 ) || ( dim1 == 20 ) || ( dim1 == 30 ) || ( dim1 == 32 ) );
1819 3362503 : assert( ( dim2 == 4 ) || ( dim2 == 8 ) || ( dim2 == 10 ) || ( dim2 == 12 ) || ( dim2 == 16 ) || ( dim2 == 20 ) );
1820 :
1821 46246317 : FOR( i = 0; i < dim2; i++ )
1822 : {
1823 1074212742 : FOR( j = 0; j < dim1; j++ )
1824 : {
1825 1031328928 : x_cmplx[i * dim1 + j] = pComplexBuf[i + j * dim2];
1826 : #ifdef WMOPS
1827 : multiCounter[currCounter].CL_move++;
1828 : #endif
1829 : }
1830 : }
1831 :
1832 3362503 : SWITCH( dim1 )
1833 : {
1834 12270 : case 5:
1835 110430 : FOR( i = 0; i < dim2; i++ )
1836 : {
1837 98160 : fft5_with_cmplx_data( &x_cmplx[i * dim1] );
1838 : }
1839 12270 : BREAK;
1840 288013 : case 8:
1841 2592117 : FOR( i = 0; i < dim2; i++ )
1842 : {
1843 2304104 : fft8_with_cmplx_data( &x_cmplx[i * dim1] );
1844 : }
1845 288013 : BREAK;
1846 81076 : case 10:
1847 729684 : FOR( i = 0; i < dim2; i++ )
1848 : {
1849 648608 : fft10_with_cmplx_data( &x_cmplx[i * dim1] );
1850 : }
1851 81076 : BREAK;
1852 :
1853 66504 : case 15:
1854 598536 : FOR( i = 0; i < dim2; i++ )
1855 : {
1856 532032 : fft15_with_cmplx_data( &x_cmplx[i * dim1] );
1857 : }
1858 66504 : BREAK;
1859 264449 : case 16:
1860 2380041 : FOR( i = 0; i < dim2; i++ )
1861 : {
1862 2115592 : fft16_with_cmplx_data( &x_cmplx[i * dim1], 1 );
1863 : }
1864 264449 : BREAK;
1865 1167589 : case 20:
1866 16698591 : FOR( i = 0; i < dim2; i++ )
1867 : {
1868 15531002 : fft20_with_cmplx_data( &x_cmplx[i * dim1] );
1869 : }
1870 1167589 : BREAK;
1871 1242618 : case 30:
1872 20977062 : FOR( i = 0; i < dim2; i++ )
1873 : {
1874 19734444 : fft30_with_cmplx_data( &x_cmplx[i * dim1] );
1875 : }
1876 1242618 : BREAK;
1877 239984 : case 32:
1878 2159856 : FOR( i = 0; i < dim2; i++ )
1879 : {
1880 1919872 : fft32_with_cmplx_data( &x_cmplx[i * dim1] );
1881 : }
1882 239984 : BREAK;
1883 : }
1884 :
1885 3362503 : SWITCH( dim2 )
1886 : {
1887 1359266 : case 8:
1888 : {
1889 : cmplx y0, y1, y2, y3, y4, y5, y6, y7;
1890 : cmplx t0, t1, t2, t3, t4, t5, t6, t7;
1891 : cmplx s0, s1, s2, s3, s4, s5, s6, s7;
1892 :
1893 1359266 : i = 0;
1894 1359266 : move16();
1895 : {
1896 1359266 : y0 = CL_shr( x_cmplx[i + 0 * dim1], 1 );
1897 1359266 : y1 = CL_shr( x_cmplx[i + 1 * dim1], 1 );
1898 1359266 : y2 = CL_shr( x_cmplx[i + 2 * dim1], 1 );
1899 1359266 : y3 = CL_shr( x_cmplx[i + 3 * dim1], 1 );
1900 1359266 : y4 = CL_shr( x_cmplx[i + 4 * dim1], 1 );
1901 1359266 : y5 = CL_shr( x_cmplx[i + 5 * dim1], 1 );
1902 1359266 : y6 = CL_shr( x_cmplx[i + 6 * dim1], 1 );
1903 1359266 : y7 = CL_shr( x_cmplx[i + 7 * dim1], 1 );
1904 :
1905 1359266 : t0 = CL_shr( CL_add( y0, y4 ), SCALEFACTORN2 - 1 );
1906 1359266 : t1 = CL_shr( CL_sub( y0, y4 ), SCALEFACTORN2 - 1 );
1907 1359266 : t2 = CL_shr( CL_add( y1, y5 ), SCALEFACTORN2 - 1 );
1908 1359266 : t3 = CL_sub( y1, y5 );
1909 1359266 : t4 = CL_shr( CL_add( y2, y6 ), SCALEFACTORN2 - 1 );
1910 1359266 : t5 = CL_shr( CL_sub( y2, y6 ), SCALEFACTORN2 - 1 );
1911 1359266 : t6 = CL_shr( CL_add( y3, y7 ), SCALEFACTORN2 - 1 );
1912 1359266 : t7 = CL_sub( y3, y7 );
1913 :
1914 :
1915 1359266 : s0 = CL_add( t0, t4 );
1916 1359266 : s2 = CL_sub( t0, t4 );
1917 1359266 : s4 = CL_mac_j( t1, t5 );
1918 1359266 : s5 = CL_msu_j( t1, t5 );
1919 1359266 : s1 = CL_add( t2, t6 );
1920 1359266 : s3 = CL_mul_j( CL_sub( t2, t6 ) );
1921 1359266 : t0 = CL_shr( CL_add( t3, t7 ), SCALEFACTORN2 - 1 );
1922 1359266 : t1 = CL_shr( CL_sub( t3, t7 ), SCALEFACTORN2 - 1 );
1923 1359266 : s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
1924 1359266 : s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
1925 :
1926 1359266 : pComplexBuf[i + 0 * dim1] = CL_add( s0, s1 );
1927 1359266 : pComplexBuf[i + 1 * dim1] = CL_add( s5, s6 );
1928 1359266 : pComplexBuf[i + 2 * dim1] = CL_sub( s2, s3 );
1929 1359266 : pComplexBuf[i + 3 * dim1] = CL_add( s4, s7 );
1930 1359266 : pComplexBuf[i + 4 * dim1] = CL_sub( s0, s1 );
1931 1359266 : pComplexBuf[i + 5 * dim1] = CL_sub( s5, s6 );
1932 1359266 : pComplexBuf[i + 6 * dim1] = CL_add( s2, s3 );
1933 1359266 : pComplexBuf[i + 7 * dim1] = CL_sub( s4, s7 );
1934 : }
1935 :
1936 :
1937 24513326 : FOR( i = 1; i < dim1; i++ )
1938 : {
1939 23154060 : y0 = CL_shr( x_cmplx[i + 0 * dim1], 1 );
1940 23154060 : y1 = CL_shr( CL_mult_32x16( x_cmplx[i + 1 * dim1], *(const cmplx_s *) &W[sc * i + sc * 1 * dim1 - Woff] ), 1 );
1941 23154060 : y2 = CL_shr( CL_mult_32x16( x_cmplx[i + 2 * dim1], *(const cmplx_s *) &W[sc * i + sc * 2 * dim1 - Woff] ), 1 );
1942 23154060 : y3 = CL_shr( CL_mult_32x16( x_cmplx[i + 3 * dim1], *(const cmplx_s *) &W[sc * i + sc * 3 * dim1 - Woff] ), 1 );
1943 23154060 : y4 = CL_shr( CL_mult_32x16( x_cmplx[i + 4 * dim1], *(const cmplx_s *) &W[sc * i + sc * 4 * dim1 - Woff] ), 1 );
1944 23154060 : y5 = CL_shr( CL_mult_32x16( x_cmplx[i + 5 * dim1], *(const cmplx_s *) &W[sc * i + sc * 5 * dim1 - Woff] ), 1 );
1945 23154060 : y6 = CL_shr( CL_mult_32x16( x_cmplx[i + 6 * dim1], *(const cmplx_s *) &W[sc * i + sc * 6 * dim1 - Woff] ), 1 );
1946 23154060 : y7 = CL_shr( CL_mult_32x16( x_cmplx[i + 7 * dim1], *(const cmplx_s *) &W[sc * i + sc * 7 * dim1 - Woff] ), 1 );
1947 :
1948 23154060 : t0 = CL_shr( CL_add( y0, y4 ), SCALEFACTORN2 - 1 );
1949 23154060 : t1 = CL_shr( CL_sub( y0, y4 ), SCALEFACTORN2 - 1 );
1950 23154060 : t2 = CL_shr( CL_add( y1, y5 ), SCALEFACTORN2 - 1 );
1951 23154060 : t3 = CL_sub( y1, y5 );
1952 23154060 : t4 = CL_shr( CL_add( y2, y6 ), SCALEFACTORN2 - 1 );
1953 23154060 : t5 = CL_shr( CL_sub( y2, y6 ), SCALEFACTORN2 - 1 );
1954 23154060 : t6 = CL_shr( CL_add( y3, y7 ), SCALEFACTORN2 - 1 );
1955 23154060 : t7 = CL_sub( y3, y7 );
1956 :
1957 :
1958 23154060 : s0 = CL_add( t0, t4 );
1959 23154060 : s2 = CL_sub( t0, t4 );
1960 23154060 : s4 = CL_mac_j( t1, t5 );
1961 23154060 : s5 = CL_msu_j( t1, t5 );
1962 23154060 : s1 = CL_add( t2, t6 );
1963 23154060 : s3 = CL_mul_j( CL_sub( t2, t6 ) );
1964 23154060 : t0 = CL_shr( CL_add( t3, t7 ), SCALEFACTORN2 - 1 );
1965 23154060 : t1 = CL_shr( CL_sub( t3, t7 ), SCALEFACTORN2 - 1 );
1966 23154060 : s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
1967 23154060 : s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
1968 :
1969 23154060 : pComplexBuf[i + 0 * dim1] = CL_add( s0, s1 );
1970 23154060 : pComplexBuf[i + 1 * dim1] = CL_add( s5, s6 );
1971 23154060 : pComplexBuf[i + 2 * dim1] = CL_sub( s2, s3 );
1972 23154060 : pComplexBuf[i + 3 * dim1] = CL_add( s4, s7 );
1973 23154060 : pComplexBuf[i + 4 * dim1] = CL_sub( s0, s1 );
1974 23154060 : pComplexBuf[i + 5 * dim1] = CL_sub( s5, s6 );
1975 23154060 : pComplexBuf[i + 6 * dim1] = CL_add( s2, s3 );
1976 23154060 : pComplexBuf[i + 7 * dim1] = CL_sub( s4, s7 );
1977 : }
1978 :
1979 1359266 : BREAK;
1980 : }
1981 :
1982 24855 : case 10:
1983 : {
1984 : cmplx y[20];
1985 : {
1986 273405 : FOR( j = 0; j < dim2; j++ )
1987 : {
1988 248550 : y[j] = CL_move( x_cmplx[j * dim1] );
1989 : }
1990 24855 : fft10_with_cmplx_data( &y[0] );
1991 273405 : FOR( j = 0; j < dim2; j++ )
1992 : {
1993 248550 : pComplexBuf[j * dim1] = y[j];
1994 : }
1995 497100 : FOR( i = 1; i < dim1; i++ )
1996 : {
1997 472245 : y[0] = CL_move( x_cmplx[i] );
1998 4722450 : FOR( j = 1; j < dim2; j++ )
1999 : {
2000 4250205 : y[j] = CL_mult_32x16( x_cmplx[i + j * dim1], *(const cmplx_s *) &W[sc * i + sc * j * dim1 - Woff] );
2001 : }
2002 472245 : fft10_with_cmplx_data( &y[0] );
2003 5194695 : FOR( j = 0; j < dim2; j++ )
2004 : {
2005 4722450 : pComplexBuf[i + j * dim1] = y[j];
2006 : }
2007 : }
2008 : }
2009 24855 : BREAK;
2010 : }
2011 1951626 : case 16:
2012 : {
2013 : cmplx y[20];
2014 :
2015 33177642 : FOR( j = 0; j < dim2; j++ )
2016 : {
2017 31226016 : y[j] = CL_shr( x_cmplx[0 + j * dim1], SCALEFACTOR16 );
2018 : }
2019 1951626 : fft16_with_cmplx_data( &y[0], 0 );
2020 :
2021 33177642 : FOR( j = 0; j < dim2; j++ )
2022 : {
2023 31226016 : pComplexBuf[j * dim1] = y[j];
2024 : }
2025 50958870 : FOR( i = 1; i < dim1; i++ )
2026 : {
2027 49007244 : y[0] = CL_shr( x_cmplx[i + ( 0 + 0 ) * dim1], SCALEFACTOR16 );
2028 49007244 : y[1] = CL_shr( CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] ), SCALEFACTOR16 );
2029 :
2030 392057952 : FOR( j = 2; j < dim2; j = j + 2 )
2031 : {
2032 343050708 : y[( j + 0 )] = CL_shr( CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] ), SCALEFACTOR16 );
2033 343050708 : y[( j + 1 )] = CL_shr( CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] ), SCALEFACTOR16 );
2034 : }
2035 49007244 : fft16_with_cmplx_data( &y[0], 0 );
2036 833123148 : FOR( j = 0; j < dim2; j++ )
2037 : {
2038 784115904 : pComplexBuf[i + j * dim1] = y[j];
2039 : }
2040 : }
2041 : }
2042 1951626 : BREAK;
2043 :
2044 26756 : case 20:
2045 :
2046 26756 : assert( dim1 == 20 || dim1 == 30 ); /* cplxMpy4_10_0 contains shift values hardcoded FOR 20x10 */
2047 26756 : IF( EQ_16( dim1, 20 ) )
2048 : {
2049 : cmplx y[20];
2050 120141 : FOR( j = 0; j < dim2; j++ )
2051 : {
2052 114420 : y[j] = CL_move( x_cmplx[j * dim1] );
2053 : }
2054 5721 : fft20_with_cmplx_data( &y[0] );
2055 120141 : FOR( j = 0; j < dim2; j++ )
2056 : {
2057 114420 : pComplexBuf[j * dim1] = y[j];
2058 : }
2059 114420 : FOR( i = 1; i < dim1; i++ )
2060 : {
2061 108699 : y[0] = CL_move( x_cmplx[i] );
2062 108699 : y[1] = CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] );
2063 1086990 : FOR( j = 2; j < dim2; j = j + 2 )
2064 : {
2065 :
2066 978291 : y[j + 0] = CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] );
2067 978291 : y[j + 1] = CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] );
2068 : }
2069 108699 : fft20_with_cmplx_data( &y[0] );
2070 2282679 : FOR( j = 0; j < dim2; j++ )
2071 : {
2072 2173980 : pComplexBuf[i + j * dim1] = y[j];
2073 : }
2074 : }
2075 : }
2076 : ELSE
2077 : {
2078 : cmplx y[20];
2079 441735 : FOR( j = 0; j < dim2; j++ )
2080 : {
2081 420700 : y[j] = CL_shl( x_cmplx[j * dim1], ( SCALEFACTOR30 - SCALEFACTOR20 ) );
2082 : }
2083 21035 : fft20_with_cmplx_data( &y[0] );
2084 441735 : FOR( j = 0; j < dim2; j++ )
2085 : {
2086 420700 : pComplexBuf[j * dim1] = y[j];
2087 : }
2088 631050 : FOR( i = 1; i < dim1; i++ )
2089 : {
2090 610015 : y[0] = CL_shl( x_cmplx[i], ( SCALEFACTOR30 - SCALEFACTOR20 ) );
2091 610015 : y[1] = CL_shl( CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
2092 6100150 : FOR( j = 2; j < dim2; j = j + 2 )
2093 : {
2094 :
2095 5490135 : y[j + 0] = CL_shl( CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
2096 5490135 : y[j + 1] = CL_shl( CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
2097 : }
2098 610015 : fft20_with_cmplx_data( &y[0] );
2099 12810315 : FOR( j = 0; j < dim2; j++ )
2100 : {
2101 12200300 : pComplexBuf[i + j * dim1] = y[j];
2102 : }
2103 : }
2104 : }
2105 26756 : BREAK;
2106 : }
2107 : #ifdef WMOPS
2108 : multiCounter[currCounter].CL_move += len;
2109 : #endif
2110 3362503 : }
2111 :
2112 :
2113 : /**
2114 : * \brief Complex valued FFT
2115 : *
2116 : * \param [i/o] re real part
2117 : * \param [i/o] im imag part
2118 : * \param [i ] sizeOfFft length of fft
2119 : * \param [i ] s stride real and imag part
2120 : * \param [i ] scale scalefactor
2121 : *
2122 : * \return void
2123 : */
2124 3761371 : void BASOP_cfft( cmplx *pComplexBuf, Word16 sizeOfFft, Word16 *scale, Word32 x[2 * BASOP_CFFT_MAX_LENGTH] )
2125 : {
2126 : Word16 s;
2127 3761371 : s = 0;
2128 3761371 : move16();
2129 3761371 : SWITCH( sizeOfFft )
2130 : {
2131 0 : case 5:
2132 0 : fft5_with_cmplx_data( pComplexBuf );
2133 0 : s = add( *scale, SCALEFACTOR5 );
2134 0 : BREAK;
2135 :
2136 64672 : case 8:
2137 64672 : fft8_with_cmplx_data( pComplexBuf );
2138 64672 : s = add( *scale, SCALEFACTOR8 );
2139 64672 : BREAK;
2140 :
2141 149236 : case 10:
2142 149236 : fft10_with_cmplx_data( pComplexBuf );
2143 149236 : s = add( *scale, SCALEFACTOR10 );
2144 149236 : BREAK;
2145 :
2146 0 : case 16:
2147 0 : fft16_with_cmplx_data( pComplexBuf, 1 );
2148 0 : s = add( *scale, SCALEFACTOR16 );
2149 0 : BREAK;
2150 :
2151 66236 : case 20:
2152 66236 : fft20_with_cmplx_data( pComplexBuf );
2153 66236 : s = add( *scale, SCALEFACTOR20 );
2154 66236 : BREAK;
2155 :
2156 118724 : case 30:
2157 118724 : fft30_with_cmplx_data( pComplexBuf );
2158 118724 : s = add( *scale, SCALEFACTOR30 );
2159 118724 : BREAK;
2160 :
2161 0 : case 32:
2162 0 : fft32_with_cmplx_data( pComplexBuf );
2163 0 : s = add( *scale, SCALEFACTOR32 );
2164 0 : BREAK;
2165 :
2166 12270 : case 40:
2167 : {
2168 12270 : fftN2( pComplexBuf, RotVector_320, 40, 5, 8, 8, x, 40 );
2169 12270 : s = add( *scale, SCALEFACTOR40 );
2170 12270 : BREAK;
2171 : }
2172 :
2173 288013 : case 64:
2174 : {
2175 288013 : fftN2( pComplexBuf, RotVector_256, 64, 8, 8, 8, x, 64 );
2176 288013 : s = add( *scale, SCALEFACTOR64 );
2177 288013 : BREAK;
2178 : }
2179 :
2180 81076 : case 80:
2181 : {
2182 81076 : fftN2( pComplexBuf, RotVector_320, 80, 10, 8, 4, x, 40 );
2183 81076 : s = add( *scale, SCALEFACTOR80 );
2184 81076 : BREAK;
2185 : }
2186 0 : case 100:
2187 : {
2188 0 : fftN2( pComplexBuf, RotVector_400, 100, 10, 10, 4, x, 40 );
2189 0 : s = add( *scale, SCALEFACTOR100 );
2190 0 : BREAK;
2191 : }
2192 66504 : case 120:
2193 : {
2194 66504 : fftN2( pComplexBuf, RotVector_480, 120, 15, 8, 4, x, 60 );
2195 66504 : s = add( *scale, SCALEFACTOR120 );
2196 66504 : BREAK;
2197 : }
2198 :
2199 264449 : case 128:
2200 : {
2201 264449 : fftN2( pComplexBuf, RotVector_256, 128, 16, 8, 4, x, 64 );
2202 264449 : s = add( *scale, SCALEFACTOR128 );
2203 264449 : BREAK;
2204 : }
2205 :
2206 378022 : case 160:
2207 : {
2208 378022 : fftN2( pComplexBuf, RotVector_320, 160, 20, 8, 2, x, 40 );
2209 378022 : s = add( *scale, SCALEFACTOR160 );
2210 378022 : BREAK;
2211 : }
2212 :
2213 24855 : case 200:
2214 : {
2215 24855 : fftN2( pComplexBuf, RotVector_400, 200, 20, 10, 2, x, 40 );
2216 24855 : s = add( *scale, SCALEFACTOR200 );
2217 24855 : BREAK;
2218 : }
2219 :
2220 28948 : case 240:
2221 : {
2222 28948 : fftN2( pComplexBuf, RotVector_480, 240, 30, 8, 2, x, 60 );
2223 28948 : s = add( *scale, SCALEFACTOR240 );
2224 28948 : BREAK;
2225 : }
2226 :
2227 239984 : case 256:
2228 : {
2229 239984 : fftN2( pComplexBuf, RotVector_256, 256, 32, 8, 2, x, 64 );
2230 239984 : s = add( *scale, SCALEFACTOR256 );
2231 239984 : BREAK;
2232 : }
2233 :
2234 758991 : case 320:
2235 : {
2236 758991 : fftN2( pComplexBuf, RotVector_320, 320, 20, 16, 2, x, 40 );
2237 758991 : s = add( *scale, SCALEFACTOR320 );
2238 758991 : BREAK;
2239 : }
2240 :
2241 5721 : case 400:
2242 : {
2243 5721 : fftN2( pComplexBuf, RotVector_400, 400, 20, 20, 2, x, 40 );
2244 5721 : s = add( *scale, SCALEFACTOR400 );
2245 5721 : BREAK;
2246 : }
2247 :
2248 1192635 : case 480:
2249 : {
2250 1192635 : fftN2( pComplexBuf, RotVector_480, 480, 30, 16, 2, x, 60 );
2251 1192635 : s = add( *scale, SCALEFACTOR480 );
2252 1192635 : BREAK;
2253 : }
2254 21035 : case 600:
2255 : {
2256 21035 : fftN2( pComplexBuf, RotVector_600, 600, 30, 20, 2, x, 60 );
2257 21035 : s = add( *scale, SCALEFACTOR600 );
2258 21035 : BREAK;
2259 : }
2260 0 : default:
2261 0 : assert( 0 );
2262 : }
2263 3761371 : *scale = s;
2264 3761371 : move16();
2265 3761371 : }
2266 :
2267 :
2268 : #define RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1, w2, xb0, xb1, xt0, xt1 ) \
2269 : { \
2270 : xb0 = L_shr( x[2 * i + 0], 2 ); \
2271 : xb1 = L_shr( x[2 * i + 1], 2 ); \
2272 : xt0 = L_shr( x[sizeOfFft - 2 * i + 0], 2 ); \
2273 : xt1 = L_shr( x[sizeOfFft - 2 * i + 1], 2 ); \
2274 : t1 = L_sub( xb0, xt0 ); \
2275 : t2 = L_add( xb1, xt1 ); \
2276 : t3 = L_sub( Mpy_32_16_1( t1, w1 ), Mpy_32_16_1( t2, w2 ) ); \
2277 : t4 = L_add( Mpy_32_16_1( t1, w2 ), Mpy_32_16_1( t2, w1 ) ); \
2278 : t1 = L_add( xb0, xt0 ); \
2279 : t2 = L_sub( xb1, xt1 ); \
2280 : }
2281 :
2282 : #define RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1, w2, xb0, xb1, xt0, xt1 ) \
2283 : { \
2284 : xb0 = L_shr( x[2 * i + 0], 2 ); \
2285 : xb1 = L_shr( x[2 * i + 1], 2 ); \
2286 : xt0 = L_shr( x[sizeOfFft - 2 * i + 0], 2 ); \
2287 : xt1 = L_shr( x[sizeOfFft - 2 * i + 1], 2 ); \
2288 : t1 = L_sub( xb0, xt0 ); \
2289 : t2 = L_add( xb1, xt1 ); \
2290 : t3 = L_add( Mpy_32_16_1( t1, w1 ), Mpy_32_16_1( t2, w2 ) ); \
2291 : t4 = L_sub( Mpy_32_16_1( t2, w1 ), Mpy_32_16_1( t1, w2 ) ); \
2292 : t1 = L_add( xb0, xt0 ); \
2293 : t2 = L_sub( xb1, xt1 ); \
2294 : }
2295 :
2296 : /**
2297 : * \brief Real valued FFT
2298 : *
2299 : * forward rFFT (isign == -1):
2300 : * The input vector contains sizeOfFft real valued time samples. The output vector contains sizeOfFft/2 complex valued
2301 : * spectral values. The spectral values resides interleaved in the output vector. x[1] contains re[sizeOfFft], because
2302 : * x[1] is zero by default. This allows use of sizeOfFft length buffer instead of sizeOfFft+1.
2303 : *
2304 : * inverse rFFT (isign == +1):
2305 : * The input vector contains sizeOfFft complex valued spectral values. The output vector contains sizeOfFft real valued
2306 : * time samples. The spectral values resides interleaved in the input vector. x[1] contains re[sizeOfFft].
2307 : * (see also forward rFFT)
2308 : *
2309 : * \param [i/o] x real input / real and imag output interleaved
2310 : * \param [i ] sizeOfFft length of fft
2311 : * \param [i ] scale scalefactor
2312 : * \param [i ] isign forward (-1) / backward (+1)
2313 : *
2314 : * \return void
2315 : */
2316 38851 : void BASOP_rfft( Word32 *x, Word16 sizeOfFft, Word16 *scale, Word16 isign )
2317 : {
2318 38851 : Word16 i, s = 0, sizeOfFft2, sizeOfFft4, sizeOfFft8, wstride; /* clear s to calm down compiler */
2319 : Word32 t1, t2, t3, t4, xb0, xb1, xt0, xt1;
2320 : const PWord16 *w1;
2321 : Word16 c1;
2322 : Word16 c2;
2323 : Word32 workBuffer[2 * BASOP_CFFT_MAX_LENGTH];
2324 :
2325 :
2326 38851 : sizeOfFft2 = shr( sizeOfFft, 1 );
2327 38851 : sizeOfFft4 = shr( sizeOfFft, 2 );
2328 38851 : sizeOfFft8 = shr( sizeOfFft, 3 );
2329 :
2330 38851 : BASOP_getTables( NULL, &w1, &wstride, sizeOfFft2 );
2331 :
2332 38851 : SWITCH( isign )
2333 : {
2334 15 : case -1:
2335 :
2336 15 : BASOP_cfft( (cmplx *) x, sizeOfFft2, scale, workBuffer );
2337 :
2338 15 : xb0 = L_shr( x[0], 1 );
2339 15 : xb1 = L_shr( x[1], 1 );
2340 15 : x[0] = L_add( xb0, xb1 );
2341 15 : move32();
2342 15 : x[1] = L_sub( xb0, xb1 );
2343 15 : move32();
2344 :
2345 1088 : FOR( i = 1; i < sizeOfFft8; i++ )
2346 : {
2347 1073 : RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1[i * wstride].v.im, w1[i * wstride].v.re, xb0, xb1, xt0, xt1 )
2348 1073 : x[2 * i] = L_sub( t1, t3 );
2349 1073 : move32();
2350 1073 : x[2 * i + 1] = L_sub( t2, t4 );
2351 1073 : move32();
2352 1073 : x[sizeOfFft - 2 * i] = L_add( t1, t3 );
2353 1073 : move32();
2354 1073 : x[sizeOfFft - 2 * i + 1] = L_negate( L_add( t2, t4 ) );
2355 1073 : move32();
2356 : }
2357 :
2358 1103 : FOR( i = sizeOfFft8; i < sizeOfFft4; i++ )
2359 : {
2360 1088 : RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1[( sizeOfFft4 - i ) * wstride].v.re, w1[( sizeOfFft4 - i ) * wstride].v.im, xb0, xb1, xt0, xt1 )
2361 1088 : x[2 * i] = L_sub( t1, t3 );
2362 1088 : move32();
2363 1088 : x[2 * i + 1] = L_sub( t2, t4 );
2364 1088 : move32();
2365 1088 : x[sizeOfFft - 2 * i] = L_add( t1, t3 );
2366 1088 : move32();
2367 1088 : x[sizeOfFft - 2 * i + 1] = L_negate( L_add( t2, t4 ) );
2368 1088 : move32();
2369 : }
2370 :
2371 15 : x[sizeOfFft - 2 * i] = L_shr( x[2 * i + 0], 1 );
2372 15 : move32();
2373 15 : x[sizeOfFft - 2 * i + 1] = L_negate( L_shr( x[2 * i + 1], 1 ) );
2374 15 : move32();
2375 :
2376 15 : *scale = add( *scale, 1 );
2377 15 : move16();
2378 15 : BREAK;
2379 :
2380 38836 : case +1:
2381 :
2382 38836 : xb0 = L_shr( x[0], 2 );
2383 38836 : xb1 = L_shr( x[1], 2 );
2384 38836 : x[0] = L_add( xb0, xb1 );
2385 38836 : move32();
2386 38836 : x[1] = L_sub( xb1, xb0 );
2387 38836 : move32();
2388 :
2389 2836960 : FOR( i = 1; i < sizeOfFft8; i++ )
2390 : {
2391 2798124 : RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1[i * wstride].v.im, w1[i * wstride].v.re, xb0, xb1, xt0, xt1 )
2392 :
2393 2798124 : x[2 * i] = L_sub( t1, t3 );
2394 2798124 : move32();
2395 2798124 : x[2 * i + 1] = L_sub( t4, t2 );
2396 2798124 : move32();
2397 2798124 : x[sizeOfFft - 2 * i] = L_add( t1, t3 );
2398 2798124 : move32();
2399 2798124 : x[sizeOfFft - 2 * i + 1] = L_add( t2, t4 );
2400 2798124 : move32();
2401 : }
2402 :
2403 2875796 : FOR( i = sizeOfFft8; i < sizeOfFft4; i++ )
2404 : {
2405 2836960 : RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1[( sizeOfFft4 - i ) * wstride].v.re, w1[( sizeOfFft4 - i ) * wstride].v.im, xb0, xb1, xt0, xt1 )
2406 :
2407 2836960 : x[2 * i] = L_sub( t1, t3 );
2408 2836960 : move32();
2409 2836960 : x[2 * i + 1] = L_sub( t4, t2 );
2410 2836960 : move32();
2411 2836960 : x[sizeOfFft - 2 * i] = L_add( t1, t3 );
2412 2836960 : move32();
2413 2836960 : x[sizeOfFft - 2 * i + 1] = L_add( t2, t4 );
2414 2836960 : move32();
2415 : }
2416 :
2417 38836 : x[sizeOfFft - 2 * i] = L_shr( x[2 * i + 0], 1 );
2418 38836 : move32();
2419 38836 : x[sizeOfFft - 2 * i + 1] = L_shr( x[2 * i + 1], 1 );
2420 38836 : move32();
2421 :
2422 38836 : BASOP_cfft( (cmplx *) x, sizeOfFft2, scale, workBuffer );
2423 :
2424 38836 : SWITCH( sizeOfFft )
2425 : {
2426 21966 : case 40:
2427 : case 80:
2428 : case 320:
2429 : case 640:
2430 21966 : c1 = FFTC( 0x66666680 );
2431 21966 : move16();
2432 21966 : c2 = FFTC( 0x99999980 );
2433 21966 : move16();
2434 7051086 : FOR( i = 0; i < sizeOfFft2; i++ )
2435 : {
2436 7029120 : x[2 * i] = Mpy_32_xx( x[2 * i], c1 );
2437 7029120 : move32();
2438 7029120 : x[2 * i + 1] = Mpy_32_xx( x[2 * i + 1], c2 );
2439 7029120 : move32();
2440 : }
2441 21966 : BREAK;
2442 :
2443 16870 : case 64:
2444 : case 256:
2445 : case 512:
2446 4335590 : FOR( i = 0; i < sizeOfFft2; i++ )
2447 : {
2448 4318720 : x[2 * i + 1] = L_negate( x[2 * i + 1] );
2449 4318720 : move32();
2450 : }
2451 16870 : BREAK;
2452 :
2453 0 : default:
2454 0 : assert( 0 );
2455 : }
2456 :
2457 38836 : SWITCH( sizeOfFft )
2458 : {
2459 0 : case 64:
2460 0 : s = add( *scale, 2 - 6 );
2461 0 : BREAK;
2462 :
2463 16870 : case 512:
2464 16870 : s = add( *scale, 2 - 9 );
2465 16870 : BREAK;
2466 :
2467 21966 : case 640:
2468 21966 : s = add( *scale, 2 - 9 );
2469 21966 : BREAK;
2470 :
2471 0 : default:
2472 0 : assert( 0 );
2473 : }
2474 38836 : *scale = s;
2475 38836 : move16();
2476 38836 : BREAK;
2477 : }
2478 38851 : }
|