LCOV - code coverage report
Current view: top level - lib_com - fft_evs.c (source / functions) Hit Total Coverage
Test: Coverage on main enc/dec/rend @ 3b2f07138c61dcf997bbf4165d0882f794b2995f Lines: 1384 1422 97.3 %
Date: 2025-05-03 01:55:50 Functions: 11 12 91.7 %

          Line data    Source code
       1             : /*====================================================================================
       2             :     EVS Codec 3GPP TS26.452 Aug 12, 2021. Version 16.3.0
       3             :   ====================================================================================*/
       4             : 
       5             : #include <assert.h>
       6             : #include "prot_fx.h"
       7             : #include "basop_util.h"
       8             : #include "rom_basop_util.h"
       9             : #include "rom_com.h"
      10             : #include "options.h"
      11             : #include "stl.h"
      12             : /************************************************************************/
      13             : /* FFT                                                                  */
      14             : /************************************************************************/
      15             : #define SCALEFACTOR16 ( 5 )
      16             : #define SCALEFACTOR20 ( 5 )
      17             : 
      18             : 
      19             : void fft16_with_cmplx_data( cmplx *pInp, Word16 bsacle );
      20             : 
      21             : /**
      22             :  * \brief Profiling / Precision results
      23             :  *
      24             :  *        Profiling / Precision of complex valued FFTs: BASOP_cfft()
      25             :  *
      26             :  *                       WOPS BASOP  Precision BASOP
      27             :  *        FFT5                   87     16.96
      28             :  *        FFT8                  108     17.04
      29             :  *        FFT10                 194     16.70
      30             :  *        FFT15                 354     16.97
      31             :  *        FFT16                 288     16.62
      32             :  *        FFT20                 368     16.06
      33             :  *        FFT30                 828     16.80
      34             :  *        FFT32                 752     15.45   (cplx mult mit 3 mult und 3 add)
      35             :  *        FFT32                 824     16.07   (cplx mult mit 4 mult und 2 add)
      36             :  *        FFT64  ( 8x 8)      3.129     15.16
      37             :  *        FFT80  (10x 8)      4.385     15.55
      38             :  *        FFT100 (20x 5)      6.518     15.65
      39             :  *        FFT120 (15x 8)      7.029     15.38
      40             :  *        FFT128 (16x 8)      6.777     15.28
      41             :  *        FFT160 (20x 8)      9.033     14.95
      42             :  *        FFT240 (30x 8)     14.961     15.49
      43             :  *        FFT256 (32x 8)     14.905     14.61   (cplx mult mit 3 mult und 3 add)
      44             :  *        FFT256 (32x 8)     15.265     15.04   (cplx mult mit 4 mult und 2 add)
      45             :  *        FFT320 (20x16)     21.517     15.21
      46             :  *
      47             :  *
      48             :  *        Profiling / Precision of real valued FFTs / iFFTs: BASOP_rfft()
      49             :  *
      50             :  *                       WOPS BASOP  Precision BASOP
      51             :  *        rFFT40                955     15.68
      52             :  *        rFFT64               1635     16.17
      53             :  *
      54             :  *        irFFT40              1116     15.36
      55             :  *        irFFT64              1759     15.18
      56             :  *
      57             :  */
      58             : 
      59             : 
      60             : #define Mpy_32_xx Mpy_32_16_1
      61             : 
      62             : #define FFTC( x ) WORD322WORD16( (Word32) x )
      63             : 
      64             : #define C31 ( FFTC( 0x91261468 ) ) /* FL2WORD32( -0.86602540) -sqrt(3)/2 */
      65             : 
      66             : #define C51 ( FFTC( 0x79bc3854 ) ) /* FL2WORD32( 0.95105652)   */
      67             : #define C52 ( FFTC( 0x9d839db0 ) ) /* FL2WORD32(-1.53884180/2) */
      68             : #define C53 ( FFTC( 0xd18053ce ) ) /* FL2WORD32(-0.36327126)   */
      69             : #define C54 ( FFTC( 0x478dde64 ) ) /* FL2WORD32( 0.55901699)   */
      70             : #define C55 ( FFTC( 0xb0000001 ) ) /* FL2WORD32(-1.25/2)       */
      71             : 
      72             : #define C81 ( FFTC( 0x5a82799a ) ) /* FL2WORD32( 7.071067811865475e-1) */
      73             : #define C82 ( FFTC( 0xa57d8666 ) ) /* FL2WORD32(-7.071067811865475e-1) */
      74             : 
      75             : #define C161 ( FFTC( 0x5a82799a ) ) /* FL2WORD32( 7.071067811865475e-1)  INV_SQRT2    */
      76             : #define C162 ( FFTC( 0xa57d8666 ) ) /* FL2WORD32(-7.071067811865475e-1) -INV_SQRT2    */
      77             : 
      78             : #define C163 ( FFTC( 0x7641af3d ) ) /* FL2WORD32( 9.238795325112867e-1)  COS_PI_DIV8  */
      79             : #define C164 ( FFTC( 0x89be50c3 ) ) /* FL2WORD32(-9.238795325112867e-1) -COS_PI_DIV8  */
      80             : 
      81             : #define C165 ( FFTC( 0x30fbc54d ) ) /* FL2WORD32( 3.826834323650898e-1)  COS_3PI_DIV8 */
      82             : #define C166 ( FFTC( 0xcf043ab3 ) ) /* FL2WORD32(-3.826834323650898e-1) -COS_3PI_DIV8 */
      83             : 
      84             : 
      85             : #define cplxMpy4_8_0( re, im, a, b, c, d )                          \
      86             :     re = L_shr( L_sub( Mpy_32_xx( a, c ), Mpy_32_xx( b, d ) ), 1 ); \
      87             :     im = L_shr( L_add( Mpy_32_xx( a, d ), Mpy_32_xx( b, c ) ), 1 );
      88             : 
      89             : #define cplxMpy4_8_1( re, im, a, b ) \
      90             :     re = L_shr( a, 1 );              \
      91             :     im = L_shr( b, 1 );
      92             : 
      93             : 
      94             : /**
      95             :  * \brief    Function performs a complex 5-point FFT
      96             :  *           The FFT is performed inplace. The result of the FFT
      97             :  *           is scaled by SCALEFACTOR5 bits.
      98             :  *
      99             :  *           WOPS with 32x16 bit multiplications:  88 cycles
     100             :  *
     101             :  * \param    [i/o] re    real input / output
     102             :  * \param    [i/o] im    imag input / output
     103             :  * \param    [i  ] s     stride real and imag input / output
     104             :  *
     105             :  * \return   void
     106             :  */
     107       98160 : static void fft5_with_cmplx_data( cmplx *inp /*Qx*/ )
     108             : {
     109             :     cmplx x0, x1, x2, x3, x4;
     110             :     cmplx y1, y2, y3, y4;
     111             :     cmplx t;
     112             : 
     113       98160 :     x0 = CL_shr( inp[0], SCALEFACTOR5 ); // Qx - 4
     114       98160 :     x1 = CL_shr( inp[1], SCALEFACTOR5 ); // Qx - 4
     115       98160 :     x2 = CL_shr( inp[2], SCALEFACTOR5 ); // Qx - 4
     116       98160 :     x3 = CL_shr( inp[3], SCALEFACTOR5 ); // Qx - 4
     117       98160 :     x4 = CL_shr( inp[4], SCALEFACTOR5 ); // Qx - 4
     118             : 
     119       98160 :     y1 = CL_add( x1, x4 );
     120       98160 :     y4 = CL_sub( x1, x4 );
     121       98160 :     y3 = CL_add( x2, x3 );
     122       98160 :     y2 = CL_sub( x2, x3 );
     123       98160 :     t = CL_scale_t( CL_sub( y1, y3 ), C54 );
     124       98160 :     y1 = CL_add( y1, y3 );
     125       98160 :     inp[0] = CL_add( x0, y1 );
     126             : 
     127             :     /* Bit shift left because of the constant C55 which was scaled with the factor 0.5 because of the representation of
     128             :     the values as fracts */
     129       98160 :     y1 = CL_add( inp[0], ( CL_shl( CL_scale_t( y1, C55 ), 1 ) ) );
     130       98160 :     y3 = CL_sub( y1, t );
     131       98160 :     y1 = CL_add( y1, t );
     132             : 
     133       98160 :     t = CL_scale_t( CL_add( y4, y2 ), C51 );
     134             :     /* Bit shift left because of the constant C55 which was scaled with the factor 0.5 because of the representation of
     135             :     the values as fracts */
     136       98160 :     y4 = CL_add( t, CL_shl( CL_scale_t( y4, C52 ), 1 ) );
     137       98160 :     y2 = CL_add( t, CL_scale_t( y2, C53 ) );
     138             : 
     139             : 
     140             :     /* combination */
     141       98160 :     inp[1] = CL_msu_j( y1, y2 );
     142       98160 :     inp[4] = CL_mac_j( y1, y2 );
     143             : 
     144       98160 :     inp[2] = CL_mac_j( y3, y4 );
     145       98160 :     inp[3] = CL_msu_j( y3, y4 );
     146             : 
     147             : #ifdef WMOPS
     148             :     multiCounter[currCounter].CL_move += 5;
     149             : #endif
     150       98160 : }
     151             : 
     152             : /**
     153             :  * \brief    Function performs a complex 8-point FFT
     154             :  *           The FFT is performed inplace. The result of the FFT
     155             :  *           is scaled by SCALEFACTOR8 bits.
     156             :  *
     157             :  *           WOPS with 32x16 bit multiplications: 108 cycles
     158             :  *
     159             :  * \param    [i/o] re    real input / output
     160             :  * \param    [i/o] im    imag input / output
     161             :  * \param    [i  ] s     stride real and imag input / output
     162             :  *
     163             :  * \return   void
     164             :  */
     165     2368776 : static void fft8_with_cmplx_data( cmplx *inp /*Qx*/ )
     166             : {
     167             :     cmplx x0, x1, x2, x3, x4, x5, x6, x7;
     168             :     cmplx s0, s1, s2, s3, s4, s5, s6, s7;
     169             :     cmplx t0, t1, t2, t3, t4, t5, t6, t7;
     170             : 
     171             :     /* Pre-additions */
     172     2368776 :     x0 = CL_shr( inp[0], SCALEFACTOR8 ); // Qx - 4
     173     2368776 :     x1 = CL_shr( inp[1], SCALEFACTOR8 );
     174     2368776 :     x2 = CL_shr( inp[2], SCALEFACTOR8 );
     175     2368776 :     x3 = CL_shr( inp[3], SCALEFACTOR8 );
     176     2368776 :     x4 = CL_shr( inp[4], SCALEFACTOR8 );
     177     2368776 :     x5 = CL_shr( inp[5], SCALEFACTOR8 );
     178     2368776 :     x6 = CL_shr( inp[6], SCALEFACTOR8 );
     179     2368776 :     x7 = CL_shr( inp[7], SCALEFACTOR8 );
     180             : 
     181             :     /* loops are unrolled */
     182             :     {
     183     2368776 :         t0 = CL_add( x0, x4 );
     184     2368776 :         t1 = CL_sub( x0, x4 );
     185             : 
     186     2368776 :         t2 = CL_add( x1, x5 );
     187     2368776 :         t3 = CL_sub( x1, x5 );
     188             : 
     189     2368776 :         t4 = CL_add( x2, x6 );
     190     2368776 :         t5 = CL_sub( x2, x6 );
     191             : 
     192     2368776 :         t6 = CL_add( x3, x7 );
     193     2368776 :         t7 = CL_sub( x3, x7 );
     194             :     }
     195             : 
     196             :     /* Pre-additions and core multiplications */
     197             : 
     198     2368776 :     s0 = CL_add( t0, t4 );
     199     2368776 :     s2 = CL_sub( t0, t4 );
     200             : 
     201     2368776 :     s4 = CL_mac_j( t1, t5 );
     202     2368776 :     s5 = CL_msu_j( t1, t5 );
     203             : 
     204     2368776 :     s1 = CL_add( t2, t6 );
     205     2368776 :     s3 = CL_sub( t2, t6 );
     206     2368776 :     s3 = CL_mul_j( s3 );
     207             : 
     208     2368776 :     t0 = CL_add( t3, t7 );
     209     2368776 :     t1 = CL_sub( t3, t7 );
     210             : 
     211     2368776 :     s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
     212     2368776 :     s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
     213             : 
     214             :     /* Post-additions */
     215             : 
     216     2368776 :     inp[0] = CL_add( s0, s1 );
     217     2368776 :     inp[4] = CL_sub( s0, s1 );
     218             : 
     219     2368776 :     inp[2] = CL_sub( s2, s3 );
     220     2368776 :     inp[6] = CL_add( s2, s3 );
     221             : 
     222     2368776 :     inp[3] = CL_add( s4, s7 );
     223     2368776 :     inp[7] = CL_sub( s4, s7 );
     224             : 
     225     2368776 :     inp[1] = CL_add( s5, s6 );
     226     2368776 :     inp[5] = CL_sub( s5, s6 );
     227             : #ifdef WMOPS
     228             :     multiCounter[currCounter].CL_move += 8;
     229             : #endif
     230     2368776 : }
     231             : 
     232             : 
     233             : /**
     234             :  * \brief    Function performs a complex 10-point FFT
     235             :  *           The FFT is performed inplace. The result of the FFT
     236             :  *           is scaled by SCALEFACTOR10 bits.
     237             :  *
     238             :  *           WOPS with 32x16 bit multiplications:  196 cycles
     239             :  *
     240             :  * \param    [i/o] re    real input / output
     241             :  * \param    [i/o] im    imag input / output
     242             :  * \param    [i  ] s     stride real and imag input / output
     243             :  *
     244             :  * \return   void
     245             :  */
     246             : 
     247     1294944 : static void fft10_with_cmplx_data( cmplx *inp_data /*Qx*/ )
     248             : {
     249             :     cmplx r1, r2, r3, r4;
     250             :     cmplx x0, x1, x2, x3, x4, t;
     251             :     cmplx y[10];
     252             : 
     253             :     /* FOR i=0 */
     254             :     {
     255     1294944 :         x0 = CL_shr( inp_data[0], SCALEFACTOR10 ); // Qx - 5
     256     1294944 :         x1 = CL_shr( inp_data[2], SCALEFACTOR10 );
     257     1294944 :         x2 = CL_shr( inp_data[4], SCALEFACTOR10 );
     258     1294944 :         x3 = CL_shr( inp_data[6], SCALEFACTOR10 );
     259     1294944 :         x4 = CL_shr( inp_data[8], SCALEFACTOR10 );
     260             : 
     261     1294944 :         r1 = CL_add( x3, x2 );
     262     1294944 :         r4 = CL_sub( x3, x2 );
     263     1294944 :         r3 = CL_add( x1, x4 );
     264     1294944 :         r2 = CL_sub( x1, x4 );
     265     1294944 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     266     1294944 :         r1 = CL_add( r1, r3 );
     267     1294944 :         y[0] = CL_add( x0, r1 );
     268     1294944 :         r1 = CL_add( y[0], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     269     1294944 :         r3 = CL_sub( r1, t );
     270     1294944 :         r1 = CL_add( r1, t );
     271     1294944 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     272     1294944 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     273     1294944 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     274             : 
     275             : 
     276     1294944 :         y[2] = CL_msu_j( r1, r2 );
     277     1294944 :         y[8] = CL_mac_j( r1, r2 );
     278     1294944 :         y[4] = CL_mac_j( r3, r4 );
     279     1294944 :         y[6] = CL_msu_j( r3, r4 );
     280             :     }
     281             :     /* FOR i=1 */
     282             :     {
     283     1294944 :         x0 = CL_shr( inp_data[5], SCALEFACTOR10 ); // Qx - 5
     284     1294944 :         x1 = CL_shr( inp_data[1], SCALEFACTOR10 );
     285     1294944 :         x2 = CL_shr( inp_data[3], SCALEFACTOR10 );
     286     1294944 :         x3 = CL_shr( inp_data[7], SCALEFACTOR10 );
     287     1294944 :         x4 = CL_shr( inp_data[9], SCALEFACTOR10 );
     288             : 
     289     1294944 :         r1 = CL_add( x1, x4 );
     290     1294944 :         r4 = CL_sub( x1, x4 );
     291     1294944 :         r3 = CL_add( x3, x2 );
     292     1294944 :         r2 = CL_sub( x3, x2 );
     293     1294944 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     294     1294944 :         r1 = CL_add( r1, r3 );
     295     1294944 :         y[1] = CL_add( x0, r1 );
     296     1294944 :         r1 = CL_add( y[1], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     297     1294944 :         r3 = CL_sub( r1, t );
     298     1294944 :         r1 = CL_add( r1, t );
     299     1294944 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     300     1294944 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     301     1294944 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     302             : 
     303             : 
     304     1294944 :         y[3] = CL_msu_j( r1, r2 );
     305     1294944 :         y[9] = CL_mac_j( r1, r2 );
     306     1294944 :         y[5] = CL_mac_j( r3, r4 );
     307     1294944 :         y[7] = CL_msu_j( r3, r4 );
     308             :     }
     309             : 
     310             :     /* FOR i=0 */
     311             :     {
     312     1294944 :         inp_data[0] = CL_add( y[0], y[1] );
     313     1294944 :         inp_data[5] = CL_sub( y[0], y[1] );
     314             :     }
     315             :     /* FOR i=2 */
     316             :     {
     317     1294944 :         inp_data[2] = CL_add( y[2], y[3] );
     318     1294944 :         inp_data[7] = CL_sub( y[2], y[3] );
     319             :     }
     320             :     /* FOR i=4 */
     321             :     {
     322     1294944 :         inp_data[4] = CL_add( y[4], y[5] );
     323     1294944 :         inp_data[9] = CL_sub( y[4], y[5] );
     324             :     }
     325             :     /* FOR i=6 */
     326             :     {
     327     1294944 :         inp_data[6] = CL_add( y[6], y[7] );
     328     1294944 :         inp_data[1] = CL_sub( y[6], y[7] );
     329             :     }
     330             :     /* FOR i=8 */
     331             :     {
     332     1294944 :         inp_data[8] = CL_add( y[8], y[9] );
     333     1294944 :         inp_data[3] = CL_sub( y[8], y[9] );
     334             :     }
     335             : 
     336             : #ifdef WMOPS
     337             :     multiCounter[currCounter].CL_move += 10;
     338             : #endif
     339     1294944 : }
     340             : 
     341             : 
     342             : /**
     343             :  * \brief    Function performs a complex 15-point FFT
     344             :  *           The FFT is performed inplace. The result of the FFT
     345             :  *           is scaled by SCALEFACTOR15 bits.
     346             :  *
     347             :  *           WOPS with 32x16 bit multiplications:  354 cycles
     348             :  *
     349             :  * \param    [i/o] re    real input / output
     350             :  * \param    [i/o] im    imag input / output
     351             :  * \param    [i  ] s     stride real and imag input / output
     352             :  *
     353             :  * \return   void
     354             :  */
     355             : 
     356      532032 : static void fft15_with_cmplx_data( cmplx *inp_data /*Qx*/ )
     357             : {
     358             :     cmplx c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14;
     359             :     cmplx c_z0, c_z1, c_z2, c_z3, c_z4, c_z5, c_z6, c_z7, c_z8, c_z9, c_z10, c_z11, c_z12, c_z13, c_z14;
     360             :     cmplx c_y1, c_y2, c_y3, c_y4;
     361             :     cmplx c_t;
     362             : 
     363      532032 :     c0 = CL_shr( inp_data[0], SCALEFACTOR15 ); // Qx - 5
     364      532032 :     c1 = CL_shr( inp_data[3], SCALEFACTOR15 );
     365      532032 :     c2 = CL_shr( inp_data[6], SCALEFACTOR15 );
     366      532032 :     c3 = CL_shr( inp_data[9], SCALEFACTOR15 );
     367      532032 :     c4 = CL_shr( inp_data[12], SCALEFACTOR15 );
     368      532032 :     c5 = CL_shr( inp_data[5], SCALEFACTOR15 );
     369      532032 :     c6 = CL_shr( inp_data[8], SCALEFACTOR15 );
     370      532032 :     c7 = CL_shr( inp_data[11], SCALEFACTOR15 );
     371      532032 :     c8 = CL_shr( inp_data[14], SCALEFACTOR15 );
     372      532032 :     c9 = CL_shr( inp_data[2], SCALEFACTOR15 );
     373      532032 :     c10 = CL_shr( inp_data[10], SCALEFACTOR15 );
     374      532032 :     c11 = CL_shr( inp_data[13], SCALEFACTOR15 );
     375      532032 :     c12 = CL_shr( inp_data[1], SCALEFACTOR15 );
     376      532032 :     c13 = CL_shr( inp_data[4], SCALEFACTOR15 );
     377      532032 :     c14 = CL_shr( inp_data[7], SCALEFACTOR15 );
     378             : 
     379             :     /* 1. FFT5 stage */
     380      532032 :     c_y1 = CL_add( c1, c4 );
     381      532032 :     c_y4 = CL_sub( c1, c4 );
     382      532032 :     c_y3 = CL_add( c2, c3 );
     383      532032 :     c_y2 = CL_sub( c2, c3 );
     384      532032 :     c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
     385      532032 :     c_y1 = CL_add( c_y1, c_y3 );
     386      532032 :     c_z0 = CL_add( c0, c_y1 );
     387      532032 :     c_y1 = CL_add( c_z0, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
     388      532032 :     c_y3 = CL_sub( c_y1, c_t );
     389      532032 :     c_y1 = CL_add( c_y1, c_t );
     390      532032 :     c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
     391      532032 :     c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
     392      532032 :     c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
     393             : 
     394             :     /* combination */
     395      532032 :     c_z1 = CL_msu_j( c_y1, c_y2 );
     396      532032 :     c_z2 = CL_mac_j( c_y3, c_y4 );
     397      532032 :     c_z3 = CL_msu_j( c_y3, c_y4 );
     398      532032 :     c_z4 = CL_mac_j( c_y1, c_y2 );
     399             : 
     400             : 
     401             :     /* 2. FFT5 stage */
     402      532032 :     c_y1 = CL_add( c6, c9 );
     403      532032 :     c_y4 = CL_sub( c6, c9 );
     404      532032 :     c_y3 = CL_add( c7, c8 );
     405      532032 :     c_y2 = CL_sub( c7, c8 );
     406      532032 :     c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
     407      532032 :     c_y1 = CL_add( c_y1, c_y3 );
     408      532032 :     c_z5 = CL_add( c5, c_y1 );
     409      532032 :     c_y1 = CL_add( c_z5, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
     410      532032 :     c_y3 = CL_sub( c_y1, c_t );
     411      532032 :     c_y1 = CL_add( c_y1, c_t );
     412      532032 :     c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
     413      532032 :     c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
     414      532032 :     c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
     415             :     /* combination */
     416      532032 :     c_z6 = CL_msu_j( c_y1, c_y2 );
     417      532032 :     c_z7 = CL_mac_j( c_y3, c_y4 );
     418      532032 :     c_z8 = CL_msu_j( c_y3, c_y4 );
     419      532032 :     c_z9 = CL_mac_j( c_y1, c_y2 );
     420             : 
     421             : 
     422             :     /* 3. FFT5 stage */
     423             : 
     424      532032 :     c_y1 = CL_add( c11, c14 );
     425      532032 :     c_y4 = CL_sub( c11, c14 );
     426      532032 :     c_y3 = CL_add( c12, c13 );
     427      532032 :     c_y2 = CL_sub( c12, c13 );
     428      532032 :     c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
     429      532032 :     c_y1 = CL_add( c_y1, c_y3 );
     430      532032 :     c_z10 = CL_add( c10, c_y1 );
     431      532032 :     c_y1 = CL_add( c_z10, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
     432      532032 :     c_y3 = CL_sub( c_y1, c_t );
     433      532032 :     c_y1 = CL_add( c_y1, c_t );
     434      532032 :     c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
     435      532032 :     c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
     436      532032 :     c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
     437             :     /* combination */
     438      532032 :     c_z11 = CL_msu_j( c_y1, c_y2 );
     439      532032 :     c_z12 = CL_mac_j( c_y3, c_y4 );
     440      532032 :     c_z13 = CL_msu_j( c_y3, c_y4 );
     441      532032 :     c_z14 = CL_mac_j( c_y1, c_y2 );
     442             : 
     443             : 
     444             :     /* 1. FFT3 stage */
     445      532032 :     c_y1 = CL_add( c_z5, c_z10 );
     446      532032 :     c_y2 = CL_scale_t( CL_sub( c_z5, c_z10 ), C31 );
     447      532032 :     inp_data[0] = CL_add( c_z0, c_y1 );
     448      532032 :     c_y1 = CL_sub( c_z0, CL_shr( c_y1, 1 ) );
     449      532032 :     inp_data[10] = CL_mac_j( c_y1, c_y2 );
     450      532032 :     inp_data[5] = CL_msu_j( c_y1, c_y2 );
     451             : 
     452             :     /* 2. FFT3 stage */
     453      532032 :     c_y1 = CL_add( c_z6, c_z11 );
     454      532032 :     c_y2 = CL_scale_t( CL_sub( c_z6, c_z11 ), C31 );
     455      532032 :     inp_data[6] = CL_add( c_z1, c_y1 );
     456      532032 :     c_y1 = CL_sub( c_z1, CL_shr( c_y1, 1 ) );
     457      532032 :     inp_data[1] = CL_mac_j( c_y1, c_y2 );
     458      532032 :     inp_data[11] = CL_msu_j( c_y1, c_y2 );
     459             : 
     460             :     /* 3. FFT3 stage */
     461      532032 :     c_y1 = CL_add( c_z7, c_z12 );
     462      532032 :     c_y2 = CL_scale_t( CL_sub( c_z7, c_z12 ), C31 );
     463      532032 :     inp_data[12] = CL_add( c_z2, c_y1 );
     464      532032 :     c_y1 = CL_sub( c_z2, CL_shr( c_y1, 1 ) );
     465      532032 :     inp_data[7] = CL_mac_j( c_y1, c_y2 );
     466      532032 :     inp_data[2] = CL_msu_j( c_y1, c_y2 );
     467             : 
     468             : 
     469             :     /* 4. FFT3 stage */
     470      532032 :     c_y1 = CL_add( c_z8, c_z13 );
     471      532032 :     c_y2 = CL_scale_t( CL_sub( c_z8, c_z13 ), C31 );
     472      532032 :     inp_data[3] = CL_add( c_z3, c_y1 );
     473      532032 :     c_y1 = CL_sub( c_z3, CL_shr( c_y1, 1 ) );
     474      532032 :     inp_data[13] = CL_mac_j( c_y1, c_y2 );
     475      532032 :     inp_data[8] = CL_msu_j( c_y1, c_y2 );
     476             : 
     477             : 
     478             :     /* 5. FFT3 stage */
     479      532032 :     c_y1 = CL_add( c_z9, c_z14 );
     480      532032 :     c_y2 = CL_scale_t( CL_sub( c_z9, c_z14 ), C31 );
     481      532032 :     inp_data[9] = CL_add( c_z4, c_y1 );
     482      532032 :     c_y1 = CL_sub( c_z4, CL_shr( c_y1, 1 ) );
     483      532032 :     inp_data[4] = CL_mac_j( c_y1, c_y2 );
     484      532032 :     inp_data[14] = CL_msu_j( c_y1, c_y2 );
     485             : 
     486             : #ifdef WMOPS
     487             :     multiCounter[currCounter].CL_move += 15;
     488             : #endif
     489      532032 : }
     490             : 
     491             : 
     492             : /**
     493             :  * \brief    Function performs a complex 16-point FFT
     494             :  *           The FFT is performed inplace. The result of the FFT
     495             :  *           is scaled by SCALEFACTOR16 bits.
     496             :  *
     497             :  *           WOPS with 32x16 bit multiplications (scale on ):  288 cycles
     498             :  *           WOPS with 32x16 bit multiplications (scale off):  256 cycles
     499             :  *
     500             :  * \param    [i/o] re    real input / output Qx
     501             :  * \param    [i/o] im    imag input / output Qx
     502             :  * \param    [i  ] s     stride real and imag input / output
     503             :  *
     504             :  * \return   void
     505             :  */
     506           0 : void fft16( Word32 *re, Word32 *im, Word16 s, Word16 bScale )
     507             : {
     508             :     Word16 i;
     509           0 :     if ( s == 2 )
     510             :     {
     511           0 :         fft16_with_cmplx_data( (cmplx *) re, bScale );
     512             :     }
     513             :     else
     514             :     {
     515             :         cmplx inp_data[16];
     516           0 :         FOR( i = 0; i < 16; i++ )
     517             :         {
     518           0 :             inp_data[i] = CL_form( re[s * i], im[s * i] );
     519           0 :             move64();
     520             :         }
     521           0 :         fft16_with_cmplx_data( inp_data, bScale );
     522           0 :         FOR( i = 0; i < 16; i++ )
     523             :         {
     524           0 :             re[s * i] = CL_Extract_real( inp_data[i] );
     525           0 :             move32();
     526           0 :             im[s * i] = CL_Extract_imag( inp_data[i] );
     527           0 :             move32();
     528             :         }
     529             :     }
     530           0 : }
     531             : 
     532    53078086 : void fft16_with_cmplx_data( cmplx *input /*Qx*/, Word16 bScale )
     533             : {
     534             :     cmplx x0, x1, x2, x3, temp;
     535             :     cmplx t0, t2, t4, t6, t7;
     536             :     cmplx y[16];
     537             : 
     538    53078086 :     IF( bScale )
     539             :     {
     540             :         {
     541     2115592 :             x0 = CL_shr( input[0], SCALEFACTOR16 ); // Qx - 5
     542     2115592 :             x1 = CL_shr( input[4], SCALEFACTOR16 );
     543     2115592 :             x2 = CL_shr( input[8], SCALEFACTOR16 );
     544     2115592 :             x3 = CL_shr( input[12], SCALEFACTOR16 );
     545     2115592 :             t0 = CL_add( x0, x2 );
     546     2115592 :             t2 = CL_sub( x0, x2 );
     547     2115592 :             t4 = CL_add( x1, x3 );
     548     2115592 :             t6 = CL_sub( x1, x3 );
     549     2115592 :             t6 = CL_mul_j( t6 );
     550     2115592 :             y[0] = CL_add( t0, t4 );
     551     2115592 :             y[1] = CL_sub( t2, t6 );
     552     2115592 :             y[2] = CL_sub( t0, t4 );
     553     2115592 :             y[3] = CL_add( t2, t6 );
     554             : 
     555             : 
     556     2115592 :             x0 = CL_shr( input[1], SCALEFACTOR16 ); // Qx - 5
     557     2115592 :             x1 = CL_shr( input[5], SCALEFACTOR16 );
     558     2115592 :             x2 = CL_shr( input[9], SCALEFACTOR16 );
     559     2115592 :             x3 = CL_shr( input[13], SCALEFACTOR16 );
     560     2115592 :             t0 = CL_add( x0, x2 );
     561     2115592 :             t2 = CL_sub( x0, x2 );
     562     2115592 :             t4 = CL_add( x1, x3 );
     563     2115592 :             t6 = CL_sub( x1, x3 );
     564     2115592 :             t6 = CL_mul_j( t6 );
     565     2115592 :             y[4] = CL_add( t0, t4 );
     566     2115592 :             y[5] = CL_sub( t2, t6 );
     567     2115592 :             y[6] = CL_sub( t0, t4 );
     568     2115592 :             y[7] = CL_add( t2, t6 );
     569             : 
     570             : 
     571     2115592 :             x0 = CL_shr( input[2], SCALEFACTOR16 ); // Qx - 5
     572     2115592 :             x1 = CL_shr( input[6], SCALEFACTOR16 );
     573     2115592 :             x2 = CL_shr( input[10], SCALEFACTOR16 );
     574     2115592 :             x3 = CL_shr( input[14], SCALEFACTOR16 );
     575     2115592 :             t0 = CL_add( x0, x2 );
     576     2115592 :             t2 = CL_sub( x0, x2 );
     577     2115592 :             t4 = CL_add( x1, x3 );
     578     2115592 :             t6 = CL_sub( x1, x3 );
     579     2115592 :             t6 = CL_mul_j( t6 );
     580     2115592 :             y[8] = CL_add( t0, t4 );
     581     2115592 :             y[9] = CL_sub( t2, t6 );
     582     2115592 :             y[10] = CL_sub( t4, t0 );
     583     2115592 :             y[10] = CL_mul_j( y[10] );
     584     2115592 :             y[11] = CL_add( t2, t6 );
     585             : 
     586             : 
     587     2115592 :             x0 = CL_shr( input[3], SCALEFACTOR16 ); // Qx - 5
     588     2115592 :             x1 = CL_shr( input[7], SCALEFACTOR16 );
     589     2115592 :             x2 = CL_shr( input[11], SCALEFACTOR16 );
     590     2115592 :             x3 = CL_shr( input[15], SCALEFACTOR16 );
     591     2115592 :             t0 = CL_add( x0, x2 );
     592     2115592 :             t2 = CL_sub( x0, x2 );
     593     2115592 :             t4 = CL_add( x1, x3 );
     594     2115592 :             t6 = CL_sub( x1, x3 );
     595     2115592 :             t6 = CL_mul_j( t6 );
     596     2115592 :             y[12] = CL_add( t0, t4 );
     597     2115592 :             y[13] = CL_sub( t2, t6 );
     598     2115592 :             y[14] = CL_sub( t0, t4 );
     599     2115592 :             y[15] = CL_add( t2, t6 );
     600             :         }
     601             :     }
     602             :     else
     603             :     {
     604             :         {
     605    50962494 :             t0 = CL_add( input[0], input[8] );
     606    50962494 :             t2 = CL_sub( input[0], input[8] );
     607    50962494 :             t4 = CL_add( input[4], input[12] );
     608    50962494 :             t7 = CL_sub( input[4], input[12] );
     609             : 
     610    50962494 :             y[0] = CL_add( t0, t4 );
     611    50962494 :             y[1] = CL_msu_j( t2, t7 );
     612    50962494 :             y[2] = CL_sub( t0, t4 );
     613    50962494 :             y[3] = CL_mac_j( t2, t7 );
     614             :         }
     615             :         /* i=1 */
     616             :         {
     617    50962494 :             t0 = CL_add( input[1], input[9] );
     618    50962494 :             t2 = CL_sub( input[1], input[9] );
     619    50962494 :             t4 = CL_add( input[5], input[13] );
     620    50962494 :             t7 = CL_sub( input[5], input[13] );
     621             : 
     622    50962494 :             y[4] = CL_add( t0, t4 );
     623    50962494 :             y[5] = CL_msu_j( t2, t7 );
     624    50962494 :             y[6] = CL_sub( t0, t4 );
     625    50962494 :             y[7] = CL_mac_j( t2, t7 );
     626             :         }
     627             :         /* i=2 */
     628             :         {
     629    50962494 :             t0 = CL_add( input[2], input[10] );
     630    50962494 :             t2 = CL_sub( input[2], input[10] );
     631    50962494 :             t4 = CL_add( input[6], input[14] );
     632    50962494 :             t7 = CL_sub( input[6], input[14] );
     633             : 
     634    50962494 :             y[8] = CL_add( t0, t4 );
     635    50962494 :             y[9] = CL_msu_j( t2, t7 );
     636    50962494 :             temp = CL_sub( t0, t4 );
     637    50962494 :             y[10] = CL_negate( CL_mul_j( temp ) );
     638    50962494 :             y[11] = CL_mac_j( t2, t7 );
     639             :         }
     640             :         /* i=3 */
     641             :         {
     642    50962494 :             t0 = CL_add( input[3], input[11] );
     643    50962494 :             t2 = CL_sub( input[3], input[11] );
     644    50962494 :             t4 = CL_add( input[7], input[15] );
     645    50962494 :             t7 = CL_sub( input[7], input[15] );
     646             : 
     647    50962494 :             y[12] = CL_add( t0, t4 );
     648    50962494 :             y[13] = CL_msu_j( t2, t7 );
     649    50962494 :             y[14] = CL_sub( t0, t4 );
     650    50962494 :             y[15] = CL_mac_j( t2, t7 );
     651             :         }
     652             :     }
     653             : 
     654    53078086 :     x0 = CL_scale_t( y[11], C162 );
     655    53078086 :     y[11] = CL_mac_j( x0, x0 );
     656             : 
     657    53078086 :     x0 = CL_scale_t( y[14], C162 );
     658    53078086 :     y[14] = CL_mac_j( x0, x0 );
     659             : 
     660    53078086 :     x0 = CL_scale_t( y[6], C161 );
     661    53078086 :     y[6] = CL_msu_j( x0, x0 );
     662             : 
     663    53078086 :     x0 = CL_scale_t( y[9], C161 );
     664    53078086 :     y[9] = CL_msu_j( x0, x0 );
     665             : 
     666    53078086 :     y[5] = CL_mac_j( CL_scale_t( y[5], C163 ), CL_scale_t( y[5], C166 ) );
     667    53078086 :     y[7] = CL_mac_j( CL_scale_t( y[7], C165 ), CL_scale_t( y[7], C164 ) );
     668    53078086 :     y[13] = CL_mac_j( CL_scale_t( y[13], C165 ), CL_scale_t( y[13], C164 ) );
     669    53078086 :     y[15] = CL_mac_j( CL_scale_t( y[15], C164 ), CL_scale_t( y[15], C165 ) );
     670             : 
     671             : 
     672             :     /* i=0 */
     673             :     {
     674    53078086 :         t0 = CL_add( y[0], y[8] );
     675    53078086 :         t2 = CL_sub( y[0], y[8] );
     676    53078086 :         t4 = CL_add( y[4], y[12] );
     677    53078086 :         t7 = CL_sub( y[4], y[12] );
     678             : 
     679    53078086 :         input[0] = CL_add( t0, t4 );
     680    53078086 :         input[4] = CL_msu_j( t2, t7 );
     681    53078086 :         input[8] = CL_sub( t0, t4 );
     682    53078086 :         input[12] = CL_mac_j( t2, t7 );
     683             :     }
     684             :     /* i=1 */
     685             :     {
     686    53078086 :         t0 = CL_add( y[1], y[9] );
     687    53078086 :         t2 = CL_sub( y[1], y[9] );
     688    53078086 :         t4 = CL_add( y[5], y[13] );
     689    53078086 :         t7 = CL_sub( y[5], y[13] );
     690             : 
     691    53078086 :         input[1] = CL_add( t0, t4 );
     692    53078086 :         input[5] = CL_msu_j( t2, t7 );
     693    53078086 :         input[9] = CL_sub( t0, t4 );
     694    53078086 :         input[13] = CL_mac_j( t2, t7 );
     695             :     }
     696             :     /* i=2 */
     697             :     {
     698    53078086 :         t0 = CL_add( y[2], y[10] );
     699    53078086 :         t2 = CL_sub( y[2], y[10] );
     700    53078086 :         t4 = CL_add( y[6], y[14] );
     701    53078086 :         t7 = CL_sub( y[6], y[14] );
     702             : 
     703    53078086 :         input[2] = CL_add( t0, t4 );
     704    53078086 :         input[6] = CL_msu_j( t2, t7 );
     705    53078086 :         input[10] = CL_sub( t0, t4 );
     706    53078086 :         input[14] = CL_mac_j( t2, t7 );
     707             :     }
     708             :     /* i=3 */
     709             :     {
     710    53078086 :         t0 = CL_add( y[3], y[11] );
     711    53078086 :         t2 = CL_sub( y[3], y[11] );
     712    53078086 :         t4 = CL_add( y[7], y[15] );
     713    53078086 :         t7 = CL_sub( y[7], y[15] );
     714             : 
     715    53078086 :         input[3] = CL_add( t0, t4 );
     716    53078086 :         input[7] = CL_msu_j( t2, t7 );
     717    53078086 :         input[11] = CL_sub( t0, t4 );
     718    53078086 :         input[15] = CL_mac_j( t2, t7 );
     719             :     }
     720             : #ifdef WMOPS
     721             :     multiCounter[currCounter].CL_move += 16;
     722             : #endif
     723    53078086 : }
     724             : 
     725             : 
     726             : /**
     727             :  * \brief    Function performs a complex 20-point FFT
     728             :  *           The FFT is performed inplace. The result of the FFT
     729             :  *           is scaled by SCALEFACTOR20 bits.
     730             :  *
     731             :  *           WOPS with 32x16 bit multiplications:  432 cycles
     732             :  *
     733             :  * \param    [i/o] re    real input / output
     734             :  * \param    [i/o] im    imag input / output
     735             :  * \param    [i  ] s     stride real and imag input / output
     736             :  *
     737             :  * \return   void
     738             :  */
     739    16342708 : static void fft20_with_cmplx_data( cmplx *inp_data /*Qx*/ )
     740             : {
     741             :     cmplx r1, r2, r3, r4;
     742             :     cmplx x0, x1, x2, x3, x4;
     743             :     cmplx t, t0, t1, t2, t3;
     744             :     cmplx y[20];
     745             :     cmplx *y0, *y1, *y2, *y3, *y4;
     746             : 
     747    16342708 :     y0 = y;
     748    16342708 :     y1 = &y[4];
     749    16342708 :     y2 = &y[16];
     750    16342708 :     y3 = &y[8];
     751    16342708 :     y4 = &y[12];
     752             : 
     753             :     {
     754    16342708 :         x0 = CL_shr( inp_data[0], SCALEFACTOR20 ); // Qx - 5
     755    16342708 :         x1 = CL_shr( inp_data[16], SCALEFACTOR20 );
     756    16342708 :         x2 = CL_shr( inp_data[12], SCALEFACTOR20 );
     757    16342708 :         x3 = CL_shr( inp_data[8], SCALEFACTOR20 );
     758    16342708 :         x4 = CL_shr( inp_data[4], SCALEFACTOR20 );
     759             : 
     760    16342708 :         r4 = CL_sub( x1, x4 );
     761    16342708 :         r2 = CL_sub( x2, x3 );
     762    16342708 :         r1 = CL_add( x1, x4 );
     763    16342708 :         r3 = CL_add( x2, x3 );
     764    16342708 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     765    16342708 :         r1 = CL_add( r1, r3 );
     766    16342708 :         y0[0] = CL_add( x0, r1 );
     767    16342708 :         r1 = CL_add( y0[0], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     768    16342708 :         r3 = CL_sub( r1, t );
     769    16342708 :         r1 = CL_add( r1, t );
     770    16342708 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     771    16342708 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     772    16342708 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     773             : 
     774             : 
     775    16342708 :         y1[0] = CL_msu_j( r1, r2 );
     776    16342708 :         y2[0] = CL_mac_j( r1, r2 );
     777    16342708 :         y3[0] = CL_mac_j( r3, r4 );
     778    16342708 :         y4[0] = CL_msu_j( r3, r4 );
     779             :     }
     780             :     {
     781    16342708 :         x0 = CL_shr( inp_data[5], SCALEFACTOR20 ); // Qx - 5
     782    16342708 :         x1 = CL_shr( inp_data[1], SCALEFACTOR20 );
     783    16342708 :         x2 = CL_shr( inp_data[17], SCALEFACTOR20 );
     784    16342708 :         x3 = CL_shr( inp_data[13], SCALEFACTOR20 );
     785    16342708 :         x4 = CL_shr( inp_data[9], SCALEFACTOR20 );
     786             : 
     787    16342708 :         r4 = CL_sub( x1, x4 );
     788    16342708 :         r2 = CL_sub( x2, x3 );
     789    16342708 :         r1 = CL_add( x1, x4 );
     790    16342708 :         r3 = CL_add( x2, x3 );
     791    16342708 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     792    16342708 :         r1 = CL_add( r1, r3 );
     793    16342708 :         y0[1] = CL_add( x0, r1 );
     794    16342708 :         r1 = CL_add( y0[1], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     795    16342708 :         r3 = CL_sub( r1, t );
     796    16342708 :         r1 = CL_add( r1, t );
     797    16342708 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     798    16342708 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     799    16342708 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     800             : 
     801             : 
     802    16342708 :         y1[1] = CL_msu_j( r1, r2 );
     803    16342708 :         y2[1] = CL_mac_j( r1, r2 );
     804    16342708 :         y3[1] = CL_mac_j( r3, r4 );
     805    16342708 :         y4[1] = CL_msu_j( r3, r4 );
     806             :     }
     807             :     {
     808    16342708 :         x0 = CL_shr( inp_data[10], SCALEFACTOR20 ); // Qx - 5
     809    16342708 :         x1 = CL_shr( inp_data[6], SCALEFACTOR20 );
     810    16342708 :         x2 = CL_shr( inp_data[2], SCALEFACTOR20 );
     811    16342708 :         x3 = CL_shr( inp_data[18], SCALEFACTOR20 );
     812    16342708 :         x4 = CL_shr( inp_data[14], SCALEFACTOR20 );
     813             : 
     814    16342708 :         r4 = CL_sub( x1, x4 );
     815    16342708 :         r2 = CL_sub( x2, x3 );
     816    16342708 :         r1 = CL_add( x1, x4 );
     817    16342708 :         r3 = CL_add( x2, x3 );
     818    16342708 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     819    16342708 :         r1 = CL_add( r1, r3 );
     820    16342708 :         y0[2] = CL_add( x0, r1 );
     821    16342708 :         r1 = CL_add( y0[2], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     822    16342708 :         r3 = CL_sub( r1, t );
     823    16342708 :         r1 = CL_add( r1, t );
     824    16342708 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     825    16342708 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     826    16342708 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     827             : 
     828             : 
     829    16342708 :         y1[2] = CL_msu_j( r1, r2 );
     830    16342708 :         y2[2] = CL_mac_j( r1, r2 );
     831    16342708 :         y3[2] = CL_mac_j( r3, r4 );
     832    16342708 :         y4[2] = CL_msu_j( r3, r4 );
     833             :     }
     834             :     {
     835    16342708 :         x0 = CL_shr( inp_data[15], SCALEFACTOR20 ); // Qx - 5
     836    16342708 :         x1 = CL_shr( inp_data[11], SCALEFACTOR20 );
     837    16342708 :         x2 = CL_shr( inp_data[7], SCALEFACTOR20 );
     838    16342708 :         x3 = CL_shr( inp_data[3], SCALEFACTOR20 );
     839    16342708 :         x4 = CL_shr( inp_data[19], SCALEFACTOR20 );
     840             : 
     841    16342708 :         r4 = CL_sub( x1, x4 );
     842    16342708 :         r2 = CL_sub( x2, x3 );
     843    16342708 :         r1 = CL_add( x1, x4 );
     844    16342708 :         r3 = CL_add( x2, x3 );
     845    16342708 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     846    16342708 :         r1 = CL_add( r1, r3 );
     847    16342708 :         y0[3] = CL_add( x0, r1 );
     848    16342708 :         r1 = CL_add( y0[3], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     849    16342708 :         r3 = CL_sub( r1, t );
     850    16342708 :         r1 = CL_add( r1, t );
     851    16342708 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     852    16342708 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     853    16342708 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     854             : 
     855             : 
     856    16342708 :         y1[3] = CL_msu_j( r1, r2 );
     857    16342708 :         y2[3] = CL_mac_j( r1, r2 );
     858    16342708 :         y3[3] = CL_mac_j( r3, r4 );
     859    16342708 :         y4[3] = CL_msu_j( r3, r4 );
     860             :     }
     861             : 
     862             :     {
     863    16342708 :         cmplx *ptr_y = y;
     864             :         {
     865             :             cmplx Cy0, Cy1, Cy2, Cy3;
     866             : 
     867    16342708 :             Cy0 = *ptr_y++;
     868    16342708 :             Cy1 = *ptr_y++;
     869    16342708 :             Cy2 = *ptr_y++;
     870    16342708 :             Cy3 = *ptr_y++;
     871             : 
     872             :             /*  Pre-additions */
     873    16342708 :             t0 = CL_add( Cy0, Cy2 );
     874    16342708 :             t1 = CL_sub( Cy0, Cy2 );
     875    16342708 :             t2 = CL_add( Cy1, Cy3 );
     876    16342708 :             t3 = CL_sub( Cy1, Cy3 );
     877             : 
     878             : 
     879    16342708 :             inp_data[0] = CL_add( t0, t2 );
     880    16342708 :             inp_data[5] = CL_msu_j( t1, t3 );
     881    16342708 :             inp_data[10] = CL_sub( t0, t2 );
     882    16342708 :             inp_data[15] = CL_mac_j( t1, t3 );
     883             :         }
     884             : 
     885             :         {
     886             :             cmplx Cy0, Cy1, Cy2, Cy3;
     887             : 
     888    16342708 :             Cy0 = *ptr_y++;
     889    16342708 :             Cy1 = *ptr_y++;
     890    16342708 :             Cy2 = *ptr_y++;
     891    16342708 :             Cy3 = *ptr_y++;
     892             : 
     893             :             /*  Pre-additions */
     894    16342708 :             t0 = CL_add( Cy0, Cy2 );
     895    16342708 :             t1 = CL_sub( Cy0, Cy2 );
     896    16342708 :             t2 = CL_add( Cy1, Cy3 );
     897    16342708 :             t3 = CL_sub( Cy1, Cy3 );
     898             : 
     899             : 
     900    16342708 :             inp_data[4] = CL_add( t0, t2 );
     901    16342708 :             inp_data[9] = CL_msu_j( t1, t3 );
     902    16342708 :             inp_data[14] = CL_sub( t0, t2 );
     903    16342708 :             inp_data[19] = CL_mac_j( t1, t3 );
     904             :         }
     905             : 
     906             :         {
     907             :             cmplx Cy0, Cy1, Cy2, Cy3;
     908             : 
     909    16342708 :             Cy0 = *ptr_y++;
     910    16342708 :             Cy1 = *ptr_y++;
     911    16342708 :             Cy2 = *ptr_y++;
     912    16342708 :             Cy3 = *ptr_y++;
     913             : 
     914             :             /*  Pre-additions */
     915    16342708 :             t0 = CL_add( Cy0, Cy2 );
     916    16342708 :             t1 = CL_sub( Cy0, Cy2 );
     917    16342708 :             t2 = CL_add( Cy1, Cy3 );
     918    16342708 :             t3 = CL_sub( Cy1, Cy3 );
     919             : 
     920             : 
     921    16342708 :             inp_data[8] = CL_add( t0, t2 );
     922    16342708 :             inp_data[13] = CL_msu_j( t1, t3 );
     923    16342708 :             inp_data[18] = CL_sub( t0, t2 );
     924    16342708 :             inp_data[3] = CL_mac_j( t1, t3 );
     925             :         }
     926             : 
     927             :         {
     928             :             cmplx Cy0, Cy1, Cy2, Cy3;
     929             : 
     930    16342708 :             Cy0 = *ptr_y++;
     931    16342708 :             Cy1 = *ptr_y++;
     932    16342708 :             Cy2 = *ptr_y++;
     933    16342708 :             Cy3 = *ptr_y++;
     934             : 
     935             :             /*  Pre-additions */
     936    16342708 :             t0 = CL_add( Cy0, Cy2 );
     937    16342708 :             t1 = CL_sub( Cy0, Cy2 );
     938    16342708 :             t2 = CL_add( Cy1, Cy3 );
     939    16342708 :             t3 = CL_sub( Cy1, Cy3 );
     940             : 
     941    16342708 :             inp_data[12] = CL_add( t0, t2 );
     942    16342708 :             inp_data[17] = CL_msu_j( t1, t3 );
     943    16342708 :             inp_data[2] = CL_sub( t0, t2 );
     944    16342708 :             inp_data[7] = CL_mac_j( t1, t3 );
     945             :         }
     946             : 
     947             :         {
     948             :             cmplx Cy0, Cy1, Cy2, Cy3;
     949             : 
     950    16342708 :             Cy0 = *ptr_y++;
     951    16342708 :             Cy1 = *ptr_y++;
     952    16342708 :             Cy2 = *ptr_y++;
     953    16342708 :             Cy3 = *ptr_y++;
     954             : 
     955             :             /*  Pre-additions */
     956    16342708 :             t0 = CL_add( Cy0, Cy2 );
     957    16342708 :             t1 = CL_sub( Cy0, Cy2 );
     958    16342708 :             t2 = CL_add( Cy1, Cy3 );
     959    16342708 :             t3 = CL_sub( Cy1, Cy3 );
     960             : 
     961             : 
     962    16342708 :             inp_data[16] = CL_add( t0, t2 );
     963    16342708 :             inp_data[1] = CL_msu_j( t1, t3 );
     964    16342708 :             inp_data[6] = CL_sub( t0, t2 );
     965    16342708 :             inp_data[11] = CL_mac_j( t1, t3 );
     966             :         }
     967             :     }
     968             : #ifdef WMOPS
     969             :     multiCounter[currCounter].CL_move += 20;
     970             : #endif
     971    16342708 : }
     972             : 
     973             : 
     974             : /**
     975             :  * \brief    Function performs a complex 30-point FFT
     976             :  *           The FFT is performed inplace. The result of the FFT
     977             :  *           is scaled by SCALEFACTOR30 bits.
     978             :  *
     979             :  *           WOPS with 32x16 bit multiplications:  828 cycles
     980             :  *
     981             :  * \param    [i/o] re    real input / output
     982             :  * \param    [i/o] im    imag input / output
     983             :  * \param    [i  ] s     stride real and imag input / output
     984             :  *
     985             :  * \return   void
     986             :  */
     987             : 
     988    19853168 : static void fft30_with_cmplx_data( cmplx *inp /*Qx*/ )
     989             : {
     990    19853168 :     cmplx *l = &inp[0];
     991    19853168 :     cmplx *h = &inp[15];
     992             : 
     993             :     cmplx z[30], y[15], x[15], rs1, rs2, rs3, rs4, t;
     994             : 
     995             :     /* 1. FFT15 stage */
     996             : 
     997    19853168 :     x[0] = CL_shr( inp[0], SCALEFACTOR30_1 ); // Qx - 5
     998    19853168 :     x[1] = CL_shr( inp[18], SCALEFACTOR30_1 );
     999    19853168 :     x[2] = CL_shr( inp[6], SCALEFACTOR30_1 );
    1000    19853168 :     x[3] = CL_shr( inp[24], SCALEFACTOR30_1 );
    1001    19853168 :     x[4] = CL_shr( inp[12], SCALEFACTOR30_1 );
    1002             : 
    1003    19853168 :     x[5] = CL_shr( inp[20], SCALEFACTOR30_1 ); // Qx - 5
    1004    19853168 :     x[6] = CL_shr( inp[8], SCALEFACTOR30_1 );
    1005    19853168 :     x[7] = CL_shr( inp[26], SCALEFACTOR30_1 );
    1006    19853168 :     x[8] = CL_shr( inp[14], SCALEFACTOR30_1 );
    1007    19853168 :     x[9] = CL_shr( inp[2], SCALEFACTOR30_1 );
    1008             : 
    1009    19853168 :     x[10] = CL_shr( inp[10], SCALEFACTOR30_1 ); // Qx - 5
    1010    19853168 :     x[11] = CL_shr( inp[28], SCALEFACTOR30_1 );
    1011    19853168 :     x[12] = CL_shr( inp[16], SCALEFACTOR30_1 );
    1012    19853168 :     x[13] = CL_shr( inp[4], SCALEFACTOR30_1 );
    1013    19853168 :     x[14] = CL_shr( inp[22], SCALEFACTOR30_1 );
    1014             : 
    1015             : 
    1016             :     /* 1. FFT5 stage */
    1017    19853168 :     rs1 = CL_add( x[1], x[4] );
    1018    19853168 :     rs4 = CL_sub( x[1], x[4] );
    1019    19853168 :     rs3 = CL_add( x[2], x[3] );
    1020    19853168 :     rs2 = CL_sub( x[2], x[3] );
    1021    19853168 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1022    19853168 :     rs1 = CL_add( rs1, rs3 );
    1023    19853168 :     y[0] = CL_add( x[0], rs1 );
    1024    19853168 :     rs1 = CL_add( y[0], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1025    19853168 :     rs3 = CL_sub( rs1, t );
    1026    19853168 :     rs1 = CL_add( rs1, t );
    1027    19853168 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1028    19853168 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1029    19853168 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1030             : 
    1031             :     /* combination */
    1032    19853168 :     y[1] = CL_msu_j( rs1, rs2 );
    1033    19853168 :     y[4] = CL_mac_j( rs1, rs2 );
    1034    19853168 :     y[2] = CL_mac_j( rs3, rs4 );
    1035    19853168 :     y[3] = CL_msu_j( rs3, rs4 );
    1036             : 
    1037             : 
    1038             :     /* 2. FFT5 stage */
    1039    19853168 :     rs1 = CL_add( x[6], x[9] );
    1040    19853168 :     rs4 = CL_sub( x[6], x[9] );
    1041    19853168 :     rs3 = CL_add( x[7], x[8] );
    1042    19853168 :     rs2 = CL_sub( x[7], x[8] );
    1043    19853168 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1044    19853168 :     rs1 = CL_add( rs1, rs3 );
    1045    19853168 :     y[5] = CL_add( x[5], rs1 );
    1046    19853168 :     rs1 = CL_add( y[5], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1047    19853168 :     rs3 = CL_sub( rs1, t );
    1048    19853168 :     rs1 = CL_add( rs1, t );
    1049    19853168 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1050    19853168 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1051    19853168 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1052             : 
    1053             :     /* combination */
    1054    19853168 :     y[6] = CL_msu_j( rs1, rs2 );
    1055    19853168 :     y[9] = CL_mac_j( rs1, rs2 );
    1056    19853168 :     y[7] = CL_mac_j( rs3, rs4 );
    1057    19853168 :     y[8] = CL_msu_j( rs3, rs4 );
    1058             : 
    1059             : 
    1060             :     /* 3. FFT5 stage */
    1061    19853168 :     rs1 = CL_add( x[11], x[14] );
    1062    19853168 :     rs4 = CL_sub( x[11], x[14] );
    1063    19853168 :     rs3 = CL_add( x[12], x[13] );
    1064    19853168 :     rs2 = CL_sub( x[12], x[13] );
    1065    19853168 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1066    19853168 :     rs1 = CL_add( rs1, rs3 );
    1067    19853168 :     y[10] = CL_add( x[10], rs1 );
    1068    19853168 :     rs1 = CL_add( y[10], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1069    19853168 :     rs3 = CL_sub( rs1, t );
    1070    19853168 :     rs1 = CL_add( rs1, t );
    1071    19853168 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1072    19853168 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1073    19853168 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1074             : 
    1075             :     /* combination */
    1076    19853168 :     y[11] = CL_msu_j( rs1, rs2 );
    1077    19853168 :     y[14] = CL_mac_j( rs1, rs2 );
    1078    19853168 :     y[12] = CL_mac_j( rs3, rs4 );
    1079    19853168 :     y[13] = CL_msu_j( rs3, rs4 );
    1080             :     /*for (i=10; i<15; i++)
    1081             :     {
    1082             :     printf("%d,\t %d,\t",y[i].re, y[i].im);
    1083             :     }
    1084             :     printf("\n\n");*/
    1085             : 
    1086             : 
    1087             :     /* 1. FFT3 stage */
    1088             :     /* real part */
    1089    19853168 :     rs1 = CL_add( y[5], y[10] );
    1090    19853168 :     rs2 = CL_scale_t( CL_sub( y[5], y[10] ), C31 );
    1091    19853168 :     z[0] = CL_add( y[0], rs1 );
    1092    19853168 :     rs1 = CL_sub( y[0], CL_shr( rs1, 1 ) );
    1093             : 
    1094    19853168 :     z[10] = CL_mac_j( rs1, rs2 );
    1095    19853168 :     z[5] = CL_msu_j( rs1, rs2 );
    1096             : 
    1097             :     /* 2. FFT3 stage */
    1098    19853168 :     rs1 = CL_add( y[6], y[11] );
    1099    19853168 :     rs2 = CL_scale_t( CL_sub( y[6], y[11] ), C31 );
    1100    19853168 :     z[6] = CL_add( y[1], rs1 );
    1101    19853168 :     rs1 = CL_sub( y[1], CL_shr( rs1, 1 ) );
    1102             : 
    1103    19853168 :     z[1] = CL_mac_j( rs1, rs2 );
    1104    19853168 :     z[11] = CL_msu_j( rs1, rs2 );
    1105             : 
    1106             : 
    1107             :     /* 3. FFT3 stage */
    1108    19853168 :     rs1 = CL_add( y[7], y[12] );
    1109    19853168 :     rs2 = CL_scale_t( CL_sub( y[7], y[12] ), C31 );
    1110    19853168 :     z[12] = CL_add( y[2], rs1 );
    1111    19853168 :     rs1 = CL_sub( y[2], CL_shr( rs1, 1 ) );
    1112             : 
    1113    19853168 :     z[7] = CL_mac_j( rs1, rs2 );
    1114    19853168 :     z[2] = CL_msu_j( rs1, rs2 );
    1115             : 
    1116             : 
    1117             :     /* 4. FFT3 stage */
    1118    19853168 :     rs1 = CL_add( y[8], y[13] );
    1119    19853168 :     rs2 = CL_scale_t( CL_sub( y[8], y[13] ), C31 );
    1120    19853168 :     z[3] = CL_add( y[3], rs1 );
    1121    19853168 :     rs1 = CL_sub( y[3], CL_shr( rs1, 1 ) );
    1122             : 
    1123    19853168 :     z[13] = CL_mac_j( rs1, rs2 );
    1124    19853168 :     z[8] = CL_msu_j( rs1, rs2 );
    1125             : 
    1126             : 
    1127             :     /* 5. FFT3 stage */
    1128    19853168 :     rs1 = CL_add( y[9], y[14] );
    1129    19853168 :     rs2 = CL_scale_t( CL_sub( y[9], y[14] ), C31 );
    1130    19853168 :     z[9] = CL_add( y[4], rs1 );
    1131    19853168 :     rs1 = CL_sub( y[4], CL_shr( rs1, 1 ) );
    1132             : 
    1133    19853168 :     z[4] = CL_mac_j( rs1, rs2 );
    1134    19853168 :     z[14] = CL_msu_j( rs1, rs2 );
    1135             : 
    1136             :     /*for (i=0; i<15; i++)
    1137             :     printf("%d,\t %d,\t",z[i].re, z[i].im);
    1138             :     printf("\n\n");*/
    1139             : 
    1140             : 
    1141             :     /* 2. FFT15 stage */
    1142             : 
    1143    19853168 :     x[0] = CL_shr( inp[15], SCALEFACTOR30_1 ); // Qx - 5
    1144    19853168 :     x[1] = CL_shr( inp[3], SCALEFACTOR30_1 );
    1145    19853168 :     x[2] = CL_shr( inp[21], SCALEFACTOR30_1 );
    1146    19853168 :     x[3] = CL_shr( inp[9], SCALEFACTOR30_1 );
    1147    19853168 :     x[4] = CL_shr( inp[27], SCALEFACTOR30_1 );
    1148             : 
    1149    19853168 :     x[5] = CL_shr( inp[5], SCALEFACTOR30_1 ); // Qx - 5
    1150    19853168 :     x[6] = CL_shr( inp[23], SCALEFACTOR30_1 );
    1151    19853168 :     x[7] = CL_shr( inp[11], SCALEFACTOR30_1 );
    1152    19853168 :     x[8] = CL_shr( inp[29], SCALEFACTOR30_1 );
    1153    19853168 :     x[9] = CL_shr( inp[17], SCALEFACTOR30_1 );
    1154             : 
    1155    19853168 :     x[10] = CL_shr( inp[25], SCALEFACTOR30_1 ); // Qx - 5
    1156    19853168 :     x[11] = CL_shr( inp[13], SCALEFACTOR30_1 );
    1157    19853168 :     x[12] = CL_shr( inp[1], SCALEFACTOR30_1 );
    1158    19853168 :     x[13] = CL_shr( inp[19], SCALEFACTOR30_1 );
    1159    19853168 :     x[14] = CL_shr( inp[7], SCALEFACTOR30_1 );
    1160             : 
    1161             :     /* 1. FFT5 stage */
    1162    19853168 :     rs1 = CL_add( x[1], x[4] );
    1163    19853168 :     rs4 = CL_sub( x[1], x[4] );
    1164    19853168 :     rs3 = CL_add( x[2], x[3] );
    1165    19853168 :     rs2 = CL_sub( x[2], x[3] );
    1166    19853168 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1167    19853168 :     rs1 = CL_add( rs1, rs3 );
    1168    19853168 :     y[0] = CL_add( x[0], rs1 );
    1169    19853168 :     rs1 = CL_add( y[0], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1170    19853168 :     rs3 = CL_sub( rs1, t );
    1171    19853168 :     rs1 = CL_add( rs1, t );
    1172    19853168 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1173    19853168 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1174    19853168 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1175             : 
    1176             :     /* combination */
    1177    19853168 :     y[1] = CL_msu_j( rs1, rs2 );
    1178    19853168 :     y[4] = CL_mac_j( rs1, rs2 );
    1179    19853168 :     y[2] = CL_mac_j( rs3, rs4 );
    1180    19853168 :     y[3] = CL_msu_j( rs3, rs4 );
    1181             : 
    1182             : 
    1183             :     /* 2. FFT5 stage */
    1184    19853168 :     rs1 = CL_add( x[6], x[9] );
    1185    19853168 :     rs4 = CL_sub( x[6], x[9] );
    1186    19853168 :     rs3 = CL_add( x[7], x[8] );
    1187    19853168 :     rs2 = CL_sub( x[7], x[8] );
    1188    19853168 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1189    19853168 :     rs1 = CL_add( rs1, rs3 );
    1190    19853168 :     y[5] = CL_add( x[5], rs1 );
    1191    19853168 :     rs1 = CL_add( y[5], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1192    19853168 :     rs3 = CL_sub( rs1, t );
    1193    19853168 :     rs1 = CL_add( rs1, t );
    1194    19853168 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1195    19853168 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1196    19853168 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1197             : 
    1198             :     /* combination */
    1199    19853168 :     y[6] = CL_msu_j( rs1, rs2 );
    1200    19853168 :     y[9] = CL_mac_j( rs1, rs2 );
    1201    19853168 :     y[7] = CL_mac_j( rs3, rs4 );
    1202    19853168 :     y[8] = CL_msu_j( rs3, rs4 );
    1203             : 
    1204             : 
    1205             :     /* 3. FFT5 stage */
    1206    19853168 :     rs1 = CL_add( x[11], x[14] );
    1207    19853168 :     rs4 = CL_sub( x[11], x[14] );
    1208    19853168 :     rs3 = CL_add( x[12], x[13] );
    1209    19853168 :     rs2 = CL_sub( x[12], x[13] );
    1210    19853168 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1211    19853168 :     rs1 = CL_add( rs1, rs3 );
    1212    19853168 :     y[10] = CL_add( x[10], rs1 );
    1213    19853168 :     rs1 = CL_add( y[10], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1214    19853168 :     rs3 = CL_sub( rs1, t );
    1215    19853168 :     rs1 = CL_add( rs1, t );
    1216    19853168 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1217    19853168 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1218    19853168 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1219             : 
    1220             :     /* combination */
    1221    19853168 :     y[11] = CL_msu_j( rs1, rs2 );
    1222    19853168 :     y[14] = CL_mac_j( rs1, rs2 );
    1223    19853168 :     y[12] = CL_mac_j( rs3, rs4 );
    1224    19853168 :     y[13] = CL_msu_j( rs3, rs4 );
    1225             :     /*for (i=10; i<15; i++)
    1226             :     {
    1227             :     printf("%d,\t %d,\t",y[i].re, y[i].im);
    1228             :     }
    1229             :     printf("\n\n");*/
    1230             : 
    1231             : 
    1232             :     /* 1. FFT3 stage */
    1233             :     /* real part */
    1234    19853168 :     rs1 = CL_add( y[5], y[10] );
    1235    19853168 :     rs2 = CL_scale_t( CL_sub( y[5], y[10] ), C31 );
    1236    19853168 :     z[15] = CL_add( y[0], rs1 );
    1237    19853168 :     rs1 = CL_sub( y[0], CL_shr( rs1, 1 ) );
    1238             : 
    1239    19853168 :     z[25] = CL_mac_j( rs1, rs2 );
    1240    19853168 :     z[20] = CL_msu_j( rs1, rs2 );
    1241             : 
    1242             :     /* 2. FFT3 stage */
    1243    19853168 :     rs1 = CL_add( y[6], y[11] );
    1244    19853168 :     rs2 = CL_scale_t( CL_sub( y[6], y[11] ), C31 );
    1245    19853168 :     z[21] = CL_add( y[1], rs1 );
    1246    19853168 :     rs1 = CL_sub( y[1], CL_shr( rs1, 1 ) );
    1247             : 
    1248    19853168 :     z[16] = CL_mac_j( rs1, rs2 );
    1249    19853168 :     z[26] = CL_msu_j( rs1, rs2 );
    1250             : 
    1251             : 
    1252             :     /* 3. FFT3 stage */
    1253    19853168 :     rs1 = CL_add( y[7], y[12] );
    1254    19853168 :     rs2 = CL_scale_t( CL_sub( y[7], y[12] ), C31 );
    1255    19853168 :     z[27] = CL_add( y[2], rs1 );
    1256    19853168 :     rs1 = CL_sub( y[2], CL_shr( rs1, 1 ) );
    1257             : 
    1258    19853168 :     z[22] = CL_mac_j( rs1, rs2 );
    1259    19853168 :     z[17] = CL_msu_j( rs1, rs2 );
    1260             : 
    1261             : 
    1262             :     /* 4. FFT3 stage */
    1263    19853168 :     rs1 = CL_add( y[8], y[13] );
    1264    19853168 :     rs2 = CL_scale_t( CL_sub( y[8], y[13] ), C31 );
    1265    19853168 :     z[18] = CL_add( y[3], rs1 );
    1266    19853168 :     rs1 = CL_sub( y[3], CL_shr( rs1, 1 ) );
    1267             : 
    1268    19853168 :     z[28] = CL_mac_j( rs1, rs2 );
    1269    19853168 :     z[23] = CL_msu_j( rs1, rs2 );
    1270             : 
    1271             : 
    1272             :     /* 5. FFT3 stage */
    1273    19853168 :     rs1 = CL_add( y[9], y[14] );
    1274    19853168 :     rs2 = CL_scale_t( CL_sub( y[9], y[14] ), C31 );
    1275    19853168 :     z[24] = CL_add( y[4], rs1 );
    1276    19853168 :     rs1 = CL_sub( y[4], CL_shr( rs1, 1 ) );
    1277             : 
    1278    19853168 :     z[19] = CL_mac_j( rs1, rs2 );
    1279    19853168 :     z[29] = CL_msu_j( rs1, rs2 );
    1280             : 
    1281             :     /*for (i=0; i<30; i++)
    1282             :     printf("%d,\t %d,\t",z[i].re, z[i].im);
    1283             :     printf("\n\n");*/
    1284             : 
    1285             : 
    1286             :     /* 1. FFT2 stage */
    1287    19853168 :     rs1 = CL_shr( z[0], SCALEFACTOR30_2 );
    1288    19853168 :     rs2 = CL_shr( z[15], SCALEFACTOR30_2 );
    1289    19853168 :     *l = CL_add( rs1, rs2 );
    1290    19853168 :     *h = CL_sub( rs1, rs2 );
    1291    19853168 :     l += 1;
    1292    19853168 :     h += 1;
    1293             : 
    1294             :     /* 2. FFT2 stage */
    1295    19853168 :     rs1 = CL_shr( z[8], SCALEFACTOR30_2 );
    1296    19853168 :     rs2 = CL_shr( z[23], SCALEFACTOR30_2 );
    1297    19853168 :     *h = CL_add( rs1, rs2 );
    1298    19853168 :     *l = CL_sub( rs1, rs2 );
    1299    19853168 :     l += 1;
    1300    19853168 :     h += 1;
    1301             : 
    1302             : 
    1303             :     /* 3. FFT2 stage */
    1304    19853168 :     rs1 = CL_shr( z[1], SCALEFACTOR30_2 );
    1305    19853168 :     rs2 = CL_shr( z[16], SCALEFACTOR30_2 );
    1306    19853168 :     *l = CL_add( rs1, rs2 );
    1307    19853168 :     *h = CL_sub( rs1, rs2 );
    1308    19853168 :     l += 1;
    1309    19853168 :     h += 1;
    1310             : 
    1311             : 
    1312             :     /* 4. FFT2 stage */
    1313    19853168 :     rs1 = CL_shr( z[9], SCALEFACTOR30_2 );
    1314    19853168 :     rs2 = CL_shr( z[24], SCALEFACTOR30_2 );
    1315    19853168 :     *h = CL_add( rs1, rs2 );
    1316    19853168 :     *l = CL_sub( rs1, rs2 );
    1317    19853168 :     l += 1;
    1318    19853168 :     h += 1;
    1319             : 
    1320             :     /* 5. FFT2 stage */
    1321    19853168 :     rs1 = CL_shr( z[2], SCALEFACTOR30_2 );
    1322    19853168 :     rs2 = CL_shr( z[17], SCALEFACTOR30_2 );
    1323    19853168 :     *l = CL_add( rs1, rs2 );
    1324    19853168 :     *h = CL_sub( rs1, rs2 );
    1325    19853168 :     l += 1;
    1326    19853168 :     h += 1;
    1327             : 
    1328             :     /* 6. FFT2 stage */
    1329    19853168 :     rs1 = CL_shr( z[10], SCALEFACTOR30_2 );
    1330    19853168 :     rs2 = CL_shr( z[25], SCALEFACTOR30_2 );
    1331    19853168 :     *h = CL_add( rs1, rs2 );
    1332    19853168 :     *l = CL_sub( rs1, rs2 );
    1333    19853168 :     l += 1;
    1334    19853168 :     h += 1;
    1335             : 
    1336             :     /* 7. FFT2 stage */
    1337    19853168 :     rs1 = CL_shr( z[3], SCALEFACTOR30_2 );
    1338    19853168 :     rs2 = CL_shr( z[18], SCALEFACTOR30_2 );
    1339    19853168 :     *l = CL_add( rs1, rs2 );
    1340    19853168 :     *h = CL_sub( rs1, rs2 );
    1341    19853168 :     l += 1;
    1342    19853168 :     h += 1;
    1343             : 
    1344             :     /* 8. FFT2 stage */
    1345    19853168 :     rs1 = CL_shr( z[11], SCALEFACTOR30_2 );
    1346    19853168 :     rs2 = CL_shr( z[26], SCALEFACTOR30_2 );
    1347    19853168 :     *h = CL_add( rs1, rs2 );
    1348    19853168 :     *l = CL_sub( rs1, rs2 );
    1349    19853168 :     l += 1;
    1350    19853168 :     h += 1;
    1351             : 
    1352             :     /* 9. FFT2 stage */
    1353    19853168 :     rs1 = CL_shr( z[4], SCALEFACTOR30_2 );
    1354    19853168 :     rs2 = CL_shr( z[19], SCALEFACTOR30_2 );
    1355    19853168 :     *l = CL_add( rs1, rs2 );
    1356    19853168 :     *h = CL_sub( rs1, rs2 );
    1357    19853168 :     l += 1;
    1358    19853168 :     h += 1;
    1359             : 
    1360             :     /* 10. FFT2 stage */
    1361    19853168 :     rs1 = CL_shr( z[12], SCALEFACTOR30_2 );
    1362    19853168 :     rs2 = CL_shr( z[27], SCALEFACTOR30_2 );
    1363    19853168 :     *h = CL_add( rs1, rs2 );
    1364    19853168 :     *l = CL_sub( rs1, rs2 );
    1365    19853168 :     l += 1;
    1366    19853168 :     h += 1;
    1367             : 
    1368             :     /* 11. FFT2 stage */
    1369    19853168 :     rs1 = CL_shr( z[5], SCALEFACTOR30_2 );
    1370    19853168 :     rs2 = CL_shr( z[20], SCALEFACTOR30_2 );
    1371    19853168 :     *l = CL_add( rs1, rs2 );
    1372    19853168 :     *h = CL_sub( rs1, rs2 );
    1373    19853168 :     l += 1;
    1374    19853168 :     h += 1;
    1375             : 
    1376             :     /* 12. FFT2 stage */
    1377    19853168 :     rs1 = CL_shr( z[13], SCALEFACTOR30_2 );
    1378    19853168 :     rs2 = CL_shr( z[28], SCALEFACTOR30_2 );
    1379    19853168 :     *h = CL_add( rs1, rs2 );
    1380    19853168 :     *l = CL_sub( rs1, rs2 );
    1381    19853168 :     l += 1;
    1382    19853168 :     h += 1;
    1383             : 
    1384             :     /* 13. FFT2 stage */
    1385    19853168 :     rs1 = CL_shr( z[6], SCALEFACTOR30_2 );
    1386    19853168 :     rs2 = CL_shr( z[21], SCALEFACTOR30_2 );
    1387    19853168 :     *l = CL_add( rs1, rs2 );
    1388    19853168 :     *h = CL_sub( rs1, rs2 );
    1389    19853168 :     l += 1;
    1390    19853168 :     h += 1;
    1391             : 
    1392             :     /* 14. FFT2 stage */
    1393    19853168 :     rs1 = CL_shr( z[14], SCALEFACTOR30_2 );
    1394    19853168 :     rs2 = CL_shr( z[29], SCALEFACTOR30_2 );
    1395    19853168 :     *h = CL_add( rs1, rs2 );
    1396    19853168 :     *l = CL_sub( rs1, rs2 );
    1397    19853168 :     l += 1;
    1398    19853168 :     h += 1;
    1399             : 
    1400             :     /* 15. FFT2 stage */
    1401    19853168 :     rs1 = CL_shr( z[7], SCALEFACTOR30_2 );
    1402    19853168 :     rs2 = CL_shr( z[22], SCALEFACTOR30_2 );
    1403    19853168 :     *l = CL_add( rs1, rs2 );
    1404    19853168 :     *h = CL_sub( rs1, rs2 );
    1405    19853168 :     l += 1;
    1406    19853168 :     h += 1;
    1407             : 
    1408             : #ifdef WMOPS
    1409             :     multiCounter[currCounter].CL_move += 30;
    1410             : #endif
    1411    19853168 : }
    1412             : 
    1413             : /**
    1414             :  * \brief    Function performs a complex 32-point FFT
    1415             :  *           The FFT is performed inplace. The result of the FFT
    1416             :  *           is scaled by SCALEFACTOR32 bits.
    1417             :  *
    1418             :  *           WOPS with 32x16 bit multiplications:  752 cycles
    1419             :  *
    1420             :  * \param    [i/o] re    real input / output
    1421             :  * \param    [i/o] im    imag input / output
    1422             :  * \param    [i  ] s     stride real and imag input / output
    1423             :  *
    1424             :  * \return   void
    1425             :  */
    1426             : 
    1427             : 
    1428     1919872 : static void fft32_with_cmplx_data( cmplx *inp /*Qx*/ )
    1429             : {
    1430             :     cmplx x[32], y[32], t[32], s[32], temp, temp1;
    1431     1919872 :     const cmplx_s *pRotVector_32 = (const cmplx_s *) RotVector_32;
    1432             : 
    1433             :     /* 1. FFT8 stage */
    1434             : 
    1435     1919872 :     x[0] = CL_shr( inp[0], SCALEFACTOR32_1 ); // Qx - 5
    1436     1919872 :     x[1] = CL_shr( inp[4], SCALEFACTOR32_1 );
    1437     1919872 :     x[2] = CL_shr( inp[8], SCALEFACTOR32_1 );
    1438     1919872 :     x[3] = CL_shr( inp[12], SCALEFACTOR32_1 );
    1439     1919872 :     x[4] = CL_shr( inp[16], SCALEFACTOR32_1 );
    1440     1919872 :     x[5] = CL_shr( inp[20], SCALEFACTOR32_1 );
    1441     1919872 :     x[6] = CL_shr( inp[24], SCALEFACTOR32_1 );
    1442     1919872 :     x[7] = CL_shr( inp[28], SCALEFACTOR32_1 );
    1443             : 
    1444             : 
    1445     1919872 :     t[0] = CL_add( x[0], x[4] );
    1446     1919872 :     t[1] = CL_sub( x[0], x[4] );
    1447     1919872 :     t[2] = CL_add( x[1], x[5] );
    1448     1919872 :     t[3] = CL_sub( x[1], x[5] );
    1449     1919872 :     t[4] = CL_add( x[2], x[6] );
    1450     1919872 :     t[5] = CL_sub( x[2], x[6] );
    1451     1919872 :     t[6] = CL_add( x[3], x[7] );
    1452     1919872 :     t[7] = CL_sub( x[3], x[7] );
    1453             : 
    1454             :     /* Pre-additions and core multiplications */
    1455             : 
    1456     1919872 :     s[0] = CL_add( t[0], t[4] );
    1457     1919872 :     s[2] = CL_sub( t[0], t[4] );
    1458     1919872 :     s[4] = CL_mac_j( t[1], t[5] );
    1459     1919872 :     s[5] = CL_msu_j( t[1], t[5] );
    1460     1919872 :     s[1] = CL_add( t[2], t[6] );
    1461     1919872 :     s[3] = CL_sub( t[2], t[6] );
    1462     1919872 :     s[3] = CL_mul_j( s[3] );
    1463             : 
    1464     1919872 :     temp = CL_add( t[3], t[7] );
    1465     1919872 :     temp1 = CL_sub( t[3], t[7] );
    1466     1919872 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1467     1919872 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1468             : 
    1469             : 
    1470     1919872 :     y[0] = CL_add( s[0], s[1] );
    1471     1919872 :     y[4] = CL_sub( s[0], s[1] );
    1472     1919872 :     y[2] = CL_sub( s[2], s[3] );
    1473     1919872 :     y[6] = CL_add( s[2], s[3] );
    1474     1919872 :     y[3] = CL_add( s[4], s[7] );
    1475     1919872 :     y[7] = CL_sub( s[4], s[7] );
    1476     1919872 :     y[1] = CL_add( s[5], s[6] );
    1477     1919872 :     y[5] = CL_sub( s[5], s[6] );
    1478             : 
    1479             :     /* 2. FFT8 stage */
    1480             : 
    1481     1919872 :     x[0] = CL_shr( inp[1], SCALEFACTOR32_1 ); // Qx - 5
    1482     1919872 :     x[1] = CL_shr( inp[5], SCALEFACTOR32_1 );
    1483     1919872 :     x[2] = CL_shr( inp[9], SCALEFACTOR32_1 );
    1484     1919872 :     x[3] = CL_shr( inp[13], SCALEFACTOR32_1 );
    1485     1919872 :     x[4] = CL_shr( inp[17], SCALEFACTOR32_1 );
    1486     1919872 :     x[5] = CL_shr( inp[21], SCALEFACTOR32_1 );
    1487     1919872 :     x[6] = CL_shr( inp[25], SCALEFACTOR32_1 );
    1488     1919872 :     x[7] = CL_shr( inp[29], SCALEFACTOR32_1 );
    1489             : 
    1490             : 
    1491     1919872 :     t[0] = CL_add( x[0], x[4] );
    1492     1919872 :     t[1] = CL_sub( x[0], x[4] );
    1493     1919872 :     t[2] = CL_add( x[1], x[5] );
    1494     1919872 :     t[3] = CL_sub( x[1], x[5] );
    1495     1919872 :     t[4] = CL_add( x[2], x[6] );
    1496     1919872 :     t[5] = CL_sub( x[2], x[6] );
    1497     1919872 :     t[6] = CL_add( x[3], x[7] );
    1498     1919872 :     t[7] = CL_sub( x[3], x[7] );
    1499             : 
    1500             :     /* Pre-additions and core multiplications */
    1501             : 
    1502     1919872 :     s[0] = CL_add( t[0], t[4] );
    1503     1919872 :     s[2] = CL_sub( t[0], t[4] );
    1504     1919872 :     s[4] = CL_mac_j( t[1], t[5] );
    1505     1919872 :     s[5] = CL_msu_j( t[1], t[5] );
    1506     1919872 :     s[1] = CL_add( t[2], t[6] );
    1507     1919872 :     s[3] = CL_sub( t[2], t[6] );
    1508     1919872 :     s[3] = CL_mul_j( s[3] );
    1509             : 
    1510     1919872 :     temp = CL_add( t[3], t[7] );
    1511     1919872 :     temp1 = CL_sub( t[3], t[7] );
    1512     1919872 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1513     1919872 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1514             : 
    1515             :     /* Post-additions */
    1516             : 
    1517     1919872 :     y[8] = CL_add( s[0], s[1] );
    1518     1919872 :     y[12] = CL_sub( s[0], s[1] );
    1519     1919872 :     y[10] = CL_sub( s[2], s[3] );
    1520     1919872 :     y[14] = CL_add( s[2], s[3] );
    1521     1919872 :     y[11] = CL_add( s[4], s[7] );
    1522     1919872 :     y[15] = CL_sub( s[4], s[7] );
    1523     1919872 :     y[9] = CL_add( s[5], s[6] );
    1524     1919872 :     y[13] = CL_sub( s[5], s[6] );
    1525             : 
    1526             :     /* 3. FFT8 stage */
    1527             : 
    1528     1919872 :     x[0] = CL_shr( inp[2], SCALEFACTOR32_1 ); // Qx - 5
    1529     1919872 :     x[1] = CL_shr( inp[6], SCALEFACTOR32_1 );
    1530     1919872 :     x[2] = CL_shr( inp[10], SCALEFACTOR32_1 );
    1531     1919872 :     x[3] = CL_shr( inp[14], SCALEFACTOR32_1 );
    1532     1919872 :     x[4] = CL_shr( inp[18], SCALEFACTOR32_1 );
    1533     1919872 :     x[5] = CL_shr( inp[22], SCALEFACTOR32_1 );
    1534     1919872 :     x[6] = CL_shr( inp[26], SCALEFACTOR32_1 );
    1535     1919872 :     x[7] = CL_shr( inp[30], SCALEFACTOR32_1 );
    1536             : 
    1537             : 
    1538     1919872 :     t[0] = CL_add( x[0], x[4] );
    1539     1919872 :     t[1] = CL_sub( x[0], x[4] );
    1540     1919872 :     t[2] = CL_add( x[1], x[5] );
    1541     1919872 :     t[3] = CL_sub( x[1], x[5] );
    1542     1919872 :     t[4] = CL_add( x[2], x[6] );
    1543     1919872 :     t[5] = CL_sub( x[2], x[6] );
    1544     1919872 :     t[6] = CL_add( x[3], x[7] );
    1545     1919872 :     t[7] = CL_sub( x[3], x[7] );
    1546             : 
    1547             :     /* Pre-additions and core multiplications */
    1548             : 
    1549     1919872 :     s[0] = CL_add( t[0], t[4] );
    1550     1919872 :     s[2] = CL_sub( t[0], t[4] );
    1551     1919872 :     s[4] = CL_mac_j( t[1], t[5] );
    1552     1919872 :     s[5] = CL_msu_j( t[1], t[5] );
    1553     1919872 :     s[1] = CL_add( t[2], t[6] );
    1554     1919872 :     s[3] = CL_sub( t[2], t[6] );
    1555     1919872 :     s[3] = CL_mul_j( s[3] );
    1556             : 
    1557     1919872 :     temp = CL_add( t[3], t[7] );
    1558     1919872 :     temp1 = CL_sub( t[3], t[7] );
    1559     1919872 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1560     1919872 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1561             : 
    1562             :     /* Post-additions */
    1563             : 
    1564     1919872 :     y[16] = CL_add( s[0], s[1] );
    1565     1919872 :     y[20] = CL_sub( s[0], s[1] );
    1566     1919872 :     y[18] = CL_sub( s[2], s[3] );
    1567     1919872 :     y[22] = CL_add( s[2], s[3] );
    1568     1919872 :     y[19] = CL_add( s[4], s[7] );
    1569     1919872 :     y[23] = CL_sub( s[4], s[7] );
    1570     1919872 :     y[17] = CL_add( s[5], s[6] );
    1571     1919872 :     y[21] = CL_sub( s[5], s[6] );
    1572             : 
    1573             :     /* 4. FFT8 stage */
    1574             : 
    1575     1919872 :     x[0] = CL_shr( inp[3], SCALEFACTOR32_1 ); // Qx - 5
    1576     1919872 :     x[1] = CL_shr( inp[7], SCALEFACTOR32_1 );
    1577     1919872 :     x[2] = CL_shr( inp[11], SCALEFACTOR32_1 );
    1578     1919872 :     x[3] = CL_shr( inp[15], SCALEFACTOR32_1 );
    1579     1919872 :     x[4] = CL_shr( inp[19], SCALEFACTOR32_1 );
    1580     1919872 :     x[5] = CL_shr( inp[23], SCALEFACTOR32_1 );
    1581     1919872 :     x[6] = CL_shr( inp[27], SCALEFACTOR32_1 );
    1582     1919872 :     x[7] = CL_shr( inp[31], SCALEFACTOR32_1 );
    1583             : 
    1584             : 
    1585     1919872 :     t[0] = CL_add( x[0], x[4] );
    1586     1919872 :     t[1] = CL_sub( x[0], x[4] );
    1587     1919872 :     t[2] = CL_add( x[1], x[5] );
    1588     1919872 :     t[3] = CL_sub( x[1], x[5] );
    1589     1919872 :     t[4] = CL_add( x[2], x[6] );
    1590     1919872 :     t[5] = CL_sub( x[2], x[6] );
    1591     1919872 :     t[6] = CL_add( x[3], x[7] );
    1592     1919872 :     t[7] = CL_sub( x[3], x[7] );
    1593             : 
    1594             : 
    1595             :     /* Pre-additions and core multiplications */
    1596             : 
    1597     1919872 :     s[0] = CL_add( t[0], t[4] );
    1598     1919872 :     s[2] = CL_sub( t[0], t[4] );
    1599     1919872 :     s[4] = CL_mac_j( t[1], t[5] );
    1600     1919872 :     s[5] = CL_msu_j( t[1], t[5] );
    1601     1919872 :     s[1] = CL_add( t[2], t[6] );
    1602     1919872 :     s[3] = CL_sub( t[2], t[6] );
    1603     1919872 :     s[3] = CL_mul_j( s[3] );
    1604             : 
    1605     1919872 :     temp = CL_add( t[3], t[7] );
    1606     1919872 :     temp1 = CL_sub( t[3], t[7] );
    1607     1919872 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1608     1919872 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1609             : 
    1610             :     /* Post-additions */
    1611             : 
    1612     1919872 :     y[24] = CL_add( s[0], s[1] );
    1613     1919872 :     y[28] = CL_sub( s[0], s[1] );
    1614     1919872 :     y[26] = CL_sub( s[2], s[3] );
    1615     1919872 :     y[30] = CL_add( s[2], s[3] );
    1616     1919872 :     y[27] = CL_add( s[4], s[7] );
    1617     1919872 :     y[31] = CL_sub( s[4], s[7] );
    1618     1919872 :     y[25] = CL_add( s[5], s[6] );
    1619     1919872 :     y[29] = CL_sub( s[5], s[6] );
    1620             : 
    1621             : 
    1622             :     /* apply twiddle factors */
    1623     1919872 :     y[0] = CL_shr( y[0], SCALEFACTOR32_2 );
    1624     1919872 :     y[1] = CL_shr( y[1], SCALEFACTOR32_2 );
    1625     1919872 :     y[2] = CL_shr( y[2], SCALEFACTOR32_2 );
    1626     1919872 :     y[3] = CL_shr( y[3], SCALEFACTOR32_2 );
    1627     1919872 :     y[4] = CL_shr( y[4], SCALEFACTOR32_2 );
    1628     1919872 :     y[5] = CL_shr( y[5], SCALEFACTOR32_2 );
    1629     1919872 :     y[6] = CL_shr( y[6], SCALEFACTOR32_2 );
    1630     1919872 :     y[7] = CL_shr( y[7], SCALEFACTOR32_2 );
    1631     1919872 :     y[8] = CL_shr( y[8], SCALEFACTOR32_2 );
    1632     1919872 :     y[16] = CL_shr( y[16], SCALEFACTOR32_2 );
    1633     1919872 :     y[24] = CL_shr( y[24], SCALEFACTOR32_2 );
    1634     1919872 :     y[20] = CL_shr( y[20], SCALEFACTOR32_2 );
    1635             : 
    1636             : 
    1637     1919872 :     y[9] = CL_mult_32x16( ( CL_shr( y[9], 1 ) ), pRotVector_32[0] );
    1638     1919872 :     y[10] = CL_mult_32x16( ( CL_shr( y[10], 1 ) ), pRotVector_32[1] );
    1639     1919872 :     y[11] = CL_mult_32x16( ( CL_shr( y[11], 1 ) ), pRotVector_32[2] );
    1640     1919872 :     y[12] = CL_mult_32x16( ( CL_shr( y[12], 1 ) ), pRotVector_32[3] );
    1641     1919872 :     y[13] = CL_mult_32x16( ( CL_shr( y[13], 1 ) ), pRotVector_32[4] );
    1642     1919872 :     y[14] = CL_mult_32x16( ( CL_shr( y[14], 1 ) ), pRotVector_32[5] );
    1643     1919872 :     y[15] = CL_mult_32x16( ( CL_shr( y[15], 1 ) ), pRotVector_32[6] );
    1644     1919872 :     y[17] = CL_mult_32x16( ( CL_shr( y[17], 1 ) ), pRotVector_32[7] );
    1645     1919872 :     y[18] = CL_mult_32x16( ( CL_shr( y[18], 1 ) ), pRotVector_32[8] );
    1646     1919872 :     y[19] = CL_mult_32x16( ( CL_shr( y[19], 1 ) ), pRotVector_32[9] );
    1647     1919872 :     y[21] = CL_mult_32x16( ( CL_shr( y[21], 1 ) ), pRotVector_32[10] );
    1648     1919872 :     y[22] = CL_mult_32x16( ( CL_shr( y[22], 1 ) ), pRotVector_32[11] );
    1649     1919872 :     y[23] = CL_mult_32x16( ( CL_shr( y[23], 1 ) ), pRotVector_32[12] );
    1650     1919872 :     y[25] = CL_mult_32x16( ( CL_shr( y[25], 1 ) ), pRotVector_32[13] );
    1651     1919872 :     y[26] = CL_mult_32x16( ( CL_shr( y[26], 1 ) ), pRotVector_32[14] );
    1652     1919872 :     y[27] = CL_mult_32x16( ( CL_shr( y[27], 1 ) ), pRotVector_32[15] );
    1653     1919872 :     y[28] = CL_mult_32x16( ( CL_shr( y[28], 1 ) ), pRotVector_32[16] );
    1654     1919872 :     y[29] = CL_mult_32x16( ( CL_shr( y[29], 1 ) ), pRotVector_32[17] );
    1655     1919872 :     y[30] = CL_mult_32x16( ( CL_shr( y[30], 1 ) ), pRotVector_32[18] );
    1656     1919872 :     y[31] = CL_mult_32x16( ( CL_shr( y[31], 1 ) ), pRotVector_32[19] );
    1657             : 
    1658             :     /* 1. FFT4 stage */
    1659             : 
    1660             :     /*  Pre-additions */
    1661     1919872 :     t[0] = CL_add( y[0], y[16] );
    1662     1919872 :     t[1] = CL_sub( y[0], y[16] );
    1663     1919872 :     t[2] = CL_add( y[8], y[24] );
    1664     1919872 :     t[3] = CL_mul_j( CL_sub( y[8], y[24] ) );
    1665             : 
    1666             :     /*  Post-additions */
    1667     1919872 :     inp[0] = CL_add( t[0], t[2] );
    1668     1919872 :     inp[8] = CL_sub( t[1], t[3] );
    1669     1919872 :     inp[16] = CL_sub( t[0], t[2] );
    1670     1919872 :     inp[24] = CL_add( t[1], t[3] );
    1671             : 
    1672             :     /* 2. FFT4 stage */
    1673             : 
    1674             :     /*  Pre-additions */
    1675     1919872 :     t[0] = CL_add( y[1], y[17] );
    1676     1919872 :     t[1] = CL_sub( y[1], y[17] );
    1677     1919872 :     t[2] = CL_add( y[9], y[25] );
    1678     1919872 :     t[3] = CL_mul_j( CL_sub( y[9], y[25] ) );
    1679             : 
    1680             :     /*  Post-additions */
    1681     1919872 :     inp[1] = CL_add( t[0], t[2] );
    1682     1919872 :     inp[9] = CL_sub( t[1], t[3] );
    1683     1919872 :     inp[17] = CL_sub( t[0], t[2] );
    1684     1919872 :     inp[25] = CL_add( t[1], t[3] );
    1685             : 
    1686             : 
    1687             :     /* 3. FFT4 stage */
    1688             : 
    1689             :     /*  Pre-additions */
    1690     1919872 :     t[0] = CL_add( y[2], y[18] );
    1691     1919872 :     t[1] = CL_sub( y[2], y[18] );
    1692     1919872 :     t[2] = CL_add( y[10], y[26] );
    1693     1919872 :     t[3] = CL_mul_j( CL_sub( y[10], y[26] ) );
    1694             : 
    1695             :     /*  Post-additions */
    1696     1919872 :     inp[2] = CL_add( t[0], t[2] );
    1697     1919872 :     inp[10] = CL_sub( t[1], t[3] );
    1698     1919872 :     inp[18] = CL_sub( t[0], t[2] );
    1699     1919872 :     inp[26] = CL_add( t[1], t[3] );
    1700             : 
    1701             : 
    1702             :     /* 4. FFT4 stage */
    1703             : 
    1704             :     /*  Pre-additions */
    1705     1919872 :     t[0] = CL_add( y[3], y[19] );
    1706     1919872 :     t[1] = CL_sub( y[3], y[19] );
    1707     1919872 :     t[2] = CL_add( y[11], y[27] );
    1708     1919872 :     t[3] = CL_mul_j( CL_sub( y[11], y[27] ) );
    1709             : 
    1710             : 
    1711             :     /*  Post-additions */
    1712     1919872 :     inp[3] = CL_add( t[0], t[2] );
    1713     1919872 :     inp[11] = CL_sub( t[1], t[3] );
    1714     1919872 :     inp[19] = CL_sub( t[0], t[2] );
    1715     1919872 :     inp[27] = CL_add( t[1], t[3] );
    1716             : 
    1717             : 
    1718             :     /* 5. FFT4 stage */
    1719             : 
    1720             :     /*  Pre-additions */
    1721     1919872 :     t[0] = CL_msu_j( y[4], y[20] );
    1722     1919872 :     t[1] = CL_mac_j( y[4], y[20] );
    1723     1919872 :     t[2] = CL_add( y[12], y[28] );
    1724     1919872 :     t[3] = CL_mul_j( CL_sub( y[12], y[28] ) );
    1725             : 
    1726             : 
    1727             :     /*  Post-additions */
    1728     1919872 :     inp[4] = CL_add( t[0], t[2] );
    1729     1919872 :     inp[12] = CL_sub( t[1], t[3] );
    1730     1919872 :     inp[20] = CL_sub( t[0], t[2] );
    1731     1919872 :     inp[28] = CL_add( t[1], t[3] );
    1732             : 
    1733             : 
    1734             :     /* 6. FFT4 stage */
    1735             : 
    1736             :     /*  Pre-additions */
    1737     1919872 :     t[0] = CL_add( y[5], y[21] );
    1738     1919872 :     t[1] = CL_sub( y[5], y[21] );
    1739     1919872 :     t[2] = CL_add( y[13], y[29] );
    1740     1919872 :     t[3] = CL_mul_j( CL_sub( y[13], y[29] ) );
    1741             : 
    1742             : 
    1743             :     /*  Post-additions */
    1744     1919872 :     inp[5] = CL_add( t[0], t[2] );
    1745     1919872 :     inp[13] = CL_sub( t[1], t[3] );
    1746     1919872 :     inp[21] = CL_sub( t[0], t[2] );
    1747     1919872 :     inp[29] = CL_add( t[1], t[3] );
    1748             : 
    1749             : 
    1750             :     /* 7. FFT4 stage */
    1751             : 
    1752             :     /*  Pre-additions */
    1753     1919872 :     t[0] = CL_add( y[6], y[22] );
    1754     1919872 :     t[1] = CL_sub( y[6], y[22] );
    1755     1919872 :     t[2] = CL_add( y[14], y[30] );
    1756     1919872 :     t[3] = CL_mul_j( CL_sub( y[14], y[30] ) );
    1757             : 
    1758             : 
    1759             :     /*  Post-additions */
    1760     1919872 :     inp[6] = CL_add( t[0], t[2] );
    1761     1919872 :     inp[14] = CL_sub( t[1], t[3] );
    1762     1919872 :     inp[22] = CL_sub( t[0], t[2] );
    1763     1919872 :     inp[30] = CL_add( t[1], t[3] );
    1764             : 
    1765             : 
    1766             :     /* 8. FFT4 stage */
    1767             : 
    1768             :     /*  Pre-additions */
    1769     1919872 :     t[0] = CL_add( y[7], y[23] );
    1770     1919872 :     t[1] = CL_sub( y[7], y[23] );
    1771     1919872 :     t[2] = CL_add( y[15], y[31] );
    1772     1919872 :     t[3] = CL_mul_j( CL_sub( y[15], y[31] ) );
    1773             : 
    1774             : 
    1775             :     /*  Post-additions */
    1776     1919872 :     inp[7] = CL_add( t[0], t[2] );
    1777     1919872 :     inp[15] = CL_sub( t[1], t[3] );
    1778     1919872 :     inp[23] = CL_sub( t[0], t[2] );
    1779     1919872 :     inp[31] = CL_add( t[1], t[3] );
    1780             : 
    1781             : #ifdef WMOPS
    1782             :     multiCounter[currCounter].CL_move += 32;
    1783             : #endif
    1784     1919872 : }
    1785             : 
    1786             : 
    1787             : /**
    1788             :  * \brief Combined FFT
    1789             :  *
    1790             :  * \param    [i/o] re     real part
    1791             :  * \param    [i/o] im     imag part
    1792             :  * \param    [i  ] W      rotation factor
    1793             :  * \param    [i  ] len    length of fft
    1794             :  * \param    [i  ] dim1   length of fft1
    1795             :  * \param    [i  ] dim2   length of fft2
    1796             :  * \param    [i  ] sx     stride real and imag part
    1797             :  * \param    [i  ] sc     stride phase rotation coefficients
    1798             :  * \param    [tmp] x      32-bit workbuffer of length=2*len
    1799             :  * \param    [i  ] Woff   offset for addressing the rotation vector table
    1800             :  *
    1801             :  * \return void
    1802             :  */
    1803             : 
    1804     3362503 : static void fftN2(
    1805             :     cmplx *__restrict pComplexBuf,
    1806             :     const Word16 *__restrict W,
    1807             :     Word16 len,
    1808             :     Word16 dim1,
    1809             :     Word16 dim2,
    1810             :     Word16 sc,
    1811             :     Word32 *x,
    1812             :     Word16 Woff )
    1813             : {
    1814             :     Word16 i, j;
    1815     3362503 :     cmplx *x_cmplx = (cmplx *) x;
    1816             : 
    1817     3362503 :     assert( len == ( dim1 * dim2 ) );
    1818     3362503 :     assert( ( dim1 == 3 ) || ( dim1 == 5 ) || ( dim1 == 8 ) || ( dim1 == 10 ) || ( dim1 == 15 ) || ( dim1 == 16 ) || ( dim1 == 20 ) || ( dim1 == 30 ) || ( dim1 == 32 ) );
    1819     3362503 :     assert( ( dim2 == 4 ) || ( dim2 == 8 ) || ( dim2 == 10 ) || ( dim2 == 12 ) || ( dim2 == 16 ) || ( dim2 == 20 ) );
    1820             : 
    1821    46246317 :     FOR( i = 0; i < dim2; i++ )
    1822             :     {
    1823  1074212742 :         FOR( j = 0; j < dim1; j++ )
    1824             :         {
    1825  1031328928 :             x_cmplx[i * dim1 + j] = pComplexBuf[i + j * dim2];
    1826             : #ifdef WMOPS
    1827             :             multiCounter[currCounter].CL_move++;
    1828             : #endif
    1829             :         }
    1830             :     }
    1831             : 
    1832     3362503 :     SWITCH( dim1 )
    1833             :     {
    1834       12270 :         case 5:
    1835      110430 :             FOR( i = 0; i < dim2; i++ )
    1836             :             {
    1837       98160 :                 fft5_with_cmplx_data( &x_cmplx[i * dim1] );
    1838             :             }
    1839       12270 :             BREAK;
    1840      288013 :         case 8:
    1841     2592117 :             FOR( i = 0; i < dim2; i++ )
    1842             :             {
    1843     2304104 :                 fft8_with_cmplx_data( &x_cmplx[i * dim1] );
    1844             :             }
    1845      288013 :             BREAK;
    1846       81076 :         case 10:
    1847      729684 :             FOR( i = 0; i < dim2; i++ )
    1848             :             {
    1849      648608 :                 fft10_with_cmplx_data( &x_cmplx[i * dim1] );
    1850             :             }
    1851       81076 :             BREAK;
    1852             : 
    1853       66504 :         case 15:
    1854      598536 :             FOR( i = 0; i < dim2; i++ )
    1855             :             {
    1856      532032 :                 fft15_with_cmplx_data( &x_cmplx[i * dim1] );
    1857             :             }
    1858       66504 :             BREAK;
    1859      264449 :         case 16:
    1860     2380041 :             FOR( i = 0; i < dim2; i++ )
    1861             :             {
    1862     2115592 :                 fft16_with_cmplx_data( &x_cmplx[i * dim1], 1 );
    1863             :             }
    1864      264449 :             BREAK;
    1865     1167589 :         case 20:
    1866    16698591 :             FOR( i = 0; i < dim2; i++ )
    1867             :             {
    1868    15531002 :                 fft20_with_cmplx_data( &x_cmplx[i * dim1] );
    1869             :             }
    1870     1167589 :             BREAK;
    1871     1242618 :         case 30:
    1872    20977062 :             FOR( i = 0; i < dim2; i++ )
    1873             :             {
    1874    19734444 :                 fft30_with_cmplx_data( &x_cmplx[i * dim1] );
    1875             :             }
    1876     1242618 :             BREAK;
    1877      239984 :         case 32:
    1878     2159856 :             FOR( i = 0; i < dim2; i++ )
    1879             :             {
    1880     1919872 :                 fft32_with_cmplx_data( &x_cmplx[i * dim1] );
    1881             :             }
    1882      239984 :             BREAK;
    1883             :     }
    1884             : 
    1885     3362503 :     SWITCH( dim2 )
    1886             :     {
    1887     1359266 :         case 8:
    1888             :         {
    1889             :             cmplx y0, y1, y2, y3, y4, y5, y6, y7;
    1890             :             cmplx t0, t1, t2, t3, t4, t5, t6, t7;
    1891             :             cmplx s0, s1, s2, s3, s4, s5, s6, s7;
    1892             : 
    1893     1359266 :             i = 0;
    1894     1359266 :             move16();
    1895             :             {
    1896     1359266 :                 y0 = CL_shr( x_cmplx[i + 0 * dim1], 1 );
    1897     1359266 :                 y1 = CL_shr( x_cmplx[i + 1 * dim1], 1 );
    1898     1359266 :                 y2 = CL_shr( x_cmplx[i + 2 * dim1], 1 );
    1899     1359266 :                 y3 = CL_shr( x_cmplx[i + 3 * dim1], 1 );
    1900     1359266 :                 y4 = CL_shr( x_cmplx[i + 4 * dim1], 1 );
    1901     1359266 :                 y5 = CL_shr( x_cmplx[i + 5 * dim1], 1 );
    1902     1359266 :                 y6 = CL_shr( x_cmplx[i + 6 * dim1], 1 );
    1903     1359266 :                 y7 = CL_shr( x_cmplx[i + 7 * dim1], 1 );
    1904             : 
    1905     1359266 :                 t0 = CL_shr( CL_add( y0, y4 ), SCALEFACTORN2 - 1 );
    1906     1359266 :                 t1 = CL_shr( CL_sub( y0, y4 ), SCALEFACTORN2 - 1 );
    1907     1359266 :                 t2 = CL_shr( CL_add( y1, y5 ), SCALEFACTORN2 - 1 );
    1908     1359266 :                 t3 = CL_sub( y1, y5 );
    1909     1359266 :                 t4 = CL_shr( CL_add( y2, y6 ), SCALEFACTORN2 - 1 );
    1910     1359266 :                 t5 = CL_shr( CL_sub( y2, y6 ), SCALEFACTORN2 - 1 );
    1911     1359266 :                 t6 = CL_shr( CL_add( y3, y7 ), SCALEFACTORN2 - 1 );
    1912     1359266 :                 t7 = CL_sub( y3, y7 );
    1913             : 
    1914             : 
    1915     1359266 :                 s0 = CL_add( t0, t4 );
    1916     1359266 :                 s2 = CL_sub( t0, t4 );
    1917     1359266 :                 s4 = CL_mac_j( t1, t5 );
    1918     1359266 :                 s5 = CL_msu_j( t1, t5 );
    1919     1359266 :                 s1 = CL_add( t2, t6 );
    1920     1359266 :                 s3 = CL_mul_j( CL_sub( t2, t6 ) );
    1921     1359266 :                 t0 = CL_shr( CL_add( t3, t7 ), SCALEFACTORN2 - 1 );
    1922     1359266 :                 t1 = CL_shr( CL_sub( t3, t7 ), SCALEFACTORN2 - 1 );
    1923     1359266 :                 s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
    1924     1359266 :                 s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
    1925             : 
    1926     1359266 :                 pComplexBuf[i + 0 * dim1] = CL_add( s0, s1 );
    1927     1359266 :                 pComplexBuf[i + 1 * dim1] = CL_add( s5, s6 );
    1928     1359266 :                 pComplexBuf[i + 2 * dim1] = CL_sub( s2, s3 );
    1929     1359266 :                 pComplexBuf[i + 3 * dim1] = CL_add( s4, s7 );
    1930     1359266 :                 pComplexBuf[i + 4 * dim1] = CL_sub( s0, s1 );
    1931     1359266 :                 pComplexBuf[i + 5 * dim1] = CL_sub( s5, s6 );
    1932     1359266 :                 pComplexBuf[i + 6 * dim1] = CL_add( s2, s3 );
    1933     1359266 :                 pComplexBuf[i + 7 * dim1] = CL_sub( s4, s7 );
    1934             :             }
    1935             : 
    1936             : 
    1937    24513326 :             FOR( i = 1; i < dim1; i++ )
    1938             :             {
    1939    23154060 :                 y0 = CL_shr( x_cmplx[i + 0 * dim1], 1 );
    1940    23154060 :                 y1 = CL_shr( CL_mult_32x16( x_cmplx[i + 1 * dim1], *(const cmplx_s *) &W[sc * i + sc * 1 * dim1 - Woff] ), 1 );
    1941    23154060 :                 y2 = CL_shr( CL_mult_32x16( x_cmplx[i + 2 * dim1], *(const cmplx_s *) &W[sc * i + sc * 2 * dim1 - Woff] ), 1 );
    1942    23154060 :                 y3 = CL_shr( CL_mult_32x16( x_cmplx[i + 3 * dim1], *(const cmplx_s *) &W[sc * i + sc * 3 * dim1 - Woff] ), 1 );
    1943    23154060 :                 y4 = CL_shr( CL_mult_32x16( x_cmplx[i + 4 * dim1], *(const cmplx_s *) &W[sc * i + sc * 4 * dim1 - Woff] ), 1 );
    1944    23154060 :                 y5 = CL_shr( CL_mult_32x16( x_cmplx[i + 5 * dim1], *(const cmplx_s *) &W[sc * i + sc * 5 * dim1 - Woff] ), 1 );
    1945    23154060 :                 y6 = CL_shr( CL_mult_32x16( x_cmplx[i + 6 * dim1], *(const cmplx_s *) &W[sc * i + sc * 6 * dim1 - Woff] ), 1 );
    1946    23154060 :                 y7 = CL_shr( CL_mult_32x16( x_cmplx[i + 7 * dim1], *(const cmplx_s *) &W[sc * i + sc * 7 * dim1 - Woff] ), 1 );
    1947             : 
    1948    23154060 :                 t0 = CL_shr( CL_add( y0, y4 ), SCALEFACTORN2 - 1 );
    1949    23154060 :                 t1 = CL_shr( CL_sub( y0, y4 ), SCALEFACTORN2 - 1 );
    1950    23154060 :                 t2 = CL_shr( CL_add( y1, y5 ), SCALEFACTORN2 - 1 );
    1951    23154060 :                 t3 = CL_sub( y1, y5 );
    1952    23154060 :                 t4 = CL_shr( CL_add( y2, y6 ), SCALEFACTORN2 - 1 );
    1953    23154060 :                 t5 = CL_shr( CL_sub( y2, y6 ), SCALEFACTORN2 - 1 );
    1954    23154060 :                 t6 = CL_shr( CL_add( y3, y7 ), SCALEFACTORN2 - 1 );
    1955    23154060 :                 t7 = CL_sub( y3, y7 );
    1956             : 
    1957             : 
    1958    23154060 :                 s0 = CL_add( t0, t4 );
    1959    23154060 :                 s2 = CL_sub( t0, t4 );
    1960    23154060 :                 s4 = CL_mac_j( t1, t5 );
    1961    23154060 :                 s5 = CL_msu_j( t1, t5 );
    1962    23154060 :                 s1 = CL_add( t2, t6 );
    1963    23154060 :                 s3 = CL_mul_j( CL_sub( t2, t6 ) );
    1964    23154060 :                 t0 = CL_shr( CL_add( t3, t7 ), SCALEFACTORN2 - 1 );
    1965    23154060 :                 t1 = CL_shr( CL_sub( t3, t7 ), SCALEFACTORN2 - 1 );
    1966    23154060 :                 s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
    1967    23154060 :                 s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
    1968             : 
    1969    23154060 :                 pComplexBuf[i + 0 * dim1] = CL_add( s0, s1 );
    1970    23154060 :                 pComplexBuf[i + 1 * dim1] = CL_add( s5, s6 );
    1971    23154060 :                 pComplexBuf[i + 2 * dim1] = CL_sub( s2, s3 );
    1972    23154060 :                 pComplexBuf[i + 3 * dim1] = CL_add( s4, s7 );
    1973    23154060 :                 pComplexBuf[i + 4 * dim1] = CL_sub( s0, s1 );
    1974    23154060 :                 pComplexBuf[i + 5 * dim1] = CL_sub( s5, s6 );
    1975    23154060 :                 pComplexBuf[i + 6 * dim1] = CL_add( s2, s3 );
    1976    23154060 :                 pComplexBuf[i + 7 * dim1] = CL_sub( s4, s7 );
    1977             :             }
    1978             : 
    1979     1359266 :             BREAK;
    1980             :         }
    1981             : 
    1982       24855 :         case 10:
    1983             :         {
    1984             :             cmplx y[20];
    1985             :             {
    1986      273405 :                 FOR( j = 0; j < dim2; j++ )
    1987             :                 {
    1988      248550 :                     y[j] = CL_move( x_cmplx[j * dim1] );
    1989             :                 }
    1990       24855 :                 fft10_with_cmplx_data( &y[0] );
    1991      273405 :                 FOR( j = 0; j < dim2; j++ )
    1992             :                 {
    1993      248550 :                     pComplexBuf[j * dim1] = y[j];
    1994             :                 }
    1995      497100 :                 FOR( i = 1; i < dim1; i++ )
    1996             :                 {
    1997      472245 :                     y[0] = CL_move( x_cmplx[i] );
    1998     4722450 :                     FOR( j = 1; j < dim2; j++ )
    1999             :                     {
    2000     4250205 :                         y[j] = CL_mult_32x16( x_cmplx[i + j * dim1], *(const cmplx_s *) &W[sc * i + sc * j * dim1 - Woff] );
    2001             :                     }
    2002      472245 :                     fft10_with_cmplx_data( &y[0] );
    2003     5194695 :                     FOR( j = 0; j < dim2; j++ )
    2004             :                     {
    2005     4722450 :                         pComplexBuf[i + j * dim1] = y[j];
    2006             :                     }
    2007             :                 }
    2008             :             }
    2009       24855 :             BREAK;
    2010             :         }
    2011     1951626 :         case 16:
    2012             :         {
    2013             :             cmplx y[20];
    2014             : 
    2015    33177642 :             FOR( j = 0; j < dim2; j++ )
    2016             :             {
    2017    31226016 :                 y[j] = CL_shr( x_cmplx[0 + j * dim1], SCALEFACTOR16 );
    2018             :             }
    2019     1951626 :             fft16_with_cmplx_data( &y[0], 0 );
    2020             : 
    2021    33177642 :             FOR( j = 0; j < dim2; j++ )
    2022             :             {
    2023    31226016 :                 pComplexBuf[j * dim1] = y[j];
    2024             :             }
    2025    50958870 :             FOR( i = 1; i < dim1; i++ )
    2026             :             {
    2027    49007244 :                 y[0] = CL_shr( x_cmplx[i + ( 0 + 0 ) * dim1], SCALEFACTOR16 );
    2028    49007244 :                 y[1] = CL_shr( CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] ), SCALEFACTOR16 );
    2029             : 
    2030   392057952 :                 FOR( j = 2; j < dim2; j = j + 2 )
    2031             :                 {
    2032   343050708 :                     y[( j + 0 )] = CL_shr( CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] ), SCALEFACTOR16 );
    2033   343050708 :                     y[( j + 1 )] = CL_shr( CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] ), SCALEFACTOR16 );
    2034             :                 }
    2035    49007244 :                 fft16_with_cmplx_data( &y[0], 0 );
    2036   833123148 :                 FOR( j = 0; j < dim2; j++ )
    2037             :                 {
    2038   784115904 :                     pComplexBuf[i + j * dim1] = y[j];
    2039             :                 }
    2040             :             }
    2041             :         }
    2042     1951626 :             BREAK;
    2043             : 
    2044       26756 :         case 20:
    2045             : 
    2046       26756 :             assert( dim1 == 20 || dim1 == 30 ); /* cplxMpy4_10_0 contains shift values hardcoded FOR 20x10 */
    2047       26756 :             IF( EQ_16( dim1, 20 ) )
    2048             :             {
    2049             :                 cmplx y[20];
    2050      120141 :                 FOR( j = 0; j < dim2; j++ )
    2051             :                 {
    2052      114420 :                     y[j] = CL_move( x_cmplx[j * dim1] );
    2053             :                 }
    2054        5721 :                 fft20_with_cmplx_data( &y[0] );
    2055      120141 :                 FOR( j = 0; j < dim2; j++ )
    2056             :                 {
    2057      114420 :                     pComplexBuf[j * dim1] = y[j];
    2058             :                 }
    2059      114420 :                 FOR( i = 1; i < dim1; i++ )
    2060             :                 {
    2061      108699 :                     y[0] = CL_move( x_cmplx[i] );
    2062      108699 :                     y[1] = CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] );
    2063     1086990 :                     FOR( j = 2; j < dim2; j = j + 2 )
    2064             :                     {
    2065             : 
    2066      978291 :                         y[j + 0] = CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] );
    2067      978291 :                         y[j + 1] = CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] );
    2068             :                     }
    2069      108699 :                     fft20_with_cmplx_data( &y[0] );
    2070     2282679 :                     FOR( j = 0; j < dim2; j++ )
    2071             :                     {
    2072     2173980 :                         pComplexBuf[i + j * dim1] = y[j];
    2073             :                     }
    2074             :                 }
    2075             :             }
    2076             :             ELSE
    2077             :             {
    2078             :                 cmplx y[20];
    2079      441735 :                 FOR( j = 0; j < dim2; j++ )
    2080             :                 {
    2081      420700 :                     y[j] = CL_shl( x_cmplx[j * dim1], ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2082             :                 }
    2083       21035 :                 fft20_with_cmplx_data( &y[0] );
    2084      441735 :                 FOR( j = 0; j < dim2; j++ )
    2085             :                 {
    2086      420700 :                     pComplexBuf[j * dim1] = y[j];
    2087             :                 }
    2088      631050 :                 FOR( i = 1; i < dim1; i++ )
    2089             :                 {
    2090      610015 :                     y[0] = CL_shl( x_cmplx[i], ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2091      610015 :                     y[1] = CL_shl( CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2092     6100150 :                     FOR( j = 2; j < dim2; j = j + 2 )
    2093             :                     {
    2094             : 
    2095     5490135 :                         y[j + 0] = CL_shl( CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2096     5490135 :                         y[j + 1] = CL_shl( CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2097             :                     }
    2098      610015 :                     fft20_with_cmplx_data( &y[0] );
    2099    12810315 :                     FOR( j = 0; j < dim2; j++ )
    2100             :                     {
    2101    12200300 :                         pComplexBuf[i + j * dim1] = y[j];
    2102             :                     }
    2103             :                 }
    2104             :             }
    2105       26756 :             BREAK;
    2106             :     }
    2107             : #ifdef WMOPS
    2108             :     multiCounter[currCounter].CL_move += len;
    2109             : #endif
    2110     3362503 : }
    2111             : 
    2112             : 
    2113             : /**
    2114             :  * \brief Complex valued FFT
    2115             :  *
    2116             :  * \param    [i/o] re          real part
    2117             :  * \param    [i/o] im          imag part
    2118             :  * \param    [i  ] sizeOfFft   length of fft
    2119             :  * \param    [i  ] s           stride real and imag part
    2120             :  * \param    [i  ] scale       scalefactor
    2121             :  *
    2122             :  * \return void
    2123             :  */
    2124     3761371 : void BASOP_cfft( cmplx *pComplexBuf, Word16 sizeOfFft, Word16 *scale, Word32 x[2 * BASOP_CFFT_MAX_LENGTH] )
    2125             : {
    2126             :     Word16 s;
    2127     3761371 :     s = 0;
    2128     3761371 :     move16();
    2129     3761371 :     SWITCH( sizeOfFft )
    2130             :     {
    2131           0 :         case 5:
    2132           0 :             fft5_with_cmplx_data( pComplexBuf );
    2133           0 :             s = add( *scale, SCALEFACTOR5 );
    2134           0 :             BREAK;
    2135             : 
    2136       64672 :         case 8:
    2137       64672 :             fft8_with_cmplx_data( pComplexBuf );
    2138       64672 :             s = add( *scale, SCALEFACTOR8 );
    2139       64672 :             BREAK;
    2140             : 
    2141      149236 :         case 10:
    2142      149236 :             fft10_with_cmplx_data( pComplexBuf );
    2143      149236 :             s = add( *scale, SCALEFACTOR10 );
    2144      149236 :             BREAK;
    2145             : 
    2146           0 :         case 16:
    2147           0 :             fft16_with_cmplx_data( pComplexBuf, 1 );
    2148           0 :             s = add( *scale, SCALEFACTOR16 );
    2149           0 :             BREAK;
    2150             : 
    2151       66236 :         case 20:
    2152       66236 :             fft20_with_cmplx_data( pComplexBuf );
    2153       66236 :             s = add( *scale, SCALEFACTOR20 );
    2154       66236 :             BREAK;
    2155             : 
    2156      118724 :         case 30:
    2157      118724 :             fft30_with_cmplx_data( pComplexBuf );
    2158      118724 :             s = add( *scale, SCALEFACTOR30 );
    2159      118724 :             BREAK;
    2160             : 
    2161           0 :         case 32:
    2162           0 :             fft32_with_cmplx_data( pComplexBuf );
    2163           0 :             s = add( *scale, SCALEFACTOR32 );
    2164           0 :             BREAK;
    2165             : 
    2166       12270 :         case 40:
    2167             :         {
    2168       12270 :             fftN2( pComplexBuf, RotVector_320, 40, 5, 8, 8, x, 40 );
    2169       12270 :             s = add( *scale, SCALEFACTOR40 );
    2170       12270 :             BREAK;
    2171             :         }
    2172             : 
    2173      288013 :         case 64:
    2174             :         {
    2175      288013 :             fftN2( pComplexBuf, RotVector_256, 64, 8, 8, 8, x, 64 );
    2176      288013 :             s = add( *scale, SCALEFACTOR64 );
    2177      288013 :             BREAK;
    2178             :         }
    2179             : 
    2180       81076 :         case 80:
    2181             :         {
    2182       81076 :             fftN2( pComplexBuf, RotVector_320, 80, 10, 8, 4, x, 40 );
    2183       81076 :             s = add( *scale, SCALEFACTOR80 );
    2184       81076 :             BREAK;
    2185             :         }
    2186           0 :         case 100:
    2187             :         {
    2188           0 :             fftN2( pComplexBuf, RotVector_400, 100, 10, 10, 4, x, 40 );
    2189           0 :             s = add( *scale, SCALEFACTOR100 );
    2190           0 :             BREAK;
    2191             :         }
    2192       66504 :         case 120:
    2193             :         {
    2194       66504 :             fftN2( pComplexBuf, RotVector_480, 120, 15, 8, 4, x, 60 );
    2195       66504 :             s = add( *scale, SCALEFACTOR120 );
    2196       66504 :             BREAK;
    2197             :         }
    2198             : 
    2199      264449 :         case 128:
    2200             :         {
    2201      264449 :             fftN2( pComplexBuf, RotVector_256, 128, 16, 8, 4, x, 64 );
    2202      264449 :             s = add( *scale, SCALEFACTOR128 );
    2203      264449 :             BREAK;
    2204             :         }
    2205             : 
    2206      378022 :         case 160:
    2207             :         {
    2208      378022 :             fftN2( pComplexBuf, RotVector_320, 160, 20, 8, 2, x, 40 );
    2209      378022 :             s = add( *scale, SCALEFACTOR160 );
    2210      378022 :             BREAK;
    2211             :         }
    2212             : 
    2213       24855 :         case 200:
    2214             :         {
    2215       24855 :             fftN2( pComplexBuf, RotVector_400, 200, 20, 10, 2, x, 40 );
    2216       24855 :             s = add( *scale, SCALEFACTOR200 );
    2217       24855 :             BREAK;
    2218             :         }
    2219             : 
    2220       28948 :         case 240:
    2221             :         {
    2222       28948 :             fftN2( pComplexBuf, RotVector_480, 240, 30, 8, 2, x, 60 );
    2223       28948 :             s = add( *scale, SCALEFACTOR240 );
    2224       28948 :             BREAK;
    2225             :         }
    2226             : 
    2227      239984 :         case 256:
    2228             :         {
    2229      239984 :             fftN2( pComplexBuf, RotVector_256, 256, 32, 8, 2, x, 64 );
    2230      239984 :             s = add( *scale, SCALEFACTOR256 );
    2231      239984 :             BREAK;
    2232             :         }
    2233             : 
    2234      758991 :         case 320:
    2235             :         {
    2236      758991 :             fftN2( pComplexBuf, RotVector_320, 320, 20, 16, 2, x, 40 );
    2237      758991 :             s = add( *scale, SCALEFACTOR320 );
    2238      758991 :             BREAK;
    2239             :         }
    2240             : 
    2241        5721 :         case 400:
    2242             :         {
    2243        5721 :             fftN2( pComplexBuf, RotVector_400, 400, 20, 20, 2, x, 40 );
    2244        5721 :             s = add( *scale, SCALEFACTOR400 );
    2245        5721 :             BREAK;
    2246             :         }
    2247             : 
    2248     1192635 :         case 480:
    2249             :         {
    2250     1192635 :             fftN2( pComplexBuf, RotVector_480, 480, 30, 16, 2, x, 60 );
    2251     1192635 :             s = add( *scale, SCALEFACTOR480 );
    2252     1192635 :             BREAK;
    2253             :         }
    2254       21035 :         case 600:
    2255             :         {
    2256       21035 :             fftN2( pComplexBuf, RotVector_600, 600, 30, 20, 2, x, 60 );
    2257       21035 :             s = add( *scale, SCALEFACTOR600 );
    2258       21035 :             BREAK;
    2259             :         }
    2260           0 :         default:
    2261           0 :             assert( 0 );
    2262             :     }
    2263     3761371 :     *scale = s;
    2264     3761371 :     move16();
    2265     3761371 : }
    2266             : 
    2267             : 
    2268             : #define RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1, w2, xb0, xb1, xt0, xt1 ) \
    2269             :     {                                                                  \
    2270             :         xb0 = L_shr( x[2 * i + 0], 2 );                                \
    2271             :         xb1 = L_shr( x[2 * i + 1], 2 );                                \
    2272             :         xt0 = L_shr( x[sizeOfFft - 2 * i + 0], 2 );                    \
    2273             :         xt1 = L_shr( x[sizeOfFft - 2 * i + 1], 2 );                    \
    2274             :         t1 = L_sub( xb0, xt0 );                                        \
    2275             :         t2 = L_add( xb1, xt1 );                                        \
    2276             :         t3 = L_sub( Mpy_32_16_1( t1, w1 ), Mpy_32_16_1( t2, w2 ) );    \
    2277             :         t4 = L_add( Mpy_32_16_1( t1, w2 ), Mpy_32_16_1( t2, w1 ) );    \
    2278             :         t1 = L_add( xb0, xt0 );                                        \
    2279             :         t2 = L_sub( xb1, xt1 );                                        \
    2280             :     }
    2281             : 
    2282             : #define RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1, w2, xb0, xb1, xt0, xt1 ) \
    2283             :     {                                                                  \
    2284             :         xb0 = L_shr( x[2 * i + 0], 2 );                                \
    2285             :         xb1 = L_shr( x[2 * i + 1], 2 );                                \
    2286             :         xt0 = L_shr( x[sizeOfFft - 2 * i + 0], 2 );                    \
    2287             :         xt1 = L_shr( x[sizeOfFft - 2 * i + 1], 2 );                    \
    2288             :         t1 = L_sub( xb0, xt0 );                                        \
    2289             :         t2 = L_add( xb1, xt1 );                                        \
    2290             :         t3 = L_add( Mpy_32_16_1( t1, w1 ), Mpy_32_16_1( t2, w2 ) );    \
    2291             :         t4 = L_sub( Mpy_32_16_1( t2, w1 ), Mpy_32_16_1( t1, w2 ) );    \
    2292             :         t1 = L_add( xb0, xt0 );                                        \
    2293             :         t2 = L_sub( xb1, xt1 );                                        \
    2294             :     }
    2295             : 
    2296             : /**
    2297             :  * \brief Real valued FFT
    2298             :  *
    2299             :  *        forward rFFT (isign == -1):
    2300             :  *        The input vector contains sizeOfFft real valued time samples. The output vector contains sizeOfFft/2 complex valued
    2301             :  *        spectral values. The spectral values resides interleaved in the output vector. x[1] contains re[sizeOfFft], because
    2302             :  *        x[1] is zero by default. This allows use of sizeOfFft length buffer instead of sizeOfFft+1.
    2303             :  *
    2304             :  *        inverse rFFT (isign == +1):
    2305             :  *        The input vector contains sizeOfFft complex valued spectral values. The output vector contains sizeOfFft real valued
    2306             :  *        time samples. The spectral values resides interleaved in the input vector. x[1] contains re[sizeOfFft].
    2307             :  *        (see also forward rFFT)
    2308             :  *
    2309             :  * \param    [i/o] x           real input / real and imag output interleaved
    2310             :  * \param    [i  ] sizeOfFft   length of fft
    2311             :  * \param    [i  ] scale       scalefactor
    2312             :  * \param    [i  ] isign       forward (-1) / backward (+1)
    2313             :  *
    2314             :  * \return void
    2315             :  */
    2316       38851 : void BASOP_rfft( Word32 *x, Word16 sizeOfFft, Word16 *scale, Word16 isign )
    2317             : {
    2318       38851 :     Word16 i, s = 0, sizeOfFft2, sizeOfFft4, sizeOfFft8, wstride; /* clear s to calm down compiler */
    2319             :     Word32 t1, t2, t3, t4, xb0, xb1, xt0, xt1;
    2320             :     const PWord16 *w1;
    2321             :     Word16 c1;
    2322             :     Word16 c2;
    2323             :     Word32 workBuffer[2 * BASOP_CFFT_MAX_LENGTH];
    2324             : 
    2325             : 
    2326       38851 :     sizeOfFft2 = shr( sizeOfFft, 1 );
    2327       38851 :     sizeOfFft4 = shr( sizeOfFft, 2 );
    2328       38851 :     sizeOfFft8 = shr( sizeOfFft, 3 );
    2329             : 
    2330       38851 :     BASOP_getTables( NULL, &w1, &wstride, sizeOfFft2 );
    2331             : 
    2332       38851 :     SWITCH( isign )
    2333             :     {
    2334          15 :         case -1:
    2335             : 
    2336          15 :             BASOP_cfft( (cmplx *) x, sizeOfFft2, scale, workBuffer );
    2337             : 
    2338          15 :             xb0 = L_shr( x[0], 1 );
    2339          15 :             xb1 = L_shr( x[1], 1 );
    2340          15 :             x[0] = L_add( xb0, xb1 );
    2341          15 :             move32();
    2342          15 :             x[1] = L_sub( xb0, xb1 );
    2343          15 :             move32();
    2344             : 
    2345        1088 :             FOR( i = 1; i < sizeOfFft8; i++ )
    2346             :             {
    2347        1073 :                 RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1[i * wstride].v.im, w1[i * wstride].v.re, xb0, xb1, xt0, xt1 )
    2348        1073 :                 x[2 * i] = L_sub( t1, t3 );
    2349        1073 :                 move32();
    2350        1073 :                 x[2 * i + 1] = L_sub( t2, t4 );
    2351        1073 :                 move32();
    2352        1073 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2353        1073 :                 move32();
    2354        1073 :                 x[sizeOfFft - 2 * i + 1] = L_negate( L_add( t2, t4 ) );
    2355        1073 :                 move32();
    2356             :             }
    2357             : 
    2358        1103 :             FOR( i = sizeOfFft8; i < sizeOfFft4; i++ )
    2359             :             {
    2360        1088 :                 RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1[( sizeOfFft4 - i ) * wstride].v.re, w1[( sizeOfFft4 - i ) * wstride].v.im, xb0, xb1, xt0, xt1 )
    2361        1088 :                 x[2 * i] = L_sub( t1, t3 );
    2362        1088 :                 move32();
    2363        1088 :                 x[2 * i + 1] = L_sub( t2, t4 );
    2364        1088 :                 move32();
    2365        1088 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2366        1088 :                 move32();
    2367        1088 :                 x[sizeOfFft - 2 * i + 1] = L_negate( L_add( t2, t4 ) );
    2368        1088 :                 move32();
    2369             :             }
    2370             : 
    2371          15 :             x[sizeOfFft - 2 * i] = L_shr( x[2 * i + 0], 1 );
    2372          15 :             move32();
    2373          15 :             x[sizeOfFft - 2 * i + 1] = L_negate( L_shr( x[2 * i + 1], 1 ) );
    2374          15 :             move32();
    2375             : 
    2376          15 :             *scale = add( *scale, 1 );
    2377          15 :             move16();
    2378          15 :             BREAK;
    2379             : 
    2380       38836 :         case +1:
    2381             : 
    2382       38836 :             xb0 = L_shr( x[0], 2 );
    2383       38836 :             xb1 = L_shr( x[1], 2 );
    2384       38836 :             x[0] = L_add( xb0, xb1 );
    2385       38836 :             move32();
    2386       38836 :             x[1] = L_sub( xb1, xb0 );
    2387       38836 :             move32();
    2388             : 
    2389     2836960 :             FOR( i = 1; i < sizeOfFft8; i++ )
    2390             :             {
    2391     2798124 :                 RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1[i * wstride].v.im, w1[i * wstride].v.re, xb0, xb1, xt0, xt1 )
    2392             : 
    2393     2798124 :                 x[2 * i] = L_sub( t1, t3 );
    2394     2798124 :                 move32();
    2395     2798124 :                 x[2 * i + 1] = L_sub( t4, t2 );
    2396     2798124 :                 move32();
    2397     2798124 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2398     2798124 :                 move32();
    2399     2798124 :                 x[sizeOfFft - 2 * i + 1] = L_add( t2, t4 );
    2400     2798124 :                 move32();
    2401             :             }
    2402             : 
    2403     2875796 :             FOR( i = sizeOfFft8; i < sizeOfFft4; i++ )
    2404             :             {
    2405     2836960 :                 RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1[( sizeOfFft4 - i ) * wstride].v.re, w1[( sizeOfFft4 - i ) * wstride].v.im, xb0, xb1, xt0, xt1 )
    2406             : 
    2407     2836960 :                 x[2 * i] = L_sub( t1, t3 );
    2408     2836960 :                 move32();
    2409     2836960 :                 x[2 * i + 1] = L_sub( t4, t2 );
    2410     2836960 :                 move32();
    2411     2836960 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2412     2836960 :                 move32();
    2413     2836960 :                 x[sizeOfFft - 2 * i + 1] = L_add( t2, t4 );
    2414     2836960 :                 move32();
    2415             :             }
    2416             : 
    2417       38836 :             x[sizeOfFft - 2 * i] = L_shr( x[2 * i + 0], 1 );
    2418       38836 :             move32();
    2419       38836 :             x[sizeOfFft - 2 * i + 1] = L_shr( x[2 * i + 1], 1 );
    2420       38836 :             move32();
    2421             : 
    2422       38836 :             BASOP_cfft( (cmplx *) x, sizeOfFft2, scale, workBuffer );
    2423             : 
    2424       38836 :             SWITCH( sizeOfFft )
    2425             :             {
    2426       21966 :                 case 40:
    2427             :                 case 80:
    2428             :                 case 320:
    2429             :                 case 640:
    2430       21966 :                     c1 = FFTC( 0x66666680 );
    2431       21966 :                     move16();
    2432       21966 :                     c2 = FFTC( 0x99999980 );
    2433       21966 :                     move16();
    2434     7051086 :                     FOR( i = 0; i < sizeOfFft2; i++ )
    2435             :                     {
    2436     7029120 :                         x[2 * i] = Mpy_32_xx( x[2 * i], c1 );
    2437     7029120 :                         move32();
    2438     7029120 :                         x[2 * i + 1] = Mpy_32_xx( x[2 * i + 1], c2 );
    2439     7029120 :                         move32();
    2440             :                     }
    2441       21966 :                     BREAK;
    2442             : 
    2443       16870 :                 case 64:
    2444             :                 case 256:
    2445             :                 case 512:
    2446     4335590 :                     FOR( i = 0; i < sizeOfFft2; i++ )
    2447             :                     {
    2448     4318720 :                         x[2 * i + 1] = L_negate( x[2 * i + 1] );
    2449     4318720 :                         move32();
    2450             :                     }
    2451       16870 :                     BREAK;
    2452             : 
    2453           0 :                 default:
    2454           0 :                     assert( 0 );
    2455             :             }
    2456             : 
    2457       38836 :             SWITCH( sizeOfFft )
    2458             :             {
    2459           0 :                 case 64:
    2460           0 :                     s = add( *scale, 2 - 6 );
    2461           0 :                     BREAK;
    2462             : 
    2463       16870 :                 case 512:
    2464       16870 :                     s = add( *scale, 2 - 9 );
    2465       16870 :                     BREAK;
    2466             : 
    2467       21966 :                 case 640:
    2468       21966 :                     s = add( *scale, 2 - 9 );
    2469       21966 :                     BREAK;
    2470             : 
    2471           0 :                 default:
    2472           0 :                     assert( 0 );
    2473             :             }
    2474       38836 :             *scale = s;
    2475       38836 :             move16();
    2476       38836 :             BREAK;
    2477             :     }
    2478       38851 : }

Generated by: LCOV version 1.14