LCOV - code coverage report
Current view: top level - lib_com - fft_evs.c (source / functions) Hit Total Coverage
Test: Coverage on main -- dec/rend @ 4c82f1d24d39d0296b18d775f18a006f4c7d024b Lines: 1384 1422 97.3 %
Date: 2025-05-17 01:59:02 Functions: 11 12 91.7 %

          Line data    Source code
       1             : /*====================================================================================
       2             :     EVS Codec 3GPP TS26.452 Aug 12, 2021. Version 16.3.0
       3             :   ====================================================================================*/
       4             : 
       5             : #include <assert.h>
       6             : #include "prot_fx.h"
       7             : #include "basop_util.h"
       8             : #include "rom_basop_util.h"
       9             : #include "rom_com.h"
      10             : #include "options.h"
      11             : #include "stl.h"
      12             : /************************************************************************/
      13             : /* FFT                                                                  */
      14             : /************************************************************************/
      15             : #define SCALEFACTOR16 ( 5 )
      16             : #define SCALEFACTOR20 ( 5 )
      17             : 
      18             : 
      19             : void fft16_with_cmplx_data( cmplx *pInp, Word16 bsacle );
      20             : 
      21             : /**
      22             :  * \brief Profiling / Precision results
      23             :  *
      24             :  *        Profiling / Precision of complex valued FFTs: BASOP_cfft()
      25             :  *
      26             :  *                       WOPS BASOP  Precision BASOP
      27             :  *        FFT5                   87     16.96
      28             :  *        FFT8                  108     17.04
      29             :  *        FFT10                 194     16.70
      30             :  *        FFT15                 354     16.97
      31             :  *        FFT16                 288     16.62
      32             :  *        FFT20                 368     16.06
      33             :  *        FFT30                 828     16.80
      34             :  *        FFT32                 752     15.45   (cplx mult mit 3 mult und 3 add)
      35             :  *        FFT32                 824     16.07   (cplx mult mit 4 mult und 2 add)
      36             :  *        FFT64  ( 8x 8)      3.129     15.16
      37             :  *        FFT80  (10x 8)      4.385     15.55
      38             :  *        FFT100 (20x 5)      6.518     15.65
      39             :  *        FFT120 (15x 8)      7.029     15.38
      40             :  *        FFT128 (16x 8)      6.777     15.28
      41             :  *        FFT160 (20x 8)      9.033     14.95
      42             :  *        FFT240 (30x 8)     14.961     15.49
      43             :  *        FFT256 (32x 8)     14.905     14.61   (cplx mult mit 3 mult und 3 add)
      44             :  *        FFT256 (32x 8)     15.265     15.04   (cplx mult mit 4 mult und 2 add)
      45             :  *        FFT320 (20x16)     21.517     15.21
      46             :  *
      47             :  *
      48             :  *        Profiling / Precision of real valued FFTs / iFFTs: BASOP_rfft()
      49             :  *
      50             :  *                       WOPS BASOP  Precision BASOP
      51             :  *        rFFT40                955     15.68
      52             :  *        rFFT64               1635     16.17
      53             :  *
      54             :  *        irFFT40              1116     15.36
      55             :  *        irFFT64              1759     15.18
      56             :  *
      57             :  */
      58             : 
      59             : 
      60             : #define Mpy_32_xx Mpy_32_16_1
      61             : 
      62             : #define FFTC( x ) WORD322WORD16( (Word32) x )
      63             : 
      64             : #define C31 ( FFTC( 0x91261468 ) ) /* FL2WORD32( -0.86602540) -sqrt(3)/2 */
      65             : 
      66             : #define C51 ( FFTC( 0x79bc3854 ) ) /* FL2WORD32( 0.95105652)   */
      67             : #define C52 ( FFTC( 0x9d839db0 ) ) /* FL2WORD32(-1.53884180/2) */
      68             : #define C53 ( FFTC( 0xd18053ce ) ) /* FL2WORD32(-0.36327126)   */
      69             : #define C54 ( FFTC( 0x478dde64 ) ) /* FL2WORD32( 0.55901699)   */
      70             : #define C55 ( FFTC( 0xb0000001 ) ) /* FL2WORD32(-1.25/2)       */
      71             : 
      72             : #define C81 ( FFTC( 0x5a82799a ) ) /* FL2WORD32( 7.071067811865475e-1) */
      73             : #define C82 ( FFTC( 0xa57d8666 ) ) /* FL2WORD32(-7.071067811865475e-1) */
      74             : 
      75             : #define C161 ( FFTC( 0x5a82799a ) ) /* FL2WORD32( 7.071067811865475e-1)  INV_SQRT2    */
      76             : #define C162 ( FFTC( 0xa57d8666 ) ) /* FL2WORD32(-7.071067811865475e-1) -INV_SQRT2    */
      77             : 
      78             : #define C163 ( FFTC( 0x7641af3d ) ) /* FL2WORD32( 9.238795325112867e-1)  COS_PI_DIV8  */
      79             : #define C164 ( FFTC( 0x89be50c3 ) ) /* FL2WORD32(-9.238795325112867e-1) -COS_PI_DIV8  */
      80             : 
      81             : #define C165 ( FFTC( 0x30fbc54d ) ) /* FL2WORD32( 3.826834323650898e-1)  COS_3PI_DIV8 */
      82             : #define C166 ( FFTC( 0xcf043ab3 ) ) /* FL2WORD32(-3.826834323650898e-1) -COS_3PI_DIV8 */
      83             : 
      84             : 
      85             : #define cplxMpy4_8_0( re, im, a, b, c, d )                          \
      86             :     re = L_shr( L_sub( Mpy_32_xx( a, c ), Mpy_32_xx( b, d ) ), 1 ); \
      87             :     im = L_shr( L_add( Mpy_32_xx( a, d ), Mpy_32_xx( b, c ) ), 1 );
      88             : 
      89             : #define cplxMpy4_8_1( re, im, a, b ) \
      90             :     re = L_shr( a, 1 );              \
      91             :     im = L_shr( b, 1 );
      92             : 
      93             : 
      94             : /**
      95             :  * \brief    Function performs a complex 5-point FFT
      96             :  *           The FFT is performed inplace. The result of the FFT
      97             :  *           is scaled by SCALEFACTOR5 bits.
      98             :  *
      99             :  *           WOPS with 32x16 bit multiplications:  88 cycles
     100             :  *
     101             :  * \param    [i/o] re    real input / output
     102             :  * \param    [i/o] im    imag input / output
     103             :  * \param    [i  ] s     stride real and imag input / output
     104             :  *
     105             :  * \return   void
     106             :  */
     107       56944 : static void fft5_with_cmplx_data( cmplx *inp /*Qx*/ )
     108             : {
     109             :     cmplx x0, x1, x2, x3, x4;
     110             :     cmplx y1, y2, y3, y4;
     111             :     cmplx t;
     112             : 
     113       56944 :     x0 = CL_shr( inp[0], SCALEFACTOR5 ); // Qx - 4
     114       56944 :     x1 = CL_shr( inp[1], SCALEFACTOR5 ); // Qx - 4
     115       56944 :     x2 = CL_shr( inp[2], SCALEFACTOR5 ); // Qx - 4
     116       56944 :     x3 = CL_shr( inp[3], SCALEFACTOR5 ); // Qx - 4
     117       56944 :     x4 = CL_shr( inp[4], SCALEFACTOR5 ); // Qx - 4
     118             : 
     119       56944 :     y1 = CL_add( x1, x4 );
     120       56944 :     y4 = CL_sub( x1, x4 );
     121       56944 :     y3 = CL_add( x2, x3 );
     122       56944 :     y2 = CL_sub( x2, x3 );
     123       56944 :     t = CL_scale_t( CL_sub( y1, y3 ), C54 );
     124       56944 :     y1 = CL_add( y1, y3 );
     125       56944 :     inp[0] = CL_add( x0, y1 );
     126             : 
     127             :     /* Bit shift left because of the constant C55 which was scaled with the factor 0.5 because of the representation of
     128             :     the values as fracts */
     129       56944 :     y1 = CL_add( inp[0], ( CL_shl( CL_scale_t( y1, C55 ), 1 ) ) );
     130       56944 :     y3 = CL_sub( y1, t );
     131       56944 :     y1 = CL_add( y1, t );
     132             : 
     133       56944 :     t = CL_scale_t( CL_add( y4, y2 ), C51 );
     134             :     /* Bit shift left because of the constant C55 which was scaled with the factor 0.5 because of the representation of
     135             :     the values as fracts */
     136       56944 :     y4 = CL_add( t, CL_shl( CL_scale_t( y4, C52 ), 1 ) );
     137       56944 :     y2 = CL_add( t, CL_scale_t( y2, C53 ) );
     138             : 
     139             : 
     140             :     /* combination */
     141       56944 :     inp[1] = CL_msu_j( y1, y2 );
     142       56944 :     inp[4] = CL_mac_j( y1, y2 );
     143             : 
     144       56944 :     inp[2] = CL_mac_j( y3, y4 );
     145       56944 :     inp[3] = CL_msu_j( y3, y4 );
     146             : 
     147             : #ifdef WMOPS
     148             :     multiCounter[currCounter].CL_move += 5;
     149             : #endif
     150       56944 : }
     151             : 
     152             : /**
     153             :  * \brief    Function performs a complex 8-point FFT
     154             :  *           The FFT is performed inplace. The result of the FFT
     155             :  *           is scaled by SCALEFACTOR8 bits.
     156             :  *
     157             :  *           WOPS with 32x16 bit multiplications: 108 cycles
     158             :  *
     159             :  * \param    [i/o] re    real input / output
     160             :  * \param    [i/o] im    imag input / output
     161             :  * \param    [i  ] s     stride real and imag input / output
     162             :  *
     163             :  * \return   void
     164             :  */
     165      172456 : static void fft8_with_cmplx_data( cmplx *inp /*Qx*/ )
     166             : {
     167             :     cmplx x0, x1, x2, x3, x4, x5, x6, x7;
     168             :     cmplx s0, s1, s2, s3, s4, s5, s6, s7;
     169             :     cmplx t0, t1, t2, t3, t4, t5, t6, t7;
     170             : 
     171             :     /* Pre-additions */
     172      172456 :     x0 = CL_shr( inp[0], SCALEFACTOR8 ); // Qx - 4
     173      172456 :     x1 = CL_shr( inp[1], SCALEFACTOR8 );
     174      172456 :     x2 = CL_shr( inp[2], SCALEFACTOR8 );
     175      172456 :     x3 = CL_shr( inp[3], SCALEFACTOR8 );
     176      172456 :     x4 = CL_shr( inp[4], SCALEFACTOR8 );
     177      172456 :     x5 = CL_shr( inp[5], SCALEFACTOR8 );
     178      172456 :     x6 = CL_shr( inp[6], SCALEFACTOR8 );
     179      172456 :     x7 = CL_shr( inp[7], SCALEFACTOR8 );
     180             : 
     181             :     /* loops are unrolled */
     182             :     {
     183      172456 :         t0 = CL_add( x0, x4 );
     184      172456 :         t1 = CL_sub( x0, x4 );
     185             : 
     186      172456 :         t2 = CL_add( x1, x5 );
     187      172456 :         t3 = CL_sub( x1, x5 );
     188             : 
     189      172456 :         t4 = CL_add( x2, x6 );
     190      172456 :         t5 = CL_sub( x2, x6 );
     191             : 
     192      172456 :         t6 = CL_add( x3, x7 );
     193      172456 :         t7 = CL_sub( x3, x7 );
     194             :     }
     195             : 
     196             :     /* Pre-additions and core multiplications */
     197             : 
     198      172456 :     s0 = CL_add( t0, t4 );
     199      172456 :     s2 = CL_sub( t0, t4 );
     200             : 
     201      172456 :     s4 = CL_mac_j( t1, t5 );
     202      172456 :     s5 = CL_msu_j( t1, t5 );
     203             : 
     204      172456 :     s1 = CL_add( t2, t6 );
     205      172456 :     s3 = CL_sub( t2, t6 );
     206      172456 :     s3 = CL_mul_j( s3 );
     207             : 
     208      172456 :     t0 = CL_add( t3, t7 );
     209      172456 :     t1 = CL_sub( t3, t7 );
     210             : 
     211      172456 :     s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
     212      172456 :     s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
     213             : 
     214             :     /* Post-additions */
     215             : 
     216      172456 :     inp[0] = CL_add( s0, s1 );
     217      172456 :     inp[4] = CL_sub( s0, s1 );
     218             : 
     219      172456 :     inp[2] = CL_sub( s2, s3 );
     220      172456 :     inp[6] = CL_add( s2, s3 );
     221             : 
     222      172456 :     inp[3] = CL_add( s4, s7 );
     223      172456 :     inp[7] = CL_sub( s4, s7 );
     224             : 
     225      172456 :     inp[1] = CL_add( s5, s6 );
     226      172456 :     inp[5] = CL_sub( s5, s6 );
     227             : #ifdef WMOPS
     228             :     multiCounter[currCounter].CL_move += 8;
     229             : #endif
     230      172456 : }
     231             : 
     232             : 
     233             : /**
     234             :  * \brief    Function performs a complex 10-point FFT
     235             :  *           The FFT is performed inplace. The result of the FFT
     236             :  *           is scaled by SCALEFACTOR10 bits.
     237             :  *
     238             :  *           WOPS with 32x16 bit multiplications:  196 cycles
     239             :  *
     240             :  * \param    [i/o] re    real input / output
     241             :  * \param    [i/o] im    imag input / output
     242             :  * \param    [i  ] s     stride real and imag input / output
     243             :  *
     244             :  * \return   void
     245             :  */
     246             : 
     247      537464 : static void fft10_with_cmplx_data( cmplx *inp_data /*Qx*/ )
     248             : {
     249             :     cmplx r1, r2, r3, r4;
     250             :     cmplx x0, x1, x2, x3, x4, t;
     251             :     cmplx y[10];
     252             : 
     253             :     /* FOR i=0 */
     254             :     {
     255      537464 :         x0 = CL_shr( inp_data[0], SCALEFACTOR10 ); // Qx - 5
     256      537464 :         x1 = CL_shr( inp_data[2], SCALEFACTOR10 );
     257      537464 :         x2 = CL_shr( inp_data[4], SCALEFACTOR10 );
     258      537464 :         x3 = CL_shr( inp_data[6], SCALEFACTOR10 );
     259      537464 :         x4 = CL_shr( inp_data[8], SCALEFACTOR10 );
     260             : 
     261      537464 :         r1 = CL_add( x3, x2 );
     262      537464 :         r4 = CL_sub( x3, x2 );
     263      537464 :         r3 = CL_add( x1, x4 );
     264      537464 :         r2 = CL_sub( x1, x4 );
     265      537464 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     266      537464 :         r1 = CL_add( r1, r3 );
     267      537464 :         y[0] = CL_add( x0, r1 );
     268      537464 :         r1 = CL_add( y[0], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     269      537464 :         r3 = CL_sub( r1, t );
     270      537464 :         r1 = CL_add( r1, t );
     271      537464 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     272      537464 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     273      537464 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     274             : 
     275             : 
     276      537464 :         y[2] = CL_msu_j( r1, r2 );
     277      537464 :         y[8] = CL_mac_j( r1, r2 );
     278      537464 :         y[4] = CL_mac_j( r3, r4 );
     279      537464 :         y[6] = CL_msu_j( r3, r4 );
     280             :     }
     281             :     /* FOR i=1 */
     282             :     {
     283      537464 :         x0 = CL_shr( inp_data[5], SCALEFACTOR10 ); // Qx - 5
     284      537464 :         x1 = CL_shr( inp_data[1], SCALEFACTOR10 );
     285      537464 :         x2 = CL_shr( inp_data[3], SCALEFACTOR10 );
     286      537464 :         x3 = CL_shr( inp_data[7], SCALEFACTOR10 );
     287      537464 :         x4 = CL_shr( inp_data[9], SCALEFACTOR10 );
     288             : 
     289      537464 :         r1 = CL_add( x1, x4 );
     290      537464 :         r4 = CL_sub( x1, x4 );
     291      537464 :         r3 = CL_add( x3, x2 );
     292      537464 :         r2 = CL_sub( x3, x2 );
     293      537464 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     294      537464 :         r1 = CL_add( r1, r3 );
     295      537464 :         y[1] = CL_add( x0, r1 );
     296      537464 :         r1 = CL_add( y[1], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     297      537464 :         r3 = CL_sub( r1, t );
     298      537464 :         r1 = CL_add( r1, t );
     299      537464 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     300      537464 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     301      537464 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     302             : 
     303             : 
     304      537464 :         y[3] = CL_msu_j( r1, r2 );
     305      537464 :         y[9] = CL_mac_j( r1, r2 );
     306      537464 :         y[5] = CL_mac_j( r3, r4 );
     307      537464 :         y[7] = CL_msu_j( r3, r4 );
     308             :     }
     309             : 
     310             :     /* FOR i=0 */
     311             :     {
     312      537464 :         inp_data[0] = CL_add( y[0], y[1] );
     313      537464 :         inp_data[5] = CL_sub( y[0], y[1] );
     314             :     }
     315             :     /* FOR i=2 */
     316             :     {
     317      537464 :         inp_data[2] = CL_add( y[2], y[3] );
     318      537464 :         inp_data[7] = CL_sub( y[2], y[3] );
     319             :     }
     320             :     /* FOR i=4 */
     321             :     {
     322      537464 :         inp_data[4] = CL_add( y[4], y[5] );
     323      537464 :         inp_data[9] = CL_sub( y[4], y[5] );
     324             :     }
     325             :     /* FOR i=6 */
     326             :     {
     327      537464 :         inp_data[6] = CL_add( y[6], y[7] );
     328      537464 :         inp_data[1] = CL_sub( y[6], y[7] );
     329             :     }
     330             :     /* FOR i=8 */
     331             :     {
     332      537464 :         inp_data[8] = CL_add( y[8], y[9] );
     333      537464 :         inp_data[3] = CL_sub( y[8], y[9] );
     334             :     }
     335             : 
     336             : #ifdef WMOPS
     337             :     multiCounter[currCounter].CL_move += 10;
     338             : #endif
     339      537464 : }
     340             : 
     341             : 
     342             : /**
     343             :  * \brief    Function performs a complex 15-point FFT
     344             :  *           The FFT is performed inplace. The result of the FFT
     345             :  *           is scaled by SCALEFACTOR15 bits.
     346             :  *
     347             :  *           WOPS with 32x16 bit multiplications:  354 cycles
     348             :  *
     349             :  * \param    [i/o] re    real input / output
     350             :  * \param    [i/o] im    imag input / output
     351             :  * \param    [i  ] s     stride real and imag input / output
     352             :  *
     353             :  * \return   void
     354             :  */
     355             : 
     356      171984 : static void fft15_with_cmplx_data( cmplx *inp_data /*Qx*/ )
     357             : {
     358             :     cmplx c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14;
     359             :     cmplx c_z0, c_z1, c_z2, c_z3, c_z4, c_z5, c_z6, c_z7, c_z8, c_z9, c_z10, c_z11, c_z12, c_z13, c_z14;
     360             :     cmplx c_y1, c_y2, c_y3, c_y4;
     361             :     cmplx c_t;
     362             : 
     363      171984 :     c0 = CL_shr( inp_data[0], SCALEFACTOR15 ); // Qx - 5
     364      171984 :     c1 = CL_shr( inp_data[3], SCALEFACTOR15 );
     365      171984 :     c2 = CL_shr( inp_data[6], SCALEFACTOR15 );
     366      171984 :     c3 = CL_shr( inp_data[9], SCALEFACTOR15 );
     367      171984 :     c4 = CL_shr( inp_data[12], SCALEFACTOR15 );
     368      171984 :     c5 = CL_shr( inp_data[5], SCALEFACTOR15 );
     369      171984 :     c6 = CL_shr( inp_data[8], SCALEFACTOR15 );
     370      171984 :     c7 = CL_shr( inp_data[11], SCALEFACTOR15 );
     371      171984 :     c8 = CL_shr( inp_data[14], SCALEFACTOR15 );
     372      171984 :     c9 = CL_shr( inp_data[2], SCALEFACTOR15 );
     373      171984 :     c10 = CL_shr( inp_data[10], SCALEFACTOR15 );
     374      171984 :     c11 = CL_shr( inp_data[13], SCALEFACTOR15 );
     375      171984 :     c12 = CL_shr( inp_data[1], SCALEFACTOR15 );
     376      171984 :     c13 = CL_shr( inp_data[4], SCALEFACTOR15 );
     377      171984 :     c14 = CL_shr( inp_data[7], SCALEFACTOR15 );
     378             : 
     379             :     /* 1. FFT5 stage */
     380      171984 :     c_y1 = CL_add( c1, c4 );
     381      171984 :     c_y4 = CL_sub( c1, c4 );
     382      171984 :     c_y3 = CL_add( c2, c3 );
     383      171984 :     c_y2 = CL_sub( c2, c3 );
     384      171984 :     c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
     385      171984 :     c_y1 = CL_add( c_y1, c_y3 );
     386      171984 :     c_z0 = CL_add( c0, c_y1 );
     387      171984 :     c_y1 = CL_add( c_z0, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
     388      171984 :     c_y3 = CL_sub( c_y1, c_t );
     389      171984 :     c_y1 = CL_add( c_y1, c_t );
     390      171984 :     c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
     391      171984 :     c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
     392      171984 :     c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
     393             : 
     394             :     /* combination */
     395      171984 :     c_z1 = CL_msu_j( c_y1, c_y2 );
     396      171984 :     c_z2 = CL_mac_j( c_y3, c_y4 );
     397      171984 :     c_z3 = CL_msu_j( c_y3, c_y4 );
     398      171984 :     c_z4 = CL_mac_j( c_y1, c_y2 );
     399             : 
     400             : 
     401             :     /* 2. FFT5 stage */
     402      171984 :     c_y1 = CL_add( c6, c9 );
     403      171984 :     c_y4 = CL_sub( c6, c9 );
     404      171984 :     c_y3 = CL_add( c7, c8 );
     405      171984 :     c_y2 = CL_sub( c7, c8 );
     406      171984 :     c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
     407      171984 :     c_y1 = CL_add( c_y1, c_y3 );
     408      171984 :     c_z5 = CL_add( c5, c_y1 );
     409      171984 :     c_y1 = CL_add( c_z5, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
     410      171984 :     c_y3 = CL_sub( c_y1, c_t );
     411      171984 :     c_y1 = CL_add( c_y1, c_t );
     412      171984 :     c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
     413      171984 :     c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
     414      171984 :     c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
     415             :     /* combination */
     416      171984 :     c_z6 = CL_msu_j( c_y1, c_y2 );
     417      171984 :     c_z7 = CL_mac_j( c_y3, c_y4 );
     418      171984 :     c_z8 = CL_msu_j( c_y3, c_y4 );
     419      171984 :     c_z9 = CL_mac_j( c_y1, c_y2 );
     420             : 
     421             : 
     422             :     /* 3. FFT5 stage */
     423             : 
     424      171984 :     c_y1 = CL_add( c11, c14 );
     425      171984 :     c_y4 = CL_sub( c11, c14 );
     426      171984 :     c_y3 = CL_add( c12, c13 );
     427      171984 :     c_y2 = CL_sub( c12, c13 );
     428      171984 :     c_t = CL_scale_t( CL_sub( c_y1, c_y3 ), C54 );
     429      171984 :     c_y1 = CL_add( c_y1, c_y3 );
     430      171984 :     c_z10 = CL_add( c10, c_y1 );
     431      171984 :     c_y1 = CL_add( c_z10, ( CL_shl( CL_scale_t( c_y1, C55 ), 1 ) ) );
     432      171984 :     c_y3 = CL_sub( c_y1, c_t );
     433      171984 :     c_y1 = CL_add( c_y1, c_t );
     434      171984 :     c_t = CL_scale_t( CL_add( c_y4, c_y2 ), C51 );
     435      171984 :     c_y4 = CL_add( c_t, CL_shl( CL_scale_t( c_y4, C52 ), 1 ) );
     436      171984 :     c_y2 = CL_add( c_t, CL_scale_t( c_y2, C53 ) );
     437             :     /* combination */
     438      171984 :     c_z11 = CL_msu_j( c_y1, c_y2 );
     439      171984 :     c_z12 = CL_mac_j( c_y3, c_y4 );
     440      171984 :     c_z13 = CL_msu_j( c_y3, c_y4 );
     441      171984 :     c_z14 = CL_mac_j( c_y1, c_y2 );
     442             : 
     443             : 
     444             :     /* 1. FFT3 stage */
     445      171984 :     c_y1 = CL_add( c_z5, c_z10 );
     446      171984 :     c_y2 = CL_scale_t( CL_sub( c_z5, c_z10 ), C31 );
     447      171984 :     inp_data[0] = CL_add( c_z0, c_y1 );
     448      171984 :     c_y1 = CL_sub( c_z0, CL_shr( c_y1, 1 ) );
     449      171984 :     inp_data[10] = CL_mac_j( c_y1, c_y2 );
     450      171984 :     inp_data[5] = CL_msu_j( c_y1, c_y2 );
     451             : 
     452             :     /* 2. FFT3 stage */
     453      171984 :     c_y1 = CL_add( c_z6, c_z11 );
     454      171984 :     c_y2 = CL_scale_t( CL_sub( c_z6, c_z11 ), C31 );
     455      171984 :     inp_data[6] = CL_add( c_z1, c_y1 );
     456      171984 :     c_y1 = CL_sub( c_z1, CL_shr( c_y1, 1 ) );
     457      171984 :     inp_data[1] = CL_mac_j( c_y1, c_y2 );
     458      171984 :     inp_data[11] = CL_msu_j( c_y1, c_y2 );
     459             : 
     460             :     /* 3. FFT3 stage */
     461      171984 :     c_y1 = CL_add( c_z7, c_z12 );
     462      171984 :     c_y2 = CL_scale_t( CL_sub( c_z7, c_z12 ), C31 );
     463      171984 :     inp_data[12] = CL_add( c_z2, c_y1 );
     464      171984 :     c_y1 = CL_sub( c_z2, CL_shr( c_y1, 1 ) );
     465      171984 :     inp_data[7] = CL_mac_j( c_y1, c_y2 );
     466      171984 :     inp_data[2] = CL_msu_j( c_y1, c_y2 );
     467             : 
     468             : 
     469             :     /* 4. FFT3 stage */
     470      171984 :     c_y1 = CL_add( c_z8, c_z13 );
     471      171984 :     c_y2 = CL_scale_t( CL_sub( c_z8, c_z13 ), C31 );
     472      171984 :     inp_data[3] = CL_add( c_z3, c_y1 );
     473      171984 :     c_y1 = CL_sub( c_z3, CL_shr( c_y1, 1 ) );
     474      171984 :     inp_data[13] = CL_mac_j( c_y1, c_y2 );
     475      171984 :     inp_data[8] = CL_msu_j( c_y1, c_y2 );
     476             : 
     477             : 
     478             :     /* 5. FFT3 stage */
     479      171984 :     c_y1 = CL_add( c_z9, c_z14 );
     480      171984 :     c_y2 = CL_scale_t( CL_sub( c_z9, c_z14 ), C31 );
     481      171984 :     inp_data[9] = CL_add( c_z4, c_y1 );
     482      171984 :     c_y1 = CL_sub( c_z4, CL_shr( c_y1, 1 ) );
     483      171984 :     inp_data[4] = CL_mac_j( c_y1, c_y2 );
     484      171984 :     inp_data[14] = CL_msu_j( c_y1, c_y2 );
     485             : 
     486             : #ifdef WMOPS
     487             :     multiCounter[currCounter].CL_move += 15;
     488             : #endif
     489      171984 : }
     490             : 
     491             : 
     492             : /**
     493             :  * \brief    Function performs a complex 16-point FFT
     494             :  *           The FFT is performed inplace. The result of the FFT
     495             :  *           is scaled by SCALEFACTOR16 bits.
     496             :  *
     497             :  *           WOPS with 32x16 bit multiplications (scale on ):  288 cycles
     498             :  *           WOPS with 32x16 bit multiplications (scale off):  256 cycles
     499             :  *
     500             :  * \param    [i/o] re    real input / output Qx
     501             :  * \param    [i/o] im    imag input / output Qx
     502             :  * \param    [i  ] s     stride real and imag input / output
     503             :  *
     504             :  * \return   void
     505             :  */
     506           0 : void fft16( Word32 *re, Word32 *im, Word16 s, Word16 bScale )
     507             : {
     508             :     Word16 i;
     509           0 :     if ( s == 2 )
     510             :     {
     511           0 :         fft16_with_cmplx_data( (cmplx *) re, bScale );
     512             :     }
     513             :     else
     514             :     {
     515             :         cmplx inp_data[16];
     516           0 :         FOR( i = 0; i < 16; i++ )
     517             :         {
     518           0 :             inp_data[i] = CL_form( re[s * i], im[s * i] );
     519           0 :             move64();
     520             :         }
     521           0 :         fft16_with_cmplx_data( inp_data, bScale );
     522           0 :         FOR( i = 0; i < 16; i++ )
     523             :         {
     524           0 :             re[s * i] = CL_Extract_real( inp_data[i] );
     525           0 :             move32();
     526           0 :             im[s * i] = CL_Extract_imag( inp_data[i] );
     527           0 :             move32();
     528             :         }
     529             :     }
     530           0 : }
     531             : 
     532    28645768 : void fft16_with_cmplx_data( cmplx *input /*Qx*/, Word16 bScale )
     533             : {
     534             :     cmplx x0, x1, x2, x3, temp;
     535             :     cmplx t0, t2, t4, t6, t7;
     536             :     cmplx y[16];
     537             : 
     538    28645768 :     IF( bScale )
     539             :     {
     540             :         {
     541      393504 :             x0 = CL_shr( input[0], SCALEFACTOR16 ); // Qx - 5
     542      393504 :             x1 = CL_shr( input[4], SCALEFACTOR16 );
     543      393504 :             x2 = CL_shr( input[8], SCALEFACTOR16 );
     544      393504 :             x3 = CL_shr( input[12], SCALEFACTOR16 );
     545      393504 :             t0 = CL_add( x0, x2 );
     546      393504 :             t2 = CL_sub( x0, x2 );
     547      393504 :             t4 = CL_add( x1, x3 );
     548      393504 :             t6 = CL_sub( x1, x3 );
     549      393504 :             t6 = CL_mul_j( t6 );
     550      393504 :             y[0] = CL_add( t0, t4 );
     551      393504 :             y[1] = CL_sub( t2, t6 );
     552      393504 :             y[2] = CL_sub( t0, t4 );
     553      393504 :             y[3] = CL_add( t2, t6 );
     554             : 
     555             : 
     556      393504 :             x0 = CL_shr( input[1], SCALEFACTOR16 ); // Qx - 5
     557      393504 :             x1 = CL_shr( input[5], SCALEFACTOR16 );
     558      393504 :             x2 = CL_shr( input[9], SCALEFACTOR16 );
     559      393504 :             x3 = CL_shr( input[13], SCALEFACTOR16 );
     560      393504 :             t0 = CL_add( x0, x2 );
     561      393504 :             t2 = CL_sub( x0, x2 );
     562      393504 :             t4 = CL_add( x1, x3 );
     563      393504 :             t6 = CL_sub( x1, x3 );
     564      393504 :             t6 = CL_mul_j( t6 );
     565      393504 :             y[4] = CL_add( t0, t4 );
     566      393504 :             y[5] = CL_sub( t2, t6 );
     567      393504 :             y[6] = CL_sub( t0, t4 );
     568      393504 :             y[7] = CL_add( t2, t6 );
     569             : 
     570             : 
     571      393504 :             x0 = CL_shr( input[2], SCALEFACTOR16 ); // Qx - 5
     572      393504 :             x1 = CL_shr( input[6], SCALEFACTOR16 );
     573      393504 :             x2 = CL_shr( input[10], SCALEFACTOR16 );
     574      393504 :             x3 = CL_shr( input[14], SCALEFACTOR16 );
     575      393504 :             t0 = CL_add( x0, x2 );
     576      393504 :             t2 = CL_sub( x0, x2 );
     577      393504 :             t4 = CL_add( x1, x3 );
     578      393504 :             t6 = CL_sub( x1, x3 );
     579      393504 :             t6 = CL_mul_j( t6 );
     580      393504 :             y[8] = CL_add( t0, t4 );
     581      393504 :             y[9] = CL_sub( t2, t6 );
     582      393504 :             y[10] = CL_sub( t4, t0 );
     583      393504 :             y[10] = CL_mul_j( y[10] );
     584      393504 :             y[11] = CL_add( t2, t6 );
     585             : 
     586             : 
     587      393504 :             x0 = CL_shr( input[3], SCALEFACTOR16 ); // Qx - 5
     588      393504 :             x1 = CL_shr( input[7], SCALEFACTOR16 );
     589      393504 :             x2 = CL_shr( input[11], SCALEFACTOR16 );
     590      393504 :             x3 = CL_shr( input[15], SCALEFACTOR16 );
     591      393504 :             t0 = CL_add( x0, x2 );
     592      393504 :             t2 = CL_sub( x0, x2 );
     593      393504 :             t4 = CL_add( x1, x3 );
     594      393504 :             t6 = CL_sub( x1, x3 );
     595      393504 :             t6 = CL_mul_j( t6 );
     596      393504 :             y[12] = CL_add( t0, t4 );
     597      393504 :             y[13] = CL_sub( t2, t6 );
     598      393504 :             y[14] = CL_sub( t0, t4 );
     599      393504 :             y[15] = CL_add( t2, t6 );
     600             :         }
     601             :     }
     602             :     else
     603             :     {
     604             :         {
     605    28252264 :             t0 = CL_add( input[0], input[8] );
     606    28252264 :             t2 = CL_sub( input[0], input[8] );
     607    28252264 :             t4 = CL_add( input[4], input[12] );
     608    28252264 :             t7 = CL_sub( input[4], input[12] );
     609             : 
     610    28252264 :             y[0] = CL_add( t0, t4 );
     611    28252264 :             y[1] = CL_msu_j( t2, t7 );
     612    28252264 :             y[2] = CL_sub( t0, t4 );
     613    28252264 :             y[3] = CL_mac_j( t2, t7 );
     614             :         }
     615             :         /* i=1 */
     616             :         {
     617    28252264 :             t0 = CL_add( input[1], input[9] );
     618    28252264 :             t2 = CL_sub( input[1], input[9] );
     619    28252264 :             t4 = CL_add( input[5], input[13] );
     620    28252264 :             t7 = CL_sub( input[5], input[13] );
     621             : 
     622    28252264 :             y[4] = CL_add( t0, t4 );
     623    28252264 :             y[5] = CL_msu_j( t2, t7 );
     624    28252264 :             y[6] = CL_sub( t0, t4 );
     625    28252264 :             y[7] = CL_mac_j( t2, t7 );
     626             :         }
     627             :         /* i=2 */
     628             :         {
     629    28252264 :             t0 = CL_add( input[2], input[10] );
     630    28252264 :             t2 = CL_sub( input[2], input[10] );
     631    28252264 :             t4 = CL_add( input[6], input[14] );
     632    28252264 :             t7 = CL_sub( input[6], input[14] );
     633             : 
     634    28252264 :             y[8] = CL_add( t0, t4 );
     635    28252264 :             y[9] = CL_msu_j( t2, t7 );
     636    28252264 :             temp = CL_sub( t0, t4 );
     637    28252264 :             y[10] = CL_negate( CL_mul_j( temp ) );
     638    28252264 :             y[11] = CL_mac_j( t2, t7 );
     639             :         }
     640             :         /* i=3 */
     641             :         {
     642    28252264 :             t0 = CL_add( input[3], input[11] );
     643    28252264 :             t2 = CL_sub( input[3], input[11] );
     644    28252264 :             t4 = CL_add( input[7], input[15] );
     645    28252264 :             t7 = CL_sub( input[7], input[15] );
     646             : 
     647    28252264 :             y[12] = CL_add( t0, t4 );
     648    28252264 :             y[13] = CL_msu_j( t2, t7 );
     649    28252264 :             y[14] = CL_sub( t0, t4 );
     650    28252264 :             y[15] = CL_mac_j( t2, t7 );
     651             :         }
     652             :     }
     653             : 
     654    28645768 :     x0 = CL_scale_t( y[11], C162 );
     655    28645768 :     y[11] = CL_mac_j( x0, x0 );
     656             : 
     657    28645768 :     x0 = CL_scale_t( y[14], C162 );
     658    28645768 :     y[14] = CL_mac_j( x0, x0 );
     659             : 
     660    28645768 :     x0 = CL_scale_t( y[6], C161 );
     661    28645768 :     y[6] = CL_msu_j( x0, x0 );
     662             : 
     663    28645768 :     x0 = CL_scale_t( y[9], C161 );
     664    28645768 :     y[9] = CL_msu_j( x0, x0 );
     665             : 
     666    28645768 :     y[5] = CL_mac_j( CL_scale_t( y[5], C163 ), CL_scale_t( y[5], C166 ) );
     667    28645768 :     y[7] = CL_mac_j( CL_scale_t( y[7], C165 ), CL_scale_t( y[7], C164 ) );
     668    28645768 :     y[13] = CL_mac_j( CL_scale_t( y[13], C165 ), CL_scale_t( y[13], C164 ) );
     669    28645768 :     y[15] = CL_mac_j( CL_scale_t( y[15], C164 ), CL_scale_t( y[15], C165 ) );
     670             : 
     671             : 
     672             :     /* i=0 */
     673             :     {
     674    28645768 :         t0 = CL_add( y[0], y[8] );
     675    28645768 :         t2 = CL_sub( y[0], y[8] );
     676    28645768 :         t4 = CL_add( y[4], y[12] );
     677    28645768 :         t7 = CL_sub( y[4], y[12] );
     678             : 
     679    28645768 :         input[0] = CL_add( t0, t4 );
     680    28645768 :         input[4] = CL_msu_j( t2, t7 );
     681    28645768 :         input[8] = CL_sub( t0, t4 );
     682    28645768 :         input[12] = CL_mac_j( t2, t7 );
     683             :     }
     684             :     /* i=1 */
     685             :     {
     686    28645768 :         t0 = CL_add( y[1], y[9] );
     687    28645768 :         t2 = CL_sub( y[1], y[9] );
     688    28645768 :         t4 = CL_add( y[5], y[13] );
     689    28645768 :         t7 = CL_sub( y[5], y[13] );
     690             : 
     691    28645768 :         input[1] = CL_add( t0, t4 );
     692    28645768 :         input[5] = CL_msu_j( t2, t7 );
     693    28645768 :         input[9] = CL_sub( t0, t4 );
     694    28645768 :         input[13] = CL_mac_j( t2, t7 );
     695             :     }
     696             :     /* i=2 */
     697             :     {
     698    28645768 :         t0 = CL_add( y[2], y[10] );
     699    28645768 :         t2 = CL_sub( y[2], y[10] );
     700    28645768 :         t4 = CL_add( y[6], y[14] );
     701    28645768 :         t7 = CL_sub( y[6], y[14] );
     702             : 
     703    28645768 :         input[2] = CL_add( t0, t4 );
     704    28645768 :         input[6] = CL_msu_j( t2, t7 );
     705    28645768 :         input[10] = CL_sub( t0, t4 );
     706    28645768 :         input[14] = CL_mac_j( t2, t7 );
     707             :     }
     708             :     /* i=3 */
     709             :     {
     710    28645768 :         t0 = CL_add( y[3], y[11] );
     711    28645768 :         t2 = CL_sub( y[3], y[11] );
     712    28645768 :         t4 = CL_add( y[7], y[15] );
     713    28645768 :         t7 = CL_sub( y[7], y[15] );
     714             : 
     715    28645768 :         input[3] = CL_add( t0, t4 );
     716    28645768 :         input[7] = CL_msu_j( t2, t7 );
     717    28645768 :         input[11] = CL_sub( t0, t4 );
     718    28645768 :         input[15] = CL_mac_j( t2, t7 );
     719             :     }
     720             : #ifdef WMOPS
     721             :     multiCounter[currCounter].CL_move += 16;
     722             : #endif
     723    28645768 : }
     724             : 
     725             : 
     726             : /**
     727             :  * \brief    Function performs a complex 20-point FFT
     728             :  *           The FFT is performed inplace. The result of the FFT
     729             :  *           is scaled by SCALEFACTOR20 bits.
     730             :  *
     731             :  *           WOPS with 32x16 bit multiplications:  432 cycles
     732             :  *
     733             :  * \param    [i/o] re    real input / output
     734             :  * \param    [i/o] im    imag input / output
     735             :  * \param    [i  ] s     stride real and imag input / output
     736             :  *
     737             :  * \return   void
     738             :  */
     739    11094998 : static void fft20_with_cmplx_data( cmplx *inp_data /*Qx*/ )
     740             : {
     741             :     cmplx r1, r2, r3, r4;
     742             :     cmplx x0, x1, x2, x3, x4;
     743             :     cmplx t, t0, t1, t2, t3;
     744             :     cmplx y[20];
     745             :     cmplx *y0, *y1, *y2, *y3, *y4;
     746             : 
     747    11094998 :     y0 = y;
     748    11094998 :     y1 = &y[4];
     749    11094998 :     y2 = &y[16];
     750    11094998 :     y3 = &y[8];
     751    11094998 :     y4 = &y[12];
     752             : 
     753             :     {
     754    11094998 :         x0 = CL_shr( inp_data[0], SCALEFACTOR20 ); // Qx - 5
     755    11094998 :         x1 = CL_shr( inp_data[16], SCALEFACTOR20 );
     756    11094998 :         x2 = CL_shr( inp_data[12], SCALEFACTOR20 );
     757    11094998 :         x3 = CL_shr( inp_data[8], SCALEFACTOR20 );
     758    11094998 :         x4 = CL_shr( inp_data[4], SCALEFACTOR20 );
     759             : 
     760    11094998 :         r4 = CL_sub( x1, x4 );
     761    11094998 :         r2 = CL_sub( x2, x3 );
     762    11094998 :         r1 = CL_add( x1, x4 );
     763    11094998 :         r3 = CL_add( x2, x3 );
     764    11094998 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     765    11094998 :         r1 = CL_add( r1, r3 );
     766    11094998 :         y0[0] = CL_add( x0, r1 );
     767    11094998 :         r1 = CL_add( y0[0], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     768    11094998 :         r3 = CL_sub( r1, t );
     769    11094998 :         r1 = CL_add( r1, t );
     770    11094998 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     771    11094998 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     772    11094998 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     773             : 
     774             : 
     775    11094998 :         y1[0] = CL_msu_j( r1, r2 );
     776    11094998 :         y2[0] = CL_mac_j( r1, r2 );
     777    11094998 :         y3[0] = CL_mac_j( r3, r4 );
     778    11094998 :         y4[0] = CL_msu_j( r3, r4 );
     779             :     }
     780             :     {
     781    11094998 :         x0 = CL_shr( inp_data[5], SCALEFACTOR20 ); // Qx - 5
     782    11094998 :         x1 = CL_shr( inp_data[1], SCALEFACTOR20 );
     783    11094998 :         x2 = CL_shr( inp_data[17], SCALEFACTOR20 );
     784    11094998 :         x3 = CL_shr( inp_data[13], SCALEFACTOR20 );
     785    11094998 :         x4 = CL_shr( inp_data[9], SCALEFACTOR20 );
     786             : 
     787    11094998 :         r4 = CL_sub( x1, x4 );
     788    11094998 :         r2 = CL_sub( x2, x3 );
     789    11094998 :         r1 = CL_add( x1, x4 );
     790    11094998 :         r3 = CL_add( x2, x3 );
     791    11094998 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     792    11094998 :         r1 = CL_add( r1, r3 );
     793    11094998 :         y0[1] = CL_add( x0, r1 );
     794    11094998 :         r1 = CL_add( y0[1], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     795    11094998 :         r3 = CL_sub( r1, t );
     796    11094998 :         r1 = CL_add( r1, t );
     797    11094998 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     798    11094998 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     799    11094998 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     800             : 
     801             : 
     802    11094998 :         y1[1] = CL_msu_j( r1, r2 );
     803    11094998 :         y2[1] = CL_mac_j( r1, r2 );
     804    11094998 :         y3[1] = CL_mac_j( r3, r4 );
     805    11094998 :         y4[1] = CL_msu_j( r3, r4 );
     806             :     }
     807             :     {
     808    11094998 :         x0 = CL_shr( inp_data[10], SCALEFACTOR20 ); // Qx - 5
     809    11094998 :         x1 = CL_shr( inp_data[6], SCALEFACTOR20 );
     810    11094998 :         x2 = CL_shr( inp_data[2], SCALEFACTOR20 );
     811    11094998 :         x3 = CL_shr( inp_data[18], SCALEFACTOR20 );
     812    11094998 :         x4 = CL_shr( inp_data[14], SCALEFACTOR20 );
     813             : 
     814    11094998 :         r4 = CL_sub( x1, x4 );
     815    11094998 :         r2 = CL_sub( x2, x3 );
     816    11094998 :         r1 = CL_add( x1, x4 );
     817    11094998 :         r3 = CL_add( x2, x3 );
     818    11094998 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     819    11094998 :         r1 = CL_add( r1, r3 );
     820    11094998 :         y0[2] = CL_add( x0, r1 );
     821    11094998 :         r1 = CL_add( y0[2], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     822    11094998 :         r3 = CL_sub( r1, t );
     823    11094998 :         r1 = CL_add( r1, t );
     824    11094998 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     825    11094998 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     826    11094998 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     827             : 
     828             : 
     829    11094998 :         y1[2] = CL_msu_j( r1, r2 );
     830    11094998 :         y2[2] = CL_mac_j( r1, r2 );
     831    11094998 :         y3[2] = CL_mac_j( r3, r4 );
     832    11094998 :         y4[2] = CL_msu_j( r3, r4 );
     833             :     }
     834             :     {
     835    11094998 :         x0 = CL_shr( inp_data[15], SCALEFACTOR20 ); // Qx - 5
     836    11094998 :         x1 = CL_shr( inp_data[11], SCALEFACTOR20 );
     837    11094998 :         x2 = CL_shr( inp_data[7], SCALEFACTOR20 );
     838    11094998 :         x3 = CL_shr( inp_data[3], SCALEFACTOR20 );
     839    11094998 :         x4 = CL_shr( inp_data[19], SCALEFACTOR20 );
     840             : 
     841    11094998 :         r4 = CL_sub( x1, x4 );
     842    11094998 :         r2 = CL_sub( x2, x3 );
     843    11094998 :         r1 = CL_add( x1, x4 );
     844    11094998 :         r3 = CL_add( x2, x3 );
     845    11094998 :         t = CL_scale_t( CL_sub( r1, r3 ), C54 );
     846    11094998 :         r1 = CL_add( r1, r3 );
     847    11094998 :         y0[3] = CL_add( x0, r1 );
     848    11094998 :         r1 = CL_add( y0[3], ( CL_shl( CL_scale_t( r1, C55 ), 1 ) ) );
     849    11094998 :         r3 = CL_sub( r1, t );
     850    11094998 :         r1 = CL_add( r1, t );
     851    11094998 :         t = CL_scale_t( ( CL_add( r4, r2 ) ), C51 );
     852    11094998 :         r4 = CL_add( t, CL_shl( CL_scale_t( r4, C52 ), 1 ) );
     853    11094998 :         r2 = CL_add( t, CL_scale_t( r2, C53 ) );
     854             : 
     855             : 
     856    11094998 :         y1[3] = CL_msu_j( r1, r2 );
     857    11094998 :         y2[3] = CL_mac_j( r1, r2 );
     858    11094998 :         y3[3] = CL_mac_j( r3, r4 );
     859    11094998 :         y4[3] = CL_msu_j( r3, r4 );
     860             :     }
     861             : 
     862             :     {
     863    11094998 :         cmplx *ptr_y = y;
     864             :         {
     865             :             cmplx Cy0, Cy1, Cy2, Cy3;
     866             : 
     867    11094998 :             Cy0 = *ptr_y++;
     868    11094998 :             Cy1 = *ptr_y++;
     869    11094998 :             Cy2 = *ptr_y++;
     870    11094998 :             Cy3 = *ptr_y++;
     871             : 
     872             :             /*  Pre-additions */
     873    11094998 :             t0 = CL_add( Cy0, Cy2 );
     874    11094998 :             t1 = CL_sub( Cy0, Cy2 );
     875    11094998 :             t2 = CL_add( Cy1, Cy3 );
     876    11094998 :             t3 = CL_sub( Cy1, Cy3 );
     877             : 
     878             : 
     879    11094998 :             inp_data[0] = CL_add( t0, t2 );
     880    11094998 :             inp_data[5] = CL_msu_j( t1, t3 );
     881    11094998 :             inp_data[10] = CL_sub( t0, t2 );
     882    11094998 :             inp_data[15] = CL_mac_j( t1, t3 );
     883             :         }
     884             : 
     885             :         {
     886             :             cmplx Cy0, Cy1, Cy2, Cy3;
     887             : 
     888    11094998 :             Cy0 = *ptr_y++;
     889    11094998 :             Cy1 = *ptr_y++;
     890    11094998 :             Cy2 = *ptr_y++;
     891    11094998 :             Cy3 = *ptr_y++;
     892             : 
     893             :             /*  Pre-additions */
     894    11094998 :             t0 = CL_add( Cy0, Cy2 );
     895    11094998 :             t1 = CL_sub( Cy0, Cy2 );
     896    11094998 :             t2 = CL_add( Cy1, Cy3 );
     897    11094998 :             t3 = CL_sub( Cy1, Cy3 );
     898             : 
     899             : 
     900    11094998 :             inp_data[4] = CL_add( t0, t2 );
     901    11094998 :             inp_data[9] = CL_msu_j( t1, t3 );
     902    11094998 :             inp_data[14] = CL_sub( t0, t2 );
     903    11094998 :             inp_data[19] = CL_mac_j( t1, t3 );
     904             :         }
     905             : 
     906             :         {
     907             :             cmplx Cy0, Cy1, Cy2, Cy3;
     908             : 
     909    11094998 :             Cy0 = *ptr_y++;
     910    11094998 :             Cy1 = *ptr_y++;
     911    11094998 :             Cy2 = *ptr_y++;
     912    11094998 :             Cy3 = *ptr_y++;
     913             : 
     914             :             /*  Pre-additions */
     915    11094998 :             t0 = CL_add( Cy0, Cy2 );
     916    11094998 :             t1 = CL_sub( Cy0, Cy2 );
     917    11094998 :             t2 = CL_add( Cy1, Cy3 );
     918    11094998 :             t3 = CL_sub( Cy1, Cy3 );
     919             : 
     920             : 
     921    11094998 :             inp_data[8] = CL_add( t0, t2 );
     922    11094998 :             inp_data[13] = CL_msu_j( t1, t3 );
     923    11094998 :             inp_data[18] = CL_sub( t0, t2 );
     924    11094998 :             inp_data[3] = CL_mac_j( t1, t3 );
     925             :         }
     926             : 
     927             :         {
     928             :             cmplx Cy0, Cy1, Cy2, Cy3;
     929             : 
     930    11094998 :             Cy0 = *ptr_y++;
     931    11094998 :             Cy1 = *ptr_y++;
     932    11094998 :             Cy2 = *ptr_y++;
     933    11094998 :             Cy3 = *ptr_y++;
     934             : 
     935             :             /*  Pre-additions */
     936    11094998 :             t0 = CL_add( Cy0, Cy2 );
     937    11094998 :             t1 = CL_sub( Cy0, Cy2 );
     938    11094998 :             t2 = CL_add( Cy1, Cy3 );
     939    11094998 :             t3 = CL_sub( Cy1, Cy3 );
     940             : 
     941    11094998 :             inp_data[12] = CL_add( t0, t2 );
     942    11094998 :             inp_data[17] = CL_msu_j( t1, t3 );
     943    11094998 :             inp_data[2] = CL_sub( t0, t2 );
     944    11094998 :             inp_data[7] = CL_mac_j( t1, t3 );
     945             :         }
     946             : 
     947             :         {
     948             :             cmplx Cy0, Cy1, Cy2, Cy3;
     949             : 
     950    11094998 :             Cy0 = *ptr_y++;
     951    11094998 :             Cy1 = *ptr_y++;
     952    11094998 :             Cy2 = *ptr_y++;
     953    11094998 :             Cy3 = *ptr_y++;
     954             : 
     955             :             /*  Pre-additions */
     956    11094998 :             t0 = CL_add( Cy0, Cy2 );
     957    11094998 :             t1 = CL_sub( Cy0, Cy2 );
     958    11094998 :             t2 = CL_add( Cy1, Cy3 );
     959    11094998 :             t3 = CL_sub( Cy1, Cy3 );
     960             : 
     961             : 
     962    11094998 :             inp_data[16] = CL_add( t0, t2 );
     963    11094998 :             inp_data[1] = CL_msu_j( t1, t3 );
     964    11094998 :             inp_data[6] = CL_sub( t0, t2 );
     965    11094998 :             inp_data[11] = CL_mac_j( t1, t3 );
     966             :         }
     967             :     }
     968             : #ifdef WMOPS
     969             :     multiCounter[currCounter].CL_move += 20;
     970             : #endif
     971    11094998 : }
     972             : 
     973             : 
     974             : /**
     975             :  * \brief    Function performs a complex 30-point FFT
     976             :  *           The FFT is performed inplace. The result of the FFT
     977             :  *           is scaled by SCALEFACTOR30 bits.
     978             :  *
     979             :  *           WOPS with 32x16 bit multiplications:  828 cycles
     980             :  *
     981             :  * \param    [i/o] re    real input / output
     982             :  * \param    [i/o] im    imag input / output
     983             :  * \param    [i  ] s     stride real and imag input / output
     984             :  *
     985             :  * \return   void
     986             :  */
     987             : 
     988     9223404 : static void fft30_with_cmplx_data( cmplx *inp /*Qx*/ )
     989             : {
     990     9223404 :     cmplx *l = &inp[0];
     991     9223404 :     cmplx *h = &inp[15];
     992             : 
     993             :     cmplx z[30], y[15], x[15], rs1, rs2, rs3, rs4, t;
     994             : 
     995             :     /* 1. FFT15 stage */
     996             : 
     997     9223404 :     x[0] = CL_shr( inp[0], SCALEFACTOR30_1 ); // Qx - 5
     998     9223404 :     x[1] = CL_shr( inp[18], SCALEFACTOR30_1 );
     999     9223404 :     x[2] = CL_shr( inp[6], SCALEFACTOR30_1 );
    1000     9223404 :     x[3] = CL_shr( inp[24], SCALEFACTOR30_1 );
    1001     9223404 :     x[4] = CL_shr( inp[12], SCALEFACTOR30_1 );
    1002             : 
    1003     9223404 :     x[5] = CL_shr( inp[20], SCALEFACTOR30_1 ); // Qx - 5
    1004     9223404 :     x[6] = CL_shr( inp[8], SCALEFACTOR30_1 );
    1005     9223404 :     x[7] = CL_shr( inp[26], SCALEFACTOR30_1 );
    1006     9223404 :     x[8] = CL_shr( inp[14], SCALEFACTOR30_1 );
    1007     9223404 :     x[9] = CL_shr( inp[2], SCALEFACTOR30_1 );
    1008             : 
    1009     9223404 :     x[10] = CL_shr( inp[10], SCALEFACTOR30_1 ); // Qx - 5
    1010     9223404 :     x[11] = CL_shr( inp[28], SCALEFACTOR30_1 );
    1011     9223404 :     x[12] = CL_shr( inp[16], SCALEFACTOR30_1 );
    1012     9223404 :     x[13] = CL_shr( inp[4], SCALEFACTOR30_1 );
    1013     9223404 :     x[14] = CL_shr( inp[22], SCALEFACTOR30_1 );
    1014             : 
    1015             : 
    1016             :     /* 1. FFT5 stage */
    1017     9223404 :     rs1 = CL_add( x[1], x[4] );
    1018     9223404 :     rs4 = CL_sub( x[1], x[4] );
    1019     9223404 :     rs3 = CL_add( x[2], x[3] );
    1020     9223404 :     rs2 = CL_sub( x[2], x[3] );
    1021     9223404 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1022     9223404 :     rs1 = CL_add( rs1, rs3 );
    1023     9223404 :     y[0] = CL_add( x[0], rs1 );
    1024     9223404 :     rs1 = CL_add( y[0], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1025     9223404 :     rs3 = CL_sub( rs1, t );
    1026     9223404 :     rs1 = CL_add( rs1, t );
    1027     9223404 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1028     9223404 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1029     9223404 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1030             : 
    1031             :     /* combination */
    1032     9223404 :     y[1] = CL_msu_j( rs1, rs2 );
    1033     9223404 :     y[4] = CL_mac_j( rs1, rs2 );
    1034     9223404 :     y[2] = CL_mac_j( rs3, rs4 );
    1035     9223404 :     y[3] = CL_msu_j( rs3, rs4 );
    1036             : 
    1037             : 
    1038             :     /* 2. FFT5 stage */
    1039     9223404 :     rs1 = CL_add( x[6], x[9] );
    1040     9223404 :     rs4 = CL_sub( x[6], x[9] );
    1041     9223404 :     rs3 = CL_add( x[7], x[8] );
    1042     9223404 :     rs2 = CL_sub( x[7], x[8] );
    1043     9223404 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1044     9223404 :     rs1 = CL_add( rs1, rs3 );
    1045     9223404 :     y[5] = CL_add( x[5], rs1 );
    1046     9223404 :     rs1 = CL_add( y[5], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1047     9223404 :     rs3 = CL_sub( rs1, t );
    1048     9223404 :     rs1 = CL_add( rs1, t );
    1049     9223404 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1050     9223404 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1051     9223404 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1052             : 
    1053             :     /* combination */
    1054     9223404 :     y[6] = CL_msu_j( rs1, rs2 );
    1055     9223404 :     y[9] = CL_mac_j( rs1, rs2 );
    1056     9223404 :     y[7] = CL_mac_j( rs3, rs4 );
    1057     9223404 :     y[8] = CL_msu_j( rs3, rs4 );
    1058             : 
    1059             : 
    1060             :     /* 3. FFT5 stage */
    1061     9223404 :     rs1 = CL_add( x[11], x[14] );
    1062     9223404 :     rs4 = CL_sub( x[11], x[14] );
    1063     9223404 :     rs3 = CL_add( x[12], x[13] );
    1064     9223404 :     rs2 = CL_sub( x[12], x[13] );
    1065     9223404 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1066     9223404 :     rs1 = CL_add( rs1, rs3 );
    1067     9223404 :     y[10] = CL_add( x[10], rs1 );
    1068     9223404 :     rs1 = CL_add( y[10], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1069     9223404 :     rs3 = CL_sub( rs1, t );
    1070     9223404 :     rs1 = CL_add( rs1, t );
    1071     9223404 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1072     9223404 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1073     9223404 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1074             : 
    1075             :     /* combination */
    1076     9223404 :     y[11] = CL_msu_j( rs1, rs2 );
    1077     9223404 :     y[14] = CL_mac_j( rs1, rs2 );
    1078     9223404 :     y[12] = CL_mac_j( rs3, rs4 );
    1079     9223404 :     y[13] = CL_msu_j( rs3, rs4 );
    1080             :     /*for (i=10; i<15; i++)
    1081             :     {
    1082             :     printf("%d,\t %d,\t",y[i].re, y[i].im);
    1083             :     }
    1084             :     printf("\n\n");*/
    1085             : 
    1086             : 
    1087             :     /* 1. FFT3 stage */
    1088             :     /* real part */
    1089     9223404 :     rs1 = CL_add( y[5], y[10] );
    1090     9223404 :     rs2 = CL_scale_t( CL_sub( y[5], y[10] ), C31 );
    1091     9223404 :     z[0] = CL_add( y[0], rs1 );
    1092     9223404 :     rs1 = CL_sub( y[0], CL_shr( rs1, 1 ) );
    1093             : 
    1094     9223404 :     z[10] = CL_mac_j( rs1, rs2 );
    1095     9223404 :     z[5] = CL_msu_j( rs1, rs2 );
    1096             : 
    1097             :     /* 2. FFT3 stage */
    1098     9223404 :     rs1 = CL_add( y[6], y[11] );
    1099     9223404 :     rs2 = CL_scale_t( CL_sub( y[6], y[11] ), C31 );
    1100     9223404 :     z[6] = CL_add( y[1], rs1 );
    1101     9223404 :     rs1 = CL_sub( y[1], CL_shr( rs1, 1 ) );
    1102             : 
    1103     9223404 :     z[1] = CL_mac_j( rs1, rs2 );
    1104     9223404 :     z[11] = CL_msu_j( rs1, rs2 );
    1105             : 
    1106             : 
    1107             :     /* 3. FFT3 stage */
    1108     9223404 :     rs1 = CL_add( y[7], y[12] );
    1109     9223404 :     rs2 = CL_scale_t( CL_sub( y[7], y[12] ), C31 );
    1110     9223404 :     z[12] = CL_add( y[2], rs1 );
    1111     9223404 :     rs1 = CL_sub( y[2], CL_shr( rs1, 1 ) );
    1112             : 
    1113     9223404 :     z[7] = CL_mac_j( rs1, rs2 );
    1114     9223404 :     z[2] = CL_msu_j( rs1, rs2 );
    1115             : 
    1116             : 
    1117             :     /* 4. FFT3 stage */
    1118     9223404 :     rs1 = CL_add( y[8], y[13] );
    1119     9223404 :     rs2 = CL_scale_t( CL_sub( y[8], y[13] ), C31 );
    1120     9223404 :     z[3] = CL_add( y[3], rs1 );
    1121     9223404 :     rs1 = CL_sub( y[3], CL_shr( rs1, 1 ) );
    1122             : 
    1123     9223404 :     z[13] = CL_mac_j( rs1, rs2 );
    1124     9223404 :     z[8] = CL_msu_j( rs1, rs2 );
    1125             : 
    1126             : 
    1127             :     /* 5. FFT3 stage */
    1128     9223404 :     rs1 = CL_add( y[9], y[14] );
    1129     9223404 :     rs2 = CL_scale_t( CL_sub( y[9], y[14] ), C31 );
    1130     9223404 :     z[9] = CL_add( y[4], rs1 );
    1131     9223404 :     rs1 = CL_sub( y[4], CL_shr( rs1, 1 ) );
    1132             : 
    1133     9223404 :     z[4] = CL_mac_j( rs1, rs2 );
    1134     9223404 :     z[14] = CL_msu_j( rs1, rs2 );
    1135             : 
    1136             :     /*for (i=0; i<15; i++)
    1137             :     printf("%d,\t %d,\t",z[i].re, z[i].im);
    1138             :     printf("\n\n");*/
    1139             : 
    1140             : 
    1141             :     /* 2. FFT15 stage */
    1142             : 
    1143     9223404 :     x[0] = CL_shr( inp[15], SCALEFACTOR30_1 ); // Qx - 5
    1144     9223404 :     x[1] = CL_shr( inp[3], SCALEFACTOR30_1 );
    1145     9223404 :     x[2] = CL_shr( inp[21], SCALEFACTOR30_1 );
    1146     9223404 :     x[3] = CL_shr( inp[9], SCALEFACTOR30_1 );
    1147     9223404 :     x[4] = CL_shr( inp[27], SCALEFACTOR30_1 );
    1148             : 
    1149     9223404 :     x[5] = CL_shr( inp[5], SCALEFACTOR30_1 ); // Qx - 5
    1150     9223404 :     x[6] = CL_shr( inp[23], SCALEFACTOR30_1 );
    1151     9223404 :     x[7] = CL_shr( inp[11], SCALEFACTOR30_1 );
    1152     9223404 :     x[8] = CL_shr( inp[29], SCALEFACTOR30_1 );
    1153     9223404 :     x[9] = CL_shr( inp[17], SCALEFACTOR30_1 );
    1154             : 
    1155     9223404 :     x[10] = CL_shr( inp[25], SCALEFACTOR30_1 ); // Qx - 5
    1156     9223404 :     x[11] = CL_shr( inp[13], SCALEFACTOR30_1 );
    1157     9223404 :     x[12] = CL_shr( inp[1], SCALEFACTOR30_1 );
    1158     9223404 :     x[13] = CL_shr( inp[19], SCALEFACTOR30_1 );
    1159     9223404 :     x[14] = CL_shr( inp[7], SCALEFACTOR30_1 );
    1160             : 
    1161             :     /* 1. FFT5 stage */
    1162     9223404 :     rs1 = CL_add( x[1], x[4] );
    1163     9223404 :     rs4 = CL_sub( x[1], x[4] );
    1164     9223404 :     rs3 = CL_add( x[2], x[3] );
    1165     9223404 :     rs2 = CL_sub( x[2], x[3] );
    1166     9223404 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1167     9223404 :     rs1 = CL_add( rs1, rs3 );
    1168     9223404 :     y[0] = CL_add( x[0], rs1 );
    1169     9223404 :     rs1 = CL_add( y[0], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1170     9223404 :     rs3 = CL_sub( rs1, t );
    1171     9223404 :     rs1 = CL_add( rs1, t );
    1172     9223404 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1173     9223404 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1174     9223404 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1175             : 
    1176             :     /* combination */
    1177     9223404 :     y[1] = CL_msu_j( rs1, rs2 );
    1178     9223404 :     y[4] = CL_mac_j( rs1, rs2 );
    1179     9223404 :     y[2] = CL_mac_j( rs3, rs4 );
    1180     9223404 :     y[3] = CL_msu_j( rs3, rs4 );
    1181             : 
    1182             : 
    1183             :     /* 2. FFT5 stage */
    1184     9223404 :     rs1 = CL_add( x[6], x[9] );
    1185     9223404 :     rs4 = CL_sub( x[6], x[9] );
    1186     9223404 :     rs3 = CL_add( x[7], x[8] );
    1187     9223404 :     rs2 = CL_sub( x[7], x[8] );
    1188     9223404 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1189     9223404 :     rs1 = CL_add( rs1, rs3 );
    1190     9223404 :     y[5] = CL_add( x[5], rs1 );
    1191     9223404 :     rs1 = CL_add( y[5], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1192     9223404 :     rs3 = CL_sub( rs1, t );
    1193     9223404 :     rs1 = CL_add( rs1, t );
    1194     9223404 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1195     9223404 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1196     9223404 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1197             : 
    1198             :     /* combination */
    1199     9223404 :     y[6] = CL_msu_j( rs1, rs2 );
    1200     9223404 :     y[9] = CL_mac_j( rs1, rs2 );
    1201     9223404 :     y[7] = CL_mac_j( rs3, rs4 );
    1202     9223404 :     y[8] = CL_msu_j( rs3, rs4 );
    1203             : 
    1204             : 
    1205             :     /* 3. FFT5 stage */
    1206     9223404 :     rs1 = CL_add( x[11], x[14] );
    1207     9223404 :     rs4 = CL_sub( x[11], x[14] );
    1208     9223404 :     rs3 = CL_add( x[12], x[13] );
    1209     9223404 :     rs2 = CL_sub( x[12], x[13] );
    1210     9223404 :     t = CL_scale_t( CL_sub( rs1, rs3 ), C54 );
    1211     9223404 :     rs1 = CL_add( rs1, rs3 );
    1212     9223404 :     y[10] = CL_add( x[10], rs1 );
    1213     9223404 :     rs1 = CL_add( y[10], ( CL_shl( CL_scale_t( rs1, C55 ), 1 ) ) );
    1214     9223404 :     rs3 = CL_sub( rs1, t );
    1215     9223404 :     rs1 = CL_add( rs1, t );
    1216     9223404 :     t = CL_scale_t( CL_add( rs4, rs2 ), C51 );
    1217     9223404 :     rs4 = CL_add( t, CL_shl( CL_scale_t( rs4, C52 ), 1 ) );
    1218     9223404 :     rs2 = CL_add( t, CL_scale_t( rs2, C53 ) );
    1219             : 
    1220             :     /* combination */
    1221     9223404 :     y[11] = CL_msu_j( rs1, rs2 );
    1222     9223404 :     y[14] = CL_mac_j( rs1, rs2 );
    1223     9223404 :     y[12] = CL_mac_j( rs3, rs4 );
    1224     9223404 :     y[13] = CL_msu_j( rs3, rs4 );
    1225             :     /*for (i=10; i<15; i++)
    1226             :     {
    1227             :     printf("%d,\t %d,\t",y[i].re, y[i].im);
    1228             :     }
    1229             :     printf("\n\n");*/
    1230             : 
    1231             : 
    1232             :     /* 1. FFT3 stage */
    1233             :     /* real part */
    1234     9223404 :     rs1 = CL_add( y[5], y[10] );
    1235     9223404 :     rs2 = CL_scale_t( CL_sub( y[5], y[10] ), C31 );
    1236     9223404 :     z[15] = CL_add( y[0], rs1 );
    1237     9223404 :     rs1 = CL_sub( y[0], CL_shr( rs1, 1 ) );
    1238             : 
    1239     9223404 :     z[25] = CL_mac_j( rs1, rs2 );
    1240     9223404 :     z[20] = CL_msu_j( rs1, rs2 );
    1241             : 
    1242             :     /* 2. FFT3 stage */
    1243     9223404 :     rs1 = CL_add( y[6], y[11] );
    1244     9223404 :     rs2 = CL_scale_t( CL_sub( y[6], y[11] ), C31 );
    1245     9223404 :     z[21] = CL_add( y[1], rs1 );
    1246     9223404 :     rs1 = CL_sub( y[1], CL_shr( rs1, 1 ) );
    1247             : 
    1248     9223404 :     z[16] = CL_mac_j( rs1, rs2 );
    1249     9223404 :     z[26] = CL_msu_j( rs1, rs2 );
    1250             : 
    1251             : 
    1252             :     /* 3. FFT3 stage */
    1253     9223404 :     rs1 = CL_add( y[7], y[12] );
    1254     9223404 :     rs2 = CL_scale_t( CL_sub( y[7], y[12] ), C31 );
    1255     9223404 :     z[27] = CL_add( y[2], rs1 );
    1256     9223404 :     rs1 = CL_sub( y[2], CL_shr( rs1, 1 ) );
    1257             : 
    1258     9223404 :     z[22] = CL_mac_j( rs1, rs2 );
    1259     9223404 :     z[17] = CL_msu_j( rs1, rs2 );
    1260             : 
    1261             : 
    1262             :     /* 4. FFT3 stage */
    1263     9223404 :     rs1 = CL_add( y[8], y[13] );
    1264     9223404 :     rs2 = CL_scale_t( CL_sub( y[8], y[13] ), C31 );
    1265     9223404 :     z[18] = CL_add( y[3], rs1 );
    1266     9223404 :     rs1 = CL_sub( y[3], CL_shr( rs1, 1 ) );
    1267             : 
    1268     9223404 :     z[28] = CL_mac_j( rs1, rs2 );
    1269     9223404 :     z[23] = CL_msu_j( rs1, rs2 );
    1270             : 
    1271             : 
    1272             :     /* 5. FFT3 stage */
    1273     9223404 :     rs1 = CL_add( y[9], y[14] );
    1274     9223404 :     rs2 = CL_scale_t( CL_sub( y[9], y[14] ), C31 );
    1275     9223404 :     z[24] = CL_add( y[4], rs1 );
    1276     9223404 :     rs1 = CL_sub( y[4], CL_shr( rs1, 1 ) );
    1277             : 
    1278     9223404 :     z[19] = CL_mac_j( rs1, rs2 );
    1279     9223404 :     z[29] = CL_msu_j( rs1, rs2 );
    1280             : 
    1281             :     /*for (i=0; i<30; i++)
    1282             :     printf("%d,\t %d,\t",z[i].re, z[i].im);
    1283             :     printf("\n\n");*/
    1284             : 
    1285             : 
    1286             :     /* 1. FFT2 stage */
    1287     9223404 :     rs1 = CL_shr( z[0], SCALEFACTOR30_2 );
    1288     9223404 :     rs2 = CL_shr( z[15], SCALEFACTOR30_2 );
    1289     9223404 :     *l = CL_add( rs1, rs2 );
    1290     9223404 :     *h = CL_sub( rs1, rs2 );
    1291     9223404 :     l += 1;
    1292     9223404 :     h += 1;
    1293             : 
    1294             :     /* 2. FFT2 stage */
    1295     9223404 :     rs1 = CL_shr( z[8], SCALEFACTOR30_2 );
    1296     9223404 :     rs2 = CL_shr( z[23], SCALEFACTOR30_2 );
    1297     9223404 :     *h = CL_add( rs1, rs2 );
    1298     9223404 :     *l = CL_sub( rs1, rs2 );
    1299     9223404 :     l += 1;
    1300     9223404 :     h += 1;
    1301             : 
    1302             : 
    1303             :     /* 3. FFT2 stage */
    1304     9223404 :     rs1 = CL_shr( z[1], SCALEFACTOR30_2 );
    1305     9223404 :     rs2 = CL_shr( z[16], SCALEFACTOR30_2 );
    1306     9223404 :     *l = CL_add( rs1, rs2 );
    1307     9223404 :     *h = CL_sub( rs1, rs2 );
    1308     9223404 :     l += 1;
    1309     9223404 :     h += 1;
    1310             : 
    1311             : 
    1312             :     /* 4. FFT2 stage */
    1313     9223404 :     rs1 = CL_shr( z[9], SCALEFACTOR30_2 );
    1314     9223404 :     rs2 = CL_shr( z[24], SCALEFACTOR30_2 );
    1315     9223404 :     *h = CL_add( rs1, rs2 );
    1316     9223404 :     *l = CL_sub( rs1, rs2 );
    1317     9223404 :     l += 1;
    1318     9223404 :     h += 1;
    1319             : 
    1320             :     /* 5. FFT2 stage */
    1321     9223404 :     rs1 = CL_shr( z[2], SCALEFACTOR30_2 );
    1322     9223404 :     rs2 = CL_shr( z[17], SCALEFACTOR30_2 );
    1323     9223404 :     *l = CL_add( rs1, rs2 );
    1324     9223404 :     *h = CL_sub( rs1, rs2 );
    1325     9223404 :     l += 1;
    1326     9223404 :     h += 1;
    1327             : 
    1328             :     /* 6. FFT2 stage */
    1329     9223404 :     rs1 = CL_shr( z[10], SCALEFACTOR30_2 );
    1330     9223404 :     rs2 = CL_shr( z[25], SCALEFACTOR30_2 );
    1331     9223404 :     *h = CL_add( rs1, rs2 );
    1332     9223404 :     *l = CL_sub( rs1, rs2 );
    1333     9223404 :     l += 1;
    1334     9223404 :     h += 1;
    1335             : 
    1336             :     /* 7. FFT2 stage */
    1337     9223404 :     rs1 = CL_shr( z[3], SCALEFACTOR30_2 );
    1338     9223404 :     rs2 = CL_shr( z[18], SCALEFACTOR30_2 );
    1339     9223404 :     *l = CL_add( rs1, rs2 );
    1340     9223404 :     *h = CL_sub( rs1, rs2 );
    1341     9223404 :     l += 1;
    1342     9223404 :     h += 1;
    1343             : 
    1344             :     /* 8. FFT2 stage */
    1345     9223404 :     rs1 = CL_shr( z[11], SCALEFACTOR30_2 );
    1346     9223404 :     rs2 = CL_shr( z[26], SCALEFACTOR30_2 );
    1347     9223404 :     *h = CL_add( rs1, rs2 );
    1348     9223404 :     *l = CL_sub( rs1, rs2 );
    1349     9223404 :     l += 1;
    1350     9223404 :     h += 1;
    1351             : 
    1352             :     /* 9. FFT2 stage */
    1353     9223404 :     rs1 = CL_shr( z[4], SCALEFACTOR30_2 );
    1354     9223404 :     rs2 = CL_shr( z[19], SCALEFACTOR30_2 );
    1355     9223404 :     *l = CL_add( rs1, rs2 );
    1356     9223404 :     *h = CL_sub( rs1, rs2 );
    1357     9223404 :     l += 1;
    1358     9223404 :     h += 1;
    1359             : 
    1360             :     /* 10. FFT2 stage */
    1361     9223404 :     rs1 = CL_shr( z[12], SCALEFACTOR30_2 );
    1362     9223404 :     rs2 = CL_shr( z[27], SCALEFACTOR30_2 );
    1363     9223404 :     *h = CL_add( rs1, rs2 );
    1364     9223404 :     *l = CL_sub( rs1, rs2 );
    1365     9223404 :     l += 1;
    1366     9223404 :     h += 1;
    1367             : 
    1368             :     /* 11. FFT2 stage */
    1369     9223404 :     rs1 = CL_shr( z[5], SCALEFACTOR30_2 );
    1370     9223404 :     rs2 = CL_shr( z[20], SCALEFACTOR30_2 );
    1371     9223404 :     *l = CL_add( rs1, rs2 );
    1372     9223404 :     *h = CL_sub( rs1, rs2 );
    1373     9223404 :     l += 1;
    1374     9223404 :     h += 1;
    1375             : 
    1376             :     /* 12. FFT2 stage */
    1377     9223404 :     rs1 = CL_shr( z[13], SCALEFACTOR30_2 );
    1378     9223404 :     rs2 = CL_shr( z[28], SCALEFACTOR30_2 );
    1379     9223404 :     *h = CL_add( rs1, rs2 );
    1380     9223404 :     *l = CL_sub( rs1, rs2 );
    1381     9223404 :     l += 1;
    1382     9223404 :     h += 1;
    1383             : 
    1384             :     /* 13. FFT2 stage */
    1385     9223404 :     rs1 = CL_shr( z[6], SCALEFACTOR30_2 );
    1386     9223404 :     rs2 = CL_shr( z[21], SCALEFACTOR30_2 );
    1387     9223404 :     *l = CL_add( rs1, rs2 );
    1388     9223404 :     *h = CL_sub( rs1, rs2 );
    1389     9223404 :     l += 1;
    1390     9223404 :     h += 1;
    1391             : 
    1392             :     /* 14. FFT2 stage */
    1393     9223404 :     rs1 = CL_shr( z[14], SCALEFACTOR30_2 );
    1394     9223404 :     rs2 = CL_shr( z[29], SCALEFACTOR30_2 );
    1395     9223404 :     *h = CL_add( rs1, rs2 );
    1396     9223404 :     *l = CL_sub( rs1, rs2 );
    1397     9223404 :     l += 1;
    1398     9223404 :     h += 1;
    1399             : 
    1400             :     /* 15. FFT2 stage */
    1401     9223404 :     rs1 = CL_shr( z[7], SCALEFACTOR30_2 );
    1402     9223404 :     rs2 = CL_shr( z[22], SCALEFACTOR30_2 );
    1403     9223404 :     *l = CL_add( rs1, rs2 );
    1404     9223404 :     *h = CL_sub( rs1, rs2 );
    1405     9223404 :     l += 1;
    1406     9223404 :     h += 1;
    1407             : 
    1408             : #ifdef WMOPS
    1409             :     multiCounter[currCounter].CL_move += 30;
    1410             : #endif
    1411     9223404 : }
    1412             : 
    1413             : /**
    1414             :  * \brief    Function performs a complex 32-point FFT
    1415             :  *           The FFT is performed inplace. The result of the FFT
    1416             :  *           is scaled by SCALEFACTOR32 bits.
    1417             :  *
    1418             :  *           WOPS with 32x16 bit multiplications:  752 cycles
    1419             :  *
    1420             :  * \param    [i/o] re    real input / output
    1421             :  * \param    [i/o] im    imag input / output
    1422             :  * \param    [i  ] s     stride real and imag input / output
    1423             :  *
    1424             :  * \return   void
    1425             :  */
    1426             : 
    1427             : 
    1428     1859336 : static void fft32_with_cmplx_data( cmplx *inp /*Qx*/ )
    1429             : {
    1430             :     cmplx x[32], y[32], t[32], s[32], temp, temp1;
    1431     1859336 :     const cmplx_s *pRotVector_32 = (const cmplx_s *) RotVector_32;
    1432             : 
    1433             :     /* 1. FFT8 stage */
    1434             : 
    1435     1859336 :     x[0] = CL_shr( inp[0], SCALEFACTOR32_1 ); // Qx - 5
    1436     1859336 :     x[1] = CL_shr( inp[4], SCALEFACTOR32_1 );
    1437     1859336 :     x[2] = CL_shr( inp[8], SCALEFACTOR32_1 );
    1438     1859336 :     x[3] = CL_shr( inp[12], SCALEFACTOR32_1 );
    1439     1859336 :     x[4] = CL_shr( inp[16], SCALEFACTOR32_1 );
    1440     1859336 :     x[5] = CL_shr( inp[20], SCALEFACTOR32_1 );
    1441     1859336 :     x[6] = CL_shr( inp[24], SCALEFACTOR32_1 );
    1442     1859336 :     x[7] = CL_shr( inp[28], SCALEFACTOR32_1 );
    1443             : 
    1444             : 
    1445     1859336 :     t[0] = CL_add( x[0], x[4] );
    1446     1859336 :     t[1] = CL_sub( x[0], x[4] );
    1447     1859336 :     t[2] = CL_add( x[1], x[5] );
    1448     1859336 :     t[3] = CL_sub( x[1], x[5] );
    1449     1859336 :     t[4] = CL_add( x[2], x[6] );
    1450     1859336 :     t[5] = CL_sub( x[2], x[6] );
    1451     1859336 :     t[6] = CL_add( x[3], x[7] );
    1452     1859336 :     t[7] = CL_sub( x[3], x[7] );
    1453             : 
    1454             :     /* Pre-additions and core multiplications */
    1455             : 
    1456     1859336 :     s[0] = CL_add( t[0], t[4] );
    1457     1859336 :     s[2] = CL_sub( t[0], t[4] );
    1458     1859336 :     s[4] = CL_mac_j( t[1], t[5] );
    1459     1859336 :     s[5] = CL_msu_j( t[1], t[5] );
    1460     1859336 :     s[1] = CL_add( t[2], t[6] );
    1461     1859336 :     s[3] = CL_sub( t[2], t[6] );
    1462     1859336 :     s[3] = CL_mul_j( s[3] );
    1463             : 
    1464     1859336 :     temp = CL_add( t[3], t[7] );
    1465     1859336 :     temp1 = CL_sub( t[3], t[7] );
    1466     1859336 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1467     1859336 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1468             : 
    1469             : 
    1470     1859336 :     y[0] = CL_add( s[0], s[1] );
    1471     1859336 :     y[4] = CL_sub( s[0], s[1] );
    1472     1859336 :     y[2] = CL_sub( s[2], s[3] );
    1473     1859336 :     y[6] = CL_add( s[2], s[3] );
    1474     1859336 :     y[3] = CL_add( s[4], s[7] );
    1475     1859336 :     y[7] = CL_sub( s[4], s[7] );
    1476     1859336 :     y[1] = CL_add( s[5], s[6] );
    1477     1859336 :     y[5] = CL_sub( s[5], s[6] );
    1478             : 
    1479             :     /* 2. FFT8 stage */
    1480             : 
    1481     1859336 :     x[0] = CL_shr( inp[1], SCALEFACTOR32_1 ); // Qx - 5
    1482     1859336 :     x[1] = CL_shr( inp[5], SCALEFACTOR32_1 );
    1483     1859336 :     x[2] = CL_shr( inp[9], SCALEFACTOR32_1 );
    1484     1859336 :     x[3] = CL_shr( inp[13], SCALEFACTOR32_1 );
    1485     1859336 :     x[4] = CL_shr( inp[17], SCALEFACTOR32_1 );
    1486     1859336 :     x[5] = CL_shr( inp[21], SCALEFACTOR32_1 );
    1487     1859336 :     x[6] = CL_shr( inp[25], SCALEFACTOR32_1 );
    1488     1859336 :     x[7] = CL_shr( inp[29], SCALEFACTOR32_1 );
    1489             : 
    1490             : 
    1491     1859336 :     t[0] = CL_add( x[0], x[4] );
    1492     1859336 :     t[1] = CL_sub( x[0], x[4] );
    1493     1859336 :     t[2] = CL_add( x[1], x[5] );
    1494     1859336 :     t[3] = CL_sub( x[1], x[5] );
    1495     1859336 :     t[4] = CL_add( x[2], x[6] );
    1496     1859336 :     t[5] = CL_sub( x[2], x[6] );
    1497     1859336 :     t[6] = CL_add( x[3], x[7] );
    1498     1859336 :     t[7] = CL_sub( x[3], x[7] );
    1499             : 
    1500             :     /* Pre-additions and core multiplications */
    1501             : 
    1502     1859336 :     s[0] = CL_add( t[0], t[4] );
    1503     1859336 :     s[2] = CL_sub( t[0], t[4] );
    1504     1859336 :     s[4] = CL_mac_j( t[1], t[5] );
    1505     1859336 :     s[5] = CL_msu_j( t[1], t[5] );
    1506     1859336 :     s[1] = CL_add( t[2], t[6] );
    1507     1859336 :     s[3] = CL_sub( t[2], t[6] );
    1508     1859336 :     s[3] = CL_mul_j( s[3] );
    1509             : 
    1510     1859336 :     temp = CL_add( t[3], t[7] );
    1511     1859336 :     temp1 = CL_sub( t[3], t[7] );
    1512     1859336 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1513     1859336 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1514             : 
    1515             :     /* Post-additions */
    1516             : 
    1517     1859336 :     y[8] = CL_add( s[0], s[1] );
    1518     1859336 :     y[12] = CL_sub( s[0], s[1] );
    1519     1859336 :     y[10] = CL_sub( s[2], s[3] );
    1520     1859336 :     y[14] = CL_add( s[2], s[3] );
    1521     1859336 :     y[11] = CL_add( s[4], s[7] );
    1522     1859336 :     y[15] = CL_sub( s[4], s[7] );
    1523     1859336 :     y[9] = CL_add( s[5], s[6] );
    1524     1859336 :     y[13] = CL_sub( s[5], s[6] );
    1525             : 
    1526             :     /* 3. FFT8 stage */
    1527             : 
    1528     1859336 :     x[0] = CL_shr( inp[2], SCALEFACTOR32_1 ); // Qx - 5
    1529     1859336 :     x[1] = CL_shr( inp[6], SCALEFACTOR32_1 );
    1530     1859336 :     x[2] = CL_shr( inp[10], SCALEFACTOR32_1 );
    1531     1859336 :     x[3] = CL_shr( inp[14], SCALEFACTOR32_1 );
    1532     1859336 :     x[4] = CL_shr( inp[18], SCALEFACTOR32_1 );
    1533     1859336 :     x[5] = CL_shr( inp[22], SCALEFACTOR32_1 );
    1534     1859336 :     x[6] = CL_shr( inp[26], SCALEFACTOR32_1 );
    1535     1859336 :     x[7] = CL_shr( inp[30], SCALEFACTOR32_1 );
    1536             : 
    1537             : 
    1538     1859336 :     t[0] = CL_add( x[0], x[4] );
    1539     1859336 :     t[1] = CL_sub( x[0], x[4] );
    1540     1859336 :     t[2] = CL_add( x[1], x[5] );
    1541     1859336 :     t[3] = CL_sub( x[1], x[5] );
    1542     1859336 :     t[4] = CL_add( x[2], x[6] );
    1543     1859336 :     t[5] = CL_sub( x[2], x[6] );
    1544     1859336 :     t[6] = CL_add( x[3], x[7] );
    1545     1859336 :     t[7] = CL_sub( x[3], x[7] );
    1546             : 
    1547             :     /* Pre-additions and core multiplications */
    1548             : 
    1549     1859336 :     s[0] = CL_add( t[0], t[4] );
    1550     1859336 :     s[2] = CL_sub( t[0], t[4] );
    1551     1859336 :     s[4] = CL_mac_j( t[1], t[5] );
    1552     1859336 :     s[5] = CL_msu_j( t[1], t[5] );
    1553     1859336 :     s[1] = CL_add( t[2], t[6] );
    1554     1859336 :     s[3] = CL_sub( t[2], t[6] );
    1555     1859336 :     s[3] = CL_mul_j( s[3] );
    1556             : 
    1557     1859336 :     temp = CL_add( t[3], t[7] );
    1558     1859336 :     temp1 = CL_sub( t[3], t[7] );
    1559     1859336 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1560     1859336 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1561             : 
    1562             :     /* Post-additions */
    1563             : 
    1564     1859336 :     y[16] = CL_add( s[0], s[1] );
    1565     1859336 :     y[20] = CL_sub( s[0], s[1] );
    1566     1859336 :     y[18] = CL_sub( s[2], s[3] );
    1567     1859336 :     y[22] = CL_add( s[2], s[3] );
    1568     1859336 :     y[19] = CL_add( s[4], s[7] );
    1569     1859336 :     y[23] = CL_sub( s[4], s[7] );
    1570     1859336 :     y[17] = CL_add( s[5], s[6] );
    1571     1859336 :     y[21] = CL_sub( s[5], s[6] );
    1572             : 
    1573             :     /* 4. FFT8 stage */
    1574             : 
    1575     1859336 :     x[0] = CL_shr( inp[3], SCALEFACTOR32_1 ); // Qx - 5
    1576     1859336 :     x[1] = CL_shr( inp[7], SCALEFACTOR32_1 );
    1577     1859336 :     x[2] = CL_shr( inp[11], SCALEFACTOR32_1 );
    1578     1859336 :     x[3] = CL_shr( inp[15], SCALEFACTOR32_1 );
    1579     1859336 :     x[4] = CL_shr( inp[19], SCALEFACTOR32_1 );
    1580     1859336 :     x[5] = CL_shr( inp[23], SCALEFACTOR32_1 );
    1581     1859336 :     x[6] = CL_shr( inp[27], SCALEFACTOR32_1 );
    1582     1859336 :     x[7] = CL_shr( inp[31], SCALEFACTOR32_1 );
    1583             : 
    1584             : 
    1585     1859336 :     t[0] = CL_add( x[0], x[4] );
    1586     1859336 :     t[1] = CL_sub( x[0], x[4] );
    1587     1859336 :     t[2] = CL_add( x[1], x[5] );
    1588     1859336 :     t[3] = CL_sub( x[1], x[5] );
    1589     1859336 :     t[4] = CL_add( x[2], x[6] );
    1590     1859336 :     t[5] = CL_sub( x[2], x[6] );
    1591     1859336 :     t[6] = CL_add( x[3], x[7] );
    1592     1859336 :     t[7] = CL_sub( x[3], x[7] );
    1593             : 
    1594             : 
    1595             :     /* Pre-additions and core multiplications */
    1596             : 
    1597     1859336 :     s[0] = CL_add( t[0], t[4] );
    1598     1859336 :     s[2] = CL_sub( t[0], t[4] );
    1599     1859336 :     s[4] = CL_mac_j( t[1], t[5] );
    1600     1859336 :     s[5] = CL_msu_j( t[1], t[5] );
    1601     1859336 :     s[1] = CL_add( t[2], t[6] );
    1602     1859336 :     s[3] = CL_sub( t[2], t[6] );
    1603     1859336 :     s[3] = CL_mul_j( s[3] );
    1604             : 
    1605     1859336 :     temp = CL_add( t[3], t[7] );
    1606     1859336 :     temp1 = CL_sub( t[3], t[7] );
    1607     1859336 :     s[6] = CL_scale_t( CL_msu_j( temp1, temp ), C81 );
    1608     1859336 :     s[7] = CL_dscale_t( CL_swap_real_imag( CL_msu_j( temp, temp1 ) ), C81, C82 );
    1609             : 
    1610             :     /* Post-additions */
    1611             : 
    1612     1859336 :     y[24] = CL_add( s[0], s[1] );
    1613     1859336 :     y[28] = CL_sub( s[0], s[1] );
    1614     1859336 :     y[26] = CL_sub( s[2], s[3] );
    1615     1859336 :     y[30] = CL_add( s[2], s[3] );
    1616     1859336 :     y[27] = CL_add( s[4], s[7] );
    1617     1859336 :     y[31] = CL_sub( s[4], s[7] );
    1618     1859336 :     y[25] = CL_add( s[5], s[6] );
    1619     1859336 :     y[29] = CL_sub( s[5], s[6] );
    1620             : 
    1621             : 
    1622             :     /* apply twiddle factors */
    1623     1859336 :     y[0] = CL_shr( y[0], SCALEFACTOR32_2 );
    1624     1859336 :     y[1] = CL_shr( y[1], SCALEFACTOR32_2 );
    1625     1859336 :     y[2] = CL_shr( y[2], SCALEFACTOR32_2 );
    1626     1859336 :     y[3] = CL_shr( y[3], SCALEFACTOR32_2 );
    1627     1859336 :     y[4] = CL_shr( y[4], SCALEFACTOR32_2 );
    1628     1859336 :     y[5] = CL_shr( y[5], SCALEFACTOR32_2 );
    1629     1859336 :     y[6] = CL_shr( y[6], SCALEFACTOR32_2 );
    1630     1859336 :     y[7] = CL_shr( y[7], SCALEFACTOR32_2 );
    1631     1859336 :     y[8] = CL_shr( y[8], SCALEFACTOR32_2 );
    1632     1859336 :     y[16] = CL_shr( y[16], SCALEFACTOR32_2 );
    1633     1859336 :     y[24] = CL_shr( y[24], SCALEFACTOR32_2 );
    1634     1859336 :     y[20] = CL_shr( y[20], SCALEFACTOR32_2 );
    1635             : 
    1636             : 
    1637     1859336 :     y[9] = CL_mult_32x16( ( CL_shr( y[9], 1 ) ), pRotVector_32[0] );
    1638     1859336 :     y[10] = CL_mult_32x16( ( CL_shr( y[10], 1 ) ), pRotVector_32[1] );
    1639     1859336 :     y[11] = CL_mult_32x16( ( CL_shr( y[11], 1 ) ), pRotVector_32[2] );
    1640     1859336 :     y[12] = CL_mult_32x16( ( CL_shr( y[12], 1 ) ), pRotVector_32[3] );
    1641     1859336 :     y[13] = CL_mult_32x16( ( CL_shr( y[13], 1 ) ), pRotVector_32[4] );
    1642     1859336 :     y[14] = CL_mult_32x16( ( CL_shr( y[14], 1 ) ), pRotVector_32[5] );
    1643     1859336 :     y[15] = CL_mult_32x16( ( CL_shr( y[15], 1 ) ), pRotVector_32[6] );
    1644     1859336 :     y[17] = CL_mult_32x16( ( CL_shr( y[17], 1 ) ), pRotVector_32[7] );
    1645     1859336 :     y[18] = CL_mult_32x16( ( CL_shr( y[18], 1 ) ), pRotVector_32[8] );
    1646     1859336 :     y[19] = CL_mult_32x16( ( CL_shr( y[19], 1 ) ), pRotVector_32[9] );
    1647     1859336 :     y[21] = CL_mult_32x16( ( CL_shr( y[21], 1 ) ), pRotVector_32[10] );
    1648     1859336 :     y[22] = CL_mult_32x16( ( CL_shr( y[22], 1 ) ), pRotVector_32[11] );
    1649     1859336 :     y[23] = CL_mult_32x16( ( CL_shr( y[23], 1 ) ), pRotVector_32[12] );
    1650     1859336 :     y[25] = CL_mult_32x16( ( CL_shr( y[25], 1 ) ), pRotVector_32[13] );
    1651     1859336 :     y[26] = CL_mult_32x16( ( CL_shr( y[26], 1 ) ), pRotVector_32[14] );
    1652     1859336 :     y[27] = CL_mult_32x16( ( CL_shr( y[27], 1 ) ), pRotVector_32[15] );
    1653     1859336 :     y[28] = CL_mult_32x16( ( CL_shr( y[28], 1 ) ), pRotVector_32[16] );
    1654     1859336 :     y[29] = CL_mult_32x16( ( CL_shr( y[29], 1 ) ), pRotVector_32[17] );
    1655     1859336 :     y[30] = CL_mult_32x16( ( CL_shr( y[30], 1 ) ), pRotVector_32[18] );
    1656     1859336 :     y[31] = CL_mult_32x16( ( CL_shr( y[31], 1 ) ), pRotVector_32[19] );
    1657             : 
    1658             :     /* 1. FFT4 stage */
    1659             : 
    1660             :     /*  Pre-additions */
    1661     1859336 :     t[0] = CL_add( y[0], y[16] );
    1662     1859336 :     t[1] = CL_sub( y[0], y[16] );
    1663     1859336 :     t[2] = CL_add( y[8], y[24] );
    1664     1859336 :     t[3] = CL_mul_j( CL_sub( y[8], y[24] ) );
    1665             : 
    1666             :     /*  Post-additions */
    1667     1859336 :     inp[0] = CL_add( t[0], t[2] );
    1668     1859336 :     inp[8] = CL_sub( t[1], t[3] );
    1669     1859336 :     inp[16] = CL_sub( t[0], t[2] );
    1670     1859336 :     inp[24] = CL_add( t[1], t[3] );
    1671             : 
    1672             :     /* 2. FFT4 stage */
    1673             : 
    1674             :     /*  Pre-additions */
    1675     1859336 :     t[0] = CL_add( y[1], y[17] );
    1676     1859336 :     t[1] = CL_sub( y[1], y[17] );
    1677     1859336 :     t[2] = CL_add( y[9], y[25] );
    1678     1859336 :     t[3] = CL_mul_j( CL_sub( y[9], y[25] ) );
    1679             : 
    1680             :     /*  Post-additions */
    1681     1859336 :     inp[1] = CL_add( t[0], t[2] );
    1682     1859336 :     inp[9] = CL_sub( t[1], t[3] );
    1683     1859336 :     inp[17] = CL_sub( t[0], t[2] );
    1684     1859336 :     inp[25] = CL_add( t[1], t[3] );
    1685             : 
    1686             : 
    1687             :     /* 3. FFT4 stage */
    1688             : 
    1689             :     /*  Pre-additions */
    1690     1859336 :     t[0] = CL_add( y[2], y[18] );
    1691     1859336 :     t[1] = CL_sub( y[2], y[18] );
    1692     1859336 :     t[2] = CL_add( y[10], y[26] );
    1693     1859336 :     t[3] = CL_mul_j( CL_sub( y[10], y[26] ) );
    1694             : 
    1695             :     /*  Post-additions */
    1696     1859336 :     inp[2] = CL_add( t[0], t[2] );
    1697     1859336 :     inp[10] = CL_sub( t[1], t[3] );
    1698     1859336 :     inp[18] = CL_sub( t[0], t[2] );
    1699     1859336 :     inp[26] = CL_add( t[1], t[3] );
    1700             : 
    1701             : 
    1702             :     /* 4. FFT4 stage */
    1703             : 
    1704             :     /*  Pre-additions */
    1705     1859336 :     t[0] = CL_add( y[3], y[19] );
    1706     1859336 :     t[1] = CL_sub( y[3], y[19] );
    1707     1859336 :     t[2] = CL_add( y[11], y[27] );
    1708     1859336 :     t[3] = CL_mul_j( CL_sub( y[11], y[27] ) );
    1709             : 
    1710             : 
    1711             :     /*  Post-additions */
    1712     1859336 :     inp[3] = CL_add( t[0], t[2] );
    1713     1859336 :     inp[11] = CL_sub( t[1], t[3] );
    1714     1859336 :     inp[19] = CL_sub( t[0], t[2] );
    1715     1859336 :     inp[27] = CL_add( t[1], t[3] );
    1716             : 
    1717             : 
    1718             :     /* 5. FFT4 stage */
    1719             : 
    1720             :     /*  Pre-additions */
    1721     1859336 :     t[0] = CL_msu_j( y[4], y[20] );
    1722     1859336 :     t[1] = CL_mac_j( y[4], y[20] );
    1723     1859336 :     t[2] = CL_add( y[12], y[28] );
    1724     1859336 :     t[3] = CL_mul_j( CL_sub( y[12], y[28] ) );
    1725             : 
    1726             : 
    1727             :     /*  Post-additions */
    1728     1859336 :     inp[4] = CL_add( t[0], t[2] );
    1729     1859336 :     inp[12] = CL_sub( t[1], t[3] );
    1730     1859336 :     inp[20] = CL_sub( t[0], t[2] );
    1731     1859336 :     inp[28] = CL_add( t[1], t[3] );
    1732             : 
    1733             : 
    1734             :     /* 6. FFT4 stage */
    1735             : 
    1736             :     /*  Pre-additions */
    1737     1859336 :     t[0] = CL_add( y[5], y[21] );
    1738     1859336 :     t[1] = CL_sub( y[5], y[21] );
    1739     1859336 :     t[2] = CL_add( y[13], y[29] );
    1740     1859336 :     t[3] = CL_mul_j( CL_sub( y[13], y[29] ) );
    1741             : 
    1742             : 
    1743             :     /*  Post-additions */
    1744     1859336 :     inp[5] = CL_add( t[0], t[2] );
    1745     1859336 :     inp[13] = CL_sub( t[1], t[3] );
    1746     1859336 :     inp[21] = CL_sub( t[0], t[2] );
    1747     1859336 :     inp[29] = CL_add( t[1], t[3] );
    1748             : 
    1749             : 
    1750             :     /* 7. FFT4 stage */
    1751             : 
    1752             :     /*  Pre-additions */
    1753     1859336 :     t[0] = CL_add( y[6], y[22] );
    1754     1859336 :     t[1] = CL_sub( y[6], y[22] );
    1755     1859336 :     t[2] = CL_add( y[14], y[30] );
    1756     1859336 :     t[3] = CL_mul_j( CL_sub( y[14], y[30] ) );
    1757             : 
    1758             : 
    1759             :     /*  Post-additions */
    1760     1859336 :     inp[6] = CL_add( t[0], t[2] );
    1761     1859336 :     inp[14] = CL_sub( t[1], t[3] );
    1762     1859336 :     inp[22] = CL_sub( t[0], t[2] );
    1763     1859336 :     inp[30] = CL_add( t[1], t[3] );
    1764             : 
    1765             : 
    1766             :     /* 8. FFT4 stage */
    1767             : 
    1768             :     /*  Pre-additions */
    1769     1859336 :     t[0] = CL_add( y[7], y[23] );
    1770     1859336 :     t[1] = CL_sub( y[7], y[23] );
    1771     1859336 :     t[2] = CL_add( y[15], y[31] );
    1772     1859336 :     t[3] = CL_mul_j( CL_sub( y[15], y[31] ) );
    1773             : 
    1774             : 
    1775             :     /*  Post-additions */
    1776     1859336 :     inp[7] = CL_add( t[0], t[2] );
    1777     1859336 :     inp[15] = CL_sub( t[1], t[3] );
    1778     1859336 :     inp[23] = CL_sub( t[0], t[2] );
    1779     1859336 :     inp[31] = CL_add( t[1], t[3] );
    1780             : 
    1781             : #ifdef WMOPS
    1782             :     multiCounter[currCounter].CL_move += 32;
    1783             : #endif
    1784     1859336 : }
    1785             : 
    1786             : 
    1787             : /**
    1788             :  * \brief Combined FFT
    1789             :  *
    1790             :  * \param    [i/o] re     real part
    1791             :  * \param    [i/o] im     imag part
    1792             :  * \param    [i  ] W      rotation factor
    1793             :  * \param    [i  ] len    length of fft
    1794             :  * \param    [i  ] dim1   length of fft1
    1795             :  * \param    [i  ] dim2   length of fft2
    1796             :  * \param    [i  ] sx     stride real and imag part
    1797             :  * \param    [i  ] sc     stride phase rotation coefficients
    1798             :  * \param    [tmp] x      32-bit workbuffer of length=2*len
    1799             :  * \param    [i  ] Woff   offset for addressing the rotation vector table
    1800             :  *
    1801             :  * \return void
    1802             :  */
    1803             : 
    1804     1720770 : static void fftN2(
    1805             :     cmplx *__restrict pComplexBuf,
    1806             :     const Word16 *__restrict W,
    1807             :     Word16 len,
    1808             :     Word16 dim1,
    1809             :     Word16 dim2,
    1810             :     Word16 sc,
    1811             :     Word32 *x,
    1812             :     Word16 Woff )
    1813             : {
    1814             :     Word16 i, j;
    1815     1720770 :     cmplx *x_cmplx = (cmplx *) x;
    1816             : 
    1817     1720770 :     assert( len == ( dim1 * dim2 ) );
    1818     1720770 :     assert( ( dim1 == 3 ) || ( dim1 == 5 ) || ( dim1 == 8 ) || ( dim1 == 10 ) || ( dim1 == 15 ) || ( dim1 == 16 ) || ( dim1 == 20 ) || ( dim1 == 30 ) || ( dim1 == 32 ) );
    1819     1720770 :     assert( ( dim2 == 4 ) || ( dim2 == 8 ) || ( dim2 == 10 ) || ( dim2 == 12 ) || ( dim2 == 16 ) || ( dim2 == 20 ) );
    1820             : 
    1821    24653356 :     FOR( i = 0; i < dim2; i++ )
    1822             :     {
    1823   587606914 :         FOR( j = 0; j < dim1; j++ )
    1824             :         {
    1825   564674328 :             x_cmplx[i * dim1 + j] = pComplexBuf[i + j * dim2];
    1826             : #ifdef WMOPS
    1827             :             multiCounter[currCounter].CL_move++;
    1828             : #endif
    1829             :         }
    1830             :     }
    1831             : 
    1832     1720770 :     SWITCH( dim1 )
    1833             :     {
    1834        7118 :         case 5:
    1835       64062 :             FOR( i = 0; i < dim2; i++ )
    1836             :             {
    1837       56944 :                 fft5_with_cmplx_data( &x_cmplx[i * dim1] );
    1838             :             }
    1839        7118 :             BREAK;
    1840       13473 :         case 8:
    1841      121257 :             FOR( i = 0; i < dim2; i++ )
    1842             :             {
    1843      107784 :                 fft8_with_cmplx_data( &x_cmplx[i * dim1] );
    1844             :             }
    1845       13473 :             BREAK;
    1846       42634 :         case 10:
    1847      383706 :             FOR( i = 0; i < dim2; i++ )
    1848             :             {
    1849      341072 :                 fft10_with_cmplx_data( &x_cmplx[i * dim1] );
    1850             :             }
    1851       42634 :             BREAK;
    1852             : 
    1853       21498 :         case 15:
    1854      193482 :             FOR( i = 0; i < dim2; i++ )
    1855             :             {
    1856      171984 :                 fft15_with_cmplx_data( &x_cmplx[i * dim1] );
    1857             :             }
    1858       21498 :             BREAK;
    1859       49188 :         case 16:
    1860      442692 :             FOR( i = 0; i < dim2; i++ )
    1861             :             {
    1862      393504 :                 fft16_with_cmplx_data( &x_cmplx[i * dim1], 1 );
    1863             :             }
    1864       49188 :             BREAK;
    1865      778281 :         case 20:
    1866    11609963 :             FOR( i = 0; i < dim2; i++ )
    1867             :             {
    1868    10831682 :                 fft20_with_cmplx_data( &x_cmplx[i * dim1] );
    1869             :             }
    1870      778281 :             BREAK;
    1871      576161 :         case 30:
    1872     9746441 :             FOR( i = 0; i < dim2; i++ )
    1873             :             {
    1874     9170280 :                 fft30_with_cmplx_data( &x_cmplx[i * dim1] );
    1875             :             }
    1876      576161 :             BREAK;
    1877      232417 :         case 32:
    1878     2091753 :             FOR( i = 0; i < dim2; i++ )
    1879             :             {
    1880     1859336 :                 fft32_with_cmplx_data( &x_cmplx[i * dim1] );
    1881             :             }
    1882      232417 :             BREAK;
    1883             :     }
    1884             : 
    1885     1720770 :     SWITCH( dim2 )
    1886             :     {
    1887      575820 :         case 8:
    1888             :         {
    1889             :             cmplx y0, y1, y2, y3, y4, y5, y6, y7;
    1890             :             cmplx t0, t1, t2, t3, t4, t5, t6, t7;
    1891             :             cmplx s0, s1, s2, s3, s4, s5, s6, s7;
    1892             : 
    1893      575820 :             i = 0;
    1894      575820 :             move16();
    1895             :             {
    1896      575820 :                 y0 = CL_shr( x_cmplx[i + 0 * dim1], 1 );
    1897      575820 :                 y1 = CL_shr( x_cmplx[i + 1 * dim1], 1 );
    1898      575820 :                 y2 = CL_shr( x_cmplx[i + 2 * dim1], 1 );
    1899      575820 :                 y3 = CL_shr( x_cmplx[i + 3 * dim1], 1 );
    1900      575820 :                 y4 = CL_shr( x_cmplx[i + 4 * dim1], 1 );
    1901      575820 :                 y5 = CL_shr( x_cmplx[i + 5 * dim1], 1 );
    1902      575820 :                 y6 = CL_shr( x_cmplx[i + 6 * dim1], 1 );
    1903      575820 :                 y7 = CL_shr( x_cmplx[i + 7 * dim1], 1 );
    1904             : 
    1905      575820 :                 t0 = CL_shr( CL_add( y0, y4 ), SCALEFACTORN2 - 1 );
    1906      575820 :                 t1 = CL_shr( CL_sub( y0, y4 ), SCALEFACTORN2 - 1 );
    1907      575820 :                 t2 = CL_shr( CL_add( y1, y5 ), SCALEFACTORN2 - 1 );
    1908      575820 :                 t3 = CL_sub( y1, y5 );
    1909      575820 :                 t4 = CL_shr( CL_add( y2, y6 ), SCALEFACTORN2 - 1 );
    1910      575820 :                 t5 = CL_shr( CL_sub( y2, y6 ), SCALEFACTORN2 - 1 );
    1911      575820 :                 t6 = CL_shr( CL_add( y3, y7 ), SCALEFACTORN2 - 1 );
    1912      575820 :                 t7 = CL_sub( y3, y7 );
    1913             : 
    1914             : 
    1915      575820 :                 s0 = CL_add( t0, t4 );
    1916      575820 :                 s2 = CL_sub( t0, t4 );
    1917      575820 :                 s4 = CL_mac_j( t1, t5 );
    1918      575820 :                 s5 = CL_msu_j( t1, t5 );
    1919      575820 :                 s1 = CL_add( t2, t6 );
    1920      575820 :                 s3 = CL_mul_j( CL_sub( t2, t6 ) );
    1921      575820 :                 t0 = CL_shr( CL_add( t3, t7 ), SCALEFACTORN2 - 1 );
    1922      575820 :                 t1 = CL_shr( CL_sub( t3, t7 ), SCALEFACTORN2 - 1 );
    1923      575820 :                 s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
    1924      575820 :                 s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
    1925             : 
    1926      575820 :                 pComplexBuf[i + 0 * dim1] = CL_add( s0, s1 );
    1927      575820 :                 pComplexBuf[i + 1 * dim1] = CL_add( s5, s6 );
    1928      575820 :                 pComplexBuf[i + 2 * dim1] = CL_sub( s2, s3 );
    1929      575820 :                 pComplexBuf[i + 3 * dim1] = CL_add( s4, s7 );
    1930      575820 :                 pComplexBuf[i + 4 * dim1] = CL_sub( s0, s1 );
    1931      575820 :                 pComplexBuf[i + 5 * dim1] = CL_sub( s5, s6 );
    1932      575820 :                 pComplexBuf[i + 6 * dim1] = CL_add( s2, s3 );
    1933      575820 :                 pComplexBuf[i + 7 * dim1] = CL_sub( s4, s7 );
    1934             :             }
    1935             : 
    1936             : 
    1937    13397086 :             FOR( i = 1; i < dim1; i++ )
    1938             :             {
    1939    12821266 :                 y0 = CL_shr( x_cmplx[i + 0 * dim1], 1 );
    1940    12821266 :                 y1 = CL_shr( CL_mult_32x16( x_cmplx[i + 1 * dim1], *(const cmplx_s *) &W[sc * i + sc * 1 * dim1 - Woff] ), 1 );
    1941    12821266 :                 y2 = CL_shr( CL_mult_32x16( x_cmplx[i + 2 * dim1], *(const cmplx_s *) &W[sc * i + sc * 2 * dim1 - Woff] ), 1 );
    1942    12821266 :                 y3 = CL_shr( CL_mult_32x16( x_cmplx[i + 3 * dim1], *(const cmplx_s *) &W[sc * i + sc * 3 * dim1 - Woff] ), 1 );
    1943    12821266 :                 y4 = CL_shr( CL_mult_32x16( x_cmplx[i + 4 * dim1], *(const cmplx_s *) &W[sc * i + sc * 4 * dim1 - Woff] ), 1 );
    1944    12821266 :                 y5 = CL_shr( CL_mult_32x16( x_cmplx[i + 5 * dim1], *(const cmplx_s *) &W[sc * i + sc * 5 * dim1 - Woff] ), 1 );
    1945    12821266 :                 y6 = CL_shr( CL_mult_32x16( x_cmplx[i + 6 * dim1], *(const cmplx_s *) &W[sc * i + sc * 6 * dim1 - Woff] ), 1 );
    1946    12821266 :                 y7 = CL_shr( CL_mult_32x16( x_cmplx[i + 7 * dim1], *(const cmplx_s *) &W[sc * i + sc * 7 * dim1 - Woff] ), 1 );
    1947             : 
    1948    12821266 :                 t0 = CL_shr( CL_add( y0, y4 ), SCALEFACTORN2 - 1 );
    1949    12821266 :                 t1 = CL_shr( CL_sub( y0, y4 ), SCALEFACTORN2 - 1 );
    1950    12821266 :                 t2 = CL_shr( CL_add( y1, y5 ), SCALEFACTORN2 - 1 );
    1951    12821266 :                 t3 = CL_sub( y1, y5 );
    1952    12821266 :                 t4 = CL_shr( CL_add( y2, y6 ), SCALEFACTORN2 - 1 );
    1953    12821266 :                 t5 = CL_shr( CL_sub( y2, y6 ), SCALEFACTORN2 - 1 );
    1954    12821266 :                 t6 = CL_shr( CL_add( y3, y7 ), SCALEFACTORN2 - 1 );
    1955    12821266 :                 t7 = CL_sub( y3, y7 );
    1956             : 
    1957             : 
    1958    12821266 :                 s0 = CL_add( t0, t4 );
    1959    12821266 :                 s2 = CL_sub( t0, t4 );
    1960    12821266 :                 s4 = CL_mac_j( t1, t5 );
    1961    12821266 :                 s5 = CL_msu_j( t1, t5 );
    1962    12821266 :                 s1 = CL_add( t2, t6 );
    1963    12821266 :                 s3 = CL_mul_j( CL_sub( t2, t6 ) );
    1964    12821266 :                 t0 = CL_shr( CL_add( t3, t7 ), SCALEFACTORN2 - 1 );
    1965    12821266 :                 t1 = CL_shr( CL_sub( t3, t7 ), SCALEFACTORN2 - 1 );
    1966    12821266 :                 s6 = CL_scale_t( CL_msu_j( t1, t0 ), C81 );
    1967    12821266 :                 s7 = CL_dscale_t( CL_swap_real_imag( CL_msu_j( t0, t1 ) ), C81, C82 );
    1968             : 
    1969    12821266 :                 pComplexBuf[i + 0 * dim1] = CL_add( s0, s1 );
    1970    12821266 :                 pComplexBuf[i + 1 * dim1] = CL_add( s5, s6 );
    1971    12821266 :                 pComplexBuf[i + 2 * dim1] = CL_sub( s2, s3 );
    1972    12821266 :                 pComplexBuf[i + 3 * dim1] = CL_add( s4, s7 );
    1973    12821266 :                 pComplexBuf[i + 4 * dim1] = CL_sub( s0, s1 );
    1974    12821266 :                 pComplexBuf[i + 5 * dim1] = CL_sub( s5, s6 );
    1975    12821266 :                 pComplexBuf[i + 6 * dim1] = CL_add( s2, s3 );
    1976    12821266 :                 pComplexBuf[i + 7 * dim1] = CL_sub( s4, s7 );
    1977             :             }
    1978             : 
    1979      575820 :             BREAK;
    1980             :         }
    1981             : 
    1982        4529 :         case 10:
    1983             :         {
    1984             :             cmplx y[20];
    1985             :             {
    1986       49819 :                 FOR( j = 0; j < dim2; j++ )
    1987             :                 {
    1988       45290 :                     y[j] = CL_move( x_cmplx[j * dim1] );
    1989             :                 }
    1990        4529 :                 fft10_with_cmplx_data( &y[0] );
    1991       49819 :                 FOR( j = 0; j < dim2; j++ )
    1992             :                 {
    1993       45290 :                     pComplexBuf[j * dim1] = y[j];
    1994             :                 }
    1995       90580 :                 FOR( i = 1; i < dim1; i++ )
    1996             :                 {
    1997       86051 :                     y[0] = CL_move( x_cmplx[i] );
    1998      860510 :                     FOR( j = 1; j < dim2; j++ )
    1999             :                     {
    2000      774459 :                         y[j] = CL_mult_32x16( x_cmplx[i + j * dim1], *(const cmplx_s *) &W[sc * i + sc * j * dim1 - Woff] );
    2001             :                     }
    2002       86051 :                     fft10_with_cmplx_data( &y[0] );
    2003      946561 :                     FOR( j = 0; j < dim2; j++ )
    2004             :                     {
    2005      860510 :                         pComplexBuf[i + j * dim1] = y[j];
    2006             :                     }
    2007             :                 }
    2008             :             }
    2009        4529 :             BREAK;
    2010             :         }
    2011     1131921 :         case 16:
    2012             :         {
    2013             :             cmplx y[20];
    2014             : 
    2015    19242657 :             FOR( j = 0; j < dim2; j++ )
    2016             :             {
    2017    18110736 :                 y[j] = CL_shr( x_cmplx[0 + j * dim1], SCALEFACTOR16 );
    2018             :             }
    2019     1131921 :             fft16_with_cmplx_data( &y[0], 0 );
    2020             : 
    2021    19242657 :             FOR( j = 0; j < dim2; j++ )
    2022             :             {
    2023    18110736 :                 pComplexBuf[j * dim1] = y[j];
    2024             :             }
    2025    28248640 :             FOR( i = 1; i < dim1; i++ )
    2026             :             {
    2027    27116719 :                 y[0] = CL_shr( x_cmplx[i + ( 0 + 0 ) * dim1], SCALEFACTOR16 );
    2028    27116719 :                 y[1] = CL_shr( CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] ), SCALEFACTOR16 );
    2029             : 
    2030   216933752 :                 FOR( j = 2; j < dim2; j = j + 2 )
    2031             :                 {
    2032   189817033 :                     y[( j + 0 )] = CL_shr( CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] ), SCALEFACTOR16 );
    2033   189817033 :                     y[( j + 1 )] = CL_shr( CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] ), SCALEFACTOR16 );
    2034             :                 }
    2035    27116719 :                 fft16_with_cmplx_data( &y[0], 0 );
    2036   460984223 :                 FOR( j = 0; j < dim2; j++ )
    2037             :                 {
    2038   433867504 :                     pComplexBuf[i + j * dim1] = y[j];
    2039             :                 }
    2040             :             }
    2041             :         }
    2042     1131921 :             BREAK;
    2043             : 
    2044        8500 :         case 20:
    2045             : 
    2046        8500 :             assert( dim1 == 20 || dim1 == 30 ); /* cplxMpy4_10_0 contains shift values hardcoded FOR 20x10 */
    2047        8500 :             IF( EQ_16( dim1, 20 ) )
    2048             :             {
    2049             :                 cmplx y[20];
    2050       51072 :                 FOR( j = 0; j < dim2; j++ )
    2051             :                 {
    2052       48640 :                     y[j] = CL_move( x_cmplx[j * dim1] );
    2053             :                 }
    2054        2432 :                 fft20_with_cmplx_data( &y[0] );
    2055       51072 :                 FOR( j = 0; j < dim2; j++ )
    2056             :                 {
    2057       48640 :                     pComplexBuf[j * dim1] = y[j];
    2058             :                 }
    2059       48640 :                 FOR( i = 1; i < dim1; i++ )
    2060             :                 {
    2061       46208 :                     y[0] = CL_move( x_cmplx[i] );
    2062       46208 :                     y[1] = CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] );
    2063      462080 :                     FOR( j = 2; j < dim2; j = j + 2 )
    2064             :                     {
    2065             : 
    2066      415872 :                         y[j + 0] = CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] );
    2067      415872 :                         y[j + 1] = CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] );
    2068             :                     }
    2069       46208 :                     fft20_with_cmplx_data( &y[0] );
    2070      970368 :                     FOR( j = 0; j < dim2; j++ )
    2071             :                     {
    2072      924160 :                         pComplexBuf[i + j * dim1] = y[j];
    2073             :                     }
    2074             :                 }
    2075             :             }
    2076             :             ELSE
    2077             :             {
    2078             :                 cmplx y[20];
    2079      127428 :                 FOR( j = 0; j < dim2; j++ )
    2080             :                 {
    2081      121360 :                     y[j] = CL_shl( x_cmplx[j * dim1], ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2082             :                 }
    2083        6068 :                 fft20_with_cmplx_data( &y[0] );
    2084      127428 :                 FOR( j = 0; j < dim2; j++ )
    2085             :                 {
    2086      121360 :                     pComplexBuf[j * dim1] = y[j];
    2087             :                 }
    2088      182040 :                 FOR( i = 1; i < dim1; i++ )
    2089             :                 {
    2090      175972 :                     y[0] = CL_shl( x_cmplx[i], ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2091      175972 :                     y[1] = CL_shl( CL_mult_32x16( x_cmplx[i + dim1], *(const cmplx_s *) &W[len + sc * i + 0 * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2092     1759720 :                     FOR( j = 2; j < dim2; j = j + 2 )
    2093             :                     {
    2094             : 
    2095     1583748 :                         y[j + 0] = CL_shl( CL_mult_32x16( x_cmplx[i + ( j + 0 ) * dim1], *(const cmplx_s *) &W[sc * i + j * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2096     1583748 :                         y[j + 1] = CL_shl( CL_mult_32x16( x_cmplx[i + ( j + 1 ) * dim1], *(const cmplx_s *) &W[len + sc * i + j * dim1 - Woff] ), ( SCALEFACTOR30 - SCALEFACTOR20 ) );
    2097             :                     }
    2098      175972 :                     fft20_with_cmplx_data( &y[0] );
    2099     3695412 :                     FOR( j = 0; j < dim2; j++ )
    2100             :                     {
    2101     3519440 :                         pComplexBuf[i + j * dim1] = y[j];
    2102             :                     }
    2103             :                 }
    2104             :             }
    2105        8500 :             BREAK;
    2106             :     }
    2107             : #ifdef WMOPS
    2108             :     multiCounter[currCounter].CL_move += len;
    2109             : #endif
    2110     1720770 : }
    2111             : 
    2112             : 
    2113             : /**
    2114             :  * \brief Complex valued FFT
    2115             :  *
    2116             :  * \param    [i/o] re          real part
    2117             :  * \param    [i/o] im          imag part
    2118             :  * \param    [i  ] sizeOfFft   length of fft
    2119             :  * \param    [i  ] s           stride real and imag part
    2120             :  * \param    [i  ] scale       scalefactor
    2121             :  *
    2122             :  * \return void
    2123             :  */
    2124     1977014 : void BASOP_cfft( cmplx *pComplexBuf, Word16 sizeOfFft, Word16 *scale, Word32 x[2 * BASOP_CFFT_MAX_LENGTH] )
    2125             : {
    2126             :     Word16 s;
    2127     1977014 :     s = 0;
    2128     1977014 :     move16();
    2129     1977014 :     SWITCH( sizeOfFft )
    2130             :     {
    2131           0 :         case 5:
    2132           0 :             fft5_with_cmplx_data( pComplexBuf );
    2133           0 :             s = add( *scale, SCALEFACTOR5 );
    2134           0 :             BREAK;
    2135             : 
    2136       64672 :         case 8:
    2137       64672 :             fft8_with_cmplx_data( pComplexBuf );
    2138       64672 :             s = add( *scale, SCALEFACTOR8 );
    2139       64672 :             BREAK;
    2140             : 
    2141      105812 :         case 10:
    2142      105812 :             fft10_with_cmplx_data( pComplexBuf );
    2143      105812 :             s = add( *scale, SCALEFACTOR10 );
    2144      105812 :             BREAK;
    2145             : 
    2146           0 :         case 16:
    2147           0 :             fft16_with_cmplx_data( pComplexBuf, 1 );
    2148           0 :             s = add( *scale, SCALEFACTOR16 );
    2149           0 :             BREAK;
    2150             : 
    2151       32636 :         case 20:
    2152       32636 :             fft20_with_cmplx_data( pComplexBuf );
    2153       32636 :             s = add( *scale, SCALEFACTOR20 );
    2154       32636 :             BREAK;
    2155             : 
    2156       53124 :         case 30:
    2157       53124 :             fft30_with_cmplx_data( pComplexBuf );
    2158       53124 :             s = add( *scale, SCALEFACTOR30 );
    2159       53124 :             BREAK;
    2160             : 
    2161           0 :         case 32:
    2162           0 :             fft32_with_cmplx_data( pComplexBuf );
    2163           0 :             s = add( *scale, SCALEFACTOR32 );
    2164           0 :             BREAK;
    2165             : 
    2166        7118 :         case 40:
    2167             :         {
    2168        7118 :             fftN2( pComplexBuf, RotVector_320, 40, 5, 8, 8, x, 40 );
    2169        7118 :             s = add( *scale, SCALEFACTOR40 );
    2170        7118 :             BREAK;
    2171             :         }
    2172             : 
    2173       13473 :         case 64:
    2174             :         {
    2175       13473 :             fftN2( pComplexBuf, RotVector_256, 64, 8, 8, 8, x, 64 );
    2176       13473 :             s = add( *scale, SCALEFACTOR64 );
    2177       13473 :             BREAK;
    2178             :         }
    2179             : 
    2180       42634 :         case 80:
    2181             :         {
    2182       42634 :             fftN2( pComplexBuf, RotVector_320, 80, 10, 8, 4, x, 40 );
    2183       42634 :             s = add( *scale, SCALEFACTOR80 );
    2184       42634 :             BREAK;
    2185             :         }
    2186           0 :         case 100:
    2187             :         {
    2188           0 :             fftN2( pComplexBuf, RotVector_400, 100, 10, 10, 4, x, 40 );
    2189           0 :             s = add( *scale, SCALEFACTOR100 );
    2190           0 :             BREAK;
    2191             :         }
    2192       21498 :         case 120:
    2193             :         {
    2194       21498 :             fftN2( pComplexBuf, RotVector_480, 120, 15, 8, 4, x, 60 );
    2195       21498 :             s = add( *scale, SCALEFACTOR120 );
    2196       21498 :             BREAK;
    2197             :         }
    2198             : 
    2199       49188 :         case 128:
    2200             :         {
    2201       49188 :             fftN2( pComplexBuf, RotVector_256, 128, 16, 8, 4, x, 64 );
    2202       49188 :             s = add( *scale, SCALEFACTOR128 );
    2203       49188 :             BREAK;
    2204             :         }
    2205             : 
    2206      200421 :         case 160:
    2207             :         {
    2208      200421 :             fftN2( pComplexBuf, RotVector_320, 160, 20, 8, 2, x, 40 );
    2209      200421 :             s = add( *scale, SCALEFACTOR160 );
    2210      200421 :             BREAK;
    2211             :         }
    2212             : 
    2213        4529 :         case 200:
    2214             :         {
    2215        4529 :             fftN2( pComplexBuf, RotVector_400, 200, 20, 10, 2, x, 40 );
    2216        4529 :             s = add( *scale, SCALEFACTOR200 );
    2217        4529 :             BREAK;
    2218             :         }
    2219             : 
    2220        9071 :         case 240:
    2221             :         {
    2222        9071 :             fftN2( pComplexBuf, RotVector_480, 240, 30, 8, 2, x, 60 );
    2223        9071 :             s = add( *scale, SCALEFACTOR240 );
    2224        9071 :             BREAK;
    2225             :         }
    2226             : 
    2227      232417 :         case 256:
    2228             :         {
    2229      232417 :             fftN2( pComplexBuf, RotVector_256, 256, 32, 8, 2, x, 64 );
    2230      232417 :             s = add( *scale, SCALEFACTOR256 );
    2231      232417 :             BREAK;
    2232             :         }
    2233             : 
    2234      570899 :         case 320:
    2235             :         {
    2236      570899 :             fftN2( pComplexBuf, RotVector_320, 320, 20, 16, 2, x, 40 );
    2237      570899 :             s = add( *scale, SCALEFACTOR320 );
    2238      570899 :             BREAK;
    2239             :         }
    2240             : 
    2241        2432 :         case 400:
    2242             :         {
    2243        2432 :             fftN2( pComplexBuf, RotVector_400, 400, 20, 20, 2, x, 40 );
    2244        2432 :             s = add( *scale, SCALEFACTOR400 );
    2245        2432 :             BREAK;
    2246             :         }
    2247             : 
    2248      561022 :         case 480:
    2249             :         {
    2250      561022 :             fftN2( pComplexBuf, RotVector_480, 480, 30, 16, 2, x, 60 );
    2251      561022 :             s = add( *scale, SCALEFACTOR480 );
    2252      561022 :             BREAK;
    2253             :         }
    2254        6068 :         case 600:
    2255             :         {
    2256        6068 :             fftN2( pComplexBuf, RotVector_600, 600, 30, 20, 2, x, 60 );
    2257        6068 :             s = add( *scale, SCALEFACTOR600 );
    2258        6068 :             BREAK;
    2259             :         }
    2260           0 :         default:
    2261           0 :             assert( 0 );
    2262             :     }
    2263     1977014 :     *scale = s;
    2264     1977014 :     move16();
    2265     1977014 : }
    2266             : 
    2267             : 
    2268             : #define RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1, w2, xb0, xb1, xt0, xt1 ) \
    2269             :     {                                                                  \
    2270             :         xb0 = L_shr( x[2 * i + 0], 2 );                                \
    2271             :         xb1 = L_shr( x[2 * i + 1], 2 );                                \
    2272             :         xt0 = L_shr( x[sizeOfFft - 2 * i + 0], 2 );                    \
    2273             :         xt1 = L_shr( x[sizeOfFft - 2 * i + 1], 2 );                    \
    2274             :         t1 = L_sub( xb0, xt0 );                                        \
    2275             :         t2 = L_add( xb1, xt1 );                                        \
    2276             :         t3 = L_sub( Mpy_32_16_1( t1, w1 ), Mpy_32_16_1( t2, w2 ) );    \
    2277             :         t4 = L_add( Mpy_32_16_1( t1, w2 ), Mpy_32_16_1( t2, w1 ) );    \
    2278             :         t1 = L_add( xb0, xt0 );                                        \
    2279             :         t2 = L_sub( xb1, xt1 );                                        \
    2280             :     }
    2281             : 
    2282             : #define RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1, w2, xb0, xb1, xt0, xt1 ) \
    2283             :     {                                                                  \
    2284             :         xb0 = L_shr( x[2 * i + 0], 2 );                                \
    2285             :         xb1 = L_shr( x[2 * i + 1], 2 );                                \
    2286             :         xt0 = L_shr( x[sizeOfFft - 2 * i + 0], 2 );                    \
    2287             :         xt1 = L_shr( x[sizeOfFft - 2 * i + 1], 2 );                    \
    2288             :         t1 = L_sub( xb0, xt0 );                                        \
    2289             :         t2 = L_add( xb1, xt1 );                                        \
    2290             :         t3 = L_add( Mpy_32_16_1( t1, w1 ), Mpy_32_16_1( t2, w2 ) );    \
    2291             :         t4 = L_sub( Mpy_32_16_1( t2, w1 ), Mpy_32_16_1( t1, w2 ) );    \
    2292             :         t1 = L_add( xb0, xt0 );                                        \
    2293             :         t2 = L_sub( xb1, xt1 );                                        \
    2294             :     }
    2295             : 
    2296             : /**
    2297             :  * \brief Real valued FFT
    2298             :  *
    2299             :  *        forward rFFT (isign == -1):
    2300             :  *        The input vector contains sizeOfFft real valued time samples. The output vector contains sizeOfFft/2 complex valued
    2301             :  *        spectral values. The spectral values resides interleaved in the output vector. x[1] contains re[sizeOfFft], because
    2302             :  *        x[1] is zero by default. This allows use of sizeOfFft length buffer instead of sizeOfFft+1.
    2303             :  *
    2304             :  *        inverse rFFT (isign == +1):
    2305             :  *        The input vector contains sizeOfFft complex valued spectral values. The output vector contains sizeOfFft real valued
    2306             :  *        time samples. The spectral values resides interleaved in the input vector. x[1] contains re[sizeOfFft].
    2307             :  *        (see also forward rFFT)
    2308             :  *
    2309             :  * \param    [i/o] x           real input / real and imag output interleaved
    2310             :  * \param    [i  ] sizeOfFft   length of fft
    2311             :  * \param    [i  ] scale       scalefactor
    2312             :  * \param    [i  ] isign       forward (-1) / backward (+1)
    2313             :  *
    2314             :  * \return void
    2315             :  */
    2316       23221 : void BASOP_rfft( Word32 *x, Word16 sizeOfFft, Word16 *scale, Word16 isign )
    2317             : {
    2318       23221 :     Word16 i, s = 0, sizeOfFft2, sizeOfFft4, sizeOfFft8, wstride; /* clear s to calm down compiler */
    2319             :     Word32 t1, t2, t3, t4, xb0, xb1, xt0, xt1;
    2320             :     const PWord16 *w1;
    2321             :     Word16 c1;
    2322             :     Word16 c2;
    2323             :     Word32 workBuffer[2 * BASOP_CFFT_MAX_LENGTH];
    2324             : 
    2325             : 
    2326       23221 :     sizeOfFft2 = shr( sizeOfFft, 1 );
    2327       23221 :     sizeOfFft4 = shr( sizeOfFft, 2 );
    2328       23221 :     sizeOfFft8 = shr( sizeOfFft, 3 );
    2329             : 
    2330       23221 :     BASOP_getTables( NULL, &w1, &wstride, sizeOfFft2 );
    2331             : 
    2332       23221 :     SWITCH( isign )
    2333             :     {
    2334          15 :         case -1:
    2335             : 
    2336          15 :             BASOP_cfft( (cmplx *) x, sizeOfFft2, scale, workBuffer );
    2337             : 
    2338          15 :             xb0 = L_shr( x[0], 1 );
    2339          15 :             xb1 = L_shr( x[1], 1 );
    2340          15 :             x[0] = L_add( xb0, xb1 );
    2341          15 :             move32();
    2342          15 :             x[1] = L_sub( xb0, xb1 );
    2343          15 :             move32();
    2344             : 
    2345        1088 :             FOR( i = 1; i < sizeOfFft8; i++ )
    2346             :             {
    2347        1073 :                 RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1[i * wstride].v.im, w1[i * wstride].v.re, xb0, xb1, xt0, xt1 )
    2348        1073 :                 x[2 * i] = L_sub( t1, t3 );
    2349        1073 :                 move32();
    2350        1073 :                 x[2 * i + 1] = L_sub( t2, t4 );
    2351        1073 :                 move32();
    2352        1073 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2353        1073 :                 move32();
    2354        1073 :                 x[sizeOfFft - 2 * i + 1] = L_negate( L_add( t2, t4 ) );
    2355        1073 :                 move32();
    2356             :             }
    2357             : 
    2358        1103 :             FOR( i = sizeOfFft8; i < sizeOfFft4; i++ )
    2359             :             {
    2360        1088 :                 RFFT_TWIDDLE1( x, t1, t2, t3, t4, w1[( sizeOfFft4 - i ) * wstride].v.re, w1[( sizeOfFft4 - i ) * wstride].v.im, xb0, xb1, xt0, xt1 )
    2361        1088 :                 x[2 * i] = L_sub( t1, t3 );
    2362        1088 :                 move32();
    2363        1088 :                 x[2 * i + 1] = L_sub( t2, t4 );
    2364        1088 :                 move32();
    2365        1088 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2366        1088 :                 move32();
    2367        1088 :                 x[sizeOfFft - 2 * i + 1] = L_negate( L_add( t2, t4 ) );
    2368        1088 :                 move32();
    2369             :             }
    2370             : 
    2371          15 :             x[sizeOfFft - 2 * i] = L_shr( x[2 * i + 0], 1 );
    2372          15 :             move32();
    2373          15 :             x[sizeOfFft - 2 * i + 1] = L_negate( L_shr( x[2 * i + 1], 1 ) );
    2374          15 :             move32();
    2375             : 
    2376          15 :             *scale = add( *scale, 1 );
    2377          15 :             move16();
    2378          15 :             BREAK;
    2379             : 
    2380       23206 :         case +1:
    2381             : 
    2382       23206 :             xb0 = L_shr( x[0], 2 );
    2383       23206 :             xb1 = L_shr( x[1], 2 );
    2384       23206 :             x[0] = L_add( xb0, xb1 );
    2385       23206 :             move32();
    2386       23206 :             x[1] = L_sub( xb1, xb0 );
    2387       23206 :             move32();
    2388             : 
    2389     1707632 :             FOR( i = 1; i < sizeOfFft8; i++ )
    2390             :             {
    2391     1684426 :                 RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1[i * wstride].v.im, w1[i * wstride].v.re, xb0, xb1, xt0, xt1 )
    2392             : 
    2393     1684426 :                 x[2 * i] = L_sub( t1, t3 );
    2394     1684426 :                 move32();
    2395     1684426 :                 x[2 * i + 1] = L_sub( t4, t2 );
    2396     1684426 :                 move32();
    2397     1684426 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2398     1684426 :                 move32();
    2399     1684426 :                 x[sizeOfFft - 2 * i + 1] = L_add( t2, t4 );
    2400     1684426 :                 move32();
    2401             :             }
    2402             : 
    2403     1730838 :             FOR( i = sizeOfFft8; i < sizeOfFft4; i++ )
    2404             :             {
    2405     1707632 :                 RFFT_TWIDDLE2( x, t1, t2, t3, t4, w1[( sizeOfFft4 - i ) * wstride].v.re, w1[( sizeOfFft4 - i ) * wstride].v.im, xb0, xb1, xt0, xt1 )
    2406             : 
    2407     1707632 :                 x[2 * i] = L_sub( t1, t3 );
    2408     1707632 :                 move32();
    2409     1707632 :                 x[2 * i + 1] = L_sub( t4, t2 );
    2410     1707632 :                 move32();
    2411     1707632 :                 x[sizeOfFft - 2 * i] = L_add( t1, t3 );
    2412     1707632 :                 move32();
    2413     1707632 :                 x[sizeOfFft - 2 * i + 1] = L_add( t2, t4 );
    2414     1707632 :                 move32();
    2415             :             }
    2416             : 
    2417       23206 :             x[sizeOfFft - 2 * i] = L_shr( x[2 * i + 0], 1 );
    2418       23206 :             move32();
    2419       23206 :             x[sizeOfFft - 2 * i + 1] = L_shr( x[2 * i + 1], 1 );
    2420       23206 :             move32();
    2421             : 
    2422       23206 :             BASOP_cfft( (cmplx *) x, sizeOfFft2, scale, workBuffer );
    2423             : 
    2424       23206 :             SWITCH( sizeOfFft )
    2425             :             {
    2426       13903 :                 case 40:
    2427             :                 case 80:
    2428             :                 case 320:
    2429             :                 case 640:
    2430       13903 :                     c1 = FFTC( 0x66666680 );
    2431       13903 :                     move16();
    2432       13903 :                     c2 = FFTC( 0x99999980 );
    2433       13903 :                     move16();
    2434     4462863 :                     FOR( i = 0; i < sizeOfFft2; i++ )
    2435             :                     {
    2436     4448960 :                         x[2 * i] = Mpy_32_xx( x[2 * i], c1 );
    2437     4448960 :                         move32();
    2438     4448960 :                         x[2 * i + 1] = Mpy_32_xx( x[2 * i + 1], c2 );
    2439     4448960 :                         move32();
    2440             :                     }
    2441       13903 :                     BREAK;
    2442             : 
    2443        9303 :                 case 64:
    2444             :                 case 256:
    2445             :                 case 512:
    2446     2390871 :                     FOR( i = 0; i < sizeOfFft2; i++ )
    2447             :                     {
    2448     2381568 :                         x[2 * i + 1] = L_negate( x[2 * i + 1] );
    2449     2381568 :                         move32();
    2450             :                     }
    2451        9303 :                     BREAK;
    2452             : 
    2453           0 :                 default:
    2454           0 :                     assert( 0 );
    2455             :             }
    2456             : 
    2457       23206 :             SWITCH( sizeOfFft )
    2458             :             {
    2459           0 :                 case 64:
    2460           0 :                     s = add( *scale, 2 - 6 );
    2461           0 :                     BREAK;
    2462             : 
    2463        9303 :                 case 512:
    2464        9303 :                     s = add( *scale, 2 - 9 );
    2465        9303 :                     BREAK;
    2466             : 
    2467       13903 :                 case 640:
    2468       13903 :                     s = add( *scale, 2 - 9 );
    2469       13903 :                     BREAK;
    2470             : 
    2471           0 :                 default:
    2472           0 :                     assert( 0 );
    2473             :             }
    2474       23206 :             *scale = s;
    2475       23206 :             move16();
    2476       23206 :             BREAK;
    2477             :     }
    2478       23221 : }

Generated by: LCOV version 1.14