Cortex-R DSP Software Library: arm_dct4

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_dct4_f32.c      
00009 *      
00010 * Description:  Processing function of DCT4 & IDCT4 F32.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00120 void arm_dct4_f32(     
00121   const arm_dct4_instance_f32 * S,     
00122   float32_t * pState,     
00123   float32_t * pInlineBuffer)     
00124 {     
00125   uint32_t i;                                    /* Loop counter */     
00126   float32_t *weights = S->pTwiddle;              /* Pointer to the Weights table */     
00127   float32_t *cosFact = S->pCosFactor;            /* Pointer to the cos factors table */     
00128   float32_t *pS1, *pS2, *pbuff;                  /* Temporary pointers for input buffer and pState buffer */     
00129   float32_t in;                                  /* Temporary variable */     
00130   float32_t in1, in2, in3, in4;  
00131      
00132      
00133   /* DCT4 computation involves DCT2 (which is calculated using RFFT)      
00134    * along with some pre-processing and post-processing.      
00135    * Computational procedure is explained as follows:      
00136    * (a) Pre-processing involves multiplying input with cos factor,      
00137    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))      
00138    *              where,      
00139    *                 r(n) -- output of preprocessing      
00140    *                 u(n) -- input to preprocessing(actual Source buffer)      
00141    * (b) Calculation of DCT2 using FFT is divided into three steps:      
00142    *                  Step1: Re-ordering of even and odd elements of input.      
00143    *                  Step2: Calculating FFT of the re-ordered input.      
00144    *                  Step3: Taking the real part of the product of FFT output and weights.      
00145    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:      
00146    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)      
00147    *                        where,      
00148    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output      
00149    * (d) Multiplying the output with the normalizing factor sqrt(2/N).      
00150    */     
00151      
00152         /*-------- Pre-processing ------------*/     
00153   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */     
00154   arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);     
00155   arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);     
00156      
00157   /* ----------------------------------------------------------------      
00158    * Step1: Re-ordering of even and odd elements as,      
00159    *             pState[i] =  pInlineBuffer[2*i] and      
00160    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2      
00161    ---------------------------------------------------------------------*/     
00162      
00163   /* pS1 initialized to pState */     
00164   pS1 = pState;     
00165      
00166   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */     
00167   pS2 = pState + (S->N - 1u);     
00168      
00169   /* pbuff initialized to input buffer */     
00170   pbuff = pInlineBuffer;     
00171      
00172   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */     
00173   i = (uint32_t) S->Nby2 >> 2u;     
00174      
00175   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00176    ** a second loop below computes the remaining 1 to 3 samples. */     
00177   do     
00178   {     
00179     /* Re-ordering of even and odd elements */     
00180     pS2 -= 4u;    
00181     /* pState[i] =  pInlineBuffer[2*i] */     
00182     in1 = *pbuff++;  
00183     *pS1++ = in1;  
00184     /* pState[N-i-1] = pInlineBuffer[2*i+1] */     
00185     in2 = *pbuff++;  
00186     pS2[4] = in2;   
00187     in3 = *pbuff++;  
00188     *pS1++ = in3;  
00189     in4 = *pbuff++;  
00190     pS2[3] = in4;   
00191     in1 = *pbuff++;  
00192     *pS1++ = in1;  
00193     in2 = *pbuff++;  
00194     pS2[2] = in2;   
00195     in3 = *pbuff++;  
00196     *pS1++ = in3;  
00197     in4 = *pbuff++;  
00198     pS2[1] = in4;   
00199      
00200     /* Decrement the loop counter */     
00201     i--;     
00202   } while(i > 0u);     
00203      
00204   /* pbuff initialized to input buffer */     
00205   pbuff = pInlineBuffer;     
00206      
00207   /* pS1 initialized to pState */     
00208   pS1 = pState;     
00209      
00210   /* Initializing the loop counter to N/4 instead of N for loop unrolling */     
00211   i = (uint32_t) S->N >> 2u;     
00212      
00213   /* Processing with loop unrolling 4 times as N is always multiple of 4.      
00214    * Compute 4 outputs at a time */     
00215   do     
00216   {     
00217     /* Writing the re-ordered output back to inplace input buffer */     
00218     in1 = *pS1++;  
00219     *pbuff++ = in1;  
00220     in2 = *pS1++;  
00221     *pbuff++ = in2;  
00222     in3 = *pS1++;  
00223     *pbuff++ = in3;  
00224     in4 = *pS1++;    
00225     *pbuff++ = in4;  
00226      
00227     /* Decrement the loop counter */     
00228     i--;     
00229   } while(i > 0u);     
00230      
00231      
00232   /* ---------------------------------------------------------      
00233    *     Step2: Calculate RFFT for N-point input      
00234    * ---------------------------------------------------------- */     
00235   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */     
00236   arm_rfft_f32(S->pRfft, pInlineBuffer, pState);     
00237      
00238         /*----------------------------------------------------------------------      
00239      *  Step3: Multiply the FFT output with the weights.      
00240      *----------------------------------------------------------------------*/     
00241   arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);     
00242      
00243   /* ----------- Post-processing ---------- */     
00244   /* DCT-IV can be obtained from DCT-II by the equation,      
00245    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)      
00246    *       Hence, Y4(0) = Y2(0)/2  */     
00247   /* Getting only real part from the output and Converting to DCT-IV */     
00248      
00249   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */     
00250   i = ((uint32_t) S->N - 1u) >> 2u;     
00251      
00252   /* pbuff initialized to input buffer. */     
00253   pbuff = pInlineBuffer;     
00254      
00255   /* pS1 initialized to pState */     
00256   pS1 = pState;     
00257      
00258   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */     
00259   in = *pS1++ * (float32_t) 0.5;     
00260   /* input buffer acts as inplace, so output values are stored in the input itself. */     
00261   *pbuff++ = in;     
00262      
00263   /* pState pointer is incremented twice as the real values are located alternatively in the array */     
00264   pS1++;     
00265      
00266   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00267    ** a second loop below computes the remaining 1 to 3 samples. */     
00268   do     
00269   {     
00270     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */     
00271     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */     
00272     in1 = pS1[0];  
00273     in2 = pS1[2];  
00274   
00275     in1 = in1 - in;  
00276   
00277     in3 = pS1[4];  
00278   
00279     in2 = in2 - in1;  
00280   
00281     in4 = pS1[6];  
00282   
00283     *pbuff++ = in1;  
00284   
00285     in3 = in3 - in2;  
00286   
00287     *pbuff++ = in2;  
00288   
00289     in = in4 - in3;  
00290   
00291     *pbuff++ = in3;  
00292     *pbuff++ = in;  
00293   
00294     pS1 += 8u;       
00295      
00296     /* Decrement the loop counter */     
00297     i--;     
00298   } while(i > 0u);     
00299      
00300   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
00301    ** No loop unrolling is used. */     
00302   i = ((uint32_t) S->N - 1u) % 0x4u;     
00303      
00304   while(i > 0u)     
00305   {     
00306     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */     
00307     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */     
00308     in = *pS1++ - in;     
00309     *pbuff++ = in;     
00310     /* points to the next real value */     
00311     pS1++;     
00312      
00313     /* Decrement the loop counter */     
00314     i--;     
00315   }     
00316      
00317      
00318         /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/     
00319      
00320   /* Initializing the loop counter to N/4 instead of N for loop unrolling */     
00321   i = (uint32_t) S->N >> 2u;     
00322      
00323   /* pbuff initialized to the pInlineBuffer(now contains the output values) */     
00324   pbuff = pInlineBuffer;     
00325      
00326   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */     
00327   do     
00328   {     
00329     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */     
00330     in1 = pbuff[0];     
00331     in2 = pbuff[1];     
00332     in3 = pbuff[2];     
00333     in4 = pbuff[3];   
00334       
00335     in1 = in1 * S->normalize;    
00336     in2 = in2 * S->normalize;    
00337     in3 = in3 * S->normalize;    
00338     in4 = in4 * S->normalize;  
00339       
00340     pbuff[0] = in1;  
00341     pbuff[1] = in2;  
00342     pbuff[2] = in3;  
00343     pbuff[3] = in4;  
00344   
00345     pbuff += 4u;     
00346   
00347     /* Decrement the loop counter */     
00348     i--;     
00349   } while(i > 0u);     
00350      
00351 }     
00352