Cortex-R DSP Software Library: arm_dct4

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_dct4_q31.c      
00009 *      
00010 * Description:  Processing function of DCT4 & IDCT4 Q31.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00047 void arm_dct4_q31(     
00048   const arm_dct4_instance_q31 * S,     
00049   q31_t * pState,     
00050   q31_t * pInlineBuffer)     
00051 {     
00052   uint16_t i;                                    /* Loop counter */     
00053   q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */     
00054   q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */     
00055   q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */     
00056   q31_t in;                                      /* Temporary variable */     
00057   q31_t in1, in2, in3, in4;  
00058      
00059      
00060   /* DCT4 computation involves DCT2 (which is calculated using RFFT)      
00061    * along with some pre-processing and post-processing.      
00062    * Computational procedure is explained as follows:      
00063    * (a) Pre-processing involves multiplying input with cos factor,      
00064    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))      
00065    *              where,      
00066    *                 r(n) -- output of preprocessing      
00067    *                 u(n) -- input to preprocessing(actual Source buffer)      
00068    * (b) Calculation of DCT2 using FFT is divided into three steps:      
00069    *                  Step1: Re-ordering of even and odd elements of input.      
00070    *                  Step2: Calculating FFT of the re-ordered input.      
00071    *                  Step3: Taking the real part of the product of FFT output and weights.      
00072    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:      
00073    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)      
00074    *                        where,      
00075    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output      
00076    * (d) Multiplying the output with the normalizing factor sqrt(2/N).      
00077    */     
00078      
00079         /*-------- Pre-processing ------------*/     
00080   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */     
00081   arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N);     
00082   arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N);     
00083      
00084   /* ----------------------------------------------------------------      
00085    * Step1: Re-ordering of even and odd elements as      
00086    *             pState[i] =  pInlineBuffer[2*i] and      
00087    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2      
00088    ---------------------------------------------------------------------*/     
00089      
00090   /* pS1 initialized to pState */     
00091   pS1 = pState;     
00092      
00093   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */     
00094   pS2 = pState + (S->N - 1u);     
00095      
00096   /* pbuff initialized to input buffer */     
00097   pbuff = pInlineBuffer;     
00098      
00099   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */     
00100   i = S->Nby2 >> 2u;     
00101      
00102   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00103    ** a second loop below computes the remaining 1 to 3 samples. */     
00104   do     
00105   {     
00106     /* Re-ordering of even and odd elements */     
00107     pS2 -= 4u;  
00108     /* pState[i] =  pInlineBuffer[2*i] */     
00109     in1 = *pbuff++;  
00110     *pS1++ = in1;  
00111     /* pState[N-i-1] = pInlineBuffer[2*i+1] */     
00112     in2 = *pbuff++;  
00113     pS2[4] = in2;  
00114     in3 = *pbuff++;  
00115     *pS1++ = in3;  
00116     in4 = *pbuff++;  
00117     pS2[3] = in4;  
00118     in1 = *pbuff++;  
00119     *pS1++ = in1;  
00120     in2 = *pbuff++;  
00121     pS2[2] = in2;  
00122     in3 = *pbuff++;  
00123     *pS1++ = in3;  
00124     in4 = *pbuff++;  
00125     pS2[1] = in4;  
00126   
00127   
00128      
00129     /* Decrement the loop counter */     
00130     i--;     
00131   } while(i > 0u);     
00132      
00133   /* pbuff initialized to input buffer */     
00134   pbuff = pInlineBuffer;     
00135      
00136   /* pS1 initialized to pState */     
00137   pS1 = pState;     
00138      
00139   /* Initializing the loop counter to N/4 instead of N for loop unrolling */     
00140   i = S->N >> 2u;     
00141      
00142   /* Processing with loop unrolling 4 times as N is always multiple of 4.      
00143    * Compute 4 outputs at a time */     
00144   do     
00145   {     
00146     /* Writing the re-ordered output back to inplace input buffer */     
00147     in1 = *pS1++;  
00148     *pbuff++ = in1;     
00149     in2 = *pS1++;  
00150     *pbuff++ = in2;     
00151     in3 = *pS1++;  
00152     *pbuff++ = in3;     
00153     in4 = *pS1++;  
00154     *pbuff++ = in4;     
00155      
00156     /* Decrement the loop counter */     
00157     i--;     
00158   } while(i > 0u);     
00159      
00160      
00161   /* ---------------------------------------------------------      
00162    *     Step2: Calculate RFFT for N-point input      
00163    * ---------------------------------------------------------- */     
00164   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */     
00165   arm_rfft_q31(S->pRfft, pInlineBuffer, pState);     
00166      
00167   /*----------------------------------------------------------------------      
00168    *  Step3: Multiply the FFT output with the weights.      
00169    *----------------------------------------------------------------------*/     
00170   arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N);     
00171      
00172   /* The output of complex multiplication is in 3.29 format.      
00173    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */     
00174   arm_shift_q31(pState, 2, pState, S->N * 2);     
00175      
00176   /* ----------- Post-processing ---------- */     
00177   /* DCT-IV can be obtained from DCT-II by the equation,      
00178    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)      
00179    *       Hence, Y4(0) = Y2(0)/2  */     
00180   /* Getting only real part from the output and Converting to DCT-IV */     
00181      
00182   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */     
00183   i = (S->N - 1u) >> 2u;     
00184      
00185   /* pbuff initialized to input buffer. */     
00186   pbuff = pInlineBuffer;     
00187      
00188   /* pS1 initialized to pState */     
00189   pS1 = pState;     
00190      
00191   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */     
00192   in = *pS1++ >> 1u;     
00193   /* input buffer acts as inplace, so output values are stored in the input itself. */     
00194   *pbuff++ = in;     
00195      
00196   /* pState pointer is incremented twice as the real values are located alternatively in the array */     
00197   pS1++;     
00198      
00199   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00200    ** a second loop below computes the remaining 1 to 3 samples. */     
00201   do     
00202   {     
00203     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */     
00204     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */     
00205     in1 = pS1[0];  
00206     in2 = pS1[2];  
00207     in1 = in1 - in;  
00208     in3 = pS1[4];  
00209     in2 = in2 - in1;  
00210     in4 = pS1[6];  
00211     *pbuff++ = in1;  
00212   
00213     in3 = in3 - in2;  
00214     *pbuff++ = in2;  
00215     in = in4 - in3;  
00216   
00217     *pbuff++ = in3;  
00218     pS1 += 8u;  
00219     *pbuff++ = in;  
00220   
00221   
00222     /* Decrement the loop counter */     
00223     i--;     
00224   } while(i > 0u);     
00225      
00226   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
00227    ** No loop unrolling is used. */     
00228   i = (S->N - 1u) % 0x4u;     
00229      
00230   while(i > 0u)     
00231   {     
00232     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */     
00233     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */     
00234     in = *pS1++ - in;     
00235     *pbuff++ = in;     
00236     /* points to the next real value */     
00237     pS1++;     
00238      
00239     /* Decrement the loop counter */     
00240     i--;     
00241   }     
00242      
00243      
00244         /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/     
00245      
00246   /* Initializing the loop counter to N/4 instead of N for loop unrolling */     
00247   i = S->N >> 2u;     
00248      
00249   /* pbuff initialized to the pInlineBuffer(now contains the output values) */     
00250   pbuff = pInlineBuffer;     
00251      
00252   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */     
00253   do     
00254   {     
00255     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */     
00256     in1 = pbuff[0];   
00257     in2 = pbuff[1];   
00258     in3 = pbuff[2];   
00259     in4 = pbuff[3];   
00260   
00261     in1 = ((q31_t) (((q63_t) in1 * S->normalize) >> 31));   
00262     in2 = ((q31_t) (((q63_t) in2 * S->normalize) >> 31));   
00263     in3 = ((q31_t) (((q63_t) in3 * S->normalize) >> 31));   
00264     in4 = ((q31_t) (((q63_t) in4 * S->normalize) >> 31));   
00265   
00266     *pbuff++ = in1;  
00267     *pbuff++ = in2;  
00268     *pbuff++ = in3;  
00269     *pbuff++ = in4;  
00270      
00271     /* Decrement the loop counter */     
00272     i--;     
00273   } while(i > 0u);     
00274      
00275 }     
00276