Cortex-R DSP Software Library: arm_dct4

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_dct4_q15.c      
00009 *      
00010 * Description:  Processing function of DCT4 & IDCT4 Q15.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00046 void arm_dct4_q15(     
00047   const arm_dct4_instance_q15 * S,     
00048   q15_t * pState,     
00049   q15_t * pInlineBuffer)     
00050 {     
00051   uint32_t i;                                    /* Loop counter */     
00052   q15_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */     
00053   q15_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */     
00054   q15_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */     
00055   q15_t in;                                      /* Temporary variable */     
00056   q31_t in1, in2, in3, in4;  
00057   q15_t i1, i2, i3, i4;  
00058      
00059      
00060   /* DCT4 computation involves DCT2 (which is calculated using RFFT)      
00061    * along with some pre-processing and post-processing.      
00062    * Computational procedure is explained as follows:      
00063    * (a) Pre-processing involves multiplying input with cos factor,      
00064    *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))      
00065    *              where,      
00066    *                 r(n) -- output of preprocessing      
00067    *                 u(n) -- input to preprocessing(actual Source buffer)      
00068    * (b) Calculation of DCT2 using FFT is divided into three steps:      
00069    *                  Step1: Re-ordering of even and odd elements of input.      
00070    *                  Step2: Calculating FFT of the re-ordered input.      
00071    *                  Step3: Taking the real part of the product of FFT output and weights.      
00072    * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:      
00073    *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)      
00074    *                        where,      
00075    *                           Y4 -- DCT4 output,   Y2 -- DCT2 output      
00076    * (d) Multiplying the output with the normalizing factor sqrt(2/N).      
00077    */     
00078      
00079         /*-------- Pre-processing ------------*/     
00080   /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */     
00081   arm_mult_q15(pInlineBuffer, cosFact, pInlineBuffer, S->N);     
00082   arm_shift_q15(pInlineBuffer, 1, pInlineBuffer, S->N);     
00083      
00084   /* ----------------------------------------------------------------      
00085    * Step1: Re-ordering of even and odd elements as      
00086    *             pState[i] =  pInlineBuffer[2*i] and      
00087    *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2      
00088    ---------------------------------------------------------------------*/     
00089      
00090   /* pS1 initialized to pState */     
00091   pS1 = pState;     
00092      
00093   /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */     
00094   pS2 = pState + (S->N - 1u);     
00095      
00096   /* pbuff initialized to input buffer */     
00097   pbuff = pInlineBuffer;     
00098      
00099   /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */     
00100   i = (uint32_t) S->Nby2 >> 2u;     
00101      
00102   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00103    ** a second loop below computes the remaining 1 to 3 samples. */     
00104   do     
00105   {     
00106     /* Re-ordering of even and odd elements */     
00107     pS2 -= 4u;  
00108     /* pState[i] =  pInlineBuffer[2*i] */   
00109      /* pState[N-i-1] = pInlineBuffer[2*i+1] */     
00110     in1 = *__SIMD32(pbuff)++;  
00111     in2 = *__SIMD32(pbuff)++;  
00112   
00113     pS2[4] = (q15_t)in1;  
00114   
00115     in3 = *__SIMD32(pbuff)++;  
00116   
00117 #ifndef ARM_MATH_BIG_ENDIAN  
00118   
00119     in1 = __PKHTB(in2, in1, 16);  
00120   
00121 #else  
00122   
00123     in1 = __PKHTB(in1, in2, 16);  
00124   
00125 #endif  //      #ifndef ARM_MATH_BIG_ENDIAN  
00126   
00127     in4 = *__SIMD32(pbuff)++;  
00128   
00129     pS2[3] = (q15_t)in2;  
00130   
00131 #ifndef ARM_MATH_BIG_ENDIAN  
00132   
00133     in2 = __PKHTB(in4, in3, 16);  
00134   
00135 #else  
00136   
00137     in2 = __PKHTB(in3, in4, 16);  
00138   
00139 #endif  //      #ifndef ARM_MATH_BIG_ENDIAN  
00140   
00141     pS2[2] = (q15_t)in3;  
00142   
00143     *__SIMD32(pS1)++ = in1;  
00144   
00145     pS2[1] = (q15_t)in4;  
00146   
00147     *__SIMD32(pS1)++ = in2;  
00148   
00149     /* Decrement the loop counter */     
00150     i--;     
00151   } while(i > 0u);     
00152      
00153   /* pbuff initialized to input buffer */     
00154   pbuff = pInlineBuffer;     
00155      
00156   /* pS1 initialized to pState */     
00157   pS1 = pState;     
00158      
00159   /* Initializing the loop counter to N/4 instead of N for loop unrolling */     
00160   i = (uint32_t) S->N >> 2u;     
00161      
00162   /* Processing with loop unrolling 4 times as N is always multiple of 4.      
00163    * Compute 4 outputs at a time */     
00164   do     
00165   {     
00166     /* Writing the re-ordered output back to inplace input buffer */     
00167     in1 = *__SIMD32(pS1)++;  
00168     in2 = *__SIMD32(pS1)++;  
00169     *__SIMD32(pbuff)++ = in1;  
00170     *__SIMD32(pbuff)++ = in2;   
00171      
00172     /* Decrement the loop counter */     
00173     i--;     
00174   } while(i > 0u);     
00175      
00176      
00177   /* ---------------------------------------------------------      
00178    *     Step2: Calculate RFFT for N-point input      
00179    * ---------------------------------------------------------- */     
00180   /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */     
00181   arm_rfft_q15(S->pRfft, pInlineBuffer, pState);     
00182      
00183  /*----------------------------------------------------------------------      
00184   *  Step3: Multiply the FFT output with the weights.      
00185   *----------------------------------------------------------------------*/     
00186   arm_cmplx_mult_cmplx_q15(pState, weights, pState, S->N);     
00187      
00188   /* The output of complex multiplication is in 3.13 format.      
00189    * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */     
00190   arm_shift_q15(pState, 2, pState, S->N * 2);     
00191      
00192   /* ----------- Post-processing ---------- */     
00193   /* DCT-IV can be obtained from DCT-II by the equation,      
00194    *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)      
00195    *       Hence, Y4(0) = Y2(0)/2  */     
00196   /* Getting only real part from the output and Converting to DCT-IV */     
00197      
00198   /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */     
00199   i = ((uint32_t) S->N - 1u) >> 2u;     
00200      
00201   /* pbuff initialized to input buffer. */     
00202   pbuff = pInlineBuffer;     
00203      
00204   /* pS1 initialized to pState */     
00205   pS1 = pState;     
00206      
00207   /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */     
00208   in = *pS1++ >> 1u;     
00209   /* input buffer acts as inplace, so output values are stored in the input itself. */     
00210   *pbuff++ = in;     
00211      
00212   /* pState pointer is incremented twice as the real values are located alternatively in the array */     
00213   pS1++;     
00214      
00215   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00216    ** a second loop below computes the remaining 1 to 3 samples. */     
00217   do     
00218   {     
00219     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */     
00220     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */     
00221     i1 = pS1[0];  
00222     i2 = pS1[2];  
00223     i3 = pS1[4];  
00224     i4 = pS1[6];  
00225   
00226     i1 = i1 - in;  
00227     i2 = i2 - i1;  
00228     i3 = i3 - i2;  
00229     in = i4 - i3;  
00230   
00231     *pbuff++ = i1;  
00232     *pbuff++ = i2;  
00233     *pbuff++ = i3;  
00234     *pbuff++ = in;  
00235   
00236     pS1 += 8u;  
00237      
00238     /* Decrement the loop counter */     
00239     i--;     
00240   } while(i > 0u);     
00241      
00242   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
00243    ** No loop unrolling is used. */     
00244   i = ((uint32_t) S->N - 1u) % 0x4u;     
00245      
00246   while(i > 0u)     
00247   {     
00248     /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */     
00249     /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */     
00250     in = *pS1++ - in;     
00251     *pbuff++ = in;     
00252     /* points to the next real value */     
00253     pS1++;     
00254      
00255     /* Decrement the loop counter */     
00256     i--;     
00257   }     
00258      
00259      
00260    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/     
00261      
00262   /* Initializing the loop counter to N/4 instead of N for loop unrolling */     
00263   i = (uint32_t) S->N >> 2u;     
00264      
00265   /* pbuff initialized to the pInlineBuffer(now contains the output values) */     
00266   pbuff = pInlineBuffer;     
00267      
00268   /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */     
00269   do     
00270   {     
00271     /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */     
00272     i1 = pbuff[0];  
00273     i2 = pbuff[1];  
00274     i3 = pbuff[2];  
00275     i4 = pbuff[3];  
00276       
00277     i1 = ((q15_t)(((q31_t) i1 * S->normalize) >> 15));    
00278     i2 = ((q15_t)(((q31_t) i2 * S->normalize) >> 15));    
00279     i3 = ((q15_t)(((q31_t) i3 * S->normalize) >> 15));    
00280     i4 = ((q15_t)(((q31_t) i4 * S->normalize) >> 15));    
00281   
00282     pbuff[0] = i1;  
00283     pbuff[1] = i2;  
00284     pbuff[2] = i3;  
00285     pbuff[3] = i4;  
00286   
00287     pbuff += 4u;  
00288   
00289     /* Decrement the loop counter */     
00290     i--;     
00291   } while(i > 0u);     
00292      
00293 }     
00294