Cortex-R DSP Software Library: arm_fir_fast

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_fir_fast_q31.c      
00009 *      
00010 * Description:  Processing function for the Q31 Fast FIR filter.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00057 void arm_fir_fast_q31(     
00058   const arm_fir_instance_q31 * S,     
00059   q31_t * pSrc,     
00060   q31_t * pDst,     
00061   uint32_t blockSize)     
00062 {     
00063   q31_t *pState = S->pState;                     /* State pointer */     
00064   q31_t *pCoeffs = S->pCoeffs;                   /* Coefficient pointer */     
00065   q31_t *pStateCurnt;                            /* Points to the current sample of the state */     
00066   q31_t x0, x1, x2, x3;                          /* Temporary variables to hold state */     
00067   q31_t c0;                                      /* Temporary variable to hold coefficient value */     
00068   q31_t *px;                                     /* Temporary pointer for state */     
00069   q31_t *pb;                                     /* Temporary pointer for coefficient buffer */     
00070   q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */     
00071   uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */     
00072   uint32_t i, tapCnt, blkCnt;                    /* Loop counters */     
00073      
00074   /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */     
00075   /* pStateCurnt points to the location where the new input data should be written */     
00076   pStateCurnt = &(S->pState[(numTaps - 1u)]);     
00077      
00078   /* Apply loop unrolling and compute 4 output values simultaneously.      
00079    * The variables acc0 ... acc3 hold output values that are being computed:      
00080    *      
00081    *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]      
00082    *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]      
00083    *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]      
00084    *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]      
00085    */     
00086   blkCnt = blockSize >> 2;     
00087      
00088   /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.      
00089    ** a second loop below computes the remaining 1 to 3 samples. */     
00090   while(blkCnt > 0u)     
00091   {     
00092     /* Copy four new input samples into the state buffer */     
00093     *pStateCurnt++ = *pSrc++;     
00094     *pStateCurnt++ = *pSrc++;     
00095     *pStateCurnt++ = *pSrc++;     
00096     *pStateCurnt++ = *pSrc++;     
00097      
00098     /* Set all accumulators to zero */     
00099     acc0 = 0;     
00100     acc1 = 0;     
00101     acc2 = 0;     
00102     acc3 = 0;     
00103      
00104     /* Initialize state pointer */     
00105     px = pState;     
00106      
00107     /* Initialize coefficient pointer */     
00108     pb = pCoeffs;     
00109      
00110     /* Read the first three samples from the state buffer:      
00111      *  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */     
00112     x0 = *(px++);     
00113     x1 = *(px++);     
00114     x2 = *(px++);     
00115      
00116     /* Loop unrolling.  Process 4 taps at a time. */     
00117     tapCnt = numTaps >> 2;     
00118     i = tapCnt;     
00119      
00120     while(i > 0u)     
00121     {     
00122       /* Read the b[numTaps] coefficient */     
00123       c0 = *(pb++);     
00124      
00125       /* Read x[n-numTaps-3] sample */     
00126       x3 = *(px++);     
00127      
00128       /* acc0 +=  b[numTaps] * x[n-numTaps] */  
00129       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00130       //acc0 = (q31_t) ((((q63_t) x0 * c0) + (acc0 << 32)) >> 32);     
00131      
00132       /* acc1 +=  b[numTaps] * x[n-numTaps-1] */  
00133       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00134       //acc1 = (q31_t) ((((q63_t) x1 * c0) + (acc1 << 32)) >> 32);     
00135      
00136       /* acc2 +=  b[numTaps] * x[n-numTaps-2] */  
00137       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00138       //acc2 = (q31_t) ((((q63_t) x2 * c0) + (acc2 << 32)) >> 32);     
00139      
00140       /* acc3 +=  b[numTaps] * x[n-numTaps-3] */  
00141       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);     
00142       //acc3 = (q31_t) ((((q63_t) x3 * c0) + (acc3 << 32)) >> 32);     
00143      
00144       /* Read the b[numTaps-1] coefficient */     
00145       c0 = *(pb++);     
00146      
00147       /* Read x[n-numTaps-4] sample */     
00148       x0 = *(px++);     
00149      
00150       /* Perform the multiply-accumulates */  
00151       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);  
00152       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);  
00153       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);  
00154       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);  
00155            
00156 //      acc0 = (q31_t) ((((q63_t) x1 * c0) + (acc0 << 32)) >> 32);     
00157 //      acc1 = (q31_t) ((((q63_t) x2 * c0) + (acc1 << 32)) >> 32);     
00158 //      acc2 = (q31_t) ((((q63_t) x3 * c0) + (acc2 << 32)) >> 32);     
00159 //      acc3 = (q31_t) ((((q63_t) x0 * c0) + (acc3 << 32)) >> 32);     
00160      
00161       /* Read the b[numTaps-2] coefficient */     
00162       c0 = *(pb++);     
00163      
00164       /* Read x[n-numTaps-5] sample */     
00165       x1 = *(px++);     
00166      
00167       /* Perform the multiply-accumulates */   
00168       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);  
00169       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);  
00170       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);  
00171       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);  
00172           
00173 //      acc0 = (q31_t) ((((q63_t) x2 * c0) + (acc0 << 32)) >> 32);     
00174 //      acc1 = (q31_t) ((((q63_t) x3 * c0) + (acc1 << 32)) >> 32);     
00175 //      acc2 = (q31_t) ((((q63_t) x0 * c0) + (acc2 << 32)) >> 32);     
00176 //      acc3 = (q31_t) ((((q63_t) x1 * c0) + (acc3 << 32)) >> 32);     
00177      
00178       /* Read the b[numTaps-3] coefficients */     
00179       c0 = *(pb++);     
00180      
00181       /* Read x[n-numTaps-6] sample */     
00182       x2 = *(px++);     
00183      
00184       /* Perform the multiply-accumulates */  
00185       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);  
00186       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);  
00187       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);  
00188       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);  
00189            
00190 //      acc0 = (q31_t) ((((q63_t) x3 * c0) + (acc0 << 32)) >> 32);     
00191 //      acc1 = (q31_t) ((((q63_t) x0 * c0) + (acc1 << 32)) >> 32);     
00192 //      acc2 = (q31_t) ((((q63_t) x1 * c0) + (acc2 << 32)) >> 32);     
00193 //      acc3 = (q31_t) ((((q63_t) x2 * c0) + (acc3 << 32)) >> 32);   
00194           
00195       i--;     
00196     }     
00197      
00198     /* If the filter length is not a multiple of 4, compute the remaining filter taps */     
00199      
00200     i = numTaps - (tapCnt * 4u);     
00201     while(i > 0u)     
00202     {     
00203       /* Read coefficients */     
00204       c0 = *(pb++);     
00205      
00206       /* Fetch 1 state variable */     
00207       x3 = *(px++);     
00208      
00209   
00210       /* Perform the multiply-accumulates */  
00211       acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);  
00212       acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);  
00213       acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);  
00214       acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);  
00215            
00216 //      acc0 = (q31_t) ((((q63_t) x0 * c0) + (acc0 << 32)) >> 32);     
00217 //      acc1 = (q31_t) ((((q63_t) x1 * c0) + (acc1 << 32)) >> 32);     
00218 //      acc2 = (q31_t) ((((q63_t) x2 * c0) + (acc2 << 32)) >> 32);     
00219 //      acc3 = (q31_t) ((((q63_t) x3 * c0) + (acc3 << 32)) >> 32);     
00220      
00221       /* Reuse the present sample states for next sample */     
00222       x0 = x1;     
00223       x1 = x2;     
00224       x2 = x3;     
00225      
00226       /* Decrement the loop counter */     
00227       i--;     
00228     }     
00229      
00230     /* Advance the state pointer by 4 to process the next group of 4 samples */     
00231     pState = pState + 4;     
00232      
00233     /* The results in the 4 accumulators are in 2.30 format.  Convert to 1.31      
00234      ** Then store the 4 outputs in the destination buffer. */     
00235     *pDst++ = (q31_t) (acc0 << 1);     
00236     *pDst++ = (q31_t) (acc1 << 1);     
00237     *pDst++ = (q31_t) (acc2 << 1);     
00238     *pDst++ = (q31_t) (acc3 << 1);     
00239      
00240     /* Decrement the samples loop counter */     
00241     blkCnt--;     
00242   }     
00243      
00244      
00245   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.      
00246    ** No loop unrolling is used. */     
00247   blkCnt = blockSize % 4u;     
00248      
00249   while(blkCnt > 0u)     
00250   {     
00251     /* Copy one sample at a time into state buffer */     
00252     *pStateCurnt++ = *pSrc++;     
00253      
00254     /* Set the accumulator to zero */     
00255     acc0 = 0;     
00256      
00257     /* Initialize state pointer */     
00258     px = pState;     
00259      
00260     /* Initialize Coefficient pointer */     
00261     pb = (pCoeffs);     
00262      
00263     i = numTaps;     
00264      
00265     /* Perform the multiply-accumulates */     
00266     do     
00267     {  
00268         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) (*px++) *  (*(pb++)))) >> 32);     
00269       //acc0 = (q31_t) ((((q63_t)  * ()) + (acc0 << 32)) >> 32);     
00270       i--;     
00271     } while(i > 0u);     
00272      
00273     /* The result is in 2.30 format.  Convert to 1.31      
00274      ** Then store the output in the destination buffer. */     
00275     *pDst++ = (q31_t) (acc0 << 1);     
00276      
00277     /* Advance state pointer by 1 for the next sample */     
00278     pState = pState + 1;     
00279      
00280     /* Decrement the samples loop counter */     
00281     blkCnt--;     
00282   }     
00283      
00284   /* Processing is complete.      
00285    ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.      
00286    ** This prepares the state buffer for the next function call. */     
00287      
00288   /* Points to the start of the state buffer */     
00289   pStateCurnt = S->pState;     
00290      
00291   tapCnt = (numTaps - 1u) >> 2u;     
00292      
00293   /* copy data */     
00294   while(tapCnt > 0u)     
00295   {   
00296      
00297       x0 =  *pState++;    
00298       x1 =  *pState++;   
00299       *pStateCurnt++ = x0;   
00300       *pStateCurnt++ = x1;   
00301    
00302       x0 =  *pState++;    
00303       x1 =  *pState++;   
00304       *pStateCurnt++ = x0;   
00305       *pStateCurnt++ = x1;   
00306      
00307     /* Decrement the loop counter */     
00308     tapCnt--;     
00309   }     
00310      
00311   /* Calculate remaining number of copies */     
00312   tapCnt = (numTaps - 1u) % 0x4u;     
00313      
00314   /* Copy the remaining q31_t data */     
00315   while(tapCnt > 0u)     
00316   {     
00317     *pStateCurnt++ = *pState++;     
00318      
00319     /* Decrement the loop counter */     
00320     tapCnt--;     
00321   }     
00322      
00323 }     
00324