00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_fir_fast_q31.c 00009 * 00010 * Description: Processing function for the Q31 Fast FIR filter. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00057 void arm_fir_fast_q31( 00058 const arm_fir_instance_q31 * S, 00059 q31_t * pSrc, 00060 q31_t * pDst, 00061 uint32_t blockSize) 00062 { 00063 q31_t *pState = S->pState; /* State pointer */ 00064 q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ 00065 q31_t *pStateCurnt; /* Points to the current sample of the state */ 00066 q31_t x0, x1, x2, x3; /* Temporary variables to hold state */ 00067 q31_t c0; /* Temporary variable to hold coefficient value */ 00068 q31_t *px; /* Temporary pointer for state */ 00069 q31_t *pb; /* Temporary pointer for coefficient buffer */ 00070 q31_t acc0, acc1, acc2, acc3; /* Accumulators */ 00071 uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ 00072 uint32_t i, tapCnt, blkCnt; /* Loop counters */ 00073 00074 /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */ 00075 /* pStateCurnt points to the location where the new input data should be written */ 00076 pStateCurnt = &(S->pState[(numTaps - 1u)]); 00077 00078 /* Apply loop unrolling and compute 4 output values simultaneously. 00079 * The variables acc0 ... acc3 hold output values that are being computed: 00080 * 00081 * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 00082 * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 00083 * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 00084 * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 00085 */ 00086 blkCnt = blockSize >> 2; 00087 00088 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00089 ** a second loop below computes the remaining 1 to 3 samples. */ 00090 while(blkCnt > 0u) 00091 { 00092 /* Copy four new input samples into the state buffer */ 00093 *pStateCurnt++ = *pSrc++; 00094 *pStateCurnt++ = *pSrc++; 00095 *pStateCurnt++ = *pSrc++; 00096 *pStateCurnt++ = *pSrc++; 00097 00098 /* Set all accumulators to zero */ 00099 acc0 = 0; 00100 acc1 = 0; 00101 acc2 = 0; 00102 acc3 = 0; 00103 00104 /* Initialize state pointer */ 00105 px = pState; 00106 00107 /* Initialize coefficient pointer */ 00108 pb = pCoeffs; 00109 00110 /* Read the first three samples from the state buffer: 00111 * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */ 00112 x0 = *(px++); 00113 x1 = *(px++); 00114 x2 = *(px++); 00115 00116 /* Loop unrolling. Process 4 taps at a time. */ 00117 tapCnt = numTaps >> 2; 00118 i = tapCnt; 00119 00120 while(i > 0u) 00121 { 00122 /* Read the b[numTaps] coefficient */ 00123 c0 = *(pb++); 00124 00125 /* Read x[n-numTaps-3] sample */ 00126 x3 = *(px++); 00127 00128 /* acc0 += b[numTaps] * x[n-numTaps] */ 00129 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00130 //acc0 = (q31_t) ((((q63_t) x0 * c0) + (acc0 << 32)) >> 32); 00131 00132 /* acc1 += b[numTaps] * x[n-numTaps-1] */ 00133 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00134 //acc1 = (q31_t) ((((q63_t) x1 * c0) + (acc1 << 32)) >> 32); 00135 00136 /* acc2 += b[numTaps] * x[n-numTaps-2] */ 00137 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00138 //acc2 = (q31_t) ((((q63_t) x2 * c0) + (acc2 << 32)) >> 32); 00139 00140 /* acc3 += b[numTaps] * x[n-numTaps-3] */ 00141 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00142 //acc3 = (q31_t) ((((q63_t) x3 * c0) + (acc3 << 32)) >> 32); 00143 00144 /* Read the b[numTaps-1] coefficient */ 00145 c0 = *(pb++); 00146 00147 /* Read x[n-numTaps-4] sample */ 00148 x0 = *(px++); 00149 00150 /* Perform the multiply-accumulates */ 00151 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00152 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00153 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00154 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00155 00156 // acc0 = (q31_t) ((((q63_t) x1 * c0) + (acc0 << 32)) >> 32); 00157 // acc1 = (q31_t) ((((q63_t) x2 * c0) + (acc1 << 32)) >> 32); 00158 // acc2 = (q31_t) ((((q63_t) x3 * c0) + (acc2 << 32)) >> 32); 00159 // acc3 = (q31_t) ((((q63_t) x0 * c0) + (acc3 << 32)) >> 32); 00160 00161 /* Read the b[numTaps-2] coefficient */ 00162 c0 = *(pb++); 00163 00164 /* Read x[n-numTaps-5] sample */ 00165 x1 = *(px++); 00166 00167 /* Perform the multiply-accumulates */ 00168 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00169 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00170 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00171 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00172 00173 // acc0 = (q31_t) ((((q63_t) x2 * c0) + (acc0 << 32)) >> 32); 00174 // acc1 = (q31_t) ((((q63_t) x3 * c0) + (acc1 << 32)) >> 32); 00175 // acc2 = (q31_t) ((((q63_t) x0 * c0) + (acc2 << 32)) >> 32); 00176 // acc3 = (q31_t) ((((q63_t) x1 * c0) + (acc3 << 32)) >> 32); 00177 00178 /* Read the b[numTaps-3] coefficients */ 00179 c0 = *(pb++); 00180 00181 /* Read x[n-numTaps-6] sample */ 00182 x2 = *(px++); 00183 00184 /* Perform the multiply-accumulates */ 00185 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00186 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00187 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00188 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00189 00190 // acc0 = (q31_t) ((((q63_t) x3 * c0) + (acc0 << 32)) >> 32); 00191 // acc1 = (q31_t) ((((q63_t) x0 * c0) + (acc1 << 32)) >> 32); 00192 // acc2 = (q31_t) ((((q63_t) x1 * c0) + (acc2 << 32)) >> 32); 00193 // acc3 = (q31_t) ((((q63_t) x2 * c0) + (acc3 << 32)) >> 32); 00194 00195 i--; 00196 } 00197 00198 /* If the filter length is not a multiple of 4, compute the remaining filter taps */ 00199 00200 i = numTaps - (tapCnt * 4u); 00201 while(i > 0u) 00202 { 00203 /* Read coefficients */ 00204 c0 = *(pb++); 00205 00206 /* Fetch 1 state variable */ 00207 x3 = *(px++); 00208 00209 00210 /* Perform the multiply-accumulates */ 00211 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00212 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00213 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00214 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00215 00216 // acc0 = (q31_t) ((((q63_t) x0 * c0) + (acc0 << 32)) >> 32); 00217 // acc1 = (q31_t) ((((q63_t) x1 * c0) + (acc1 << 32)) >> 32); 00218 // acc2 = (q31_t) ((((q63_t) x2 * c0) + (acc2 << 32)) >> 32); 00219 // acc3 = (q31_t) ((((q63_t) x3 * c0) + (acc3 << 32)) >> 32); 00220 00221 /* Reuse the present sample states for next sample */ 00222 x0 = x1; 00223 x1 = x2; 00224 x2 = x3; 00225 00226 /* Decrement the loop counter */ 00227 i--; 00228 } 00229 00230 /* Advance the state pointer by 4 to process the next group of 4 samples */ 00231 pState = pState + 4; 00232 00233 /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31 00234 ** Then store the 4 outputs in the destination buffer. */ 00235 *pDst++ = (q31_t) (acc0 << 1); 00236 *pDst++ = (q31_t) (acc1 << 1); 00237 *pDst++ = (q31_t) (acc2 << 1); 00238 *pDst++ = (q31_t) (acc3 << 1); 00239 00240 /* Decrement the samples loop counter */ 00241 blkCnt--; 00242 } 00243 00244 00245 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00246 ** No loop unrolling is used. */ 00247 blkCnt = blockSize % 4u; 00248 00249 while(blkCnt > 0u) 00250 { 00251 /* Copy one sample at a time into state buffer */ 00252 *pStateCurnt++ = *pSrc++; 00253 00254 /* Set the accumulator to zero */ 00255 acc0 = 0; 00256 00257 /* Initialize state pointer */ 00258 px = pState; 00259 00260 /* Initialize Coefficient pointer */ 00261 pb = (pCoeffs); 00262 00263 i = numTaps; 00264 00265 /* Perform the multiply-accumulates */ 00266 do 00267 { 00268 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) (*px++) * (*(pb++)))) >> 32); 00269 //acc0 = (q31_t) ((((q63_t) * ()) + (acc0 << 32)) >> 32); 00270 i--; 00271 } while(i > 0u); 00272 00273 /* The result is in 2.30 format. Convert to 1.31 00274 ** Then store the output in the destination buffer. */ 00275 *pDst++ = (q31_t) (acc0 << 1); 00276 00277 /* Advance state pointer by 1 for the next sample */ 00278 pState = pState + 1; 00279 00280 /* Decrement the samples loop counter */ 00281 blkCnt--; 00282 } 00283 00284 /* Processing is complete. 00285 ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. 00286 ** This prepares the state buffer for the next function call. */ 00287 00288 /* Points to the start of the state buffer */ 00289 pStateCurnt = S->pState; 00290 00291 tapCnt = (numTaps - 1u) >> 2u; 00292 00293 /* copy data */ 00294 while(tapCnt > 0u) 00295 { 00296 00297 x0 = *pState++; 00298 x1 = *pState++; 00299 *pStateCurnt++ = x0; 00300 *pStateCurnt++ = x1; 00301 00302 x0 = *pState++; 00303 x1 = *pState++; 00304 *pStateCurnt++ = x0; 00305 *pStateCurnt++ = x1; 00306 00307 /* Decrement the loop counter */ 00308 tapCnt--; 00309 } 00310 00311 /* Calculate remaining number of copies */ 00312 tapCnt = (numTaps - 1u) % 0x4u; 00313 00314 /* Copy the remaining q31_t data */ 00315 while(tapCnt > 0u) 00316 { 00317 *pStateCurnt++ = *pState++; 00318 00319 /* Decrement the loop counter */ 00320 tapCnt--; 00321 } 00322 00323 } 00324