Cortex-R DSP Software Library: arm_conv_fast

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_conv_fast_q31.c      
00009 *      
00010 * Description:  Q31 Convolution (fast version).      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00062 void arm_conv_fast_q31(     
00063   q31_t * pSrcA,     
00064   uint32_t srcALen,     
00065   q31_t * pSrcB,     
00066   uint32_t srcBLen,     
00067   q31_t * pDst)     
00068 {     
00069   q31_t *pIn1;                                   /* inputA pointer */     
00070   q31_t *pIn2;                                   /* inputB pointer */     
00071   q31_t *pOut = pDst;                            /* output pointer */     
00072   q31_t *px;                                     /* Intermediate inputA pointer  */     
00073   q31_t *py;                                     /* Intermediate inputB pointer  */     
00074   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */     
00075   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */     
00076   q31_t x0, x1, x2, x3, c0, c1;                  /* Temporary variables to hold input1 and input2 values */     
00077   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */     
00078     
00079   /* The algorithm implementation is based on the lengths of the inputs. */     
00080   /* srcB is always made to slide across srcA. */     
00081   /* So srcBLen is always considered as shorter or equal to srcALen */     
00082   if(srcALen >= srcBLen)     
00083   {     
00084     /* Initialization of inputA pointer */     
00085     pIn1 = pSrcA;     
00086      
00087     /* Initialization of inputB pointer */     
00088     pIn2 = pSrcB;     
00089   }     
00090   else     
00091   {     
00092     /* Initialization of inputA pointer */     
00093     pIn1 = pSrcB;     
00094      
00095     /* Initialization of inputB pointer */     
00096     pIn2 = pSrcA;     
00097      
00098     /* srcBLen is always considered as shorter or equal to srcALen */     
00099     j = srcBLen;     
00100     srcBLen = srcALen;     
00101     srcALen = j;     
00102   }     
00103      
00104   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */     
00105   /* The function is internally      
00106    * divided into three stages according to the number of multiplications that has to be      
00107    * taken place between inputA samples and inputB samples. In the first stage of the      
00108    * algorithm, the multiplications increase by one for every iteration.      
00109    * In the second stage of the algorithm, srcBLen number of multiplications are done.      
00110    * In the third stage of the algorithm, the multiplications decrease by one      
00111    * for every iteration. */     
00112      
00113   /* The algorithm is implemented in three stages.      
00114      The loop counters of each stage is initiated here. */     
00115   blockSize1 = srcBLen - 1u;     
00116   blockSize2 = srcALen - (srcBLen - 1u);     
00117   blockSize3 = blockSize1;     
00118      
00119   /* --------------------------      
00120    * Initializations of stage1      
00121    * -------------------------*/     
00122      
00123   /* sum = x[0] * y[0]      
00124    * sum = x[0] * y[1] + x[1] * y[0]      
00125    * ....      
00126    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]      
00127    */     
00128      
00129   /* In this stage the MAC operations are increased by 1 for every iteration.      
00130      The count variable holds the number of MAC operations performed */     
00131   count = 1u;     
00132      
00133   /* Working pointer of inputA */     
00134   px = pIn1;     
00135      
00136   /* Working pointer of inputB */     
00137   py = pIn2;     
00138      
00139      
00140   /* ------------------------      
00141    * Stage1 process      
00142    * ----------------------*/     
00143      
00144   /* The first stage starts here */     
00145   while(blockSize1 > 0u)     
00146   {     
00147     /* Accumulator is made zero for every iteration */     
00148     sum = 0;     
00149      
00150     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00151     k = count >> 2u;     
00152      
00153     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00154      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00155     while(k > 0u)     
00156     {     
00157       
00158       /* Read x[0] */  
00159       x0 = *px++;  
00160       /* Read y[srcBLen - 1] */  
00161       c0 = *py--;  
00162   
00163       /* Read x[1] */  
00164       x1 = *px++;  
00165       /* Read y[srcBLen - 2] */  
00166       c1 = *py--;  
00167   
00168       /* x[0] * y[srcBLen - 1] */     
00169       sum = (q31_t) ((((q63_t) sum << 32) +      
00170             ((q63_t) x0 * c0)) >> 32);     
00171      
00172       /* x[1] * y[srcBLen - 2] */     
00173       sum = (q31_t) ((((q63_t) sum << 32) +      
00174             ((q63_t) x1 * c1)) >> 32);   
00175                 
00176       /* Read x[2] */       
00177       x0 = *px++;  
00178       /* Read y[srcBLen - 3] */  
00179       c0 = *py--;  
00180   
00181       /* Read x[3] */  
00182       x1 = *px++;  
00183       /* Read y[srcBLen - 4] */  
00184       c1 = *py--;  
00185      
00186       /* x[2] * y[srcBLen - 3] */     
00187       sum = (q31_t) ((((q63_t) sum << 32) +      
00188             ((q63_t) x0 * c0)) >> 32);     
00189      
00190       /* x[3] * y[srcBLen - 4] */     
00191       sum = (q31_t) ((((q63_t) sum << 32) +      
00192             ((q63_t) x1 * c1)) >> 32);     
00193      
00194       /* Decrement the loop counter */     
00195       k--;     
00196     }     
00197      
00198     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00199      ** No loop unrolling is used. */     
00200     k = count % 0x4u;     
00201      
00202     while(k > 0u)     
00203     {     
00204   
00205       /* Perform the multiply-accumulate */     
00206       sum = (q31_t) ((((q63_t) sum << 32) +     
00207             ((q63_t) * px++ * (*py--))) >> 32);     
00208      
00209       /* Decrement the loop counter */     
00210       k--;     
00211     }     
00212      
00213     /* Store the result in the accumulator in the destination buffer. */     
00214     *pOut++ = sum << 1;     
00215      
00216     /* Update the inputA and inputB pointers for next MAC calculation */     
00217     py = pIn2 + count;     
00218     px = pIn1;     
00219      
00220     /* Increment the MAC count */     
00221     count++;     
00222      
00223     /* Decrement the loop counter */     
00224     blockSize1--;     
00225   }     
00226      
00227   /* --------------------------      
00228    * Initializations of stage2      
00229    * ------------------------*/     
00230      
00231   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]      
00232    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]      
00233    * ....      
00234    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]      
00235    */     
00236      
00237   /* Working pointer of inputA */     
00238   px = pIn1;     
00239      
00240   /* Working pointer of inputB */     
00241   pSrc2 = pIn2 + (srcBLen - 1u);     
00242   py = pSrc2;     
00243      
00244   /* count is index by which the pointer pIn1 to be incremented */     
00245   count = 0u;     
00246      
00247   /* -------------------      
00248    * Stage2 process      
00249    * ------------------*/     
00250      
00251   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00252    * So, to loop unroll over blockSize2,      
00253    * srcBLen should be greater than or equal to 4 */     
00254   if(srcBLen >= 4u)     
00255   {     
00256     /* Loop unroll over blockSize2, by 4 */     
00257     blkCnt = blockSize2 >> 2u;     
00258      
00259     while(blkCnt > 0u)     
00260     {     
00261       /* Set all accumulators to zero */     
00262       acc0 = 0;     
00263       acc1 = 0;     
00264       acc2 = 0;     
00265       acc3 = 0;  
00266      
00267       /* read x[0], x[1], x[2] samples */     
00268       x0 = *(px++);     
00269       x1 = *(px++);     
00270       x2 = *(px++);    
00271      
00272       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00273       k = srcBLen >> 2u;     
00274             
00275      
00276       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00277        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00278       do     
00279       {     
00280         /* Read y[srcBLen - 1] sample */     
00281         c0 = *(py);     
00282      
00283         /* Read x[3] sample */     
00284         x3 = *(px);     
00285      
00286         /* Perform the multiply-accumulates */     
00287         /* acc0 +=  x[0] * y[srcBLen - 1] */  
00288         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00289      
00290         /* acc1 +=  x[1] * y[srcBLen - 1] */     
00291         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00292      
00293         /* acc2 +=  x[2] * y[srcBLen - 1] */     
00294         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00295      
00296         /* acc3 +=  x[3] * y[srcBLen - 1] */     
00297         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);  
00298      
00299         /* Read y[srcBLen - 2] sample */     
00300         c0 = *(py - 1u);     
00301      
00302         /* Read x[4] sample */     
00303         x0 = *(px + 1u);     
00304      
00305         /* Perform the multiply-accumulate */     
00306         /* acc0 +=  x[1] * y[srcBLen - 2] */     
00307         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);     
00308         /* acc1 +=  x[2] * y[srcBLen - 2] */     
00309         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);     
00310         /* acc2 +=  x[3] * y[srcBLen - 2] */     
00311         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);     
00312         /* acc3 +=  x[4] * y[srcBLen - 2] */     
00313         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);     
00314      
00315         /* Read y[srcBLen - 3] sample */     
00316         c0 = *(py - 2u);     
00317      
00318         /* Read x[5] sample */     
00319         x1 = *(px + 2u);     
00320      
00321         /* Perform the multiply-accumulates */     
00322         /* acc0 +=  x[2] * y[srcBLen - 3] */     
00323         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);     
00324         /* acc1 +=  x[3] * y[srcBLen - 3] */     
00325         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);     
00326         /* acc2 +=  x[4] * y[srcBLen - 3] */     
00327         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);     
00328         /* acc3 +=  x[5] * y[srcBLen - 3] */     
00329         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);   
00330      
00331         /* Read y[srcBLen - 4] sample */     
00332         c0 = *(py - 3u);     
00333      
00334         /* Read x[6] sample */     
00335         x2 = *(px + 3u);     
00336      
00337         /* Perform the multiply-accumulates */     
00338         /* acc0 +=  x[3] * y[srcBLen - 4] */     
00339         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);     
00340         /* acc1 +=  x[4] * y[srcBLen - 4] */     
00341         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);     
00342         /* acc2 +=  x[5] * y[srcBLen - 4] */     
00343         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);     
00344         /* acc3 +=  x[6] * y[srcBLen - 4] */     
00345         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);  
00346      
00347         /* update scratch pointers */  
00348         px += 4u;  
00349         py -= 4u;  
00350   
00351      
00352       } while(--k);     
00353      
00354       /* If the srcBLen is not a multiple of 5, compute any remaining MACs here.      
00355        ** No loop unrolling is used. */     
00356       k = srcBLen % 0x4u;     
00357      
00358       while(k > 0u)     
00359       {     
00360         /* Read y[srcBLen - 5] sample */     
00361         c0 = *(py--);     
00362      
00363         /* Read x[7] sample */     
00364         x3 = *(px++);     
00365      
00366         /* Perform the multiply-accumulates */     
00367         /* acc0 +=  x[4] * y[srcBLen - 5] */     
00368         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00369         /* acc1 +=  x[5] * y[srcBLen - 5] */     
00370         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00371         /* acc2 +=  x[6] * y[srcBLen - 5] */     
00372         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00373         /* acc3 +=  x[7] * y[srcBLen - 5] */     
00374         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);  
00375         
00376      
00377         /* Reuse the present samples for the next MAC */     
00378         x0 = x1;     
00379         x1 = x2;     
00380         x2 = x3;  
00381       
00382      
00383         /* Decrement the loop counter */     
00384         k--;     
00385       }     
00386      
00387       /* Store the results in the accumulators in the destination buffer. */     
00388       *pOut++ = (q31_t) (acc0 << 1);     
00389       *pOut++ = (q31_t) (acc1 << 1);     
00390       *pOut++ = (q31_t) (acc2 << 1);     
00391       *pOut++ = (q31_t) (acc3 << 1);   
00392    
00393      
00394       /* Increment the pointer pIn1 index, count by 1 */     
00395       count += 4u;     
00396      
00397       /* Update the inputA and inputB pointers for next MAC calculation */     
00398       px = pIn1 + count;     
00399       py = pSrc2;     
00400      
00401       /* Decrement the loop counter */     
00402       blkCnt--;     
00403     }     
00404      
00405     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00406      ** No loop unrolling is used. */     
00407     blkCnt = blockSize2 % 0x4u;     
00408      
00409     while(blkCnt > 0u)     
00410     {     
00411       /* Accumulator is made zero for every iteration */     
00412       sum = 0;     
00413      
00414       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00415       k = srcBLen >> 2u;     
00416      
00417       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00418        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00419       while(k > 0u)     
00420       {     
00421         /* Perform the multiply-accumulates */     
00422         sum = (q31_t) ((((q63_t) sum << 32) +      
00423                         ((q63_t) * px++ * (*py--))) >> 32);     
00424         sum = (q31_t) ((((q63_t) sum << 32) +      
00425                         ((q63_t) * px++ * (*py--))) >> 32);     
00426         sum = (q31_t) ((((q63_t) sum << 32) +      
00427                         ((q63_t) * px++ * (*py--))) >> 32);     
00428         sum = (q31_t) ((((q63_t) sum << 32) +      
00429                         ((q63_t) * px++ * (*py--))) >> 32);     
00430      
00431         /* Decrement the loop counter */     
00432         k--;     
00433       }     
00434      
00435       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00436        ** No loop unrolling is used. */     
00437       k = srcBLen % 0x4u;     
00438      
00439       while(k > 0u)     
00440       {     
00441         /* Perform the multiply-accumulate */     
00442         sum = (q31_t) ((((q63_t) sum << 32) +      
00443                         ((q63_t) * px++ * (*py--))) >> 32);     
00444      
00445         /* Decrement the loop counter */     
00446         k--;     
00447       }     
00448      
00449       /* Store the result in the accumulator in the destination buffer. */     
00450       *pOut++ = sum << 1;     
00451      
00452       /* Increment the MAC count */     
00453       count++;     
00454      
00455       /* Update the inputA and inputB pointers for next MAC calculation */     
00456       px = pIn1 + count;     
00457       py = pSrc2;     
00458      
00459       /* Decrement the loop counter */     
00460       blkCnt--;     
00461     }     
00462   }     
00463   else     
00464   {     
00465     /* If the srcBLen is not a multiple of 4,      
00466      * the blockSize2 loop cannot be unrolled by 4 */     
00467     blkCnt = blockSize2;     
00468      
00469     while(blkCnt > 0u)     
00470     {     
00471       /* Accumulator is made zero for every iteration */     
00472       sum = 0;     
00473      
00474       /* srcBLen number of MACS should be performed */     
00475       k = srcBLen;     
00476      
00477       while(k > 0u)     
00478       {     
00479         /* Perform the multiply-accumulate */     
00480         sum = (q31_t) ((((q63_t) sum << 32) +      
00481                         ((q63_t) * px++ * (*py--))) >> 32);     
00482      
00483         /* Decrement the loop counter */     
00484         k--;     
00485       }     
00486      
00487       /* Store the result in the accumulator in the destination buffer. */     
00488       *pOut++ = sum << 1;     
00489      
00490       /* Increment the MAC count */     
00491       count++;     
00492      
00493       /* Update the inputA and inputB pointers for next MAC calculation */     
00494       px = pIn1 + count;     
00495       py = pSrc2;     
00496      
00497       /* Decrement the loop counter */     
00498       blkCnt--;     
00499     }     
00500   }     
00501      
00502      
00503   /* --------------------------      
00504    * Initializations of stage3      
00505    * -------------------------*/     
00506      
00507   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]      
00508    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]      
00509    * ....      
00510    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]      
00511    * sum +=  x[srcALen-1] * y[srcBLen-1]      
00512    */     
00513      
00514   /* In this stage the MAC operations are decreased by 1 for every iteration.      
00515      The blockSize3 variable holds the number of MAC operations performed */     
00516      
00517   /* Working pointer of inputA */     
00518   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);     
00519   px = pSrc1;     
00520      
00521   /* Working pointer of inputB */     
00522   pSrc2 = pIn2 + (srcBLen - 1u);     
00523   py = pSrc2;     
00524      
00525   /* -------------------      
00526    * Stage3 process      
00527    * ------------------*/     
00528      
00529   while(blockSize3 > 0u)     
00530   {     
00531     /* Accumulator is made zero for every iteration */     
00532     sum = 0;     
00533      
00534     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00535     k = blockSize3 >> 2u;     
00536      
00537     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00538      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00539     while(k > 0u)     
00540     {  
00541       
00542       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */     
00543       sum = (q31_t) ((((q63_t) sum << 32) +      
00544                       ((q63_t) * px++ * (*py--))) >> 32);     
00545      
00546       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */     
00547       sum = (q31_t) ((((q63_t) sum << 32) +      
00548                       ((q63_t) * px++ * (*py--))) >> 32);  
00549      
00550       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */     
00551       sum = (q31_t) ((((q63_t) sum << 32) +     
00552                       ((q63_t) * px++ * (*py--))) >> 32);     
00553      
00554       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */     
00555       sum = (q31_t) ((((q63_t) sum << 32) +      
00556                       ((q63_t) * px++ * (*py--))) >> 32);     
00557      
00558       /* Decrement the loop counter */     
00559       k--;     
00560     }     
00561      
00562     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.      
00563      ** No loop unrolling is used. */     
00564     k = blockSize3 % 0x4u;     
00565      
00566     while(k > 0u)     
00567     {     
00568       /* Perform the multiply-accumulate */     
00569       sum = (q31_t) ((((q63_t) sum << 32) +      
00570                       ((q63_t) * px++ * (*py--))) >> 32);     
00571      
00572       /* Decrement the loop counter */     
00573       k--;     
00574     }     
00575      
00576     /* Store the result in the accumulator in the destination buffer. */     
00577     *pOut++ = sum << 1;     
00578      
00579     /* Update the inputA and inputB pointers for next MAC calculation */     
00580     px = ++pSrc1;     
00581     py = pSrc2;     
00582      
00583     /* Decrement the loop counter */     
00584     blockSize3--;     
00585   }     
00586      
00587 }     
00588