Cortex-R DSP Software Library: arm_conv

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_conv_q31.c      
00009 *      
00010 * Description:  Convolution of Q31 sequences.    
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00062 void arm_conv_q31(     
00063   q31_t * pSrcA,     
00064   uint32_t srcALen,     
00065   q31_t * pSrcB,     
00066   uint32_t srcBLen,     
00067   q31_t * pDst)     
00068 {     
00069   q31_t *pIn1;                                   /* inputA pointer */     
00070   q31_t *pIn2;                                   /* inputB pointer */     
00071   q31_t *pOut = pDst;                            /* output pointer */     
00072   q31_t *px;                                     /* Intermediate inputA pointer  */     
00073   q31_t *py;                                     /* Intermediate inputB pointer  */     
00074   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */     
00075   q63_t sum;                                     /* Accumulator */     
00076   q63_t acc0, acc1, acc2;                        /* Accumulator */     
00077   q31_t x0, x1, x2, c0;                          /* Temporary variables to hold input1 and input2 values */     
00078   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counter */     
00079   q31_t c1;                                      /* Temporary variable to hold input */  
00080      
00081   /* The algorithm implementation is based on the lengths of the inputs. */     
00082   /* srcB is always made to slide across srcA. */     
00083   /* So srcBLen is always considered as shorter or equal to srcALen */     
00084   if(srcALen >= srcBLen)     
00085   {     
00086     /* Initialization of inputA pointer */     
00087     pIn1 = pSrcA;     
00088      
00089     /* Initialization of inputB pointer */     
00090     pIn2 = pSrcB;     
00091   }     
00092   else     
00093   {     
00094     /* Initialization of inputA pointer */     
00095     pIn1 = pSrcB;     
00096      
00097     /* Initialization of inputB pointer */     
00098     pIn2 = pSrcA;     
00099      
00100     /* srcBLen is always considered as shorter or equal to srcALen */     
00101     j = srcBLen;     
00102     srcBLen = srcALen;     
00103     srcALen = j;     
00104   }     
00105      
00106   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */     
00107   /* The function is internally      
00108    * divided into three stages according to the number of multiplications that has to be      
00109    * taken place between inputA samples and inputB samples. In the first stage of the      
00110    * algorithm, the multiplications increase by one for every iteration.      
00111    * In the second stage of the algorithm, srcBLen number of multiplications are done.      
00112    * In the third stage of the algorithm, the multiplications decrease by one      
00113    * for every iteration. */     
00114      
00115   /* The algorithm is implemented in three stages.      
00116      The loop counters of each stage is initiated here. */     
00117   blockSize1 = srcBLen - 1u;     
00118   blockSize2 = srcALen - (srcBLen - 1u);     
00119   blockSize3 = blockSize1;     
00120      
00121   /* --------------------------      
00122    * Initializations of stage1      
00123    * -------------------------*/     
00124      
00125   /* sum = x[0] * y[0]      
00126    * sum = x[0] * y[1] + x[1] * y[0]      
00127    * ....      
00128    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]      
00129    */     
00130      
00131   /* In this stage the MAC operations are increased by 1 for every iteration.      
00132      The count variable holds the number of MAC operations performed */     
00133   count = 1u;     
00134      
00135   /* Working pointer of inputA */     
00136   px = pIn1;     
00137      
00138   /* Working pointer of inputB */     
00139   py = pIn2;     
00140      
00141      
00142   /* ------------------------      
00143    * Stage1 process      
00144    * ----------------------*/     
00145      
00146   /* The first stage starts here */     
00147   while(blockSize1 > 0u)     
00148   {     
00149     /* Accumulator is made zero for every iteration */     
00150     sum = 0;     
00151      
00152     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00153     k = count >> 2u;     
00154      
00155     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00156      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00157     while(k > 0u)     
00158     {  
00159       
00160       /* Read x[0] */  
00161       x0 = *px++;  
00162       /* Read y[srcBLen - 1] */  
00163       c0 = *py--;  
00164   
00165       /* Read x[1] */  
00166       x1 = *px++;  
00167       /* Read y[srcBLen - 2] */  
00168       c1 = *py--;  
00169   
00170       /* x[0] * y[srcBLen - 1] */     
00171       sum += (q63_t) x0 * c0;  
00172                
00173       /* x[1] * y[srcBLen - 2] */     
00174       sum += (q63_t) x1 * c1;     
00175   
00176       /* Read x[2] */       
00177       x0 = *px++;  
00178       /* Read y[srcBLen - 3] */  
00179       c0 = *py--;  
00180   
00181       /* Read x[3] */  
00182       x1 = *px++;  
00183       /* Read y[srcBLen - 4] */  
00184       c1 = *py--;  
00185         
00186       /* x[2] * y[srcBLen - 3] */     
00187       sum += (q63_t) x0 * c0;  
00188   
00189       /* x[3] * y[srcBLen - 4] */     
00190       sum += (q63_t) x1 * c1;     
00191      
00192       /* Decrement the loop counter */     
00193       k--;     
00194     }     
00195      
00196     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00197      ** No loop unrolling is used. */     
00198     k = count % 0x4u;     
00199      
00200     while(k > 0u)     
00201     {  
00202       
00203       x0 = *px++;  
00204       c0 = *py--;  
00205            
00206       /* Perform the multiply-accumulate */     
00207       sum += (q63_t) x0 * c0;     
00208      
00209       /* Decrement the loop counter */     
00210       k--;     
00211     }     
00212      
00213     /* Store the result in the accumulator in the destination buffer. */     
00214     *pOut++ = (q31_t) (sum >> 31);     
00215      
00216     /* Update the inputA and inputB pointers for next MAC calculation */     
00217     py = pIn2 + count;     
00218     px = pIn1;     
00219      
00220     /* Increment the MAC count */     
00221     count++;     
00222      
00223     /* Decrement the loop counter */     
00224     blockSize1--;     
00225   }     
00226      
00227   /* --------------------------      
00228    * Initializations of stage2      
00229    * ------------------------*/     
00230      
00231   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]      
00232    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]      
00233    * ....      
00234    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]      
00235    */     
00236      
00237   /* Working pointer of inputA */     
00238   px = pIn1;     
00239      
00240   /* Working pointer of inputB */     
00241   pSrc2 = pIn2 + (srcBLen - 1u);     
00242   py = pSrc2;     
00243      
00244   /* count is index by which the pointer pIn1 to be incremented */     
00245   count = 0u;     
00246      
00247   /* -------------------      
00248    * Stage2 process      
00249    * ------------------*/     
00250      
00251   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00252    * So, to loop unroll over blockSize2,      
00253    * srcBLen should be greater than or equal to 4 */     
00254   if(srcBLen >= 4u)     
00255   {     
00256     /* Loop unroll by 3 */     
00257     blkCnt = blockSize2 / 3;  
00258      
00259     while(blkCnt > 0u)     
00260     {     
00261       /* Set all accumulators to zero */     
00262       acc0 = 0;     
00263       acc1 = 0;     
00264       acc2 = 0;     
00265      
00266       /* read x[0], x[1], x[2] samples */     
00267       x0 = *(px++);     
00268       x1 = *(px++);     
00269      
00270       /* Apply loop unrolling and compute 3 MACs simultaneously. */     
00271       k = srcBLen / 3;  
00272      
00273       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.      
00274        ** a second loop below computes MACs for the remaining 1 to 2 samples. */     
00275       do     
00276       {     
00277         /* Read y[srcBLen - 1] sample */     
00278         c0 = *(py);     
00279      
00280         /* Read x[3] sample */     
00281         x2 = *(px);     
00282      
00283         /* Perform the multiply-accumulates */     
00284         /* acc0 +=  x[0] * y[srcBLen - 1] */     
00285         acc0 += ((q63_t) x0 * c0);     
00286         /* acc1 +=  x[1] * y[srcBLen - 1] */     
00287         acc1 += ((q63_t) x1 * c0);     
00288         /* acc2 +=  x[2] * y[srcBLen - 1] */     
00289         acc2 += ((q63_t) x2 * c0);     
00290      
00291         /* Read y[srcBLen - 2] sample */     
00292         c0 = *(py - 1u);     
00293      
00294         /* Read x[4] sample */     
00295         x0 = *(px + 1u);     
00296      
00297         /* Perform the multiply-accumulate */     
00298         /* acc0 +=  x[1] * y[srcBLen - 2] */     
00299         acc0 += ((q63_t) x1 * c0);     
00300         /* acc1 +=  x[2] * y[srcBLen - 2] */     
00301         acc1 += ((q63_t) x2 * c0);     
00302         /* acc2 +=  x[3] * y[srcBLen - 2] */     
00303         acc2 += ((q63_t) x0 * c0);     
00304      
00305         /* Read y[srcBLen - 3] sample */     
00306         c0 = *(py - 2u);     
00307      
00308         /* Read x[5] sample */     
00309         x1 = *(px + 2u);     
00310      
00311         /* Perform the multiply-accumulates */     
00312         /* acc0 +=  x[2] * y[srcBLen - 3] */     
00313         acc0 += ((q63_t) x2 * c0);     
00314         /* acc1 +=  x[3] * y[srcBLen - 2] */     
00315         acc1 += ((q63_t) x0 * c0);     
00316         /* acc2 +=  x[4] * y[srcBLen - 2] */     
00317         acc2 += ((q63_t) x1 * c0);     
00318   
00319         /* update scratch pointers */  
00320         px += 3u;  
00321         py -= 3u;     
00322      
00323       } while(--k);     
00324      
00325       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.      
00326        ** No loop unrolling is used. */     
00327       k = srcBLen - ( 3 * (srcBLen/3));  
00328      
00329       while(k > 0u)     
00330       {     
00331         /* Read y[srcBLen - 5] sample */     
00332         c0 = *(py--);     
00333      
00334         /* Read x[7] sample */     
00335         x2 = *(px++);     
00336      
00337         /* Perform the multiply-accumulates */     
00338         /* acc0 +=  x[4] * y[srcBLen - 5] */     
00339         acc0 += ((q63_t) x0 * c0);     
00340         /* acc1 +=  x[5] * y[srcBLen - 5] */     
00341         acc1 += ((q63_t) x1 * c0);     
00342         /* acc2 +=  x[6] * y[srcBLen - 5] */     
00343         acc2 += ((q63_t) x2 * c0);     
00344      
00345         /* Reuse the present samples for the next MAC */     
00346         x0 = x1;     
00347         x1 = x2;     
00348      
00349         /* Decrement the loop counter */     
00350         k--;     
00351       }     
00352      
00353       /* Store the results in the accumulators in the destination buffer. */     
00354       *pOut++ = (q31_t) (acc0 >> 31);     
00355       *pOut++ = (q31_t) (acc1 >> 31);     
00356       *pOut++ = (q31_t) (acc2 >> 31);     
00357      
00358       /* Increment the pointer pIn1 index, count by 1 */     
00359       count += 3u;     
00360      
00361       /* Update the inputA and inputB pointers for next MAC calculation */     
00362       px = pIn1 + count;     
00363       py = pSrc2;     
00364      
00365       /* Decrement the loop counter */     
00366       blkCnt--;     
00367     }     
00368      
00369     /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.      
00370      ** No loop unrolling is used. */     
00371     blkCnt =  blockSize2 - 3 * (blockSize2/3);    
00372      
00373     while(blkCnt > 0u)     
00374     {     
00375       /* Accumulator is made zero for every iteration */     
00376       sum = 0;     
00377      
00378       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00379       k = srcBLen >> 2u;     
00380      
00381       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00382        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00383       while(k > 0u)     
00384       {     
00385         /* Perform the multiply-accumulates */     
00386         sum += (q63_t) * px++ * (*py--);     
00387         sum += (q63_t) * px++ * (*py--);     
00388         sum += (q63_t) * px++ * (*py--);     
00389         sum += (q63_t) * px++ * (*py--);     
00390      
00391         /* Decrement the loop counter */     
00392         k--;     
00393       }     
00394      
00395       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00396        ** No loop unrolling is used. */     
00397       k = srcBLen % 0x4u;     
00398      
00399       while(k > 0u)     
00400       {     
00401         /* Perform the multiply-accumulate */     
00402         sum += (q63_t) * px++ * (*py--);     
00403      
00404         /* Decrement the loop counter */     
00405         k--;     
00406       }     
00407      
00408       /* Store the result in the accumulator in the destination buffer. */     
00409       *pOut++ = (q31_t) (sum >> 31);     
00410      
00411       /* Increment the MAC count */     
00412       count++;     
00413      
00414       /* Update the inputA and inputB pointers for next MAC calculation */     
00415       px = pIn1 + count;     
00416       py = pSrc2;     
00417      
00418       /* Decrement the loop counter */     
00419       blkCnt--;     
00420     }     
00421   }     
00422   else     
00423   {     
00424     /* If the srcBLen is not a multiple of 4,      
00425      * the blockSize2 loop cannot be unrolled by 4 */     
00426     blkCnt = blockSize2;     
00427      
00428     while(blkCnt > 0u)     
00429     {     
00430       /* Accumulator is made zero for every iteration */     
00431       sum = 0;     
00432      
00433       /* srcBLen number of MACS should be performed */     
00434       k = srcBLen;     
00435      
00436       while(k > 0u)     
00437       {     
00438         /* Perform the multiply-accumulate */     
00439         sum += (q63_t) * px++ * (*py--);     
00440      
00441         /* Decrement the loop counter */     
00442         k--;     
00443       }     
00444      
00445       /* Store the result in the accumulator in the destination buffer. */     
00446       *pOut++ = (q31_t) (sum >> 31);     
00447      
00448       /* Increment the MAC count */     
00449       count++;     
00450      
00451       /* Update the inputA and inputB pointers for next MAC calculation */     
00452       px = pIn1 + count;     
00453       py = pSrc2;     
00454      
00455       /* Decrement the loop counter */     
00456       blkCnt--;     
00457     }     
00458   }     
00459      
00460      
00461   /* --------------------------      
00462    * Initializations of stage3      
00463    * -------------------------*/     
00464      
00465   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]      
00466    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]      
00467    * ....      
00468    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]      
00469    * sum +=  x[srcALen-1] * y[srcBLen-1]      
00470    */     
00471      
00472   /* In this stage the MAC operations are decreased by 1 for every iteration.      
00473      The blockSize3 variable holds the number of MAC operations performed */     
00474      
00475   /* Working pointer of inputA */     
00476   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);     
00477   px = pSrc1;     
00478      
00479   /* Working pointer of inputB */     
00480   pSrc2 = pIn2 + (srcBLen - 1u);     
00481   py = pSrc2;     
00482      
00483   /* -------------------      
00484    * Stage3 process      
00485    * ------------------*/     
00486      
00487   while(blockSize3 > 0u)     
00488   {     
00489     /* Accumulator is made zero for every iteration */     
00490     sum = 0;     
00491      
00492     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00493     k = blockSize3 >> 2u;     
00494      
00495     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00496      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00497     while(k > 0u)     
00498     {  
00499   
00500       /* Read x[srcALen - srcBLen + 1] */  
00501       x0 = *px++;  
00502       /* Read y[srcBLen - 1] */  
00503       c0 = *py--;  
00504   
00505       /* Read x[srcALen - srcBLen + 2] */  
00506       x1 = *px++;  
00507       /* Read y[srcBLen - 2] */  
00508       c1 = *py--;  
00509   
00510       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */     
00511       sum += (q63_t) x0 * c0;    
00512            
00513       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */     
00514       sum += (q63_t) x1 * c1;   
00515         
00516       /* Read x[srcALen - srcBLen + 3] */  
00517       x0 = *px++;  
00518       /* Read y[srcBLen - 3] */  
00519       c0 = *py--;  
00520   
00521       /* Read x[srcALen - srcBLen + 4] */  
00522       x1 = *px++;  
00523       /* Read y[srcBLen - 4] */  
00524       c1 = *py--;  
00525               
00526       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */     
00527       sum += (q63_t) x0 * c0;     
00528       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */     
00529       sum += (q63_t) x1 * c1;     
00530      
00531       /* Decrement the loop counter */     
00532       k--;     
00533     }     
00534      
00535     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.      
00536      ** No loop unrolling is used. */     
00537     k = blockSize3 % 0x4u;     
00538      
00539     while(k > 0u)     
00540     {  
00541       
00542       x0 = *px++;  
00543       c0 = *py--;  
00544            
00545       /* Perform the multiply-accumulate */     
00546       sum += (q63_t) x0 * c0;     
00547      
00548       /* Decrement the loop counter */     
00549       k--;     
00550     }     
00551      
00552     /* Store the result in the accumulator in the destination buffer. */     
00553     *pOut++ = (q31_t) (sum >> 31);     
00554      
00555     /* Update the inputA and inputB pointers for next MAC calculation */     
00556     px = ++pSrc1;     
00557     py = pSrc2;     
00558      
00559     /* Decrement the loop counter */     
00560     blockSize3--;     
00561   }     
00562      
00563 }     
00564