Cortex-R DSP Software Library: arm_conv

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_conv_f32.c      
00009 *      
00010 * Description:  Convolution of floating-point sequences.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00094 void arm_conv_f32(     
00095   float32_t * pSrcA,     
00096   uint32_t srcALen,     
00097   float32_t * pSrcB,     
00098   uint32_t srcBLen,     
00099   float32_t * pDst)     
00100 {     
00101   float32_t *pIn1;                               /* inputA pointer */     
00102   float32_t *pIn2;                               /* inputB pointer */     
00103   float32_t *pOut = pDst;                        /* output pointer */     
00104   float32_t *px;                                 /* Intermediate inputA pointer */     
00105   float32_t *py;                                 /* Intermediate inputB pointer */     
00106   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */     
00107   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */     
00108   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */     
00109   uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3;     /* loop counters */     
00110      
00111      
00112   /* The algorithm implementation is based on the lengths of the inputs. */     
00113   /* srcB is always made to slide across srcA. */     
00114   /* So srcBLen is always considered as shorter or equal to srcALen */     
00115   if(srcALen >= srcBLen)     
00116   {     
00117     /* Initialization of inputA pointer */     
00118     pIn1 = pSrcA;     
00119      
00120     /* Initialization of inputB pointer */     
00121     pIn2 = pSrcB;  
00122          
00123   }     
00124   else     
00125   {     
00126     /* Initialization of inputA pointer */     
00127     pIn1 = pSrcB;     
00128      
00129     /* Initialization of inputB pointer */     
00130     pIn2 = pSrcA;     
00131      
00132     /* srcBLen is always considered as shorter or equal to srcALen */     
00133     j = srcBLen;     
00134     srcBLen = srcALen;     
00135     srcALen = j;    
00136        
00137   }     
00138      
00139   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */     
00140   /* The function is internally      
00141    * divided into three stages according to the number of multiplications that has to be      
00142    * taken place between inputA samples and inputB samples. In the first stage of the      
00143    * algorithm, the multiplications increase by one for every iteration.      
00144    * In the second stage of the algorithm, srcBLen number of multiplications are done.      
00145    * In the third stage of the algorithm, the multiplications decrease by one      
00146    * for every iteration. */     
00147      
00148   /* The algorithm is implemented in three stages.      
00149      The loop counters of each stage is initiated here. */     
00150   blockSize1 = srcBLen - 1u;     
00151   blockSize2 = srcALen - (srcBLen - 1u);     
00152   blockSize3 = blockSize1;     
00153      
00154   /* --------------------------      
00155    * initializations of stage1      
00156    * -------------------------*/     
00157      
00158   /* sum = x[0] * y[0]      
00159    * sum = x[0] * y[1] + x[1] * y[0]      
00160    * ....      
00161    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]      
00162    */     
00163      
00164   /* In this stage the MAC operations are increased by 1 for every iteration.      
00165      The count variable holds the number of MAC operations performed */     
00166   count = 1u;     
00167      
00168   /* Working pointer of inputA */     
00169   px = pIn1;     
00170      
00171   /* Working pointer of inputB */     
00172   py = pIn2;     
00173      
00174      
00175   /* ------------------------      
00176    * Stage1 process      
00177    * ----------------------*/     
00178      
00179   /* The first stage starts here */     
00180   while(blockSize1 > 0u)     
00181   {     
00182     /* Accumulator is made zero for every iteration */     
00183     sum = 0.0f;     
00184      
00185     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00186     k = count >> 2u;     
00187      
00188     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00189      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00190     while(k > 0u)     
00191     {  
00192       
00193       /* Read x[0] */  
00194       x0 = *px++;  
00195   
00196       /* y[srcBLen - 1] */   
00197       c0 = *py--;  
00198           
00199       /* x[0] * y[srcBLen - 1] */     
00200       sum +=  x0 * c0;     
00201      
00202       /* x[1] * y[srcBLen - 2] */     
00203       sum += *px++ * *py--;     
00204      
00205       /* x[2] * y[srcBLen - 3] */     
00206       sum += *px++ * *py--;     
00207      
00208       /* x[3] * y[srcBLen - 4] */     
00209       sum += *px++ * *py--;     
00210      
00211       /* Decrement the loop counter */     
00212       k--;     
00213     }     
00214      
00215     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00216      ** No loop unrolling is used. */     
00217     k = count % 0x4u;     
00218      
00219     while(k > 0u)     
00220     {  
00221   
00222       x0 = *px++;  
00223       c0 = *py--;  
00224   
00225       /* Perform the multiply-accumulate */     
00226       sum += x0 * c0;     
00227      
00228       /* Decrement the loop counter */     
00229       k--;     
00230     }     
00231      
00232     /* Store the result in the accumulator in the destination buffer. */     
00233     *pOut++ = sum;     
00234      
00235     /* Update the inputA and inputB pointers for next MAC calculation */     
00236     py = pIn2 + count;     
00237     px = pIn1;     
00238      
00239     /* Increment the MAC count */     
00240     count++;     
00241      
00242     /* Decrement the loop counter */     
00243     blockSize1--;     
00244   }     
00245      
00246   /* --------------------------      
00247    * Initializations of stage2      
00248    * ------------------------*/     
00249      
00250   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]      
00251    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]      
00252    * ....      
00253    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]      
00254    */     
00255      
00256   /* Working pointer of inputA */     
00257   px = pIn1;     
00258      
00259   /* Working pointer of inputB */     
00260   pSrc2 = pIn2 + (srcBLen - 1u);     
00261   py = pSrc2;     
00262      
00263   /* count is index by which the pointer pIn1 to be incremented */     
00264   count = 0u;     
00265      
00266   /* -------------------      
00267    * Stage2 process      
00268    * ------------------*/     
00269      
00270   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00271    * So, to loop unroll over blockSize2,      
00272    * srcBLen should be greater than or equal to 4 */     
00273   if(srcBLen >= 4u)     
00274   {     
00275     /* Loop unroll over blockSize2, by 4 */     
00276     blkCnt = blockSize2 >> 2u;     
00277      
00278     while(blkCnt > 0u)     
00279     {     
00280       /* Set all accumulators to zero */     
00281       acc0 = 0.0f;     
00282       acc1 = 0.0f;     
00283       acc2 = 0.0f;     
00284       acc3 = 0.0f;     
00285      
00286       /* read x[0], x[1], x[2] samples */     
00287       x0 = *(px++);     
00288       x1 = *(px++);     
00289       x2 = *(px++);     
00290      
00291       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00292       k = srcBLen >> 2u;     
00293      
00294       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00295        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00296       do     
00297       {     
00298         /* Read y[srcBLen - 1] sample */     
00299         c0 = *(py--);     
00300      
00301         /* Read x[3] sample */     
00302         x3 = *(px);     
00303      
00304         /* Perform the multiply-accumulate */     
00305         /* acc0 +=  x[0] * y[srcBLen - 1] */     
00306         acc0 += x0 * c0;     
00307      
00308         /* acc1 +=  x[1] * y[srcBLen - 1] */     
00309         acc1 += x1 * c0;     
00310      
00311         /* acc2 +=  x[2] * y[srcBLen - 1] */     
00312         acc2 += x2 * c0;     
00313      
00314         /* acc3 +=  x[3] * y[srcBLen - 1] */     
00315         acc3 += x3 * c0;     
00316      
00317         /* Read y[srcBLen - 2] sample */     
00318         c0 = *(py--);     
00319      
00320         /* Read x[4] sample */     
00321         x0 = *(px + 1u);     
00322      
00323         /* Perform the multiply-accumulate */     
00324         /* acc0 +=  x[1] * y[srcBLen - 2] */     
00325         acc0 += x1 * c0;     
00326         /* acc1 +=  x[2] * y[srcBLen - 2] */     
00327         acc1 += x2 * c0;     
00328         /* acc2 +=  x[3] * y[srcBLen - 2] */     
00329         acc2 += x3 * c0;     
00330         /* acc3 +=  x[4] * y[srcBLen - 2] */     
00331         acc3 += x0 * c0;     
00332      
00333         /* Read y[srcBLen - 3] sample */     
00334         c0 = *(py--);     
00335      
00336         /* Read x[5] sample */     
00337         x1 = *(px + 2u);     
00338      
00339         /* Perform the multiply-accumulates */     
00340         /* acc0 +=  x[2] * y[srcBLen - 3] */     
00341         acc0 += x2 * c0;     
00342         /* acc1 +=  x[3] * y[srcBLen - 2] */     
00343         acc1 += x3 * c0;     
00344         /* acc2 +=  x[4] * y[srcBLen - 2] */     
00345         acc2 += x0 * c0;     
00346         /* acc3 +=  x[5] * y[srcBLen - 2] */     
00347         acc3 += x1 * c0;     
00348      
00349         /* Read y[srcBLen - 4] sample */     
00350         c0 = *(py--);     
00351      
00352         /* Read x[6] sample */     
00353         x2 = *(px + 3u);     
00354      
00355         /* Perform the multiply-accumulates */     
00356         /* acc0 +=  x[3] * y[srcBLen - 4] */     
00357         acc0 += x3 * c0;     
00358         /* acc1 +=  x[4] * y[srcBLen - 4] */     
00359         acc1 += x0 * c0;     
00360         /* acc2 +=  x[5] * y[srcBLen - 4] */     
00361         acc2 += x1 * c0;     
00362         /* acc3 +=  x[6] * y[srcBLen - 4] */     
00363         acc3 += x2 * c0;     
00364   
00365         px += 4u;  
00366      
00367      
00368       } while(--k);     
00369      
00370       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00371        ** No loop unrolling is used. */     
00372       k = srcBLen % 0x4u;     
00373      
00374       while(k > 0u)     
00375       {     
00376         /* Read y[srcBLen - 5] sample */     
00377         c0 = *(py--);     
00378      
00379         /* Read x[7] sample */     
00380         x3 = *(px++);     
00381      
00382         /* Perform the multiply-accumulates */     
00383         /* acc0 +=  x[4] * y[srcBLen - 5] */     
00384         acc0 += x0 * c0;     
00385         /* acc1 +=  x[5] * y[srcBLen - 5] */     
00386         acc1 += x1 * c0;     
00387         /* acc2 +=  x[6] * y[srcBLen - 5] */     
00388         acc2 += x2 * c0;     
00389         /* acc3 +=  x[7] * y[srcBLen - 5] */     
00390         acc3 += x3 * c0;     
00391      
00392         /* Reuse the present samples for the next MAC */     
00393         x0 = x1;     
00394         x1 = x2;     
00395         x2 = x3;     
00396      
00397         /* Decrement the loop counter */     
00398         k--;     
00399       }     
00400      
00401       /* Store the result in the accumulator in the destination buffer. */     
00402       *pOut++ = acc0;     
00403       *pOut++ = acc1;     
00404       *pOut++ = acc2;     
00405       *pOut++ = acc3;     
00406      
00407       /* Increment the pointer pIn1 index, count by 1 */     
00408       count += 4u;     
00409      
00410       /* Update the inputA and inputB pointers for next MAC calculation */     
00411       px = pIn1 + count;     
00412       py = pSrc2;     
00413      
00414       /* Decrement the loop counter */     
00415       blkCnt--;     
00416     }     
00417      
00418     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00419      ** No loop unrolling is used. */     
00420     blkCnt = blockSize2 % 0x4u;     
00421      
00422     while(blkCnt > 0u)     
00423     {     
00424       /* Accumulator is made zero for every iteration */     
00425       sum = 0.0f;     
00426      
00427       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00428       k = srcBLen >> 2u;     
00429      
00430       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00431        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00432       while(k > 0u)     
00433       {    
00434            
00435         /* Perform the multiply-accumulates */     
00436         sum += *px++ * *py--;     
00437         sum += *px++ * *py--;     
00438         sum += *px++ * *py--;     
00439         sum += *px++ * *py--;     
00440      
00441         /* Decrement the loop counter */     
00442         k--;     
00443       }     
00444      
00445       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00446        ** No loop unrolling is used. */     
00447       k = srcBLen % 0x4u;     
00448      
00449       while(k > 0u)     
00450       {     
00451         /* Perform the multiply-accumulate */     
00452         sum += *px++ * *py--;     
00453      
00454         /* Decrement the loop counter */     
00455         k--;     
00456       }     
00457      
00458       /* Store the result in the accumulator in the destination buffer. */     
00459       *pOut++ = sum;     
00460      
00461       /* Increment the MAC count */     
00462       count++;     
00463      
00464      /* Update the inputA and inputB pointers for next MAC calculation */     
00465       px = pIn1 + count;     
00466       py = pSrc2;     
00467      
00468       /* Decrement the loop counter */     
00469       blkCnt--;     
00470     }     
00471   }     
00472   else     
00473   {     
00474     /* If the srcBLen is not a multiple of 4,      
00475      * the blockSize2 loop cannot be unrolled by 4 */     
00476     blkCnt = blockSize2;     
00477      
00478     while(blkCnt > 0u)     
00479     {     
00480       /* Accumulator is made zero for every iteration */     
00481       sum = 0.0f;     
00482      
00483       /* srcBLen number of MACS should be performed */     
00484       k = srcBLen;     
00485      
00486       while(k > 0u)     
00487       {     
00488         /* Perform the multiply-accumulate */     
00489         sum += *px++ * *py--;     
00490      
00491         /* Decrement the loop counter */     
00492         k--;     
00493       }     
00494      
00495       /* Store the result in the accumulator in the destination buffer. */     
00496       *pOut++ = sum;     
00497      
00498       /* Increment the MAC count */     
00499       count++;     
00500      
00501       /* Update the inputA and inputB pointers for next MAC calculation */     
00502       px = pIn1 + count;     
00503       py = pSrc2;     
00504      
00505       /* Decrement the loop counter */     
00506       blkCnt--;     
00507     }     
00508   }     
00509      
00510      
00511   /* --------------------------      
00512    * Initializations of stage3      
00513    * -------------------------*/     
00514      
00515   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]      
00516    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]      
00517    * ....      
00518    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]      
00519    * sum +=  x[srcALen-1] * y[srcBLen-1]      
00520    */     
00521      
00522   /* In this stage the MAC operations are decreased by 1 for every iteration.      
00523      The blockSize3 variable holds the number of MAC operations performed */     
00524      
00525   /* Working pointer of inputA */     
00526   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);     
00527   px = pSrc1;     
00528      
00529   /* Working pointer of inputB */     
00530   pSrc2 = pIn2 + (srcBLen - 1u);     
00531   py = pSrc2;     
00532      
00533   /* -------------------      
00534    * Stage3 process      
00535    * ------------------*/     
00536      
00537   while(blockSize3 > 0u)     
00538   {     
00539     /* Accumulator is made zero for every iteration */     
00540     sum = 0.0f;     
00541      
00542     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00543     k = blockSize3 >> 2u;     
00544      
00545     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00546      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00547     while(k > 0u)     
00548     {  
00549       /* Read x[srcALen - srcBLen + 1] */  
00550       x0 = *px++;  
00551   
00552       /* Read y[srcBLen - 1] */  
00553       c0 = *py--;  
00554   
00555       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */     
00556       sum += x0 * c0;     
00557      
00558       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */     
00559       sum += *px++ * *py--;     
00560      
00561       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */     
00562       sum += *px++ * *py--;     
00563      
00564       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */     
00565       sum += *px++ * *py--;     
00566      
00567       /* Decrement the loop counter */     
00568       k--;     
00569     }  
00570      
00571     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.      
00572      ** No loop unrolling is used. */     
00573     k = blockSize3 % 0x4u;     
00574      
00575     while(k > 0u)     
00576     {     
00577       /* Perform the multiply-accumulates */     
00578       /* sum +=  x[srcALen-1] * y[srcBLen-1] */   
00579       sum += *px++ * *py--;     
00580      
00581       /* Decrement the loop counter */     
00582       k--;     
00583     }     
00584      
00585     /* Store the result in the accumulator in the destination buffer. */     
00586     *pOut++ = sum;     
00587      
00588     /* Update the inputA and inputB pointers for next MAC calculation */     
00589     px = ++pSrc1;     
00590     py = pSrc2;     
00591      
00592     /* Decrement the loop counter */     
00593     blockSize3--;     
00594   }     
00595      
00596 }     
00597