Cortex-R DSP Software Library: arm_conv_partial

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_conv_partial_f32.c      
00009 *      
00010 * Description:  Partial convolution of floating-point sequences.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00065 arm_status arm_conv_partial_f32(     
00066   float32_t * pSrcA,     
00067   uint32_t srcALen,     
00068   float32_t * pSrcB,     
00069   uint32_t srcBLen,     
00070   float32_t * pDst,     
00071   uint32_t firstIndex,     
00072   uint32_t numPoints)     
00073 {     
00074   float32_t *pIn1 = pSrcA;                       /* inputA pointer */     
00075   float32_t *pIn2 = pSrcB;                       /* inputB pointer */     
00076   float32_t *pOut = pDst;                        /* output pointer */     
00077   float32_t *px;                                 /* Intermediate inputA pointer */     
00078   float32_t *py;                                 /* Intermediate inputB pointer */     
00079   float32_t *pSrc1, *pSrc2;                      /* Intermediate pointers */     
00080   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulator */     
00081   float32_t x0, x1, x2, x3, c0;                  /* Temporary variables to hold state and coefficient values */     
00082   uint32_t j, k, count = 0u, blkCnt, check;     
00083   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters */     
00084   arm_status status;                             /* status of Partial convolution */     
00085      
00086      
00087   /* Check for range of output samples to be calculated */     
00088   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))     
00089   {     
00090     /* Set status as ARM_MATH_ARGUMENT_ERROR */     
00091     status = ARM_MATH_ARGUMENT_ERROR;     
00092   }     
00093   else     
00094   {     
00095      
00096     /* The algorithm implementation is based on the lengths of the inputs. */     
00097     /* srcB is always made to slide across srcA. */     
00098     /* So srcBLen is always considered as shorter or equal to srcALen */     
00099     if(srcALen >= srcBLen)     
00100     {     
00101       /* Initialization of inputA pointer */     
00102       pIn1 = pSrcA;     
00103      
00104       /* Initialization of inputB pointer */     
00105       pIn2 = pSrcB;     
00106     }     
00107     else     
00108     {     
00109       /* Initialization of inputA pointer */     
00110       pIn1 = pSrcB;     
00111      
00112       /* Initialization of inputB pointer */     
00113       pIn2 = pSrcA;     
00114      
00115       /* srcBLen is always considered as shorter or equal to srcALen */     
00116       j = srcBLen;     
00117       srcBLen = srcALen;     
00118       srcALen = j;     
00119     }     
00120      
00121     /* Conditions to check which loopCounter holds      
00122      * the first and last indices of the output samples to be calculated. */     
00123     check = firstIndex + numPoints;     
00124     blockSize3 = (int32_t) check - (int32_t) srcALen;     
00125     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;     
00126     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;     
00127     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :     
00128                                     (int32_t) numPoints) : 0;     
00129     blockSize2 = ((int32_t) check - blockSize3) -      
00130                  (blockSize1 + (int32_t) firstIndex);     
00131     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;     
00132      
00133     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */     
00134     /* The function is internally      
00135      * divided into three stages according to the number of multiplications that has to be      
00136      * taken place between inputA samples and inputB samples. In the first stage of the      
00137      * algorithm, the multiplications increase by one for every iteration.      
00138      * In the second stage of the algorithm, srcBLen number of multiplications are done.      
00139      * In the third stage of the algorithm, the multiplications decrease by one      
00140      * for every iteration. */     
00141      
00142     /* Set the output pointer to point to the firstIndex      
00143      * of the output sample to be calculated. */     
00144     pOut = pDst + firstIndex;     
00145      
00146     /* --------------------------      
00147      * Initializations of stage1      
00148      * -------------------------*/     
00149      
00150     /* sum = x[0] * y[0]      
00151      * sum = x[0] * y[1] + x[1] * y[0]      
00152      * ....      
00153      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]      
00154      */     
00155      
00156     /* In this stage the MAC operations are increased by 1 for every iteration.      
00157        The count variable holds the number of MAC operations performed.      
00158        Since the partial convolution starts from from firstIndex      
00159        Number of Macs to be performed is firstIndex + 1 */     
00160     count = 1u + firstIndex;     
00161      
00162     /* Working pointer of inputA */     
00163     px = pIn1;     
00164      
00165     /* Working pointer of inputB */     
00166     pSrc1 = pIn2 + firstIndex;     
00167     py = pSrc1;     
00168      
00169     /* ------------------------      
00170      * Stage1 process      
00171      * ----------------------*/     
00172      
00173     /* The first stage starts here */     
00174     while(blockSize1 > 0)     
00175     {     
00176       /* Accumulator is made zero for every iteration */     
00177       sum = 0.0f;     
00178      
00179       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00180       k = count >> 2u;     
00181      
00182       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00183        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00184       while(k > 0u)     
00185       {     
00186       
00187       /* Read x[0] */  
00188       x0 = *px++;  
00189   
00190       /* y[srcBLen - 1] */   
00191       c0 = *py--;  
00192           
00193       /* x[0] * y[srcBLen - 1] */     
00194       sum +=  x0 * c0;     
00195      
00196         /* x[1] * y[srcBLen - 2] */     
00197         sum += *px++ * *py--;     
00198      
00199         /* x[2] * y[srcBLen - 3] */     
00200         sum += *px++ * *py--;     
00201      
00202         /* x[3] * y[srcBLen - 4] */     
00203         sum += *px++ * *py--;     
00204      
00205         /* Decrement the loop counter */     
00206         k--;     
00207       }     
00208      
00209       /* If the count is not a multiple of 4, compute any remaining MACs here.      
00210        ** No loop unrolling is used. */     
00211       k = count % 0x4u;     
00212      
00213       while(k > 0u)     
00214       {     
00215   
00216       x0 = *px++;  
00217       c0 = *py--;  
00218   
00219       /* Perform the multiply-accumulate */     
00220       sum += x0 * c0;     
00221      
00222         /* Decrement the loop counter */     
00223         k--;     
00224       }     
00225      
00226       /* Store the result in the accumulator in the destination buffer. */     
00227       *pOut++ = sum;     
00228      
00229       /* Update the inputA and inputB pointers for next MAC calculation */     
00230       py = ++pSrc1;     
00231       px = pIn1;     
00232      
00233       /* Increment the MAC count */     
00234       count++;     
00235      
00236       /* Decrement the loop counter */     
00237       blockSize1--;     
00238     }     
00239      
00240     /* --------------------------      
00241      * Initializations of stage2      
00242      * ------------------------*/     
00243      
00244     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]      
00245      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]      
00246      * ....      
00247      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]      
00248      */     
00249      
00250     /* Working pointer of inputA */     
00251     px = pIn1;     
00252      
00253     /* Working pointer of inputB */     
00254     pSrc2 = pIn2 + (srcBLen - 1u);     
00255     py = pSrc2;     
00256      
00257     /* count is index by which the pointer pIn1 to be incremented */     
00258     count = 0u;     
00259      
00260     /* -------------------      
00261      * Stage2 process      
00262      * ------------------*/     
00263      
00264     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00265      * So, to loop unroll over blockSize2,      
00266      * srcBLen should be greater than or equal to 4 */     
00267     if(srcBLen >= 4u)     
00268     {     
00269       /* Loop unroll over blockSize2, by 4 */     
00270       blkCnt = ((uint32_t) blockSize2 >> 2u);     
00271      
00272       while(blkCnt > 0u)     
00273       {     
00274         /* Set all accumulators to zero */     
00275         acc0 = 0.0f;     
00276         acc1 = 0.0f;     
00277         acc2 = 0.0f;     
00278         acc3 = 0.0f;     
00279      
00280         /* read x[0], x[1], x[2] samples */     
00281         x0 = *(px++);     
00282         x1 = *(px++);     
00283         x2 = *(px++);     
00284      
00285         /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00286         k = srcBLen >> 2u;     
00287      
00288         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00289          ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00290         do     
00291         {     
00292           /* Read y[srcBLen - 1] sample */     
00293           c0 = *(py--);     
00294      
00295           /* Read x[3] sample */     
00296           x3 = *(px);     
00297      
00298           /* Perform the multiply-accumulate */     
00299           /* acc0 +=  x[0] * y[srcBLen - 1] */     
00300           acc0 += x0 * c0;     
00301      
00302           /* acc1 +=  x[1] * y[srcBLen - 1] */     
00303           acc1 += x1 * c0;     
00304      
00305           /* acc2 +=  x[2] * y[srcBLen - 1] */     
00306           acc2 += x2 * c0;     
00307      
00308           /* acc3 +=  x[3] * y[srcBLen - 1] */     
00309           acc3 += x3 * c0;     
00310      
00311           /* Read y[srcBLen - 2] sample */     
00312           c0 = *(py--);     
00313      
00314           /* Read x[4] sample */     
00315           x0 = *(px + 1u);     
00316      
00317           /* Perform the multiply-accumulate */     
00318           /* acc0 +=  x[1] * y[srcBLen - 2] */     
00319           acc0 += x1 * c0;     
00320           /* acc1 +=  x[2] * y[srcBLen - 2] */     
00321           acc1 += x2 * c0;     
00322           /* acc2 +=  x[3] * y[srcBLen - 2] */     
00323           acc2 += x3 * c0;     
00324           /* acc3 +=  x[4] * y[srcBLen - 2] */     
00325           acc3 += x0 * c0;     
00326      
00327           /* Read y[srcBLen - 3] sample */     
00328           c0 = *(py--);     
00329      
00330           /* Read x[5] sample */     
00331           x1 = *(px + 2u);     
00332      
00333           /* Perform the multiply-accumulates */     
00334           /* acc0 +=  x[2] * y[srcBLen - 3] */     
00335           acc0 += x2 * c0;     
00336           /* acc1 +=  x[3] * y[srcBLen - 2] */     
00337           acc1 += x3 * c0;     
00338           /* acc2 +=  x[4] * y[srcBLen - 2] */     
00339           acc2 += x0 * c0;     
00340           /* acc3 +=  x[5] * y[srcBLen - 2] */     
00341           acc3 += x1 * c0;     
00342      
00343           /* Read y[srcBLen - 4] sample */     
00344           c0 = *(py--);     
00345      
00346           /* Read x[6] sample */     
00347           x2 = *(px + 3u);     
00348      
00349           /* Perform the multiply-accumulates */     
00350           /* acc0 +=  x[3] * y[srcBLen - 4] */     
00351           acc0 += x3 * c0;     
00352           /* acc1 +=  x[4] * y[srcBLen - 4] */     
00353           acc1 += x0 * c0;     
00354           /* acc2 +=  x[5] * y[srcBLen - 4] */     
00355           acc2 += x1 * c0;     
00356           /* acc3 +=  x[6] * y[srcBLen - 4] */     
00357           acc3 += x2 * c0;     
00358   
00359           px += 4u;  
00360      
00361      
00362         } while(--k);     
00363      
00364         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00365          ** No loop unrolling is used. */     
00366         k = srcBLen % 0x4u;     
00367      
00368         while(k > 0u)     
00369         {     
00370           /* Read y[srcBLen - 5] sample */     
00371           c0 = *(py--);     
00372      
00373           /* Read x[7] sample */     
00374           x3 = *(px++);     
00375      
00376           /* Perform the multiply-accumulates */     
00377           /* acc0 +=  x[4] * y[srcBLen - 5] */     
00378           acc0 += x0 * c0;     
00379           /* acc1 +=  x[5] * y[srcBLen - 5] */     
00380           acc1 += x1 * c0;     
00381           /* acc2 +=  x[6] * y[srcBLen - 5] */     
00382           acc2 += x2 * c0;     
00383           /* acc3 +=  x[7] * y[srcBLen - 5] */     
00384           acc3 += x3 * c0;     
00385      
00386           /* Reuse the present samples for the next MAC */     
00387           x0 = x1;     
00388           x1 = x2;     
00389           x2 = x3;     
00390      
00391           /* Decrement the loop counter */     
00392           k--;     
00393         }     
00394      
00395         /* Store the result in the accumulator in the destination buffer. */     
00396         *pOut++ = acc0;     
00397         *pOut++ = acc1;     
00398         *pOut++ = acc2;     
00399         *pOut++ = acc3;     
00400      
00401         /* Increment the pointer pIn1 index, count by 1 */     
00402         count += 4u;     
00403      
00404         /* Update the inputA and inputB pointers for next MAC calculation */     
00405         px = pIn1 + count;     
00406         py = pSrc2;     
00407      
00408         /* Decrement the loop counter */     
00409         blkCnt--;     
00410       }     
00411      
00412       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00413        ** No loop unrolling is used. */     
00414       blkCnt = (uint32_t) blockSize2 % 0x4u;     
00415      
00416       while(blkCnt > 0u)     
00417       {     
00418         /* Accumulator is made zero for every iteration */     
00419         sum = 0.0f;     
00420      
00421         /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00422         k = srcBLen >> 2u;     
00423      
00424         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00425          ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00426         while(k > 0u)     
00427         {     
00428           /* Perform the multiply-accumulates */     
00429           sum += *px++ * *py--;     
00430           sum += *px++ * *py--;     
00431           sum += *px++ * *py--;     
00432           sum += *px++ * *py--;     
00433      
00434           /* Decrement the loop counter */     
00435           k--;     
00436         }     
00437      
00438         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00439          ** No loop unrolling is used. */     
00440         k = srcBLen % 0x4u;     
00441      
00442         while(k > 0u)     
00443         {     
00444           /* Perform the multiply-accumulate */     
00445           sum += *px++ * *py--;     
00446      
00447           /* Decrement the loop counter */     
00448           k--;     
00449         }     
00450      
00451         /* Increment the MAC count */     
00452         count++;     
00453      
00454         /* Store the result in the accumulator in the destination buffer. */     
00455         *pOut++ = sum;     
00456      
00457         /* Update the inputA and inputB pointers for next MAC calculation */     
00458         px = pIn1 + count;     
00459         py = pSrc2;     
00460      
00461         /* Decrement the loop counter */     
00462         blkCnt--;     
00463       }     
00464     }     
00465     else     
00466     {     
00467       /* If the srcBLen is not a multiple of 4,      
00468        * the blockSize2 loop cannot be unrolled by 4 */     
00469       blkCnt = (uint32_t) blockSize2;     
00470      
00471       while(blkCnt > 0u)     
00472       {     
00473         /* Accumulator is made zero for every iteration */     
00474         sum = 0.0f;     
00475      
00476         /* srcBLen number of MACS should be performed */     
00477         k = srcBLen;     
00478      
00479         while(k > 0u)     
00480         {     
00481           /* Perform the multiply-accumulate */     
00482           sum += *px++ * *py--;     
00483      
00484           /* Decrement the loop counter */     
00485           k--;     
00486         }     
00487      
00488         /* Increment the MAC count */     
00489         count++;     
00490      
00491         /* Store the result in the accumulator in the destination buffer. */     
00492         *pOut++ = sum;     
00493      
00494         /* Update the inputA and inputB pointers for next MAC calculation */     
00495         px = pIn1 + count;     
00496         py = pSrc2;     
00497      
00498         /* Decrement the loop counter */     
00499         blkCnt--;     
00500       }     
00501     }     
00502      
00503      
00504     /* --------------------------      
00505      * Initializations of stage3      
00506      * -------------------------*/     
00507      
00508     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]      
00509      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]      
00510      * ....      
00511      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]      
00512      * sum +=  x[srcALen-1] * y[srcBLen-1]      
00513      */     
00514      
00515     /* In this stage the MAC operations are decreased by 1 for every iteration.      
00516        The count variable holds the number of MAC operations performed */     
00517     count = srcBLen - 1u;     
00518      
00519     /* Working pointer of inputA */     
00520     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);     
00521     px = pSrc1;     
00522      
00523     /* Working pointer of inputB */     
00524     pSrc2 = pIn2 + (srcBLen - 1u);     
00525     py = pSrc2;     
00526      
00527     while(blockSize3 > 0)     
00528     {     
00529       /* Accumulator is made zero for every iteration */     
00530       sum = 0.0f;     
00531      
00532       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00533       k = count >> 2u;     
00534      
00535       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00536        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00537       while(k > 0u)     
00538       {   
00539       /* Read x[srcALen - srcBLen + 1] */  
00540         x0 = *px++;  
00541       /* Read y[srcBLen - 1] */  
00542         c0 = *py--;  
00543           
00544         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */     
00545         sum += x0 * c0;     
00546           
00547         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */     
00548         sum += *px++ * *py--;     
00549           
00550         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */     
00551         sum += *px++ * *py--;     
00552           
00553         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */     
00554         sum += *px++ * *py--;     
00555      
00556         /* Decrement the loop counter */     
00557         k--;     
00558       }     
00559      
00560       /* If the count is not a multiple of 4, compute any remaining MACs here.      
00561        ** No loop unrolling is used. */     
00562       k = count % 0x4u;     
00563      
00564       while(k > 0u)     
00565       {     
00566         /* Perform the multiply-accumulates */     
00567         /* sum +=  x[srcALen-1] * y[srcBLen-1] */     
00568         sum += *px++ * *py--;     
00569      
00570         /* Decrement the loop counter */     
00571         k--;     
00572       }     
00573      
00574       /* Store the result in the accumulator in the destination buffer. */     
00575       *pOut++ = sum;     
00576      
00577       /* Update the inputA and inputB pointers for next MAC calculation */     
00578       px = ++pSrc1;     
00579       py = pSrc2;     
00580      
00581       /* Decrement the MAC count */     
00582       count--;     
00583      
00584       /* Decrement the loop counter */     
00585       blockSize3--;     
00586      
00587     }     
00588      
00589     /* set status as ARM_MATH_SUCCESS */     
00590     status = ARM_MATH_SUCCESS;     
00591   }     
00592      
00593   /* Return to application */     
00594   return (status);     
00595      
00596 }     
00597