Cortex-R DSP Software Library: arm_conv_partial_fast

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_conv_partial_fast_q31.c      
00009 *      
00010 * Description:  Fast Q31 Partial convolution.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00050 arm_status arm_conv_partial_fast_q31(     
00051   q31_t * pSrcA,     
00052   uint32_t srcALen,     
00053   q31_t * pSrcB,     
00054   uint32_t srcBLen,     
00055   q31_t * pDst,     
00056   uint32_t firstIndex,     
00057   uint32_t numPoints)     
00058 {     
00059   q31_t *pIn1;                                   /* inputA pointer               */     
00060   q31_t *pIn2;                                   /* inputB pointer               */     
00061   q31_t *pOut = pDst;                            /* output pointer               */     
00062   q31_t *px;                                     /* Intermediate inputA pointer  */     
00063   q31_t *py;                                     /* Intermediate inputB pointer  */     
00064   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */     
00065   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */     
00066   q31_t x0, x1, x2, x3, c0;     
00067   uint32_t j, k, count, check, blkCnt;     
00068   int32_t blockSize1, blockSize2, blockSize3;    /* loop counters                 */     
00069   arm_status status;                             /* status of Partial convolution */     
00070      
00071      
00072   /* Check for range of output samples to be calculated */     
00073   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))     
00074   {     
00075     /* Set status as ARM_MATH_ARGUMENT_ERROR */     
00076     status = ARM_MATH_ARGUMENT_ERROR;     
00077   }     
00078   else     
00079   {     
00080      
00081     /* The algorithm implementation is based on the lengths of the inputs. */     
00082     /* srcB is always made to slide across srcA. */     
00083     /* So srcBLen is always considered as shorter or equal to srcALen */     
00084     if(srcALen >= srcBLen)     
00085     {     
00086       /* Initialization of inputA pointer */     
00087       pIn1 = pSrcA;     
00088      
00089       /* Initialization of inputB pointer */     
00090       pIn2 = pSrcB;     
00091     }     
00092     else     
00093     {     
00094       /* Initialization of inputA pointer */     
00095       pIn1 = pSrcB;     
00096      
00097       /* Initialization of inputB pointer */     
00098       pIn2 = pSrcA;     
00099      
00100       /* srcBLen is always considered as shorter or equal to srcALen */     
00101       j = srcBLen;     
00102       srcBLen = srcALen;     
00103       srcALen = j;     
00104     }     
00105      
00106     /* Conditions to check which loopCounter holds      
00107      * the first and last indices of the output samples to be calculated. */     
00108     check = firstIndex + numPoints;     
00109     blockSize3 = ((int32_t) check - (int32_t) srcALen);     
00110     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;     
00111     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);     
00112     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :     
00113                                     (int32_t) numPoints) : 0;     
00114     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +      
00115                                     (int32_t) firstIndex);     
00116     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;     
00117      
00118     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */     
00119     /* The function is internally      
00120      * divided into three stages according to the number of multiplications that has to be      
00121      * taken place between inputA samples and inputB samples. In the first stage of the      
00122      * algorithm, the multiplications increase by one for every iteration.      
00123      * In the second stage of the algorithm, srcBLen number of multiplications are done.      
00124      * In the third stage of the algorithm, the multiplications decrease by one      
00125      * for every iteration. */     
00126      
00127     /* Set the output pointer to point to the firstIndex      
00128      * of the output sample to be calculated. */     
00129     pOut = pDst + firstIndex;     
00130      
00131     /* --------------------------      
00132      * Initializations of stage1      
00133      * -------------------------*/     
00134      
00135     /* sum = x[0] * y[0]      
00136      * sum = x[0] * y[1] + x[1] * y[0]      
00137      * ....      
00138      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]      
00139      */     
00140      
00141     /* In this stage the MAC operations are increased by 1 for every iteration.      
00142        The count variable holds the number of MAC operations performed.      
00143        Since the partial convolution starts from firstIndex      
00144        Number of Macs to be performed is firstIndex + 1 */     
00145     count = 1u + firstIndex;     
00146      
00147     /* Working pointer of inputA */     
00148     px = pIn1;     
00149      
00150     /* Working pointer of inputB */     
00151     pSrc2 = pIn2 + firstIndex;     
00152     py = pSrc2;     
00153      
00154     /* ------------------------      
00155      * Stage1 process      
00156      * ----------------------*/     
00157      
00158     /* The first loop starts here */     
00159     while(blockSize1 > 0)     
00160     {     
00161       /* Accumulator is made zero for every iteration */     
00162       sum = 0;     
00163      
00164       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00165       k = count >> 2u;     
00166      
00167       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00168        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00169       while(k > 0u)     
00170       {     
00171         /* x[0] * y[srcBLen - 1] */     
00172         sum = (q31_t) ((((q63_t) sum << 32) +      
00173                         ((q63_t) * px++ * (*py--))) >> 32);     
00174      
00175         /* x[1] * y[srcBLen - 2] */     
00176         sum = (q31_t) ((((q63_t) sum << 32) +      
00177                         ((q63_t) * px++ * (*py--))) >> 32);     
00178      
00179         /* x[2] * y[srcBLen - 3] */     
00180         sum = (q31_t) ((((q63_t) sum << 32) +      
00181                         ((q63_t) * px++ * (*py--))) >> 32);     
00182      
00183         /* x[3] * y[srcBLen - 4] */     
00184         sum = (q31_t) ((((q63_t) sum << 32) +      
00185                         ((q63_t) * px++ * (*py--))) >> 32);     
00186      
00187         /* Decrement the loop counter */     
00188         k--;     
00189       }     
00190      
00191       /* If the count is not a multiple of 4, compute any remaining MACs here.      
00192        ** No loop unrolling is used. */     
00193       k = count % 0x4u;     
00194      
00195       while(k > 0u)     
00196       {     
00197         /* Perform the multiply-accumulates */     
00198         sum = (q31_t) ((((q63_t) sum << 32) +      
00199                         ((q63_t) * px++ * (*py--))) >> 32);     
00200      
00201         /* Decrement the loop counter */     
00202         k--;     
00203       }     
00204      
00205       /* Store the result in the accumulator in the destination buffer. */     
00206       *pOut++ = sum << 1;     
00207      
00208       /* Update the inputA and inputB pointers for next MAC calculation */     
00209       py = ++pSrc2;     
00210       px = pIn1;     
00211      
00212       /* Increment the MAC count */     
00213       count++;     
00214      
00215       /* Decrement the loop counter */     
00216       blockSize1--;     
00217     }     
00218      
00219     /* --------------------------      
00220      * Initializations of stage2      
00221      * ------------------------*/     
00222      
00223     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]      
00224      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]      
00225      * ....      
00226      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]      
00227      */     
00228      
00229     /* Working pointer of inputA */     
00230     px = pIn1;     
00231      
00232     /* Working pointer of inputB */     
00233     pSrc2 = pIn2 + (srcBLen - 1u);     
00234     py = pSrc2;     
00235      
00236     /* count is index by which the pointer pIn1 to be incremented */     
00237     count = 0u;     
00238      
00239     /* -------------------      
00240      * Stage2 process      
00241      * ------------------*/     
00242      
00243     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00244      * So, to loop unroll over blockSize2,      
00245      * srcBLen should be greater than or equal to 4 */     
00246     if(srcBLen >= 4u)     
00247     {     
00248       /* Loop unroll over blockSize2 */     
00249       blkCnt = ((uint32_t) blockSize2 >> 2u);     
00250      
00251       while(blkCnt > 0u)     
00252       {     
00253         /* Set all accumulators to zero */     
00254         acc0 = 0;     
00255         acc1 = 0;     
00256         acc2 = 0;     
00257         acc3 = 0;     
00258      
00259         /* read x[0], x[1], x[2] samples */     
00260         x0 = *(px++);     
00261         x1 = *(px++);     
00262         x2 = *(px++);     
00263      
00264         /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00265         k = srcBLen >> 2u;     
00266      
00267         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00268          ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00269         do     
00270         {     
00271           /* Read y[srcBLen - 1] sample */     
00272           c0 = *(py--);     
00273      
00274           /* Read x[3] sample */     
00275           x3 = *(px++);     
00276      
00277           /* Perform the multiply-accumulate */     
00278           /* acc0 +=  x[0] * y[srcBLen - 1] */     
00279           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00280      
00281           /* acc1 +=  x[1] * y[srcBLen - 1] */     
00282           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00283      
00284           /* acc2 +=  x[2] * y[srcBLen - 1] */     
00285           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00286      
00287           /* acc3 +=  x[3] * y[srcBLen - 1] */     
00288           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);     
00289      
00290           /* Read y[srcBLen - 2] sample */     
00291           c0 = *(py--);     
00292      
00293           /* Read x[4] sample */     
00294           x0 = *(px++);     
00295      
00296           /* Perform the multiply-accumulate */     
00297           /* acc0 +=  x[1] * y[srcBLen - 2] */     
00298           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);     
00299           /* acc1 +=  x[2] * y[srcBLen - 2] */     
00300           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);     
00301           /* acc2 +=  x[3] * y[srcBLen - 2] */     
00302           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);     
00303           /* acc3 +=  x[4] * y[srcBLen - 2] */     
00304           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);     
00305      
00306           /* Read y[srcBLen - 3] sample */     
00307           c0 = *(py--);     
00308      
00309           /* Read x[5] sample */     
00310           x1 = *(px++);     
00311      
00312           /* Perform the multiply-accumulates */     
00313           /* acc0 +=  x[2] * y[srcBLen - 3] */     
00314           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);     
00315           /* acc1 +=  x[3] * y[srcBLen - 2] */     
00316           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);     
00317           /* acc2 +=  x[4] * y[srcBLen - 2] */     
00318           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);     
00319           /* acc3 +=  x[5] * y[srcBLen - 2] */     
00320           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);     
00321      
00322           /* Read y[srcBLen - 4] sample */     
00323           c0 = *(py--);     
00324      
00325           /* Read x[6] sample */     
00326           x2 = *(px++);     
00327      
00328           /* Perform the multiply-accumulates */     
00329           /* acc0 +=  x[3] * y[srcBLen - 4] */     
00330           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);     
00331           /* acc1 +=  x[4] * y[srcBLen - 4] */     
00332           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);     
00333           /* acc2 +=  x[5] * y[srcBLen - 4] */     
00334           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);     
00335           /* acc3 +=  x[6] * y[srcBLen - 4] */     
00336           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);     
00337      
00338      
00339         } while(--k);     
00340      
00341         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00342          ** No loop unrolling is used. */     
00343         k = srcBLen % 0x4u;     
00344      
00345         while(k > 0u)     
00346         {     
00347           /* Read y[srcBLen - 5] sample */     
00348           c0 = *(py--);     
00349      
00350           /* Read x[7] sample */     
00351           x3 = *(px++);     
00352      
00353           /* Perform the multiply-accumulates */     
00354           /* acc0 +=  x[4] * y[srcBLen - 5] */     
00355           acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00356           /* acc1 +=  x[5] * y[srcBLen - 5] */     
00357           acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00358           /* acc2 +=  x[6] * y[srcBLen - 5] */     
00359           acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00360           /* acc3 +=  x[7] * y[srcBLen - 5] */     
00361           acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);     
00362      
00363           /* Reuse the present samples for the next MAC */     
00364           x0 = x1;     
00365           x1 = x2;     
00366           x2 = x3;     
00367      
00368           /* Decrement the loop counter */     
00369           k--;     
00370         }     
00371      
00372         /* Store the result in the accumulator in the destination buffer. */     
00373         *pOut++ = (q31_t) (acc0 << 1);     
00374         *pOut++ = (q31_t) (acc1 << 1);     
00375         *pOut++ = (q31_t) (acc2 << 1);     
00376         *pOut++ = (q31_t) (acc3 << 1);     
00377      
00378          /* Increment the pointer pIn1 index, count by 1 */     
00379         count += 4u;     
00380      
00381        /* Update the inputA and inputB pointers for next MAC calculation */     
00382         px = pIn1 + count;     
00383         py = pSrc2;     
00384      
00385         /* Decrement the loop counter */     
00386         blkCnt--;     
00387       }     
00388      
00389       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00390        ** No loop unrolling is used. */     
00391       blkCnt = (uint32_t) blockSize2 % 0x4u;     
00392      
00393       while(blkCnt > 0u)     
00394       {     
00395         /* Accumulator is made zero for every iteration */     
00396         sum = 0;     
00397      
00398         /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00399         k = srcBLen >> 2u;     
00400      
00401         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00402          ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00403         while(k > 0u)     
00404         {     
00405           /* Perform the multiply-accumulates */     
00406           sum = (q31_t) ((((q63_t) sum << 32) +     
00407                           ((q63_t) * px++ * (*py--))) >> 32);     
00408           sum = (q31_t) ((((q63_t) sum << 32) +     
00409                           ((q63_t) * px++ * (*py--))) >> 32);     
00410           sum = (q31_t) ((((q63_t) sum << 32) +     
00411                           ((q63_t) * px++ * (*py--))) >> 32);     
00412           sum = (q31_t) ((((q63_t) sum << 32) +     
00413                           ((q63_t) * px++ * (*py--))) >> 32);     
00414      
00415           /* Decrement the loop counter */     
00416           k--;     
00417         }     
00418      
00419         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00420          ** No loop unrolling is used. */     
00421         k = srcBLen % 0x4u;     
00422      
00423         while(k > 0u)     
00424         {     
00425           /* Perform the multiply-accumulate */     
00426           sum = (q31_t) ((((q63_t) sum << 32) +     
00427                           ((q63_t) * px++ * (*py--))) >> 32);     
00428      
00429           /* Decrement the loop counter */     
00430           k--;     
00431         }     
00432      
00433         /* Store the result in the accumulator in the destination buffer. */     
00434         *pOut++ = sum << 1;     
00435      
00436         /* Increment the MAC count */     
00437         count++;     
00438      
00439         /* Update the inputA and inputB pointers for next MAC calculation */     
00440         px = pIn1 + count;     
00441         py = pSrc2;     
00442      
00443         /* Decrement the loop counter */     
00444         blkCnt--;     
00445       }     
00446     }     
00447     else     
00448     {     
00449       /* If the srcBLen is not a multiple of 4,      
00450        * the blockSize2 loop cannot be unrolled by 4 */     
00451       blkCnt = (uint32_t) blockSize2;     
00452      
00453       while(blkCnt > 0u)     
00454       {     
00455         /* Accumulator is made zero for every iteration */     
00456         sum = 0;     
00457      
00458         /* srcBLen number of MACS should be performed */     
00459         k = srcBLen;     
00460      
00461         while(k > 0u)     
00462         {     
00463           /* Perform the multiply-accumulate */     
00464           sum = (q31_t) ((((q63_t) sum << 32) +     
00465                           ((q63_t) * px++ * (*py--))) >> 32);     
00466      
00467           /* Decrement the loop counter */     
00468           k--;     
00469         }     
00470      
00471         /* Store the result in the accumulator in the destination buffer. */     
00472         *pOut++ = sum << 1;     
00473      
00474          /* Increment the MAC count */     
00475         count++;     
00476      
00477        /* Update the inputA and inputB pointers for next MAC calculation */     
00478         px = pIn1 + count;     
00479         py = pSrc2;     
00480      
00481         /* Decrement the loop counter */     
00482         blkCnt--;     
00483       }     
00484     }     
00485      
00486      
00487     /* --------------------------      
00488      * Initializations of stage3      
00489      * -------------------------*/     
00490      
00491     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]      
00492      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]      
00493      * ....      
00494      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]      
00495      * sum +=  x[srcALen-1] * y[srcBLen-1]      
00496      */     
00497      
00498     /* In this stage the MAC operations are decreased by 1 for every iteration.      
00499        The count variable holds the number of MAC operations performed */     
00500     count = srcBLen - 1u;     
00501      
00502     /* Working pointer of inputA */     
00503     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);     
00504     px = pSrc1;     
00505      
00506     /* Working pointer of inputB */     
00507     pSrc2 = pIn2 + (srcBLen - 1u);     
00508     py = pSrc2;     
00509      
00510     /* -------------------      
00511      * Stage3 process      
00512      * ------------------*/     
00513      
00514     while(blockSize3 > 0)     
00515     {     
00516       /* Accumulator is made zero for every iteration */     
00517       sum = 0;     
00518      
00519       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00520       k = count >> 2u;     
00521      
00522       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00523        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00524       while(k > 0u)     
00525       {     
00526         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */     
00527         sum = (q31_t) ((((q63_t) sum << 32) +      
00528                         ((q63_t) * px++ * (*py--))) >> 32);     
00529      
00530         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */     
00531         sum = (q31_t) ((((q63_t) sum << 32) +      
00532                         ((q63_t) * px++ * (*py--))) >> 32);     
00533      
00534         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */     
00535         sum = (q31_t) ((((q63_t) sum << 32) +      
00536                         ((q63_t) * px++ * (*py--))) >> 32);     
00537      
00538         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */     
00539         sum = (q31_t) ((((q63_t) sum << 32) +      
00540                         ((q63_t) * px++ * (*py--))) >> 32);     
00541      
00542         /* Decrement the loop counter */     
00543         k--;     
00544       }     
00545      
00546       /* If the count is not a multiple of 4, compute any remaining MACs here.      
00547        ** No loop unrolling is used. */     
00548       k = count % 0x4u;     
00549      
00550       while(k > 0u)     
00551       {     
00552         /* Perform the multiply-accumulates */     
00553         /* sum +=  x[srcALen-1] * y[srcBLen-1] */     
00554         sum = (q31_t) ((((q63_t) sum << 32) +      
00555                         ((q63_t) * px++ * (*py--))) >> 32);     
00556      
00557         /* Decrement the loop counter */     
00558         k--;     
00559       }     
00560      
00561       /* Store the result in the accumulator in the destination buffer. */     
00562       *pOut++ = sum << 1;     
00563      
00564       /* Update the inputA and inputB pointers for next MAC calculation */     
00565       px = ++pSrc1;     
00566       py = pSrc2;     
00567      
00568       /* Decrement the MAC count */     
00569       count--;     
00570      
00571       /* Decrement the loop counter */     
00572       blockSize3--;     
00573      
00574     }     
00575      
00576     /* set status as ARM_MATH_SUCCESS */     
00577     status = ARM_MATH_SUCCESS;     
00578   }     
00579      
00580   /* Return to application */     
00581   return (status);     
00582      
00583 }     
00584