Cortex-R DSP Software Library: arm_conv_partial

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_conv_partial_q31.c      
00009 *      
00010 * Description:  Partial convolution of Q31 sequences.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00050 arm_status arm_conv_partial_q31(     
00051   q31_t * pSrcA,     
00052   uint32_t srcALen,     
00053   q31_t * pSrcB,     
00054   uint32_t srcBLen,     
00055   q31_t * pDst,     
00056   uint32_t firstIndex,     
00057   uint32_t numPoints)     
00058 {     
00059   q31_t *pIn1;                                   /* inputA pointer               */     
00060   q31_t *pIn2;                                   /* inputB pointer               */     
00061   q31_t *pOut = pDst;                            /* output pointer               */     
00062   q31_t *px;                                     /* Intermediate inputA pointer  */     
00063   q31_t *py;                                     /* Intermediate inputB pointer  */     
00064   q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers        */     
00065   q63_t sum, acc0, acc1, acc2;             /* Accumulator                  */     
00066   q31_t x0, x1, x2, c0;     
00067   uint32_t j, k, count, check, blkCnt;     
00068   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter                 */     
00069   arm_status status;                             /* status of Partial convolution */     
00070   q31_t c1;   
00071      
00072   /* Check for range of output samples to be calculated */     
00073   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))     
00074   {     
00075     /* Set status as ARM_MATH_ARGUMENT_ERROR */     
00076     status = ARM_MATH_ARGUMENT_ERROR;     
00077   }     
00078   else     
00079   {     
00080      
00081     /* The algorithm implementation is based on the lengths of the inputs. */     
00082     /* srcB is always made to slide across srcA. */     
00083     /* So srcBLen is always considered as shorter or equal to srcALen */     
00084     if(srcALen >= srcBLen)     
00085     {     
00086       /* Initialization of inputA pointer */     
00087       pIn1 = pSrcA;     
00088      
00089       /* Initialization of inputB pointer */     
00090       pIn2 = pSrcB;     
00091     }     
00092     else     
00093     {     
00094       /* Initialization of inputA pointer */     
00095       pIn1 = pSrcB;     
00096      
00097       /* Initialization of inputB pointer */     
00098       pIn2 = pSrcA;     
00099      
00100       /* srcBLen is always considered as shorter or equal to srcALen */     
00101       j = srcBLen;     
00102       srcBLen = srcALen;     
00103       srcALen = j;     
00104     }     
00105      
00106     /* Conditions to check which loopCounter holds      
00107      * the first and last indices of the output samples to be calculated. */     
00108     check = firstIndex + numPoints;     
00109     blockSize3 = ((int32_t) check - (int32_t) srcALen);     
00110     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;     
00111     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);     
00112     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :     
00113                                     (int32_t) numPoints) : 0;     
00114     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +     
00115                                     (int32_t) firstIndex);     
00116     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;     
00117      
00118     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */     
00119     /* The function is internally      
00120      * divided into three stages according to the number of multiplications that has to be      
00121      * taken place between inputA samples and inputB samples. In the first stage of the      
00122      * algorithm, the multiplications increase by one for every iteration.      
00123      * In the second stage of the algorithm, srcBLen number of multiplications are done.      
00124      * In the third stage of the algorithm, the multiplications decrease by one      
00125      * for every iteration. */     
00126      
00127     /* Set the output pointer to point to the firstIndex      
00128      * of the output sample to be calculated. */     
00129     pOut = pDst + firstIndex;     
00130      
00131     /* --------------------------      
00132      * Initializations of stage1      
00133      * -------------------------*/     
00134      
00135     /* sum = x[0] * y[0]      
00136      * sum = x[0] * y[1] + x[1] * y[0]      
00137      * ....      
00138      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]      
00139      */     
00140      
00141     /* In this stage the MAC operations are increased by 1 for every iteration.      
00142        The count variable holds the number of MAC operations performed.      
00143        Since the partial convolution starts from firstIndex      
00144        Number of Macs to be performed is firstIndex + 1 */     
00145     count = 1u + firstIndex;     
00146      
00147     /* Working pointer of inputA */     
00148     px = pIn1;     
00149      
00150     /* Working pointer of inputB */     
00151     pSrc2 = pIn2 + firstIndex;     
00152     py = pSrc2;     
00153      
00154     /* ------------------------      
00155      * Stage1 process      
00156      * ----------------------*/     
00157      
00158     /* The first loop starts here */     
00159     while(blockSize1 > 0)     
00160     {     
00161       /* Accumulator is made zero for every iteration */     
00162       sum = 0;     
00163      
00164       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00165       k = count >> 2u;     
00166      
00167       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00168        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00169       while(k > 0u)     
00170       {     
00171       x0 = *px++;  
00172       c0 = *py--;  
00173   
00174       x1 = *px++;  
00175       c1 = *py--;  
00176   
00177       /* x[0] * y[srcBLen - 1] */     
00178       sum += (q63_t) x0 * c0;  
00179                
00180       /* x[1] * y[srcBLen - 2] */     
00181       sum += (q63_t) x1 * c1;     
00182         
00183       x0 = *px++;  
00184       c0 = *py--;  
00185   
00186       x1 = *px++;  
00187       c1 = *py--;  
00188         
00189       /* x[2] * y[srcBLen - 3] */     
00190       sum += (q63_t) x0 * c0;  
00191   
00192       /* x[3] * y[srcBLen - 4] */     
00193       sum += (q63_t) x1 * c1;    
00194      
00195         /* Decrement the loop counter */     
00196         k--;     
00197       }     
00198      
00199       /* If the count is not a multiple of 4, compute any remaining MACs here.      
00200        ** No loop unrolling is used. */     
00201       k = count % 0x4u;     
00202      
00203       while(k > 0u)     
00204       {     
00205         /* Perform the multiply-accumulate */     
00206         sum += (q63_t) * px++ * (*py--);     
00207      
00208         /* Decrement the loop counter */     
00209         k--;     
00210       }     
00211      
00212       /* Store the result in the accumulator in the destination buffer. */     
00213       *pOut++ = (q31_t) (sum >> 31);     
00214      
00215       /* Update the inputA and inputB pointers for next MAC calculation */     
00216       py = ++pSrc2;     
00217       px = pIn1;     
00218      
00219       /* Increment the MAC count */     
00220       count++;     
00221      
00222       /* Decrement the loop counter */     
00223       blockSize1--;     
00224     }     
00225      
00226     /* --------------------------      
00227      * Initializations of stage2      
00228      * ------------------------*/     
00229      
00230     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]      
00231      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]      
00232      * ....      
00233      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]      
00234      */     
00235      
00236     /* Working pointer of inputA */     
00237     px = pIn1;     
00238      
00239     /* Working pointer of inputB */     
00240     pSrc2 = pIn2 + (srcBLen - 1u);     
00241     py = pSrc2;     
00242      
00243     /* count is index by which the pointer pIn1 to be incremented */     
00244     count = 0u;     
00245      
00246     /* -------------------      
00247      * Stage2 process      
00248      * ------------------*/     
00249      
00250     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00251      * So, to loop unroll over blockSize2,      
00252      * srcBLen should be greater than or equal to 4 */     
00253     if(srcBLen >= 4u)     
00254     {     
00255       /* Loop unroll over blockSize2 */     
00256       //blkCnt = ((uint32_t) blockSize2 >> 2u);     
00257   
00258       blkCnt = blockSize2 / 3;  
00259      
00260       while(blkCnt > 0u)     
00261       {     
00262         /* Set all accumulators to zero */     
00263         acc0 = 0;     
00264         acc1 = 0;     
00265         acc2 = 0;     
00266         //acc3 = 0;     
00267      
00268         /* read x[0], x[1], x[2] samples */     
00269         x0 = *(px++);     
00270         x1 = *(px++);     
00271         //x2 = *(px++);     
00272      
00273         /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00274         //k = srcBLen >> 2u;     
00275   
00276         k = srcBLen / 3;  
00277      
00278         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00279          ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00280         do     
00281         {     
00282           /* Read y[srcBLen - 1] sample */     
00283           c0 = *(py);     
00284      
00285           /* Read x[3] sample */     
00286           x2 = *(px);     
00287      
00288           /* Perform the multiply-accumulates */     
00289           /* acc0 +=  x[0] * y[srcBLen - 1] */     
00290           acc0 += (q63_t) x0 *c0;     
00291           /* acc1 +=  x[1] * y[srcBLen - 1] */     
00292           acc1 += (q63_t) x1 *c0;     
00293           /* acc2 +=  x[2] * y[srcBLen - 1] */     
00294           acc2 += (q63_t) x2 *c0;     
00295           /* acc3 +=  x[3] * y[srcBLen - 1] */     
00296           //acc3 += (q63_t) x3 *c0;     
00297      
00298           /* Read y[srcBLen - 2] sample */     
00299           c0 = *(py - 1u);     
00300      
00301           /* Read x[4] sample */     
00302           x0 = *(px + 1u);     
00303      
00304           /* Perform the multiply-accumulate */     
00305           /* acc0 +=  x[1] * y[srcBLen - 2] */     
00306           acc0 += (q63_t) x1 *c0;     
00307           /* acc1 +=  x[2] * y[srcBLen - 2] */     
00308           acc1 += (q63_t) x2 *c0;     
00309           /* acc2 +=  x[3] * y[srcBLen - 2] */     
00310           acc2 += (q63_t) x0 *c0;     
00311           /* acc3 +=  x[4] * y[srcBLen - 2] */     
00312           //acc3 += (q63_t) x0 *c0;     
00313      
00314           /* Read y[srcBLen - 3] sample */     
00315           c0 = *(py - 2u);     
00316      
00317           /* Read x[5] sample */     
00318           x1 = *(px + 2u);     
00319      
00320           /* Perform the multiply-accumulates */     
00321           /* acc0 +=  x[2] * y[srcBLen - 3] */     
00322           acc0 += (q63_t) x2 *c0;     
00323           /* acc1 +=  x[3] * y[srcBLen - 2] */     
00324           acc1 += (q63_t) x0 *c0;     
00325           /* acc2 +=  x[4] * y[srcBLen - 2] */     
00326           acc2 += (q63_t) x1 *c0;     
00327           /* acc3 +=  x[5] * y[srcBLen - 2] */     
00328           //acc3 += (q63_t) x1 *c0;     
00329      
00330           px += 3u;  
00331   
00332           py -= 3u;  
00333   
00335           //c0 = *(py--);     
00336      
00338           //x2 = *(px++);     
00339      
00342           //acc0 += (q63_t) x3 *c0;     
00344           //acc1 += (q63_t) x0 *c0;     
00346           //acc2 += (q63_t) x1 *c0;     
00348           //acc3 += (q63_t) x2 *c0;     
00349      
00350         } while(--k);     
00351      
00352         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00353          ** No loop unrolling is used. */     
00354         //k = srcBLen % 0x4u;     
00355         k = srcBLen - ( 3 * (srcBLen/3));  
00356      
00357         while(k > 0u)     
00358         {     
00359           /* Read y[srcBLen - 5] sample */     
00360           c0 = *(py--);     
00361      
00362           /* Read x[7] sample */     
00363           x2 = *(px++);     
00364      
00365           /* Perform the multiply-accumulates */     
00366           /* acc0 +=  x[4] * y[srcBLen - 5] */     
00367           acc0 += (q63_t) x0 *c0;     
00368           /* acc1 +=  x[5] * y[srcBLen - 5] */     
00369           acc1 += (q63_t) x1 *c0;     
00370           /* acc2 +=  x[6] * y[srcBLen - 5] */     
00371           acc2 += (q63_t) x2 *c0;     
00372           /* acc3 +=  x[7] * y[srcBLen - 5] */     
00373           //acc3 += (q63_t) x3 *c0;     
00374      
00375           /* Reuse the present samples for the next MAC */     
00376           x0 = x1;     
00377           x1 = x2;     
00378           //x2 = x3;     
00379      
00380           /* Decrement the loop counter */     
00381           k--;     
00382         }     
00383      
00384          /* Increment the pointer pIn1 index, count by 1 */     
00385         count += 3u;     
00386      
00387        /* Store the result in the accumulator in the destination buffer. */     
00388         *pOut++ = (q31_t) (acc0 >> 31);     
00389         *pOut++ = (q31_t) (acc1 >> 31);     
00390         *pOut++ = (q31_t) (acc2 >> 31);     
00391         //*pOut++ = (q31_t) (acc3 >> 31);     
00392      
00393         /* Update the inputA and inputB pointers for next MAC calculation */     
00394         px = pIn1 + count;     
00395         py = pSrc2;     
00396      
00397         /* Decrement the loop counter */     
00398         blkCnt--;     
00399       }     
00400      
00401       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00402        ** No loop unrolling is used. */     
00403       //blkCnt = (uint32_t) blockSize2 % 0x4u;     
00404         blkCnt =  blockSize2 - 3 * (blockSize2/3);  
00405   
00406      
00407       while(blkCnt > 0u)     
00408       {     
00409         /* Accumulator is made zero for every iteration */     
00410         sum = 0;     
00411      
00412         /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00413         k = srcBLen >> 2u;     
00414      
00415         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00416          ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00417         while(k > 0u)     
00418         {     
00419           /* Perform the multiply-accumulates */     
00420           sum += (q63_t) * px++ * (*py--);     
00421           sum += (q63_t) * px++ * (*py--);     
00422           sum += (q63_t) * px++ * (*py--);     
00423           sum += (q63_t) * px++ * (*py--);     
00424      
00425           /* Decrement the loop counter */     
00426           k--;     
00427         }     
00428      
00429         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00430          ** No loop unrolling is used. */     
00431         k = srcBLen % 0x4u;     
00432      
00433         while(k > 0u)     
00434         {     
00435           /* Perform the multiply-accumulate */     
00436           sum += (q63_t) * px++ * (*py--);     
00437      
00438           /* Decrement the loop counter */     
00439           k--;     
00440         }     
00441      
00442         /* Store the result in the accumulator in the destination buffer. */     
00443         *pOut++ = (q31_t) (sum >> 31);     
00444      
00445          /* Increment the MAC count */     
00446         count++;     
00447      
00448        /* Update the inputA and inputB pointers for next MAC calculation */     
00449         px = pIn1 + count;     
00450         py = pSrc2;     
00451      
00452         /* Decrement the loop counter */     
00453         blkCnt--;     
00454       }     
00455     }     
00456     else     
00457     {     
00458       /* If the srcBLen is not a multiple of 4,      
00459        * the blockSize2 loop cannot be unrolled by 4 */     
00460       blkCnt = (uint32_t) blockSize2;     
00461      
00462       while(blkCnt > 0u)     
00463       {     
00464         /* Accumulator is made zero for every iteration */     
00465         sum = 0;     
00466      
00467         /* srcBLen number of MACS should be performed */     
00468         k = srcBLen;     
00469      
00470         while(k > 0u)     
00471         {     
00472           /* Perform the multiply-accumulate */     
00473           sum += (q63_t) * px++ * (*py--);     
00474      
00475           /* Decrement the loop counter */     
00476           k--;     
00477         }     
00478      
00479         /* Store the result in the accumulator in the destination buffer. */     
00480         *pOut++ = (q31_t) (sum >> 31);     
00481      
00482         /* Increment the MAC count */     
00483         count++;     
00484      
00485         /* Update the inputA and inputB pointers for next MAC calculation */     
00486         px = pIn1 + count;     
00487         py = pSrc2;     
00488      
00489         /* Decrement the loop counter */     
00490         blkCnt--;     
00491       }     
00492     }     
00493      
00494      
00495     /* --------------------------      
00496      * Initializations of stage3      
00497      * -------------------------*/     
00498      
00499     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]      
00500      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]      
00501      * ....      
00502      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]      
00503      * sum +=  x[srcALen-1] * y[srcBLen-1]      
00504      */     
00505      
00506     /* In this stage the MAC operations are decreased by 1 for every iteration.      
00507        The blockSize3 variable holds the number of MAC operations performed */     
00508     count = srcBLen - 1u;     
00509      
00510     /* Working pointer of inputA */     
00511     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);     
00512     px = pSrc1;     
00513      
00514     /* Working pointer of inputB */     
00515     pSrc2 = pIn2 + (srcBLen - 1u);     
00516     py = pSrc2;     
00517      
00518     /* -------------------      
00519      * Stage3 process      
00520      * ------------------*/     
00521      
00522     while(blockSize3 > 0)     
00523     {     
00524       /* Accumulator is made zero for every iteration */     
00525       sum = 0;     
00526      
00527       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00528       k = count >> 2u;     
00529      
00530       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00531        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00532       while(k > 0u)     
00533       {     
00534         x0 = *px++;  
00535         c0 = *py--;  
00536   
00537         x1 = *px++;  
00538         c1 = *py--;  
00539   
00540         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */     
00541         sum += (q63_t) x0 * c0;    
00542                
00543         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */     
00544         sum += (q63_t) x1 * c1;   
00545             
00546         x0 = *px++;  
00547         c0 = *py--;  
00548   
00549         x1 = *px++;  
00550         c1 = *py--;  
00551                   
00552         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */     
00553         sum += (q63_t) x0 * c0;     
00554         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */     
00555         sum += (q63_t) x1 * c1;     
00556      
00557         /* Decrement the loop counter */     
00558         k--;     
00559       }     
00560      
00561       /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.      
00562        ** No loop unrolling is used. */     
00563       k = count % 0x4u;     
00564      
00565       while(k > 0u)     
00566       {     
00567         /* Perform the multiply-accumulate */     
00568         sum += (q63_t) * px++ * (*py--);     
00569      
00570         /* Decrement the loop counter */     
00571         k--;     
00572       }     
00573      
00574       /* Store the result in the accumulator in the destination buffer. */     
00575       *pOut++ = (q31_t) (sum >> 31);     
00576      
00577       /* Update the inputA and inputB pointers for next MAC calculation */     
00578       px = ++pSrc1;     
00579       py = pSrc2;     
00580      
00581       /* Decrement the MAC count */     
00582       count--;     
00583      
00584       /* Decrement the loop counter */     
00585       blockSize3--;     
00586      
00587     }     
00588      
00589     /* set status as ARM_MATH_SUCCESS */     
00590     status = ARM_MATH_SUCCESS;     
00591   }     
00592      
00593   /* Return to application */     
00594   return (status);     
00595      
00596 }     
00597