Cortex-R DSP Software Library: arm_correlate

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_correlate_q31.c      
00009 *      
00010 * Description:  Correlation of Q31 sequences.    
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00062 void arm_correlate_q31(     
00063   q31_t * pSrcA,     
00064   uint32_t srcALen,     
00065   q31_t * pSrcB,     
00066   uint32_t srcBLen,     
00067   q31_t * pDst)     
00068 {     
00069   q31_t *pIn1;                        /* inputA pointer               */     
00070   q31_t *pIn2;                        /* inputB pointer               */     
00071   q31_t *pOut = pDst;                 /* output pointer               */     
00072   q31_t *px;                          /* Intermediate inputA pointer  */     
00073   q31_t *py;                          /* Intermediate inputB pointer  */     
00074   q31_t *pSrc1;                       /* Intermediate pointers        */     
00075   q63_t sum, acc0, acc1, acc2;        /* Accumulators                  */     
00076   q31_t x0, x1, x2, c0, c1;           /* temporary variables for holding input and coefficient values */     
00077   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */     
00078   int32_t inc = 1;                    /* Destination address modifier */     
00079      
00080      
00081   /* The algorithm implementation is based on the lengths of the inputs. */     
00082   /* srcB is always made to slide across srcA. */     
00083   /* So srcBLen is always considered as shorter or equal to srcALen */     
00084   /* But CORR(x, y) is reverse of CORR(y, x) */     
00085   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */     
00086   /* and the destination pointer modifier, inc is set to -1 */     
00087   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */     
00088   /* But to improve the performance,      
00089    * we include zeroes in the output instead of zero padding either of the the inputs*/     
00090   /* If srcALen > srcBLen,      
00091    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */     
00092   /* If srcALen < srcBLen,      
00093    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */     
00094     
00095   if(srcALen >= srcBLen)     
00096   {     
00097     /* Initialization of inputA pointer */     
00098     pIn1 = (pSrcA);     
00099      
00100     /* Initialization of inputB pointer */     
00101     pIn2 = (pSrcB);     
00102      
00103     /* Number of output samples is calculated */     
00104     outBlockSize = (2u * srcALen) - 1u;     
00105      
00106     /* When srcALen > srcBLen, zero padding is done to srcB      
00107      * to make their lengths equal.      
00108      * Instead, (outBlockSize - (srcALen + srcBLen - 1))      
00109      * number of output samples are made zero */     
00110     j = outBlockSize - (srcALen + (srcBLen - 1u));     
00111      
00112     /* Updating the pointer position to non zero value */    
00113     pOut += j;    
00114      
00115   }     
00116   else     
00117   {     
00118     /* Initialization of inputA pointer */     
00119     pIn1 = (pSrcB);     
00120      
00121     /* Initialization of inputB pointer */     
00122     pIn2 = (pSrcA);     
00123      
00124     /* srcBLen is always considered as shorter or equal to srcALen */     
00125     j = srcBLen;     
00126     srcBLen = srcALen;     
00127     srcALen = j;     
00128      
00129     /* CORR(x, y) = Reverse order(CORR(y, x)) */     
00130     /* Hence set the destination pointer to point to the last output sample */     
00131     pOut = pDst + ((srcALen + srcBLen) - 2u);     
00132      
00133     /* Destination address modifier is set to -1 */     
00134     inc = -1;     
00135      
00136   }     
00137      
00138   /* The function is internally      
00139    * divided into three parts according to the number of multiplications that has to be      
00140    * taken place between inputA samples and inputB samples. In the first part of the      
00141    * algorithm, the multiplications increase by one for every iteration.      
00142    * In the second part of the algorithm, srcBLen number of multiplications are done.      
00143    * In the third part of the algorithm, the multiplications decrease by one      
00144    * for every iteration.*/     
00145   /* The algorithm is implemented in three stages.      
00146    * The loop counters of each stage is initiated here. */     
00147   blockSize1 = srcBLen - 1u;     
00148   blockSize2 = srcALen - (srcBLen - 1u);     
00149   blockSize3 = blockSize1;     
00150      
00151   /* --------------------------      
00152    * Initializations of stage1      
00153    * -------------------------*/     
00154      
00155   /* sum = x[0] * y[srcBlen - 1]      
00156    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]      
00157    * ....      
00158    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]      
00159    */     
00160      
00161   /* In this stage the MAC operations are increased by 1 for every iteration.      
00162      The count variable holds the number of MAC operations performed */     
00163   count = 1u;     
00164      
00165   /* Working pointer of inputA */     
00166   px = pIn1;     
00167      
00168   /* Working pointer of inputB */     
00169   pSrc1 = pIn2 + (srcBLen - 1u);     
00170   py = pSrc1;     
00171      
00172   /* ------------------------      
00173    * Stage1 process      
00174    * ----------------------*/     
00175      
00176   /* The first stage starts here */     
00177   while(blockSize1 > 0u)     
00178   {     
00179     /* Accumulator is made zero for every iteration */     
00180     sum = 0;     
00181      
00182     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00183     k = count >> 2;     
00184      
00185     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00186      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00187     while(k > 0u)     
00188     {     
00189       /* Read x[0] */  
00190       x0 = *px++;  
00191       /* Read y[srcBLen - 4] */  
00192       c0 = *py++;  
00193   
00194       /* Read x[1] */  
00195       x1 = *px++;  
00196       /* Read y[srcBLen - 3] */  
00197       c1 = *py++;  
00198   
00199       /* x[0] * y[srcBLen - 4] */     
00200       sum += (q63_t) x0 * c0;  
00201                
00202       /* x[1] * y[srcBLen - 3] */     
00203       sum += (q63_t) x1 * c1;     
00204         
00205       /* Read x[2] */       
00206       x0 = *px++;  
00207       /* Read y[srcBLen - 2] */  
00208       c0 = *py++;  
00209   
00210       /* Read x[3] */  
00211       x1 = *px++;  
00212       /* Read y[srcBLen - 1] */  
00213       c1 = *py++;  
00214         
00215       /* x[2] * y[srcBLen - 2] */     
00216       sum += (q63_t) x0 * c0;  
00217   
00218       /* x[3] * y[srcBLen - 1] */     
00219       sum += (q63_t) x1 * c1;     
00220      
00221       /* Decrement the loop counter */     
00222       k--;     
00223     }     
00224      
00225     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00226      ** No loop unrolling is used. */     
00227     k = count % 0x4u;     
00228      
00229     while(k > 0u)     
00230     {     
00231       /* Perform the multiply-accumulates */     
00232       /* x[0] * y[srcBLen - 1] */     
00233       sum += (q63_t) * px++ * (*py++);     
00234      
00235       /* Decrement the loop counter */     
00236       k--;     
00237     }     
00238      
00239     /* Store the result in the accumulator in the destination buffer. */     
00240     *pOut = (q31_t) (sum >> 31);     
00241     /* Destination pointer is updated according to the address modifier, inc */     
00242     pOut += inc;     
00243      
00244     /* Update the inputA and inputB pointers for next MAC calculation */     
00245     py = pSrc1 - count;     
00246     px = pIn1;     
00247      
00248     /* Increment the MAC count */     
00249     count++;     
00250      
00251     /* Decrement the loop counter */     
00252     blockSize1--;     
00253   }     
00254      
00255   /* --------------------------      
00256    * Initializations of stage2      
00257    * ------------------------*/     
00258      
00259   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]      
00260    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]      
00261    * ....      
00262    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00263    */     
00264      
00265   /* Working pointer of inputA */     
00266   px = pIn1;     
00267      
00268   /* Working pointer of inputB */     
00269   py = pIn2;     
00270      
00271   /* count is index by which the pointer pIn1 to be incremented */     
00272   count = 0u;     
00273      
00274   /* -------------------      
00275    * Stage2 process      
00276    * ------------------*/     
00277      
00278   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00279    * So, to loop unroll over blockSize2,      
00280    * srcBLen should be greater than or equal to 4 */     
00281   if(srcBLen >= 4u)     
00282   {     
00283     /* Loop unroll by 3 */     
00284     blkCnt = blockSize2 / 3;  
00285      
00286     while(blkCnt > 0u)     
00287     {     
00288       /* Set all accumulators to zero */     
00289       acc0 = 0;     
00290       acc1 = 0;     
00291       acc2 = 0;     
00292      
00293       /* read x[0], x[1], x[2] samples */     
00294       x0 = *(px++);     
00295       x1 = *(px++);     
00296      
00297       /* Apply loop unrolling and compute 3 MACs simultaneously. */     
00298       k = srcBLen / 3;  
00299      
00300       /* First part of the processing with loop unrolling.  Compute 3 MACs at a time.      
00301        ** a second loop below computes MACs for the remaining 1 to 2 samples. */     
00302       do     
00303       {     
00304         /* Read y[0] sample */     
00305         c0 = *(py);     
00306      
00307         /* Read x[3] sample */     
00308         x2 = *(px);     
00309      
00310         /* Perform the multiply-accumulate */     
00311         /* acc0 +=  x[0] * y[0] */     
00312         acc0 += ((q63_t) x0 * c0);     
00313         /* acc1 +=  x[1] * y[0] */     
00314         acc1 += ((q63_t) x1 * c0);     
00315         /* acc2 +=  x[2] * y[0] */     
00316         acc2 += ((q63_t) x2 * c0);     
00317      
00318         /* Read y[1] sample */     
00319         c0 = *(py + 1u);     
00320      
00321         /* Read x[4] sample */     
00322         x0 = *(px + 1u);     
00323      
00324         /* Perform the multiply-accumulates */     
00325         /* acc0 +=  x[1] * y[1] */     
00326         acc0 += ((q63_t) x1 * c0);     
00327         /* acc1 +=  x[2] * y[1] */     
00328         acc1 += ((q63_t) x2 * c0);     
00329         /* acc2 +=  x[3] * y[1] */     
00330         acc2 += ((q63_t) x0 * c0);     
00331      
00332         /* Read y[2] sample */     
00333         c0 = *(py + 2u);     
00334      
00335         /* Read x[5] sample */     
00336         x1 = *(px + 2u);     
00337      
00338         /* Perform the multiply-accumulates */     
00339         /* acc0 +=  x[2] * y[2] */     
00340         acc0 += ((q63_t) x2 * c0);     
00341         /* acc1 +=  x[3] * y[2] */     
00342         acc1 += ((q63_t) x0 * c0);     
00343         /* acc2 +=  x[4] * y[2] */     
00344         acc2 += ((q63_t) x1 * c0);     
00345   
00346         /* update scratch pointers */  
00347         px += 3u;  
00348         py += 3u;  
00349      
00350       } while(--k);     
00351      
00352       /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.      
00353        ** No loop unrolling is used. */     
00354       k = srcBLen - ( 3 * (srcBLen/3));  
00355      
00356       while(k > 0u)     
00357       {     
00358         /* Read y[4] sample */     
00359         c0 = *(py++);     
00360      
00361         /* Read x[7] sample */     
00362         x2 = *(px++);     
00363      
00364         /* Perform the multiply-accumulates */     
00365         /* acc0 +=  x[4] * y[4] */     
00366         acc0 += ((q63_t) x0 * c0);     
00367         /* acc1 +=  x[5] * y[4] */     
00368         acc1 += ((q63_t) x1 * c0);     
00369         /* acc2 +=  x[6] * y[4] */     
00370         acc2 += ((q63_t) x2 * c0);     
00371      
00372         /* Reuse the present samples for the next MAC */     
00373         x0 = x1;     
00374         x1 = x2;     
00375      
00376         /* Decrement the loop counter */     
00377         k--;     
00378       }     
00379      
00380       /* Store the result in the accumulator in the destination buffer. */     
00381       *pOut = (q31_t) (acc0 >> 31);     
00382       /* Destination pointer is updated according to the address modifier, inc */     
00383       pOut += inc;     
00384      
00385       *pOut = (q31_t) (acc1 >> 31);     
00386       pOut += inc;     
00387      
00388       *pOut = (q31_t) (acc2 >> 31);     
00389       pOut += inc;     
00390      
00391       /* Increment the pointer pIn1 index, count by 1 */     
00392       count += 3u;     
00393      
00394       /* Update the inputA and inputB pointers for next MAC calculation */     
00395       px = pIn1 + count;     
00396       py = pIn2;     
00397      
00398       /* Decrement the loop counter */     
00399       blkCnt--;     
00400     }     
00401      
00402     /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.      
00403      ** No loop unrolling is used. */     
00404     blkCnt =  blockSize2 - 3 * (blockSize2/3);    
00405      
00406     while(blkCnt > 0u)     
00407     {     
00408       /* Accumulator is made zero for every iteration */     
00409       sum = 0;     
00410      
00411       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00412       k = srcBLen >> 2u;     
00413      
00414       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00415        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00416       while(k > 0u)     
00417       {     
00418         /* Perform the multiply-accumulates */     
00419         sum += (q63_t) * px++ * (*py++);     
00420         sum += (q63_t) * px++ * (*py++);     
00421         sum += (q63_t) * px++ * (*py++);     
00422         sum += (q63_t) * px++ * (*py++);     
00423      
00424         /* Decrement the loop counter */     
00425         k--;     
00426       }     
00427      
00428       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00429        ** No loop unrolling is used. */     
00430       k = srcBLen % 0x4u;     
00431      
00432       while(k > 0u)     
00433       {     
00434         /* Perform the multiply-accumulate */     
00435         sum += (q63_t) * px++ * (*py++);     
00436      
00437         /* Decrement the loop counter */     
00438         k--;     
00439       }     
00440      
00441       /* Store the result in the accumulator in the destination buffer. */     
00442       *pOut = (q31_t) (sum >> 31);     
00443       /* Destination pointer is updated according to the address modifier, inc */     
00444       pOut += inc;     
00445      
00446       /* Increment the MAC count */     
00447       count++;     
00448      
00449       /* Update the inputA and inputB pointers for next MAC calculation */     
00450       px = pIn1 + count;     
00451       py = pIn2;     
00452      
00453       /* Decrement the loop counter */     
00454       blkCnt--;     
00455     }     
00456   }     
00457   else     
00458   {     
00459     /* If the srcBLen is not a multiple of 4,      
00460      * the blockSize2 loop cannot be unrolled by 4 */     
00461     blkCnt = blockSize2;     
00462      
00463     while(blkCnt > 0u)     
00464     {     
00465       /* Accumulator is made zero for every iteration */     
00466       sum = 0;     
00467      
00468       /* srcBLen number of MACS should be performed */     
00469       k = srcBLen;     
00470      
00471       while(k > 0u)     
00472       {     
00473         /* Perform the multiply-accumulate */     
00474         sum += (q63_t) * px++ * (*py++);     
00475      
00476         /* Decrement the loop counter */     
00477         k--;     
00478       }     
00479      
00480       /* Store the result in the accumulator in the destination buffer. */     
00481       *pOut = (q31_t) (sum >> 31);     
00482       /* Destination pointer is updated according to the address modifier, inc */     
00483       pOut += inc;     
00484      
00485       /* Increment the MAC count */     
00486       count++;     
00487      
00488       /* Update the inputA and inputB pointers for next MAC calculation */     
00489       px = pIn1 + count;     
00490       py = pIn2;     
00491      
00492       /* Decrement the loop counter */     
00493       blkCnt--;     
00494     }     
00495   }     
00496      
00497   /* --------------------------      
00498    * Initializations of stage3      
00499    * -------------------------*/     
00500      
00501   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00502    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00503    * ....      
00504    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]      
00505    * sum +=  x[srcALen-1] * y[0]      
00506    */     
00507      
00508   /* In this stage the MAC operations are decreased by 1 for every iteration.      
00509      The count variable holds the number of MAC operations performed */     
00510   count = srcBLen - 1u;     
00511      
00512   /* Working pointer of inputA */     
00513   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));     
00514   px = pSrc1;     
00515      
00516   /* Working pointer of inputB */     
00517   py = pIn2;     
00518      
00519   /* -------------------      
00520    * Stage3 process      
00521    * ------------------*/     
00522      
00523   while(blockSize3 > 0u)     
00524   {     
00525     /* Accumulator is made zero for every iteration */     
00526     sum = 0;     
00527      
00528     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00529     k = count >> 2u;     
00530      
00531     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00532      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00533     while(k > 0u)     
00534     {     
00535       x0 = *px++;  
00536       c0 = *py++;  
00537   
00538       x1 = *px++;  
00539       c1 = *py++;  
00540   
00541       /* Perform the multiply-accumulates */     
00542       /* sum += x[srcALen - srcBLen + 4] * y[3] */     
00543       sum += (q63_t) x0 * c0;    
00544       /* sum += x[srcALen - srcBLen + 3] * y[2] */     
00545       sum += (q63_t) x1 * c1;  
00546          
00547       x0 = *px++;  
00548       c0 = *py++;  
00549   
00550       x1 = *px++;  
00551       c1 = *py++;  
00552   
00553       /* sum += x[srcALen - srcBLen + 2] * y[1] */     
00554       sum += (q63_t) x0 * c0;     
00555       /* sum += x[srcALen - srcBLen + 1] * y[0] */     
00556       sum += (q63_t) x1 * c1;     
00557      
00558       /* Decrement the loop counter */     
00559       k--;     
00560     }     
00561      
00562     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00563      ** No loop unrolling is used. */     
00564     k = count % 0x4u;     
00565      
00566     while(k > 0u)     
00567     {     
00568       /* Perform the multiply-accumulates */     
00569       sum += (q63_t) * px++ * (*py++);     
00570      
00571       /* Decrement the loop counter */     
00572       k--;     
00573     }     
00574      
00575     /* Store the result in the accumulator in the destination buffer. */     
00576     *pOut = (q31_t) (sum >> 31);     
00577     /* Destination pointer is updated according to the address modifier, inc */     
00578     pOut += inc;     
00579      
00580     /* Update the inputA and inputB pointers for next MAC calculation */     
00581     px = ++pSrc1;     
00582     py = pIn2;     
00583      
00584     /* Decrement the MAC count */     
00585     count--;     
00586      
00587     /* Decrement the loop counter */     
00588     blockSize3--;     
00589   }     
00590      
00591 }     
00592