Cortex-R DSP Software Library: arm_correlate_fast

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_correlate_fast_q31.c      
00009 *      
00010 * Description:  Fast Q31 Correlation.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00064 void arm_correlate_fast_q31(     
00065   q31_t * pSrcA,     
00066   uint32_t srcALen,     
00067   q31_t * pSrcB,     
00068   uint32_t srcBLen,     
00069   q31_t * pDst)     
00070 {     
00071   q31_t *pIn1;                                   /* inputA pointer               */     
00072   q31_t *pIn2;                                   /* inputB pointer               */     
00073   q31_t *pOut = pDst;                            /* output pointer               */     
00074   q31_t *px;                                     /* Intermediate inputA pointer  */     
00075   q31_t *py;                                     /* Intermediate inputB pointer  */     
00076   q31_t *pSrc1;                                  /* Intermediate pointers        */     
00077   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators                  */     
00078   q31_t x0, x1, x2, x3, c0;                      /* temporary variables for holding input and coefficient values */     
00079   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counter                 */     
00080   int32_t inc = 1;                               /* Destination address modifier */     
00081      
00082      
00083   /* The algorithm implementation is based on the lengths of the inputs. */     
00084   /* srcB is always made to slide across srcA. */     
00085   /* So srcBLen is always considered as shorter or equal to srcALen */     
00086   if(srcALen >= srcBLen)     
00087   {     
00088     /* Initialization of inputA pointer */     
00089     pIn1 = (pSrcA);     
00090      
00091     /* Initialization of inputB pointer */     
00092     pIn2 = (pSrcB);     
00093      
00094     /* Number of output samples is calculated */     
00095     outBlockSize = (2u * srcALen) - 1u;     
00096      
00097     /* When srcALen > srcBLen, zero padding is done to srcB      
00098      * to make their lengths equal.      
00099      * Instead, (outBlockSize - (srcALen + srcBLen - 1))      
00100      * number of output samples are made zero */     
00101     j = outBlockSize - (srcALen + (srcBLen - 1u));     
00102      
00103     /* Updating the pointer position to non zero value */    
00104     pOut += j;    
00105      
00106   }     
00107   else     
00108   {     
00109     /* Initialization of inputA pointer */     
00110     pIn1 = (pSrcB);     
00111      
00112     /* Initialization of inputB pointer */     
00113     pIn2 = (pSrcA);     
00114      
00115     /* srcBLen is always considered as shorter or equal to srcALen */     
00116     j = srcBLen;     
00117     srcBLen = srcALen;     
00118     srcALen = j;     
00119      
00120     /* CORR(x, y) = Reverse order(CORR(y, x)) */     
00121     /* Hence set the destination pointer to point to the last output sample */     
00122     pOut = pDst + ((srcALen + srcBLen) - 2u);     
00123      
00124     /* Destination address modifier is set to -1 */     
00125     inc = -1;     
00126      
00127   }     
00128      
00129   /* The function is internally      
00130    * divided into three parts according to the number of multiplications that has to be      
00131    * taken place between inputA samples and inputB samples. In the first part of the      
00132    * algorithm, the multiplications increase by one for every iteration.      
00133    * In the second part of the algorithm, srcBLen number of multiplications are done.      
00134    * In the third part of the algorithm, the multiplications decrease by one      
00135    * for every iteration.*/     
00136   /* The algorithm is implemented in three stages.      
00137    * The loop counters of each stage is initiated here. */     
00138   blockSize1 = srcBLen - 1u;     
00139   blockSize2 = srcALen - (srcBLen - 1u);     
00140   blockSize3 = blockSize1;     
00141      
00142   /* --------------------------      
00143    * Initializations of stage1      
00144    * -------------------------*/     
00145      
00146   /* sum = x[0] * y[srcBlen - 1]      
00147    * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]      
00148    * ....      
00149    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]      
00150    */     
00151      
00152   /* In this stage the MAC operations are increased by 1 for every iteration.      
00153      The count variable holds the number of MAC operations performed */     
00154   count = 1u;     
00155      
00156   /* Working pointer of inputA */     
00157   px = pIn1;     
00158      
00159   /* Working pointer of inputB */     
00160   pSrc1 = pIn2 + (srcBLen - 1u);     
00161   py = pSrc1;     
00162      
00163   /* ------------------------      
00164    * Stage1 process      
00165    * ----------------------*/     
00166      
00167   /* The first stage starts here */     
00168   while(blockSize1 > 0u)     
00169   {     
00170     /* Accumulator is made zero for every iteration */     
00171     sum = 0;     
00172      
00173     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00174     k = count >> 2;     
00175      
00176     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00177      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00178     while(k > 0u)     
00179     {     
00180   
00181   
00182       /* x[0] * y[srcBLen - 4] */     
00183       sum = (q31_t) ((((q63_t) sum << 32) +     
00184                       ((q63_t) * px++ * (*py++))) >> 32);     
00185       /* x[1] * y[srcBLen - 3] */     
00186       sum = (q31_t) ((((q63_t) sum << 32) +     
00187                       ((q63_t) * px++ * (*py++))) >> 32);     
00188   
00189       /* x[2] * y[srcBLen - 2] */     
00190       sum = (q31_t) ((((q63_t) sum << 32) +     
00191                       ((q63_t) * px++ * (*py++))) >> 32);     
00192       /* x[3] * y[srcBLen - 1] */     
00193       sum = (q31_t) ((((q63_t) sum << 32) +     
00194                       ((q63_t) * px++ * (*py++))) >> 32);     
00195      
00196       /* Decrement the loop counter */     
00197       k--;     
00198     }     
00199      
00200     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00201      ** No loop unrolling is used. */     
00202     k = count % 0x4u;     
00203      
00204     while(k > 0u)     
00205     {     
00206       /* Perform the multiply-accumulates */     
00207       /* x[0] * y[srcBLen - 1] */     
00208       sum = (q31_t) ((((q63_t) sum << 32) +     
00209                       ((q63_t) * px++ * (*py++))) >> 32);     
00210      
00211       /* Decrement the loop counter */     
00212       k--;     
00213     }     
00214      
00215     /* Store the result in the accumulator in the destination buffer. */     
00216     *pOut = sum << 1;     
00217     /* Destination pointer is updated according to the address modifier, inc */     
00218     pOut += inc;     
00219      
00220     /* Update the inputA and inputB pointers for next MAC calculation */     
00221     py = pSrc1 - count;     
00222     px = pIn1;     
00223      
00224     /* Increment the MAC count */     
00225     count++;     
00226      
00227     /* Decrement the loop counter */     
00228     blockSize1--;     
00229   }     
00230      
00231   /* --------------------------      
00232    * Initializations of stage2      
00233    * ------------------------*/     
00234      
00235   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]      
00236    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]      
00237    * ....      
00238    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00239    */     
00240      
00241   /* Working pointer of inputA */     
00242   px = pIn1;     
00243      
00244   /* Working pointer of inputB */     
00245   py = pIn2;     
00246      
00247   /* count is index by which the pointer pIn1 to be incremented */     
00248   count = 0u;     
00249      
00250   /* -------------------      
00251    * Stage2 process      
00252    * ------------------*/     
00253      
00254   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00255    * So, to loop unroll over blockSize2,      
00256    * srcBLen should be greater than or equal to 4 */     
00257   if(srcBLen >= 4u)     
00258   {     
00259     /* Loop unroll over blockSize2, by 4 */     
00260     blkCnt = blockSize2 >> 2u;     
00261      
00262     while(blkCnt > 0u)     
00263     {     
00264       /* Set all accumulators to zero */     
00265       acc0 = 0;     
00266       acc1 = 0;     
00267       acc2 = 0;     
00268       acc3 = 0;     
00269      
00270       /* read x[0], x[1], x[2] samples */     
00271       x0 = *(px++);     
00272       x1 = *(px++);     
00273       x2 = *(px++);     
00274      
00275       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00276       k = srcBLen >> 2u;     
00277      
00278       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00279        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00280       do     
00281       {     
00282         /* Read y[0] sample */     
00283         c0 = *(py++);     
00284      
00285         /* Read x[3] sample */     
00286         x3 = *(px++);     
00287      
00288         /* Perform the multiply-accumulate */     
00289         /* acc0 +=  x[0] * y[0] */     
00290         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00291         /* acc1 +=  x[1] * y[0] */     
00292         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00293         /* acc2 +=  x[2] * y[0] */     
00294         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00295         /* acc3 +=  x[3] * y[0] */     
00296         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);     
00297      
00298         /* Read y[1] sample */     
00299         c0 = *(py++);     
00300      
00301         /* Read x[4] sample */     
00302         x0 = *(px++);     
00303      
00304         /* Perform the multiply-accumulates */     
00305         /* acc0 +=  x[1] * y[1] */     
00306         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);     
00307         /* acc1 +=  x[2] * y[1] */     
00308         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);     
00309         /* acc2 +=  x[3] * y[1] */     
00310         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);     
00311         /* acc3 +=  x[4] * y[1] */     
00312         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);     
00313      
00314         /* Read y[2] sample */     
00315         c0 = *(py++);     
00316      
00317         /* Read x[5] sample */     
00318         x1 = *(px++);     
00319      
00320         /* Perform the multiply-accumulates */     
00321         /* acc0 +=  x[2] * y[2] */     
00322         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);     
00323         /* acc1 +=  x[3] * y[2] */     
00324         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);     
00325         /* acc2 +=  x[4] * y[2] */     
00326         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);     
00327         /* acc3 +=  x[5] * y[2] */     
00328         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);     
00329      
00330         /* Read y[3] sample */     
00331         c0 = *(py++);     
00332      
00333         /* Read x[6] sample */     
00334         x2 = *(px++);     
00335      
00336         /* Perform the multiply-accumulates */     
00337         /* acc0 +=  x[3] * y[3] */     
00338         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);     
00339         /* acc1 +=  x[4] * y[3] */     
00340         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);     
00341         /* acc2 +=  x[5] * y[3] */     
00342         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);     
00343         /* acc3 +=  x[6] * y[3] */     
00344         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);     
00345      
00346      
00347       } while(--k);     
00348      
00349       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00350        ** No loop unrolling is used. */     
00351       k = srcBLen % 0x4u;     
00352      
00353       while(k > 0u)     
00354       {     
00355         /* Read y[4] sample */     
00356         c0 = *(py++);     
00357      
00358         /* Read x[7] sample */     
00359         x3 = *(px++);     
00360      
00361         /* Perform the multiply-accumulates */     
00362         /* acc0 +=  x[4] * y[4] */     
00363         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);     
00364         /* acc1 +=  x[5] * y[4] */     
00365         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);     
00366         /* acc2 +=  x[6] * y[4] */     
00367         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);     
00368         /* acc3 +=  x[7] * y[4] */     
00369         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);     
00370      
00371         /* Reuse the present samples for the next MAC */     
00372         x0 = x1;     
00373         x1 = x2;     
00374         x2 = x3;     
00375      
00376         /* Decrement the loop counter */     
00377         k--;     
00378       }     
00379      
00380       /* Store the result in the accumulator in the destination buffer. */     
00381       *pOut = (q31_t) (acc0 << 1);     
00382       /* Destination pointer is updated according to the address modifier, inc */     
00383       pOut += inc;     
00384      
00385       *pOut = (q31_t) (acc1 << 1);     
00386       pOut += inc;     
00387      
00388       *pOut = (q31_t) (acc2 << 1);     
00389       pOut += inc;     
00390      
00391       *pOut = (q31_t) (acc3 << 1);     
00392       pOut += inc;     
00393      
00394       /* Increment the pointer pIn1 index, count by 1 */     
00395       count += 4u;     
00396      
00397       /* Update the inputA and inputB pointers for next MAC calculation */     
00398       px = pIn1 + count;     
00399       py = pIn2;     
00400      
00401       /* Decrement the loop counter */     
00402       blkCnt--;     
00403     }     
00404      
00405     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00406      ** No loop unrolling is used. */     
00407     blkCnt = blockSize2 % 0x4u;     
00408      
00409     while(blkCnt > 0u)     
00410     {     
00411       /* Accumulator is made zero for every iteration */     
00412       sum = 0;     
00413      
00414       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00415       k = srcBLen >> 2u;     
00416      
00417       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00418        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00419       while(k > 0u)     
00420       {     
00421         /* Perform the multiply-accumulates */     
00422         sum = (q31_t) ((((q63_t) sum << 32) +     
00423                         ((q63_t) * px++ * (*py++))) >> 32);     
00424         sum = (q31_t) ((((q63_t) sum << 32) +     
00425                         ((q63_t) * px++ * (*py++))) >> 32);     
00426         sum = (q31_t) ((((q63_t) sum << 32) +     
00427                         ((q63_t) * px++ * (*py++))) >> 32);     
00428         sum = (q31_t) ((((q63_t) sum << 32) +     
00429                         ((q63_t) * px++ * (*py++))) >> 32);     
00430      
00431         /* Decrement the loop counter */     
00432         k--;     
00433       }     
00434      
00435       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00436        ** No loop unrolling is used. */     
00437       k = srcBLen % 0x4u;     
00438      
00439       while(k > 0u)     
00440       {     
00441         /* Perform the multiply-accumulate */     
00442         sum = (q31_t) ((((q63_t) sum << 32) +     
00443                         ((q63_t) * px++ * (*py++))) >> 32);     
00444      
00445         /* Decrement the loop counter */     
00446         k--;     
00447       }     
00448      
00449       /* Store the result in the accumulator in the destination buffer. */     
00450       *pOut = sum << 1;     
00451       /* Destination pointer is updated according to the address modifier, inc */     
00452       pOut += inc;     
00453      
00454       /* Increment the MAC count */     
00455       count++;     
00456      
00457       /* Update the inputA and inputB pointers for next MAC calculation */     
00458       px = pIn1 + count;     
00459       py = pIn2;     
00460      
00461       /* Decrement the loop counter */     
00462       blkCnt--;     
00463     }     
00464   }     
00465   else     
00466   {     
00467     /* If the srcBLen is not a multiple of 4,      
00468      * the blockSize2 loop cannot be unrolled by 4 */     
00469     blkCnt = blockSize2;     
00470      
00471     while(blkCnt > 0u)     
00472     {     
00473       /* Accumulator is made zero for every iteration */     
00474       sum = 0;     
00475      
00476       /* Loop over srcBLen */     
00477       k = srcBLen;     
00478      
00479       while(k > 0u)     
00480       {     
00481         /* Perform the multiply-accumulate */     
00482         sum = (q31_t) ((((q63_t) sum << 32) +     
00483                         ((q63_t) * px++ * (*py++))) >> 32);     
00484      
00485         /* Decrement the loop counter */     
00486         k--;     
00487       }     
00488      
00489       /* Store the result in the accumulator in the destination buffer. */     
00490       *pOut = sum << 1;     
00491       /* Destination pointer is updated according to the address modifier, inc */     
00492       pOut += inc;     
00493      
00494       /* Increment the MAC count */     
00495       count++;     
00496      
00497       /* Update the inputA and inputB pointers for next MAC calculation */     
00498       px = pIn1 + count;     
00499       py = pIn2;     
00500      
00501       /* Decrement the loop counter */     
00502       blkCnt--;     
00503     }     
00504   }     
00505      
00506   /* --------------------------      
00507    * Initializations of stage3      
00508    * -------------------------*/     
00509      
00510   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00511    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00512    * ....      
00513    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]      
00514    * sum +=  x[srcALen-1] * y[0]      
00515    */     
00516      
00517   /* In this stage the MAC operations are decreased by 1 for every iteration.      
00518      The count variable holds the number of MAC operations performed */     
00519   count = srcBLen - 1u;     
00520      
00521   /* Working pointer of inputA */     
00522   pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;     
00523   px = pSrc1;     
00524      
00525   /* Working pointer of inputB */     
00526   py = pIn2;     
00527      
00528   /* -------------------      
00529    * Stage3 process      
00530    * ------------------*/     
00531      
00532   while(blockSize3 > 0u)     
00533   {     
00534     /* Accumulator is made zero for every iteration */     
00535     sum = 0;     
00536      
00537     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00538     k = count >> 2u;     
00539      
00540     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00541      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00542     while(k > 0u)     
00543     {     
00544       /* Perform the multiply-accumulates */     
00545       /* sum += x[srcALen - srcBLen + 4] * y[3] */     
00546       sum = (q31_t) ((((q63_t) sum << 32) +     
00547                       ((q63_t) * px++ * (*py++))) >> 32);     
00548       /* sum += x[srcALen - srcBLen + 3] * y[2] */     
00549       sum = (q31_t) ((((q63_t) sum << 32) +     
00550                       ((q63_t) * px++ * (*py++))) >> 32);     
00551       /* sum += x[srcALen - srcBLen + 2] * y[1] */     
00552       sum = (q31_t) ((((q63_t) sum << 32) +     
00553                       ((q63_t) * px++ * (*py++))) >> 32);     
00554       /* sum += x[srcALen - srcBLen + 1] * y[0] */     
00555       sum = (q31_t) ((((q63_t) sum << 32) +     
00556                       ((q63_t) * px++ * (*py++))) >> 32);     
00557      
00558       /* Decrement the loop counter */     
00559       k--;     
00560     }     
00561      
00562     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00563      ** No loop unrolling is used. */     
00564     k = count % 0x4u;     
00565      
00566     while(k > 0u)     
00567     {     
00568       /* Perform the multiply-accumulates */     
00569       sum = (q31_t) ((((q63_t) sum << 32) +     
00570                       ((q63_t) * px++ * (*py++))) >> 32);     
00571      
00572       /* Decrement the loop counter */     
00573       k--;     
00574     }     
00575      
00576     /* Store the result in the accumulator in the destination buffer. */     
00577     *pOut = sum << 1;     
00578     /* Destination pointer is updated according to the address modifier, inc */     
00579     pOut += inc;     
00580      
00581     /* Update the inputA and inputB pointers for next MAC calculation */     
00582     px = ++pSrc1;     
00583     py = pIn2;     
00584      
00585     /* Decrement the MAC count */     
00586     count--;     
00587      
00588     /* Decrement the loop counter */     
00589     blockSize3--;     
00590   }     
00591      
00592 }     
00593