Cortex-R DSP Software Library: arm_correlate

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_correlate_f32.c      
00009 *      
00010 * Description:   Correlation of floating-point sequences.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00086 void arm_correlate_f32(     
00087   float32_t * pSrcA,     
00088   uint32_t srcALen,     
00089   float32_t * pSrcB,     
00090   uint32_t srcBLen,     
00091   float32_t * pDst)     
00092 {     
00093   float32_t *pIn1;                               /* inputA pointer */     
00094   float32_t *pIn2;                               /* inputB pointer */     
00095   float32_t *pOut = pDst;                        /* output pointer */     
00096   float32_t *px;                                 /* Intermediate inputA pointer */     
00097   float32_t *py;                                 /* Intermediate inputB pointer */     
00098   float32_t *pSrc1;                              /* Intermediate pointers */     
00099   float32_t sum, acc0, acc1, acc2, acc3;         /* Accumulators */     
00100   float32_t x0, x1, x2, x3, c0;                  /* temporary variables for holding input and coefficient values */     
00101   uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3;  /* loop counters */     
00102   int32_t inc = 1;                               /* Destination address modifier */     
00103      
00104      
00105   /* The algorithm implementation is based on the lengths of the inputs. */     
00106   /* srcB is always made to slide across srcA. */     
00107   /* So srcBLen is always considered as shorter or equal to srcALen */     
00108   /* But CORR(x, y) is reverse of CORR(y, x) */     
00109   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */     
00110   /* and the destination pointer modifier, inc is set to -1 */     
00111   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */     
00112   /* But to improve the performance,      
00113    * we include zeroes in the output instead of zero padding either of the the inputs*/     
00114   /* If srcALen > srcBLen,      
00115    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */     
00116   /* If srcALen < srcBLen,      
00117    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */     
00118   if(srcALen >= srcBLen)     
00119   {     
00120     /* Initialization of inputA pointer */     
00121     pIn1 = pSrcA;     
00122      
00123     /* Initialization of inputB pointer */     
00124     pIn2 = pSrcB;     
00125      
00126     /* Number of output samples is calculated */     
00127     outBlockSize = (2u * srcALen) - 1u;     
00128      
00129     /* When srcALen > srcBLen, zero padding has to be done to srcB      
00130      * to make their lengths equal.      
00131      * Instead, (outBlockSize - (srcALen + srcBLen - 1))      
00132      * number of output samples are made zero */     
00133     j = outBlockSize - (srcALen + (srcBLen - 1u));     
00134     
00135     /* Updating the pointer position to non zero value */    
00136     pOut += j;    
00137      
00138      
00139   }     
00140   else     
00141   {     
00142     /* Initialization of inputA pointer */     
00143     pIn1 = pSrcB;     
00144      
00145     /* Initialization of inputB pointer */     
00146     pIn2 = pSrcA;     
00147      
00148     /* srcBLen is always considered as shorter or equal to srcALen */     
00149     j = srcBLen;     
00150     srcBLen = srcALen;     
00151     srcALen = j;     
00152      
00153     /* CORR(x, y) = Reverse order(CORR(y, x)) */     
00154     /* Hence set the destination pointer to point to the last output sample */     
00155     pOut = pDst + ((srcALen + srcBLen) - 2u);     
00156      
00157     /* Destination address modifier is set to -1 */     
00158     inc = -1;     
00159      
00160   }     
00161      
00162   /* The function is internally      
00163    * divided into three parts according to the number of multiplications that has to be      
00164    * taken place between inputA samples and inputB samples. In the first part of the      
00165    * algorithm, the multiplications increase by one for every iteration.      
00166    * In the second part of the algorithm, srcBLen number of multiplications are done.      
00167    * In the third part of the algorithm, the multiplications decrease by one      
00168    * for every iteration.*/     
00169   /* The algorithm is implemented in three stages.      
00170    * The loop counters of each stage is initiated here. */     
00171   blockSize1 = srcBLen - 1u;     
00172   blockSize2 = srcALen - (srcBLen - 1u);     
00173   blockSize3 = blockSize1;     
00174      
00175   /* --------------------------      
00176    * Initializations of stage1      
00177    * -------------------------*/     
00178      
00179   /* sum = x[0] * y[srcBlen - 1]      
00180    * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]      
00181    * ....      
00182    * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]      
00183    */     
00184      
00185   /* In this stage the MAC operations are increased by 1 for every iteration.      
00186      The count variable holds the number of MAC operations performed */     
00187   count = 1u;     
00188      
00189   /* Working pointer of inputA */     
00190   px = pIn1;     
00191      
00192   /* Working pointer of inputB */     
00193   pSrc1 = pIn2 + (srcBLen - 1u);     
00194   py = pSrc1;     
00195      
00196   /* ------------------------      
00197    * Stage1 process      
00198    * ----------------------*/     
00199      
00200   /* The first stage starts here */     
00201   while(blockSize1 > 0u)     
00202   {     
00203     /* Accumulator is made zero for every iteration */     
00204     sum = 0.0f;     
00205      
00206     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00207     k = count >> 2u;     
00208      
00209     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00210      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00211     while(k > 0u)     
00212     {     
00213       /* Read x[0] */  
00214       x0 = *px++;  
00215       /* y[srcBLen - 4] */   
00216       c0 = *py++;  
00217   
00218       /* x[0] * y[srcBLen - 4] */     
00219       sum += x0 * c0;   
00220           
00221       /* x[1] * y[srcBLen - 3] */     
00222       sum += *px++ * *py++;     
00223       /* x[2] * y[srcBLen - 2] */     
00224       sum += *px++ * *py++;     
00225       /* x[3] * y[srcBLen - 1] */     
00226       sum += *px++ * *py++;     
00227      
00228       /* Decrement the loop counter */     
00229       k--;     
00230     }     
00231      
00232     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00233      ** No loop unrolling is used. */     
00234     k = count % 0x4u;     
00235      
00236     while(k > 0u)     
00237     {     
00238       /* Perform the multiply-accumulate */     
00239       /* x[0] * y[srcBLen - 1] */     
00240       sum += *px++ * *py++;     
00241      
00242       /* Decrement the loop counter */     
00243       k--;     
00244     }     
00245      
00246     /* Store the result in the accumulator in the destination buffer. */     
00247     *pOut = sum;     
00248     /* Destination pointer is updated according to the address modifier, inc */     
00249     pOut += inc;     
00250      
00251     /* Update the inputA and inputB pointers for next MAC calculation */     
00252     py = pSrc1 - count;     
00253     px = pIn1;     
00254      
00255     /* Increment the MAC count */     
00256     count++;     
00257      
00258     /* Decrement the loop counter */     
00259     blockSize1--;     
00260   }     
00261      
00262   /* --------------------------      
00263    * Initializations of stage2      
00264    * ------------------------*/     
00265      
00266   /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]      
00267    * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]      
00268    * ....      
00269    * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00270    */     
00271      
00272   /* Working pointer of inputA */     
00273   px = pIn1;     
00274      
00275   /* Working pointer of inputB */     
00276   py = pIn2;     
00277      
00278   /* count is index by which the pointer pIn1 to be incremented */     
00279   count = 0u;     
00280      
00281   /* -------------------      
00282    * Stage2 process      
00283    * ------------------*/     
00284      
00285   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.      
00286    * So, to loop unroll over blockSize2,      
00287    * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */     
00288   if(srcBLen >= 4u)     
00289   {     
00290     /* Loop unroll over blockSize2, by 4 */     
00291     blkCnt = blockSize2 >> 2u;     
00292      
00293     while(blkCnt > 0u)     
00294     {     
00295       /* Set all accumulators to zero */     
00296       acc0 = 0.0f;     
00297       acc1 = 0.0f;     
00298       acc2 = 0.0f;     
00299       acc3 = 0.0f;     
00300      
00301       /* read x[0], x[1], x[2] samples */     
00302       x0 = *(px++);     
00303       x1 = *(px++);     
00304       x2 = *(px++);     
00305      
00306       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00307       k = srcBLen >> 2u;     
00308      
00309       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00310        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00311       do     
00312       {     
00313         /* Read y[0] sample */     
00314         c0 = *(py);     
00315      
00316         /* Read x[3] sample */     
00317         x3 = *(px++);     
00318      
00319         /* Perform the multiply-accumulate */     
00320         /* acc0 +=  x[0] * y[0] */     
00321         acc0 += x0 * c0;     
00322         /* acc1 +=  x[1] * y[0] */     
00323         acc1 += x1 * c0;     
00324         /* acc2 +=  x[2] * y[0] */     
00325         acc2 += x2 * c0;     
00326         /* acc3 +=  x[3] * y[0] */     
00327         acc3 += x3 * c0;     
00328      
00329         /* Read y[1] sample */     
00330         c0 = *(py + 1u);     
00331      
00332         /* Read x[4] sample */     
00333         x0 = *(px++);     
00334      
00335         /* Perform the multiply-accumulate */     
00336         /* acc0 +=  x[1] * y[1] */     
00337         acc0 += x1 * c0;     
00338         /* acc1 +=  x[2] * y[1] */     
00339         acc1 += x2 * c0;     
00340         /* acc2 +=  x[3] * y[1] */     
00341         acc2 += x3 * c0;     
00342         /* acc3 +=  x[4] * y[1] */     
00343         acc3 += x0 * c0;     
00344      
00345         /* Read y[2] sample */     
00346         c0 = *(py + 2u);     
00347      
00348         /* Read x[5] sample */     
00349         x1 = *(px++);     
00350      
00351         /* Perform the multiply-accumulates */     
00352         /* acc0 +=  x[2] * y[2] */     
00353         acc0 += x2 * c0;     
00354         /* acc1 +=  x[3] * y[2] */     
00355         acc1 += x3 * c0;     
00356         /* acc2 +=  x[4] * y[2] */     
00357         acc2 += x0 * c0;     
00358         /* acc3 +=  x[5] * y[2] */     
00359         acc3 += x1 * c0;     
00360      
00361         /* Read y[3] sample */     
00362         c0 = *(py + 3u);     
00363      
00364         /* Read x[6] sample */     
00365         x2 = *(px++);     
00366      
00367         /* Perform the multiply-accumulates */     
00368         /* acc0 +=  x[3] * y[3] */     
00369         acc0 += x3 * c0;     
00370         /* acc1 +=  x[4] * y[3] */     
00371         acc1 += x0 * c0;     
00372         /* acc2 +=  x[5] * y[3] */     
00373         acc2 += x1 * c0;     
00374         /* acc3 +=  x[6] * y[3] */     
00375         acc3 += x2 * c0;   
00376           
00377         py += 4u;    
00378      
00379      
00380       } while(--k);     
00381      
00382       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00383        ** No loop unrolling is used. */     
00384       k = srcBLen % 0x4u;     
00385      
00386       while(k > 0u)     
00387       {     
00388         /* Read y[4] sample */     
00389         c0 = *(py++);     
00390      
00391         /* Read x[7] sample */     
00392         x3 = *(px++);     
00393      
00394         /* Perform the multiply-accumulates */     
00395         /* acc0 +=  x[4] * y[4] */     
00396         acc0 += x0 * c0;     
00397         /* acc1 +=  x[5] * y[4] */     
00398         acc1 += x1 * c0;     
00399         /* acc2 +=  x[6] * y[4] */     
00400         acc2 += x2 * c0;     
00401         /* acc3 +=  x[7] * y[4] */     
00402         acc3 += x3 * c0;     
00403      
00404         /* Reuse the present samples for the next MAC */     
00405         x0 = x1;     
00406         x1 = x2;     
00407         x2 = x3;     
00408      
00409         /* Decrement the loop counter */     
00410         k--;     
00411       }     
00412      
00413       /* Store the result in the accumulator in the destination buffer. */     
00414       *pOut = acc0;     
00415       /* Destination pointer is updated according to the address modifier, inc */     
00416       pOut += inc;     
00417      
00418       *pOut = acc1;     
00419       pOut += inc;     
00420      
00421       *pOut = acc2;     
00422       pOut += inc;     
00423      
00424       *pOut = acc3;     
00425       pOut += inc;     
00426      
00427       /* Increment the pointer pIn1 index, count by 1 */     
00428       count += 4u;     
00429      
00430       /* Update the inputA and inputB pointers for next MAC calculation */     
00431       px = pIn1 + count;     
00432       py = pIn2;     
00433      
00434       /* Decrement the loop counter */     
00435       blkCnt--;     
00436     }     
00437      
00438     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.      
00439      ** No loop unrolling is used. */     
00440     blkCnt = blockSize2 % 0x4u;     
00441      
00442     while(blkCnt > 0u)     
00443     {     
00444       /* Accumulator is made zero for every iteration */     
00445       sum = 0.0f;     
00446      
00447       /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00448       k = srcBLen >> 2u;     
00449      
00450       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00451        ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00452       while(k > 0u)     
00453       {     
00454         /* Perform the multiply-accumulates */     
00455         sum += *px++ * *py++;     
00456         sum += *px++ * *py++;     
00457         sum += *px++ * *py++;     
00458         sum += *px++ * *py++;     
00459      
00460         /* Decrement the loop counter */     
00461         k--;     
00462       }     
00463      
00464       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.      
00465        ** No loop unrolling is used. */     
00466       k = srcBLen % 0x4u;     
00467      
00468       while(k > 0u)     
00469       {     
00470         /* Perform the multiply-accumulate */     
00471         sum += *px++ * *py++;     
00472      
00473         /* Decrement the loop counter */     
00474         k--;     
00475       }     
00476      
00477       /* Store the result in the accumulator in the destination buffer. */     
00478       *pOut = sum;     
00479       /* Destination pointer is updated according to the address modifier, inc */     
00480       pOut += inc;     
00481      
00482       /* Increment the pointer pIn1 index, count by 1 */     
00483       count++;     
00484      
00485       /* Update the inputA and inputB pointers for next MAC calculation */     
00486       px = pIn1 + count;     
00487       py = pIn2;     
00488      
00489       /* Decrement the loop counter */     
00490       blkCnt--;     
00491     }     
00492   }     
00493   else     
00494   {     
00495     /* If the srcBLen is not a multiple of 4,      
00496      * the blockSize2 loop cannot be unrolled by 4 */     
00497     blkCnt = blockSize2;     
00498      
00499     while(blkCnt > 0u)     
00500     {     
00501       /* Accumulator is made zero for every iteration */     
00502       sum = 0.0f;     
00503      
00504       /* Loop over srcBLen */     
00505       k = srcBLen;     
00506      
00507       while(k > 0u)     
00508       {     
00509         /* Perform the multiply-accumulate */     
00510         sum += *px++ * *py++;     
00511      
00512         /* Decrement the loop counter */     
00513         k--;     
00514       }     
00515      
00516       /* Store the result in the accumulator in the destination buffer. */     
00517       *pOut = sum;     
00518       /* Destination pointer is updated according to the address modifier, inc */     
00519       pOut += inc;     
00520      
00521       /* Increment the pointer pIn1 index, count by 1 */     
00522       count++;     
00523      
00524       /* Update the inputA and inputB pointers for next MAC calculation */     
00525       px = pIn1 + count;     
00526       py = pIn2;     
00527      
00528       /* Decrement the loop counter */     
00529       blkCnt--;     
00530     }     
00531   }     
00532      
00533   /* --------------------------      
00534    * Initializations of stage3      
00535    * -------------------------*/     
00536      
00537   /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00538    * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]      
00539    * ....      
00540    * sum +=  x[srcALen-2] * y[0] + x[srcALen-1] * y[1]      
00541    * sum +=  x[srcALen-1] * y[0]      
00542    */     
00543      
00544   /* In this stage the MAC operations are decreased by 1 for every iteration.      
00545      The count variable holds the number of MAC operations performed */     
00546   count = srcBLen - 1u;     
00547      
00548   /* Working pointer of inputA */     
00549   pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));     
00550   px = pSrc1;     
00551      
00552   /* Working pointer of inputB */     
00553   py = pIn2;     
00554      
00555   /* -------------------      
00556    * Stage3 process      
00557    * ------------------*/     
00558      
00559   while(blockSize3 > 0u)     
00560   {     
00561     /* Accumulator is made zero for every iteration */     
00562     sum = 0.0f;     
00563      
00564     /* Apply loop unrolling and compute 4 MACs simultaneously. */     
00565     k = count >> 2u;     
00566      
00567     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.      
00568      ** a second loop below computes MACs for the remaining 1 to 3 samples. */     
00569     while(k > 0u)     
00570     {     
00571       x0 = *px++;  
00572       c0 = *py++;  
00573       /* Perform the multiply-accumulates */     
00574   
00575       /* sum += x[srcALen - srcBLen + 4] * y[3] */     
00576       sum += x0 * c0;     
00577       /* sum += x[srcALen - srcBLen + 3] * y[2] */     
00578       sum += *px++ * *py++;     
00579       /* sum += x[srcALen - srcBLen + 2] * y[1] */     
00580       sum += *px++ * *py++;     
00581       /* sum += x[srcALen - srcBLen + 1] * y[0] */     
00582       sum += *px++ * *py++;     
00583      
00584       /* Decrement the loop counter */     
00585       k--;     
00586     }     
00587      
00588     /* If the count is not a multiple of 4, compute any remaining MACs here.      
00589      ** No loop unrolling is used. */     
00590     k = count % 0x4u;     
00591      
00592     while(k > 0u)     
00593     {     
00594       /* Perform the multiply-accumulates */     
00595       sum += *px++ * *py++;     
00596      
00597       /* Decrement the loop counter */     
00598       k--;     
00599     }     
00600      
00601     /* Store the result in the accumulator in the destination buffer. */     
00602     *pOut = sum;     
00603     /* Destination pointer is updated according to the address modifier, inc */     
00604     pOut += inc;     
00605      
00606     /* Update the inputA and inputB pointers for next MAC calculation */     
00607     px = ++pSrc1;     
00608     py = pIn2;     
00609      
00610     /* Decrement the MAC count */     
00611     count--;     
00612      
00613     /* Decrement the loop counter */     
00614     blockSize3--;     
00615   }     
00616      
00617 }     
00618