Cortex-R DSP Software Library: arm_dot_prod

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_dot_prod_f32.c      
00009 *      
00010 * Description:  Floating-point dot product.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * ---------------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00053 void arm_dot_prod_f32(     
00054   float32_t * pSrcA,     
00055   float32_t * pSrcB,     
00056   uint32_t blockSize,     
00057   float32_t * result)     
00058 {     
00059   float32_t sum1 = 0.0f;  
00060   float32_t sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f;                       /* Temporary result storage */     
00061   float32_t inA1, inA2, inB1, inB2;  
00062   float32_t inA3, inA4, inB3, inB4;  
00063   uint32_t blkCnt;                               /* loop counter */     
00064      
00065   /*loop Unrolling */     
00066   blkCnt = blockSize >> 3u;     
00067      
00068   /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
00069    ** a second loop below computes the remaining 1 to 7 samples. */     
00070   while(blkCnt > 0u)     
00071   {     
00072     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */     
00073     /* Calculate dot product and then store the result in a temporary buffer */     
00074   
00075     /* read input from sourceA */  
00076     inA1 = *pSrcA;  
00077     /* read input from sourceB */  
00078     inB1 = *pSrcB;  
00079     /* read input from sourceA */  
00080     inA2 = *(pSrcA + 1);  
00081     /* read input from sourceB */  
00082     inB2 = *(pSrcB + 1);  
00083   
00084     /* multiply and accumulate */  
00085     sum1 += inA1 * inB1;  
00086   
00087     /* read input from sourceA */  
00088     inA3 = *(pSrcA + 2);  
00089     /* read input from sourceB */  
00090     inB3 = *(pSrcB + 2);  
00091   
00092     /* multiply and accumulate */  
00093     sum2 += inA2 * inB2;  
00094   
00095     /* read input from sourceA */  
00096     inA4 = *(pSrcA + 3);  
00097     /* read input from sourceB */  
00098     inB4 = *(pSrcB + 3);  
00099   
00100     /* multiply and accumulate */  
00101     sum3 += inA3 * inB3;  
00102   
00103     /* read input from sourceA */  
00104     inA1 = *(pSrcA + 4);  
00105     /* read input from sourceB */  
00106     inB1 = *(pSrcB + 4);  
00107   
00108     /* multiply and accumulate */  
00109     sum4 += inA4 * inB4;  
00110   
00111     /* read input from sourceA */  
00112     inA2 = *(pSrcA + 5);  
00113     /* read input from sourceB */  
00114     inB2 = *(pSrcB + 5);  
00115   
00116     /* multiply and accumulate */  
00117     sum1 += inA1 * inB1;  
00118   
00119     /* read input from sourceA */  
00120     inA3 = *(pSrcA + 6);  
00121     /* read input from sourceB */  
00122     inB3 = *(pSrcB + 6);  
00123   
00124     /* multiply and accumulate */  
00125     sum2 += inA2 * inB2;  
00126   
00127     /* read input from sourceA */  
00128     inA4 = *(pSrcA + 7);  
00129     /* read input from sourceB */  
00130     inB4 = *(pSrcB + 7);  
00131   
00132     /* multiply and accumulate */  
00133     sum3 += inA3 * inB3;  
00134   
00135     /* increment sourceA pointer by 8 */  
00136     pSrcA += 8u;  
00137     /* increment sourceB pointer by 8 */  
00138     pSrcB += 8u;  
00139       
00140     /* multiply and accumulate */  
00141     sum4 += inA4 * inB4;  
00142   
00143     /* Decrement the loop counter */     
00144     blkCnt--;     
00145   }   
00146   /* add accumulators */    
00147   sum1 = sum1 + sum2 + sum3 + sum4;   
00148   
00149   /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
00150    ** No loop unrolling is used. */     
00151   blkCnt = blockSize % 0x8u;     
00152   
00153   while(blkCnt > 0u)     
00154   {     
00155     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */     
00156     /* Calculate dot product and then store the result in a temporary buffer. */     
00157     inA1 = *pSrcA++;  
00158     inB1 = *pSrcB++;  
00159     sum1 += (inA1 * inB1);  
00160      
00161     /* Decrement the loop counter */     
00162     blkCnt--;     
00163   }     
00164      
00165   /* Store the result back in the destination buffer */     
00166   *result = sum1;     
00167 }     
00168