Cortex-R DSP Software Library: arm_dot_prod

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_dot_prod_q15.c      
00009 *      
00010 * Description:  Q15 dot product.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00056 void arm_dot_prod_q15(     
00057   q15_t * pSrcA,     
00058   q15_t * pSrcB,     
00059   uint32_t blockSize,     
00060   q63_t * result)     
00061 {     
00062   q63_t sum = 0;                                 /* Temporary result storage */     
00063   uint32_t blkCnt;                               /* loop counter */     
00064   q31_t inA1, inA2, inB1, inB2;                  /* Temporary variables to store input data */  
00065   q31_t inA3, inA4, inB3, inB4;  
00066      
00067   /*loop Unrolling */     
00068   blkCnt = blockSize >> 3u;     
00069      
00070   /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
00071    ** a second loop below computes the remaining 1 to 7 samples. */     
00072   while(blkCnt > 0u)     
00073   {     
00074     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */     
00075     /* Calculate dot product and then store the result in a temporary buffer. */     
00076     /* read two samples at a time from soruceA buffer */  
00077     inA1 = _SIMD32_OFFSET(pSrcA);  
00078     /* read two samples at a time from soruceB buffer */  
00079     inB1 = _SIMD32_OFFSET(pSrcB);  
00080     /* read two samples at a time from soruceA buffer */  
00081     inA2 = _SIMD32_OFFSET(pSrcA+2);  
00082   
00083     /* multiply and accumulate two samples at a time */  
00084     sum = __SMLALD(inA1, inB1, sum);   
00085         
00086     /* read two samples at a time from soruceB buffer */  
00087     inB2 = _SIMD32_OFFSET(pSrcB+2);  
00088     /* read two samples at a time from soruceA buffer */  
00089     inA3 = _SIMD32_OFFSET(pSrcA+4);  
00090     /* read two samples at a time from soruceB buffer */  
00091     inB3 = _SIMD32_OFFSET(pSrcB+4);  
00092   
00093     /* multiply and accumulate two samples at a time */  
00094     sum = __SMLALD(inA2, inB2, sum);  
00095   
00096     /* read two samples at a time from soruceA buffer */  
00097     inA4 = _SIMD32_OFFSET(pSrcA+6);  
00098     /* read two samples at a time from soruceB buffer */  
00099     inB4 = _SIMD32_OFFSET(pSrcB+6);  
00100   
00101     /* increment source A buffer by 8 */  
00102     pSrcA += 8u;  
00103     /* increment sourceB buffer by 8 */  
00104     pSrcB += 8u;  
00105   
00106     /* multiply and accumulate two samples at a time */  
00107     sum = __SMLALD(inA3, inB3, sum);  
00108     sum = __SMLALD(inA4, inB4, sum);  
00109   
00110     /* Decrement the loop counter */     
00111     blkCnt--;     
00112   }     
00113   /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
00114    ** No loop unrolling is used. */     
00115   blkCnt = blockSize % 0x8u;     
00116      
00117   while(blkCnt > 0u)     
00118   {     
00119     /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */     
00120     /* Calculate dot product and then store the results in a temporary buffer. */     
00121     sum = __SMLALD(*pSrcA++, *pSrcB++, sum);     
00122      
00123     /* Decrement the loop counter */     
00124     blkCnt--;     
00125   }     
00126      
00127   /* Store the result in the destination buffer in 34.30 format */     
00128   *result = sum;     
00129 }     
00130