00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_dot_prod_f32.c 00009 * 00010 * Description: Floating-point dot product. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * ---------------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00053 void arm_dot_prod_f32( 00054 float32_t * pSrcA, 00055 float32_t * pSrcB, 00056 uint32_t blockSize, 00057 float32_t * result) 00058 { 00059 float32_t sum1 = 0.0f; 00060 float32_t sum2 = 0.0f, sum3 = 0.0f, sum4 = 0.0f; /* Temporary result storage */ 00061 float32_t inA1, inA2, inB1, inB2; 00062 float32_t inA3, inA4, inB3, inB4; 00063 uint32_t blkCnt; /* loop counter */ 00064 00065 /*loop Unrolling */ 00066 blkCnt = blockSize >> 3u; 00067 00068 /* First part of the processing with loop unrolling. Compute 8 outputs at a time. 00069 ** a second loop below computes the remaining 1 to 7 samples. */ 00070 while(blkCnt > 0u) 00071 { 00072 /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ 00073 /* Calculate dot product and then store the result in a temporary buffer */ 00074 00075 /* read input from sourceA */ 00076 inA1 = *pSrcA; 00077 /* read input from sourceB */ 00078 inB1 = *pSrcB; 00079 /* read input from sourceA */ 00080 inA2 = *(pSrcA + 1); 00081 /* read input from sourceB */ 00082 inB2 = *(pSrcB + 1); 00083 00084 /* multiply and accumulate */ 00085 sum1 += inA1 * inB1; 00086 00087 /* read input from sourceA */ 00088 inA3 = *(pSrcA + 2); 00089 /* read input from sourceB */ 00090 inB3 = *(pSrcB + 2); 00091 00092 /* multiply and accumulate */ 00093 sum2 += inA2 * inB2; 00094 00095 /* read input from sourceA */ 00096 inA4 = *(pSrcA + 3); 00097 /* read input from sourceB */ 00098 inB4 = *(pSrcB + 3); 00099 00100 /* multiply and accumulate */ 00101 sum3 += inA3 * inB3; 00102 00103 /* read input from sourceA */ 00104 inA1 = *(pSrcA + 4); 00105 /* read input from sourceB */ 00106 inB1 = *(pSrcB + 4); 00107 00108 /* multiply and accumulate */ 00109 sum4 += inA4 * inB4; 00110 00111 /* read input from sourceA */ 00112 inA2 = *(pSrcA + 5); 00113 /* read input from sourceB */ 00114 inB2 = *(pSrcB + 5); 00115 00116 /* multiply and accumulate */ 00117 sum1 += inA1 * inB1; 00118 00119 /* read input from sourceA */ 00120 inA3 = *(pSrcA + 6); 00121 /* read input from sourceB */ 00122 inB3 = *(pSrcB + 6); 00123 00124 /* multiply and accumulate */ 00125 sum2 += inA2 * inB2; 00126 00127 /* read input from sourceA */ 00128 inA4 = *(pSrcA + 7); 00129 /* read input from sourceB */ 00130 inB4 = *(pSrcB + 7); 00131 00132 /* multiply and accumulate */ 00133 sum3 += inA3 * inB3; 00134 00135 /* increment sourceA pointer by 8 */ 00136 pSrcA += 8u; 00137 /* increment sourceB pointer by 8 */ 00138 pSrcB += 8u; 00139 00140 /* multiply and accumulate */ 00141 sum4 += inA4 * inB4; 00142 00143 /* Decrement the loop counter */ 00144 blkCnt--; 00145 } 00146 /* add accumulators */ 00147 sum1 = sum1 + sum2 + sum3 + sum4; 00148 00149 /* If the blockSize is not a multiple of 8, compute any remaining output samples here. 00150 ** No loop unrolling is used. */ 00151 blkCnt = blockSize % 0x8u; 00152 00153 while(blkCnt > 0u) 00154 { 00155 /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */ 00156 /* Calculate dot product and then store the result in a temporary buffer. */ 00157 inA1 = *pSrcA++; 00158 inB1 = *pSrcB++; 00159 sum1 += (inA1 * inB1); 00160 00161 /* Decrement the loop counter */ 00162 blkCnt--; 00163 } 00164 00165 /* Store the result back in the destination buffer */ 00166 *result = sum1; 00167 } 00168