00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_mult_f32.c 00009 * 00010 * Description: Floating-point vector multiplication. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00057 void arm_mult_f32( 00058 float32_t * pSrcA, 00059 float32_t * pSrcB, 00060 float32_t * pDst, 00061 uint32_t blockSize) 00062 { 00063 uint32_t blkCnt; /* loop counters */ 00064 float32_t inA1, inA2, inA3, inA4; /* temporary input variables */ 00065 float32_t inB1, inB2, inB3, inB4; /* temporary input variables */ 00066 float32_t out1, out2, out3, out4; /* temporary output variables */ 00067 00068 /* loop Unrolling */ 00069 blkCnt = blockSize >> 3u; 00070 00071 /* First part of the processing with loop unrolling. Compute 8 outputs at a time. 00072 ** a second loop below computes the remaining 1 to 7 samples. */ 00073 while(blkCnt > 0u) 00074 { 00075 /* C = A * B */ 00076 /* Multiply the inputs and store the results in output buffer */ 00077 /* read sample from sourceA */ 00078 inA1 = *pSrcA; 00079 /* read sample from sourceB */ 00080 inB1 = *pSrcB; 00081 /* read sample from sourceA */ 00082 inA2 = *(pSrcA + 1); 00083 /* read sample from sourceB */ 00084 inB2 = *(pSrcB + 1); 00085 00086 /* out = sourceA * sourceB */ 00087 out1 = inA1 * inB1; 00088 00089 /* read sample from sourceA */ 00090 inA3 = *(pSrcA + 2); 00091 /* read sample from sourceB */ 00092 inB3 = *(pSrcB + 2); 00093 00094 /* out = sourceA * sourceB */ 00095 out2 = inA2 * inB2; 00096 00097 /* read sample from sourceA */ 00098 inA4 = *(pSrcA + 3); 00099 00100 /* store result to destination buffer */ 00101 *pDst = out1; 00102 00103 /* read sample from sourceB */ 00104 inB4 = *(pSrcB + 3); 00105 00106 /* out = sourceA * sourceB */ 00107 out3 = inA3 * inB3; 00108 00109 /* read sample from sourceA */ 00110 inA1 = *(pSrcA + 4); 00111 00112 /* store result to destination buffer */ 00113 *(pDst + 1) = out2; 00114 00115 /* read sample from sourceB */ 00116 inB1 = *(pSrcB + 4); 00117 00118 /* out = sourceA * sourceB */ 00119 out4 = inA4 * inB4; 00120 00121 /* read sample from sourceA */ 00122 inA2 = *(pSrcA + 5); 00123 00124 /* store result to destination buffer */ 00125 *(pDst + 2) = out3; 00126 00127 /* read sample from sourceB */ 00128 inB2 = *(pSrcB + 5); 00129 00130 /* out = sourceA * sourceB */ 00131 out1 = inA1 * inB1; 00132 00133 /* read sample from sourceA */ 00134 inA3 = *(pSrcA + 6); 00135 00136 /* store result to destination buffer */ 00137 *(pDst + 3) = out4; 00138 00139 /* read sample from sourceB */ 00140 inB3 = *(pSrcB + 6); 00141 00142 /* out = sourceA * sourceB */ 00143 out2 = inA2 * inB2; 00144 00145 /* read sample from sourceA */ 00146 inA4 = *(pSrcA + 7); 00147 /* read sample from sourceB */ 00148 inB4 = *(pSrcB + 7); 00149 00150 /* store result to destination buffer */ 00151 *(pDst + 4) = out1; 00152 00153 /* out = sourceA * sourceB */ 00154 out3 = inA3 * inB3; 00155 00156 /* store result to destination buffer */ 00157 *(pDst + 5) = out2; 00158 00159 /* out = sourceA * sourceB */ 00160 out4 = inA4 * inB4; 00161 00162 /* increment sourceA by 8 to process next samples */ 00163 pSrcA += 8u; 00164 00165 /* store result to destination buffer */ 00166 *(pDst + 6) = out3; 00167 00168 /* increment sourceB by 8 to process next samples */ 00169 pSrcB += 8u; 00170 00171 /* store result to destination buffer */ 00172 *(pDst + 7) = out4; 00173 00174 /* increment destination buffer by 8 to process next samples */ 00175 pDst += 8u; 00176 00177 /* Decrement the blockSize loop counter */ 00178 blkCnt--; 00179 } 00180 00181 /* If the blockSize is not a multiple of 8, compute any remaining output samples here. 00182 ** No loop unrolling is used. */ 00183 blkCnt = blockSize % 0x8u; 00184 00185 while(blkCnt > 0u) 00186 { 00187 /* C = A * B */ 00188 /* Multiply the inputs and store the results in output buffer */ 00189 inA1 = *pSrcA++; 00190 inB1 = *pSrcB++; 00191 00192 *pDst++ = inA1 * inB1; 00193 00194 /* Decrement the blockSize loop counter */ 00195 blkCnt--; 00196 } 00197 } 00198