00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_conv_f32.c 00009 * 00010 * Description: Convolution of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00094 void arm_conv_f32( 00095 float32_t * pSrcA, 00096 uint32_t srcALen, 00097 float32_t * pSrcB, 00098 uint32_t srcBLen, 00099 float32_t * pDst) 00100 { 00101 float32_t *pIn1; /* inputA pointer */ 00102 float32_t *pIn2; /* inputB pointer */ 00103 float32_t *pOut = pDst; /* output pointer */ 00104 float32_t *px; /* Intermediate inputA pointer */ 00105 float32_t *py; /* Intermediate inputB pointer */ 00106 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00107 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00108 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00109 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */ 00110 00111 00112 /* The algorithm implementation is based on the lengths of the inputs. */ 00113 /* srcB is always made to slide across srcA. */ 00114 /* So srcBLen is always considered as shorter or equal to srcALen */ 00115 if(srcALen >= srcBLen) 00116 { 00117 /* Initialization of inputA pointer */ 00118 pIn1 = pSrcA; 00119 00120 /* Initialization of inputB pointer */ 00121 pIn2 = pSrcB; 00122 00123 } 00124 else 00125 { 00126 /* Initialization of inputA pointer */ 00127 pIn1 = pSrcB; 00128 00129 /* Initialization of inputB pointer */ 00130 pIn2 = pSrcA; 00131 00132 /* srcBLen is always considered as shorter or equal to srcALen */ 00133 j = srcBLen; 00134 srcBLen = srcALen; 00135 srcALen = j; 00136 00137 } 00138 00139 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00140 /* The function is internally 00141 * divided into three stages according to the number of multiplications that has to be 00142 * taken place between inputA samples and inputB samples. In the first stage of the 00143 * algorithm, the multiplications increase by one for every iteration. 00144 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00145 * In the third stage of the algorithm, the multiplications decrease by one 00146 * for every iteration. */ 00147 00148 /* The algorithm is implemented in three stages. 00149 The loop counters of each stage is initiated here. */ 00150 blockSize1 = srcBLen - 1u; 00151 blockSize2 = srcALen - (srcBLen - 1u); 00152 blockSize3 = blockSize1; 00153 00154 /* -------------------------- 00155 * initializations of stage1 00156 * -------------------------*/ 00157 00158 /* sum = x[0] * y[0] 00159 * sum = x[0] * y[1] + x[1] * y[0] 00160 * .... 00161 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00162 */ 00163 00164 /* In this stage the MAC operations are increased by 1 for every iteration. 00165 The count variable holds the number of MAC operations performed */ 00166 count = 1u; 00167 00168 /* Working pointer of inputA */ 00169 px = pIn1; 00170 00171 /* Working pointer of inputB */ 00172 py = pIn2; 00173 00174 00175 /* ------------------------ 00176 * Stage1 process 00177 * ----------------------*/ 00178 00179 /* The first stage starts here */ 00180 while(blockSize1 > 0u) 00181 { 00182 /* Accumulator is made zero for every iteration */ 00183 sum = 0.0f; 00184 00185 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00186 k = count >> 2u; 00187 00188 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00189 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00190 while(k > 0u) 00191 { 00192 00193 /* Read x[0] */ 00194 x0 = *px++; 00195 00196 /* y[srcBLen - 1] */ 00197 c0 = *py--; 00198 00199 /* x[0] * y[srcBLen - 1] */ 00200 sum += x0 * c0; 00201 00202 /* x[1] * y[srcBLen - 2] */ 00203 sum += *px++ * *py--; 00204 00205 /* x[2] * y[srcBLen - 3] */ 00206 sum += *px++ * *py--; 00207 00208 /* x[3] * y[srcBLen - 4] */ 00209 sum += *px++ * *py--; 00210 00211 /* Decrement the loop counter */ 00212 k--; 00213 } 00214 00215 /* If the count is not a multiple of 4, compute any remaining MACs here. 00216 ** No loop unrolling is used. */ 00217 k = count % 0x4u; 00218 00219 while(k > 0u) 00220 { 00221 00222 x0 = *px++; 00223 c0 = *py--; 00224 00225 /* Perform the multiply-accumulate */ 00226 sum += x0 * c0; 00227 00228 /* Decrement the loop counter */ 00229 k--; 00230 } 00231 00232 /* Store the result in the accumulator in the destination buffer. */ 00233 *pOut++ = sum; 00234 00235 /* Update the inputA and inputB pointers for next MAC calculation */ 00236 py = pIn2 + count; 00237 px = pIn1; 00238 00239 /* Increment the MAC count */ 00240 count++; 00241 00242 /* Decrement the loop counter */ 00243 blockSize1--; 00244 } 00245 00246 /* -------------------------- 00247 * Initializations of stage2 00248 * ------------------------*/ 00249 00250 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00251 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00252 * .... 00253 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00254 */ 00255 00256 /* Working pointer of inputA */ 00257 px = pIn1; 00258 00259 /* Working pointer of inputB */ 00260 pSrc2 = pIn2 + (srcBLen - 1u); 00261 py = pSrc2; 00262 00263 /* count is index by which the pointer pIn1 to be incremented */ 00264 count = 0u; 00265 00266 /* ------------------- 00267 * Stage2 process 00268 * ------------------*/ 00269 00270 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00271 * So, to loop unroll over blockSize2, 00272 * srcBLen should be greater than or equal to 4 */ 00273 if(srcBLen >= 4u) 00274 { 00275 /* Loop unroll over blockSize2, by 4 */ 00276 blkCnt = blockSize2 >> 2u; 00277 00278 while(blkCnt > 0u) 00279 { 00280 /* Set all accumulators to zero */ 00281 acc0 = 0.0f; 00282 acc1 = 0.0f; 00283 acc2 = 0.0f; 00284 acc3 = 0.0f; 00285 00286 /* read x[0], x[1], x[2] samples */ 00287 x0 = *(px++); 00288 x1 = *(px++); 00289 x2 = *(px++); 00290 00291 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00292 k = srcBLen >> 2u; 00293 00294 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00295 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00296 do 00297 { 00298 /* Read y[srcBLen - 1] sample */ 00299 c0 = *(py--); 00300 00301 /* Read x[3] sample */ 00302 x3 = *(px); 00303 00304 /* Perform the multiply-accumulate */ 00305 /* acc0 += x[0] * y[srcBLen - 1] */ 00306 acc0 += x0 * c0; 00307 00308 /* acc1 += x[1] * y[srcBLen - 1] */ 00309 acc1 += x1 * c0; 00310 00311 /* acc2 += x[2] * y[srcBLen - 1] */ 00312 acc2 += x2 * c0; 00313 00314 /* acc3 += x[3] * y[srcBLen - 1] */ 00315 acc3 += x3 * c0; 00316 00317 /* Read y[srcBLen - 2] sample */ 00318 c0 = *(py--); 00319 00320 /* Read x[4] sample */ 00321 x0 = *(px + 1u); 00322 00323 /* Perform the multiply-accumulate */ 00324 /* acc0 += x[1] * y[srcBLen - 2] */ 00325 acc0 += x1 * c0; 00326 /* acc1 += x[2] * y[srcBLen - 2] */ 00327 acc1 += x2 * c0; 00328 /* acc2 += x[3] * y[srcBLen - 2] */ 00329 acc2 += x3 * c0; 00330 /* acc3 += x[4] * y[srcBLen - 2] */ 00331 acc3 += x0 * c0; 00332 00333 /* Read y[srcBLen - 3] sample */ 00334 c0 = *(py--); 00335 00336 /* Read x[5] sample */ 00337 x1 = *(px + 2u); 00338 00339 /* Perform the multiply-accumulates */ 00340 /* acc0 += x[2] * y[srcBLen - 3] */ 00341 acc0 += x2 * c0; 00342 /* acc1 += x[3] * y[srcBLen - 2] */ 00343 acc1 += x3 * c0; 00344 /* acc2 += x[4] * y[srcBLen - 2] */ 00345 acc2 += x0 * c0; 00346 /* acc3 += x[5] * y[srcBLen - 2] */ 00347 acc3 += x1 * c0; 00348 00349 /* Read y[srcBLen - 4] sample */ 00350 c0 = *(py--); 00351 00352 /* Read x[6] sample */ 00353 x2 = *(px + 3u); 00354 00355 /* Perform the multiply-accumulates */ 00356 /* acc0 += x[3] * y[srcBLen - 4] */ 00357 acc0 += x3 * c0; 00358 /* acc1 += x[4] * y[srcBLen - 4] */ 00359 acc1 += x0 * c0; 00360 /* acc2 += x[5] * y[srcBLen - 4] */ 00361 acc2 += x1 * c0; 00362 /* acc3 += x[6] * y[srcBLen - 4] */ 00363 acc3 += x2 * c0; 00364 00365 px += 4u; 00366 00367 00368 } while(--k); 00369 00370 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00371 ** No loop unrolling is used. */ 00372 k = srcBLen % 0x4u; 00373 00374 while(k > 0u) 00375 { 00376 /* Read y[srcBLen - 5] sample */ 00377 c0 = *(py--); 00378 00379 /* Read x[7] sample */ 00380 x3 = *(px++); 00381 00382 /* Perform the multiply-accumulates */ 00383 /* acc0 += x[4] * y[srcBLen - 5] */ 00384 acc0 += x0 * c0; 00385 /* acc1 += x[5] * y[srcBLen - 5] */ 00386 acc1 += x1 * c0; 00387 /* acc2 += x[6] * y[srcBLen - 5] */ 00388 acc2 += x2 * c0; 00389 /* acc3 += x[7] * y[srcBLen - 5] */ 00390 acc3 += x3 * c0; 00391 00392 /* Reuse the present samples for the next MAC */ 00393 x0 = x1; 00394 x1 = x2; 00395 x2 = x3; 00396 00397 /* Decrement the loop counter */ 00398 k--; 00399 } 00400 00401 /* Store the result in the accumulator in the destination buffer. */ 00402 *pOut++ = acc0; 00403 *pOut++ = acc1; 00404 *pOut++ = acc2; 00405 *pOut++ = acc3; 00406 00407 /* Increment the pointer pIn1 index, count by 1 */ 00408 count += 4u; 00409 00410 /* Update the inputA and inputB pointers for next MAC calculation */ 00411 px = pIn1 + count; 00412 py = pSrc2; 00413 00414 /* Decrement the loop counter */ 00415 blkCnt--; 00416 } 00417 00418 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00419 ** No loop unrolling is used. */ 00420 blkCnt = blockSize2 % 0x4u; 00421 00422 while(blkCnt > 0u) 00423 { 00424 /* Accumulator is made zero for every iteration */ 00425 sum = 0.0f; 00426 00427 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00428 k = srcBLen >> 2u; 00429 00430 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00431 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00432 while(k > 0u) 00433 { 00434 00435 /* Perform the multiply-accumulates */ 00436 sum += *px++ * *py--; 00437 sum += *px++ * *py--; 00438 sum += *px++ * *py--; 00439 sum += *px++ * *py--; 00440 00441 /* Decrement the loop counter */ 00442 k--; 00443 } 00444 00445 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00446 ** No loop unrolling is used. */ 00447 k = srcBLen % 0x4u; 00448 00449 while(k > 0u) 00450 { 00451 /* Perform the multiply-accumulate */ 00452 sum += *px++ * *py--; 00453 00454 /* Decrement the loop counter */ 00455 k--; 00456 } 00457 00458 /* Store the result in the accumulator in the destination buffer. */ 00459 *pOut++ = sum; 00460 00461 /* Increment the MAC count */ 00462 count++; 00463 00464 /* Update the inputA and inputB pointers for next MAC calculation */ 00465 px = pIn1 + count; 00466 py = pSrc2; 00467 00468 /* Decrement the loop counter */ 00469 blkCnt--; 00470 } 00471 } 00472 else 00473 { 00474 /* If the srcBLen is not a multiple of 4, 00475 * the blockSize2 loop cannot be unrolled by 4 */ 00476 blkCnt = blockSize2; 00477 00478 while(blkCnt > 0u) 00479 { 00480 /* Accumulator is made zero for every iteration */ 00481 sum = 0.0f; 00482 00483 /* srcBLen number of MACS should be performed */ 00484 k = srcBLen; 00485 00486 while(k > 0u) 00487 { 00488 /* Perform the multiply-accumulate */ 00489 sum += *px++ * *py--; 00490 00491 /* Decrement the loop counter */ 00492 k--; 00493 } 00494 00495 /* Store the result in the accumulator in the destination buffer. */ 00496 *pOut++ = sum; 00497 00498 /* Increment the MAC count */ 00499 count++; 00500 00501 /* Update the inputA and inputB pointers for next MAC calculation */ 00502 px = pIn1 + count; 00503 py = pSrc2; 00504 00505 /* Decrement the loop counter */ 00506 blkCnt--; 00507 } 00508 } 00509 00510 00511 /* -------------------------- 00512 * Initializations of stage3 00513 * -------------------------*/ 00514 00515 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00516 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00517 * .... 00518 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00519 * sum += x[srcALen-1] * y[srcBLen-1] 00520 */ 00521 00522 /* In this stage the MAC operations are decreased by 1 for every iteration. 00523 The blockSize3 variable holds the number of MAC operations performed */ 00524 00525 /* Working pointer of inputA */ 00526 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00527 px = pSrc1; 00528 00529 /* Working pointer of inputB */ 00530 pSrc2 = pIn2 + (srcBLen - 1u); 00531 py = pSrc2; 00532 00533 /* ------------------- 00534 * Stage3 process 00535 * ------------------*/ 00536 00537 while(blockSize3 > 0u) 00538 { 00539 /* Accumulator is made zero for every iteration */ 00540 sum = 0.0f; 00541 00542 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00543 k = blockSize3 >> 2u; 00544 00545 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00546 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00547 while(k > 0u) 00548 { 00549 /* Read x[srcALen - srcBLen + 1] */ 00550 x0 = *px++; 00551 00552 /* Read y[srcBLen - 1] */ 00553 c0 = *py--; 00554 00555 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00556 sum += x0 * c0; 00557 00558 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00559 sum += *px++ * *py--; 00560 00561 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00562 sum += *px++ * *py--; 00563 00564 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00565 sum += *px++ * *py--; 00566 00567 /* Decrement the loop counter */ 00568 k--; 00569 } 00570 00571 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00572 ** No loop unrolling is used. */ 00573 k = blockSize3 % 0x4u; 00574 00575 while(k > 0u) 00576 { 00577 /* Perform the multiply-accumulates */ 00578 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00579 sum += *px++ * *py--; 00580 00581 /* Decrement the loop counter */ 00582 k--; 00583 } 00584 00585 /* Store the result in the accumulator in the destination buffer. */ 00586 *pOut++ = sum; 00587 00588 /* Update the inputA and inputB pointers for next MAC calculation */ 00589 px = ++pSrc1; 00590 py = pSrc2; 00591 00592 /* Decrement the loop counter */ 00593 blockSize3--; 00594 } 00595 00596 } 00597