00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_correlate_f32.c 00009 * 00010 * Description: Correlation of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00086 void arm_correlate_f32( 00087 float32_t * pSrcA, 00088 uint32_t srcALen, 00089 float32_t * pSrcB, 00090 uint32_t srcBLen, 00091 float32_t * pDst) 00092 { 00093 float32_t *pIn1; /* inputA pointer */ 00094 float32_t *pIn2; /* inputB pointer */ 00095 float32_t *pOut = pDst; /* output pointer */ 00096 float32_t *px; /* Intermediate inputA pointer */ 00097 float32_t *py; /* Intermediate inputB pointer */ 00098 float32_t *pSrc1; /* Intermediate pointers */ 00099 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00100 float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00101 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */ 00102 int32_t inc = 1; /* Destination address modifier */ 00103 00104 00105 /* The algorithm implementation is based on the lengths of the inputs. */ 00106 /* srcB is always made to slide across srcA. */ 00107 /* So srcBLen is always considered as shorter or equal to srcALen */ 00108 /* But CORR(x, y) is reverse of CORR(y, x) */ 00109 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00110 /* and the destination pointer modifier, inc is set to -1 */ 00111 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00112 /* But to improve the performance, 00113 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00114 /* If srcALen > srcBLen, 00115 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00116 /* If srcALen < srcBLen, 00117 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00118 if(srcALen >= srcBLen) 00119 { 00120 /* Initialization of inputA pointer */ 00121 pIn1 = pSrcA; 00122 00123 /* Initialization of inputB pointer */ 00124 pIn2 = pSrcB; 00125 00126 /* Number of output samples is calculated */ 00127 outBlockSize = (2u * srcALen) - 1u; 00128 00129 /* When srcALen > srcBLen, zero padding has to be done to srcB 00130 * to make their lengths equal. 00131 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00132 * number of output samples are made zero */ 00133 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00134 00135 /* Updating the pointer position to non zero value */ 00136 pOut += j; 00137 00138 00139 } 00140 else 00141 { 00142 /* Initialization of inputA pointer */ 00143 pIn1 = pSrcB; 00144 00145 /* Initialization of inputB pointer */ 00146 pIn2 = pSrcA; 00147 00148 /* srcBLen is always considered as shorter or equal to srcALen */ 00149 j = srcBLen; 00150 srcBLen = srcALen; 00151 srcALen = j; 00152 00153 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00154 /* Hence set the destination pointer to point to the last output sample */ 00155 pOut = pDst + ((srcALen + srcBLen) - 2u); 00156 00157 /* Destination address modifier is set to -1 */ 00158 inc = -1; 00159 00160 } 00161 00162 /* The function is internally 00163 * divided into three parts according to the number of multiplications that has to be 00164 * taken place between inputA samples and inputB samples. In the first part of the 00165 * algorithm, the multiplications increase by one for every iteration. 00166 * In the second part of the algorithm, srcBLen number of multiplications are done. 00167 * In the third part of the algorithm, the multiplications decrease by one 00168 * for every iteration.*/ 00169 /* The algorithm is implemented in three stages. 00170 * The loop counters of each stage is initiated here. */ 00171 blockSize1 = srcBLen - 1u; 00172 blockSize2 = srcALen - (srcBLen - 1u); 00173 blockSize3 = blockSize1; 00174 00175 /* -------------------------- 00176 * Initializations of stage1 00177 * -------------------------*/ 00178 00179 /* sum = x[0] * y[srcBlen - 1] 00180 * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1] 00181 * .... 00182 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00183 */ 00184 00185 /* In this stage the MAC operations are increased by 1 for every iteration. 00186 The count variable holds the number of MAC operations performed */ 00187 count = 1u; 00188 00189 /* Working pointer of inputA */ 00190 px = pIn1; 00191 00192 /* Working pointer of inputB */ 00193 pSrc1 = pIn2 + (srcBLen - 1u); 00194 py = pSrc1; 00195 00196 /* ------------------------ 00197 * Stage1 process 00198 * ----------------------*/ 00199 00200 /* The first stage starts here */ 00201 while(blockSize1 > 0u) 00202 { 00203 /* Accumulator is made zero for every iteration */ 00204 sum = 0.0f; 00205 00206 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00207 k = count >> 2u; 00208 00209 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00210 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00211 while(k > 0u) 00212 { 00213 /* Read x[0] */ 00214 x0 = *px++; 00215 /* y[srcBLen - 4] */ 00216 c0 = *py++; 00217 00218 /* x[0] * y[srcBLen - 4] */ 00219 sum += x0 * c0; 00220 00221 /* x[1] * y[srcBLen - 3] */ 00222 sum += *px++ * *py++; 00223 /* x[2] * y[srcBLen - 2] */ 00224 sum += *px++ * *py++; 00225 /* x[3] * y[srcBLen - 1] */ 00226 sum += *px++ * *py++; 00227 00228 /* Decrement the loop counter */ 00229 k--; 00230 } 00231 00232 /* If the count is not a multiple of 4, compute any remaining MACs here. 00233 ** No loop unrolling is used. */ 00234 k = count % 0x4u; 00235 00236 while(k > 0u) 00237 { 00238 /* Perform the multiply-accumulate */ 00239 /* x[0] * y[srcBLen - 1] */ 00240 sum += *px++ * *py++; 00241 00242 /* Decrement the loop counter */ 00243 k--; 00244 } 00245 00246 /* Store the result in the accumulator in the destination buffer. */ 00247 *pOut = sum; 00248 /* Destination pointer is updated according to the address modifier, inc */ 00249 pOut += inc; 00250 00251 /* Update the inputA and inputB pointers for next MAC calculation */ 00252 py = pSrc1 - count; 00253 px = pIn1; 00254 00255 /* Increment the MAC count */ 00256 count++; 00257 00258 /* Decrement the loop counter */ 00259 blockSize1--; 00260 } 00261 00262 /* -------------------------- 00263 * Initializations of stage2 00264 * ------------------------*/ 00265 00266 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00267 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00268 * .... 00269 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00270 */ 00271 00272 /* Working pointer of inputA */ 00273 px = pIn1; 00274 00275 /* Working pointer of inputB */ 00276 py = pIn2; 00277 00278 /* count is index by which the pointer pIn1 to be incremented */ 00279 count = 0u; 00280 00281 /* ------------------- 00282 * Stage2 process 00283 * ------------------*/ 00284 00285 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00286 * So, to loop unroll over blockSize2, 00287 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00288 if(srcBLen >= 4u) 00289 { 00290 /* Loop unroll over blockSize2, by 4 */ 00291 blkCnt = blockSize2 >> 2u; 00292 00293 while(blkCnt > 0u) 00294 { 00295 /* Set all accumulators to zero */ 00296 acc0 = 0.0f; 00297 acc1 = 0.0f; 00298 acc2 = 0.0f; 00299 acc3 = 0.0f; 00300 00301 /* read x[0], x[1], x[2] samples */ 00302 x0 = *(px++); 00303 x1 = *(px++); 00304 x2 = *(px++); 00305 00306 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00307 k = srcBLen >> 2u; 00308 00309 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00310 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00311 do 00312 { 00313 /* Read y[0] sample */ 00314 c0 = *(py); 00315 00316 /* Read x[3] sample */ 00317 x3 = *(px++); 00318 00319 /* Perform the multiply-accumulate */ 00320 /* acc0 += x[0] * y[0] */ 00321 acc0 += x0 * c0; 00322 /* acc1 += x[1] * y[0] */ 00323 acc1 += x1 * c0; 00324 /* acc2 += x[2] * y[0] */ 00325 acc2 += x2 * c0; 00326 /* acc3 += x[3] * y[0] */ 00327 acc3 += x3 * c0; 00328 00329 /* Read y[1] sample */ 00330 c0 = *(py + 1u); 00331 00332 /* Read x[4] sample */ 00333 x0 = *(px++); 00334 00335 /* Perform the multiply-accumulate */ 00336 /* acc0 += x[1] * y[1] */ 00337 acc0 += x1 * c0; 00338 /* acc1 += x[2] * y[1] */ 00339 acc1 += x2 * c0; 00340 /* acc2 += x[3] * y[1] */ 00341 acc2 += x3 * c0; 00342 /* acc3 += x[4] * y[1] */ 00343 acc3 += x0 * c0; 00344 00345 /* Read y[2] sample */ 00346 c0 = *(py + 2u); 00347 00348 /* Read x[5] sample */ 00349 x1 = *(px++); 00350 00351 /* Perform the multiply-accumulates */ 00352 /* acc0 += x[2] * y[2] */ 00353 acc0 += x2 * c0; 00354 /* acc1 += x[3] * y[2] */ 00355 acc1 += x3 * c0; 00356 /* acc2 += x[4] * y[2] */ 00357 acc2 += x0 * c0; 00358 /* acc3 += x[5] * y[2] */ 00359 acc3 += x1 * c0; 00360 00361 /* Read y[3] sample */ 00362 c0 = *(py + 3u); 00363 00364 /* Read x[6] sample */ 00365 x2 = *(px++); 00366 00367 /* Perform the multiply-accumulates */ 00368 /* acc0 += x[3] * y[3] */ 00369 acc0 += x3 * c0; 00370 /* acc1 += x[4] * y[3] */ 00371 acc1 += x0 * c0; 00372 /* acc2 += x[5] * y[3] */ 00373 acc2 += x1 * c0; 00374 /* acc3 += x[6] * y[3] */ 00375 acc3 += x2 * c0; 00376 00377 py += 4u; 00378 00379 00380 } while(--k); 00381 00382 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00383 ** No loop unrolling is used. */ 00384 k = srcBLen % 0x4u; 00385 00386 while(k > 0u) 00387 { 00388 /* Read y[4] sample */ 00389 c0 = *(py++); 00390 00391 /* Read x[7] sample */ 00392 x3 = *(px++); 00393 00394 /* Perform the multiply-accumulates */ 00395 /* acc0 += x[4] * y[4] */ 00396 acc0 += x0 * c0; 00397 /* acc1 += x[5] * y[4] */ 00398 acc1 += x1 * c0; 00399 /* acc2 += x[6] * y[4] */ 00400 acc2 += x2 * c0; 00401 /* acc3 += x[7] * y[4] */ 00402 acc3 += x3 * c0; 00403 00404 /* Reuse the present samples for the next MAC */ 00405 x0 = x1; 00406 x1 = x2; 00407 x2 = x3; 00408 00409 /* Decrement the loop counter */ 00410 k--; 00411 } 00412 00413 /* Store the result in the accumulator in the destination buffer. */ 00414 *pOut = acc0; 00415 /* Destination pointer is updated according to the address modifier, inc */ 00416 pOut += inc; 00417 00418 *pOut = acc1; 00419 pOut += inc; 00420 00421 *pOut = acc2; 00422 pOut += inc; 00423 00424 *pOut = acc3; 00425 pOut += inc; 00426 00427 /* Increment the pointer pIn1 index, count by 1 */ 00428 count += 4u; 00429 00430 /* Update the inputA and inputB pointers for next MAC calculation */ 00431 px = pIn1 + count; 00432 py = pIn2; 00433 00434 /* Decrement the loop counter */ 00435 blkCnt--; 00436 } 00437 00438 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00439 ** No loop unrolling is used. */ 00440 blkCnt = blockSize2 % 0x4u; 00441 00442 while(blkCnt > 0u) 00443 { 00444 /* Accumulator is made zero for every iteration */ 00445 sum = 0.0f; 00446 00447 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00448 k = srcBLen >> 2u; 00449 00450 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00451 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00452 while(k > 0u) 00453 { 00454 /* Perform the multiply-accumulates */ 00455 sum += *px++ * *py++; 00456 sum += *px++ * *py++; 00457 sum += *px++ * *py++; 00458 sum += *px++ * *py++; 00459 00460 /* Decrement the loop counter */ 00461 k--; 00462 } 00463 00464 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00465 ** No loop unrolling is used. */ 00466 k = srcBLen % 0x4u; 00467 00468 while(k > 0u) 00469 { 00470 /* Perform the multiply-accumulate */ 00471 sum += *px++ * *py++; 00472 00473 /* Decrement the loop counter */ 00474 k--; 00475 } 00476 00477 /* Store the result in the accumulator in the destination buffer. */ 00478 *pOut = sum; 00479 /* Destination pointer is updated according to the address modifier, inc */ 00480 pOut += inc; 00481 00482 /* Increment the pointer pIn1 index, count by 1 */ 00483 count++; 00484 00485 /* Update the inputA and inputB pointers for next MAC calculation */ 00486 px = pIn1 + count; 00487 py = pIn2; 00488 00489 /* Decrement the loop counter */ 00490 blkCnt--; 00491 } 00492 } 00493 else 00494 { 00495 /* If the srcBLen is not a multiple of 4, 00496 * the blockSize2 loop cannot be unrolled by 4 */ 00497 blkCnt = blockSize2; 00498 00499 while(blkCnt > 0u) 00500 { 00501 /* Accumulator is made zero for every iteration */ 00502 sum = 0.0f; 00503 00504 /* Loop over srcBLen */ 00505 k = srcBLen; 00506 00507 while(k > 0u) 00508 { 00509 /* Perform the multiply-accumulate */ 00510 sum += *px++ * *py++; 00511 00512 /* Decrement the loop counter */ 00513 k--; 00514 } 00515 00516 /* Store the result in the accumulator in the destination buffer. */ 00517 *pOut = sum; 00518 /* Destination pointer is updated according to the address modifier, inc */ 00519 pOut += inc; 00520 00521 /* Increment the pointer pIn1 index, count by 1 */ 00522 count++; 00523 00524 /* Update the inputA and inputB pointers for next MAC calculation */ 00525 px = pIn1 + count; 00526 py = pIn2; 00527 00528 /* Decrement the loop counter */ 00529 blkCnt--; 00530 } 00531 } 00532 00533 /* -------------------------- 00534 * Initializations of stage3 00535 * -------------------------*/ 00536 00537 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00538 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00539 * .... 00540 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00541 * sum += x[srcALen-1] * y[0] 00542 */ 00543 00544 /* In this stage the MAC operations are decreased by 1 for every iteration. 00545 The count variable holds the number of MAC operations performed */ 00546 count = srcBLen - 1u; 00547 00548 /* Working pointer of inputA */ 00549 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00550 px = pSrc1; 00551 00552 /* Working pointer of inputB */ 00553 py = pIn2; 00554 00555 /* ------------------- 00556 * Stage3 process 00557 * ------------------*/ 00558 00559 while(blockSize3 > 0u) 00560 { 00561 /* Accumulator is made zero for every iteration */ 00562 sum = 0.0f; 00563 00564 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00565 k = count >> 2u; 00566 00567 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00568 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00569 while(k > 0u) 00570 { 00571 x0 = *px++; 00572 c0 = *py++; 00573 /* Perform the multiply-accumulates */ 00574 00575 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00576 sum += x0 * c0; 00577 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00578 sum += *px++ * *py++; 00579 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00580 sum += *px++ * *py++; 00581 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00582 sum += *px++ * *py++; 00583 00584 /* Decrement the loop counter */ 00585 k--; 00586 } 00587 00588 /* If the count is not a multiple of 4, compute any remaining MACs here. 00589 ** No loop unrolling is used. */ 00590 k = count % 0x4u; 00591 00592 while(k > 0u) 00593 { 00594 /* Perform the multiply-accumulates */ 00595 sum += *px++ * *py++; 00596 00597 /* Decrement the loop counter */ 00598 k--; 00599 } 00600 00601 /* Store the result in the accumulator in the destination buffer. */ 00602 *pOut = sum; 00603 /* Destination pointer is updated according to the address modifier, inc */ 00604 pOut += inc; 00605 00606 /* Update the inputA and inputB pointers for next MAC calculation */ 00607 px = ++pSrc1; 00608 py = pIn2; 00609 00610 /* Decrement the MAC count */ 00611 count--; 00612 00613 /* Decrement the loop counter */ 00614 blockSize3--; 00615 } 00616 00617 } 00618