00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_correlate_fast_q31.c 00009 * 00010 * Description: Fast Q31 Correlation. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00064 void arm_correlate_fast_q31( 00065 q31_t * pSrcA, 00066 uint32_t srcALen, 00067 q31_t * pSrcB, 00068 uint32_t srcBLen, 00069 q31_t * pDst) 00070 { 00071 q31_t *pIn1; /* inputA pointer */ 00072 q31_t *pIn2; /* inputB pointer */ 00073 q31_t *pOut = pDst; /* output pointer */ 00074 q31_t *px; /* Intermediate inputA pointer */ 00075 q31_t *py; /* Intermediate inputB pointer */ 00076 q31_t *pSrc1; /* Intermediate pointers */ 00077 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00078 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00079 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00080 int32_t inc = 1; /* Destination address modifier */ 00081 00082 00083 /* The algorithm implementation is based on the lengths of the inputs. */ 00084 /* srcB is always made to slide across srcA. */ 00085 /* So srcBLen is always considered as shorter or equal to srcALen */ 00086 if(srcALen >= srcBLen) 00087 { 00088 /* Initialization of inputA pointer */ 00089 pIn1 = (pSrcA); 00090 00091 /* Initialization of inputB pointer */ 00092 pIn2 = (pSrcB); 00093 00094 /* Number of output samples is calculated */ 00095 outBlockSize = (2u * srcALen) - 1u; 00096 00097 /* When srcALen > srcBLen, zero padding is done to srcB 00098 * to make their lengths equal. 00099 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00100 * number of output samples are made zero */ 00101 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00102 00103 /* Updating the pointer position to non zero value */ 00104 pOut += j; 00105 00106 } 00107 else 00108 { 00109 /* Initialization of inputA pointer */ 00110 pIn1 = (pSrcB); 00111 00112 /* Initialization of inputB pointer */ 00113 pIn2 = (pSrcA); 00114 00115 /* srcBLen is always considered as shorter or equal to srcALen */ 00116 j = srcBLen; 00117 srcBLen = srcALen; 00118 srcALen = j; 00119 00120 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00121 /* Hence set the destination pointer to point to the last output sample */ 00122 pOut = pDst + ((srcALen + srcBLen) - 2u); 00123 00124 /* Destination address modifier is set to -1 */ 00125 inc = -1; 00126 00127 } 00128 00129 /* The function is internally 00130 * divided into three parts according to the number of multiplications that has to be 00131 * taken place between inputA samples and inputB samples. In the first part of the 00132 * algorithm, the multiplications increase by one for every iteration. 00133 * In the second part of the algorithm, srcBLen number of multiplications are done. 00134 * In the third part of the algorithm, the multiplications decrease by one 00135 * for every iteration.*/ 00136 /* The algorithm is implemented in three stages. 00137 * The loop counters of each stage is initiated here. */ 00138 blockSize1 = srcBLen - 1u; 00139 blockSize2 = srcALen - (srcBLen - 1u); 00140 blockSize3 = blockSize1; 00141 00142 /* -------------------------- 00143 * Initializations of stage1 00144 * -------------------------*/ 00145 00146 /* sum = x[0] * y[srcBlen - 1] 00147 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00148 * .... 00149 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00150 */ 00151 00152 /* In this stage the MAC operations are increased by 1 for every iteration. 00153 The count variable holds the number of MAC operations performed */ 00154 count = 1u; 00155 00156 /* Working pointer of inputA */ 00157 px = pIn1; 00158 00159 /* Working pointer of inputB */ 00160 pSrc1 = pIn2 + (srcBLen - 1u); 00161 py = pSrc1; 00162 00163 /* ------------------------ 00164 * Stage1 process 00165 * ----------------------*/ 00166 00167 /* The first stage starts here */ 00168 while(blockSize1 > 0u) 00169 { 00170 /* Accumulator is made zero for every iteration */ 00171 sum = 0; 00172 00173 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00174 k = count >> 2; 00175 00176 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00177 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00178 while(k > 0u) 00179 { 00180 00181 00182 /* x[0] * y[srcBLen - 4] */ 00183 sum = (q31_t) ((((q63_t) sum << 32) + 00184 ((q63_t) * px++ * (*py++))) >> 32); 00185 /* x[1] * y[srcBLen - 3] */ 00186 sum = (q31_t) ((((q63_t) sum << 32) + 00187 ((q63_t) * px++ * (*py++))) >> 32); 00188 00189 /* x[2] * y[srcBLen - 2] */ 00190 sum = (q31_t) ((((q63_t) sum << 32) + 00191 ((q63_t) * px++ * (*py++))) >> 32); 00192 /* x[3] * y[srcBLen - 1] */ 00193 sum = (q31_t) ((((q63_t) sum << 32) + 00194 ((q63_t) * px++ * (*py++))) >> 32); 00195 00196 /* Decrement the loop counter */ 00197 k--; 00198 } 00199 00200 /* If the count is not a multiple of 4, compute any remaining MACs here. 00201 ** No loop unrolling is used. */ 00202 k = count % 0x4u; 00203 00204 while(k > 0u) 00205 { 00206 /* Perform the multiply-accumulates */ 00207 /* x[0] * y[srcBLen - 1] */ 00208 sum = (q31_t) ((((q63_t) sum << 32) + 00209 ((q63_t) * px++ * (*py++))) >> 32); 00210 00211 /* Decrement the loop counter */ 00212 k--; 00213 } 00214 00215 /* Store the result in the accumulator in the destination buffer. */ 00216 *pOut = sum << 1; 00217 /* Destination pointer is updated according to the address modifier, inc */ 00218 pOut += inc; 00219 00220 /* Update the inputA and inputB pointers for next MAC calculation */ 00221 py = pSrc1 - count; 00222 px = pIn1; 00223 00224 /* Increment the MAC count */ 00225 count++; 00226 00227 /* Decrement the loop counter */ 00228 blockSize1--; 00229 } 00230 00231 /* -------------------------- 00232 * Initializations of stage2 00233 * ------------------------*/ 00234 00235 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00236 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00237 * .... 00238 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00239 */ 00240 00241 /* Working pointer of inputA */ 00242 px = pIn1; 00243 00244 /* Working pointer of inputB */ 00245 py = pIn2; 00246 00247 /* count is index by which the pointer pIn1 to be incremented */ 00248 count = 0u; 00249 00250 /* ------------------- 00251 * Stage2 process 00252 * ------------------*/ 00253 00254 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00255 * So, to loop unroll over blockSize2, 00256 * srcBLen should be greater than or equal to 4 */ 00257 if(srcBLen >= 4u) 00258 { 00259 /* Loop unroll over blockSize2, by 4 */ 00260 blkCnt = blockSize2 >> 2u; 00261 00262 while(blkCnt > 0u) 00263 { 00264 /* Set all accumulators to zero */ 00265 acc0 = 0; 00266 acc1 = 0; 00267 acc2 = 0; 00268 acc3 = 0; 00269 00270 /* read x[0], x[1], x[2] samples */ 00271 x0 = *(px++); 00272 x1 = *(px++); 00273 x2 = *(px++); 00274 00275 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00276 k = srcBLen >> 2u; 00277 00278 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00279 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00280 do 00281 { 00282 /* Read y[0] sample */ 00283 c0 = *(py++); 00284 00285 /* Read x[3] sample */ 00286 x3 = *(px++); 00287 00288 /* Perform the multiply-accumulate */ 00289 /* acc0 += x[0] * y[0] */ 00290 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00291 /* acc1 += x[1] * y[0] */ 00292 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00293 /* acc2 += x[2] * y[0] */ 00294 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00295 /* acc3 += x[3] * y[0] */ 00296 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00297 00298 /* Read y[1] sample */ 00299 c0 = *(py++); 00300 00301 /* Read x[4] sample */ 00302 x0 = *(px++); 00303 00304 /* Perform the multiply-accumulates */ 00305 /* acc0 += x[1] * y[1] */ 00306 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00307 /* acc1 += x[2] * y[1] */ 00308 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00309 /* acc2 += x[3] * y[1] */ 00310 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00311 /* acc3 += x[4] * y[1] */ 00312 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00313 00314 /* Read y[2] sample */ 00315 c0 = *(py++); 00316 00317 /* Read x[5] sample */ 00318 x1 = *(px++); 00319 00320 /* Perform the multiply-accumulates */ 00321 /* acc0 += x[2] * y[2] */ 00322 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00323 /* acc1 += x[3] * y[2] */ 00324 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00325 /* acc2 += x[4] * y[2] */ 00326 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00327 /* acc3 += x[5] * y[2] */ 00328 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00329 00330 /* Read y[3] sample */ 00331 c0 = *(py++); 00332 00333 /* Read x[6] sample */ 00334 x2 = *(px++); 00335 00336 /* Perform the multiply-accumulates */ 00337 /* acc0 += x[3] * y[3] */ 00338 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00339 /* acc1 += x[4] * y[3] */ 00340 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00341 /* acc2 += x[5] * y[3] */ 00342 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00343 /* acc3 += x[6] * y[3] */ 00344 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00345 00346 00347 } while(--k); 00348 00349 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00350 ** No loop unrolling is used. */ 00351 k = srcBLen % 0x4u; 00352 00353 while(k > 0u) 00354 { 00355 /* Read y[4] sample */ 00356 c0 = *(py++); 00357 00358 /* Read x[7] sample */ 00359 x3 = *(px++); 00360 00361 /* Perform the multiply-accumulates */ 00362 /* acc0 += x[4] * y[4] */ 00363 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00364 /* acc1 += x[5] * y[4] */ 00365 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00366 /* acc2 += x[6] * y[4] */ 00367 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00368 /* acc3 += x[7] * y[4] */ 00369 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00370 00371 /* Reuse the present samples for the next MAC */ 00372 x0 = x1; 00373 x1 = x2; 00374 x2 = x3; 00375 00376 /* Decrement the loop counter */ 00377 k--; 00378 } 00379 00380 /* Store the result in the accumulator in the destination buffer. */ 00381 *pOut = (q31_t) (acc0 << 1); 00382 /* Destination pointer is updated according to the address modifier, inc */ 00383 pOut += inc; 00384 00385 *pOut = (q31_t) (acc1 << 1); 00386 pOut += inc; 00387 00388 *pOut = (q31_t) (acc2 << 1); 00389 pOut += inc; 00390 00391 *pOut = (q31_t) (acc3 << 1); 00392 pOut += inc; 00393 00394 /* Increment the pointer pIn1 index, count by 1 */ 00395 count += 4u; 00396 00397 /* Update the inputA and inputB pointers for next MAC calculation */ 00398 px = pIn1 + count; 00399 py = pIn2; 00400 00401 /* Decrement the loop counter */ 00402 blkCnt--; 00403 } 00404 00405 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00406 ** No loop unrolling is used. */ 00407 blkCnt = blockSize2 % 0x4u; 00408 00409 while(blkCnt > 0u) 00410 { 00411 /* Accumulator is made zero for every iteration */ 00412 sum = 0; 00413 00414 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00415 k = srcBLen >> 2u; 00416 00417 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00418 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00419 while(k > 0u) 00420 { 00421 /* Perform the multiply-accumulates */ 00422 sum = (q31_t) ((((q63_t) sum << 32) + 00423 ((q63_t) * px++ * (*py++))) >> 32); 00424 sum = (q31_t) ((((q63_t) sum << 32) + 00425 ((q63_t) * px++ * (*py++))) >> 32); 00426 sum = (q31_t) ((((q63_t) sum << 32) + 00427 ((q63_t) * px++ * (*py++))) >> 32); 00428 sum = (q31_t) ((((q63_t) sum << 32) + 00429 ((q63_t) * px++ * (*py++))) >> 32); 00430 00431 /* Decrement the loop counter */ 00432 k--; 00433 } 00434 00435 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00436 ** No loop unrolling is used. */ 00437 k = srcBLen % 0x4u; 00438 00439 while(k > 0u) 00440 { 00441 /* Perform the multiply-accumulate */ 00442 sum = (q31_t) ((((q63_t) sum << 32) + 00443 ((q63_t) * px++ * (*py++))) >> 32); 00444 00445 /* Decrement the loop counter */ 00446 k--; 00447 } 00448 00449 /* Store the result in the accumulator in the destination buffer. */ 00450 *pOut = sum << 1; 00451 /* Destination pointer is updated according to the address modifier, inc */ 00452 pOut += inc; 00453 00454 /* Increment the MAC count */ 00455 count++; 00456 00457 /* Update the inputA and inputB pointers for next MAC calculation */ 00458 px = pIn1 + count; 00459 py = pIn2; 00460 00461 /* Decrement the loop counter */ 00462 blkCnt--; 00463 } 00464 } 00465 else 00466 { 00467 /* If the srcBLen is not a multiple of 4, 00468 * the blockSize2 loop cannot be unrolled by 4 */ 00469 blkCnt = blockSize2; 00470 00471 while(blkCnt > 0u) 00472 { 00473 /* Accumulator is made zero for every iteration */ 00474 sum = 0; 00475 00476 /* Loop over srcBLen */ 00477 k = srcBLen; 00478 00479 while(k > 0u) 00480 { 00481 /* Perform the multiply-accumulate */ 00482 sum = (q31_t) ((((q63_t) sum << 32) + 00483 ((q63_t) * px++ * (*py++))) >> 32); 00484 00485 /* Decrement the loop counter */ 00486 k--; 00487 } 00488 00489 /* Store the result in the accumulator in the destination buffer. */ 00490 *pOut = sum << 1; 00491 /* Destination pointer is updated according to the address modifier, inc */ 00492 pOut += inc; 00493 00494 /* Increment the MAC count */ 00495 count++; 00496 00497 /* Update the inputA and inputB pointers for next MAC calculation */ 00498 px = pIn1 + count; 00499 py = pIn2; 00500 00501 /* Decrement the loop counter */ 00502 blkCnt--; 00503 } 00504 } 00505 00506 /* -------------------------- 00507 * Initializations of stage3 00508 * -------------------------*/ 00509 00510 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00511 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00512 * .... 00513 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00514 * sum += x[srcALen-1] * y[0] 00515 */ 00516 00517 /* In this stage the MAC operations are decreased by 1 for every iteration. 00518 The count variable holds the number of MAC operations performed */ 00519 count = srcBLen - 1u; 00520 00521 /* Working pointer of inputA */ 00522 pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u; 00523 px = pSrc1; 00524 00525 /* Working pointer of inputB */ 00526 py = pIn2; 00527 00528 /* ------------------- 00529 * Stage3 process 00530 * ------------------*/ 00531 00532 while(blockSize3 > 0u) 00533 { 00534 /* Accumulator is made zero for every iteration */ 00535 sum = 0; 00536 00537 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00538 k = count >> 2u; 00539 00540 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00541 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00542 while(k > 0u) 00543 { 00544 /* Perform the multiply-accumulates */ 00545 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00546 sum = (q31_t) ((((q63_t) sum << 32) + 00547 ((q63_t) * px++ * (*py++))) >> 32); 00548 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00549 sum = (q31_t) ((((q63_t) sum << 32) + 00550 ((q63_t) * px++ * (*py++))) >> 32); 00551 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00552 sum = (q31_t) ((((q63_t) sum << 32) + 00553 ((q63_t) * px++ * (*py++))) >> 32); 00554 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00555 sum = (q31_t) ((((q63_t) sum << 32) + 00556 ((q63_t) * px++ * (*py++))) >> 32); 00557 00558 /* Decrement the loop counter */ 00559 k--; 00560 } 00561 00562 /* If the count is not a multiple of 4, compute any remaining MACs here. 00563 ** No loop unrolling is used. */ 00564 k = count % 0x4u; 00565 00566 while(k > 0u) 00567 { 00568 /* Perform the multiply-accumulates */ 00569 sum = (q31_t) ((((q63_t) sum << 32) + 00570 ((q63_t) * px++ * (*py++))) >> 32); 00571 00572 /* Decrement the loop counter */ 00573 k--; 00574 } 00575 00576 /* Store the result in the accumulator in the destination buffer. */ 00577 *pOut = sum << 1; 00578 /* Destination pointer is updated according to the address modifier, inc */ 00579 pOut += inc; 00580 00581 /* Update the inputA and inputB pointers for next MAC calculation */ 00582 px = ++pSrc1; 00583 py = pIn2; 00584 00585 /* Decrement the MAC count */ 00586 count--; 00587 00588 /* Decrement the loop counter */ 00589 blockSize3--; 00590 } 00591 00592 } 00593