00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_correlate_q31.c 00009 * 00010 * Description: Correlation of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00062 void arm_correlate_q31( 00063 q31_t * pSrcA, 00064 uint32_t srcALen, 00065 q31_t * pSrcB, 00066 uint32_t srcBLen, 00067 q31_t * pDst) 00068 { 00069 q31_t *pIn1; /* inputA pointer */ 00070 q31_t *pIn2; /* inputB pointer */ 00071 q31_t *pOut = pDst; /* output pointer */ 00072 q31_t *px; /* Intermediate inputA pointer */ 00073 q31_t *py; /* Intermediate inputB pointer */ 00074 q31_t *pSrc1; /* Intermediate pointers */ 00075 q63_t sum, acc0, acc1, acc2; /* Accumulators */ 00076 q31_t x0, x1, x2, c0, c1; /* temporary variables for holding input and coefficient values */ 00077 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00078 int32_t inc = 1; /* Destination address modifier */ 00079 00080 00081 /* The algorithm implementation is based on the lengths of the inputs. */ 00082 /* srcB is always made to slide across srcA. */ 00083 /* So srcBLen is always considered as shorter or equal to srcALen */ 00084 /* But CORR(x, y) is reverse of CORR(y, x) */ 00085 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00086 /* and the destination pointer modifier, inc is set to -1 */ 00087 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00088 /* But to improve the performance, 00089 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00090 /* If srcALen > srcBLen, 00091 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00092 /* If srcALen < srcBLen, 00093 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00094 00095 if(srcALen >= srcBLen) 00096 { 00097 /* Initialization of inputA pointer */ 00098 pIn1 = (pSrcA); 00099 00100 /* Initialization of inputB pointer */ 00101 pIn2 = (pSrcB); 00102 00103 /* Number of output samples is calculated */ 00104 outBlockSize = (2u * srcALen) - 1u; 00105 00106 /* When srcALen > srcBLen, zero padding is done to srcB 00107 * to make their lengths equal. 00108 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00109 * number of output samples are made zero */ 00110 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00111 00112 /* Updating the pointer position to non zero value */ 00113 pOut += j; 00114 00115 } 00116 else 00117 { 00118 /* Initialization of inputA pointer */ 00119 pIn1 = (pSrcB); 00120 00121 /* Initialization of inputB pointer */ 00122 pIn2 = (pSrcA); 00123 00124 /* srcBLen is always considered as shorter or equal to srcALen */ 00125 j = srcBLen; 00126 srcBLen = srcALen; 00127 srcALen = j; 00128 00129 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00130 /* Hence set the destination pointer to point to the last output sample */ 00131 pOut = pDst + ((srcALen + srcBLen) - 2u); 00132 00133 /* Destination address modifier is set to -1 */ 00134 inc = -1; 00135 00136 } 00137 00138 /* The function is internally 00139 * divided into three parts according to the number of multiplications that has to be 00140 * taken place between inputA samples and inputB samples. In the first part of the 00141 * algorithm, the multiplications increase by one for every iteration. 00142 * In the second part of the algorithm, srcBLen number of multiplications are done. 00143 * In the third part of the algorithm, the multiplications decrease by one 00144 * for every iteration.*/ 00145 /* The algorithm is implemented in three stages. 00146 * The loop counters of each stage is initiated here. */ 00147 blockSize1 = srcBLen - 1u; 00148 blockSize2 = srcALen - (srcBLen - 1u); 00149 blockSize3 = blockSize1; 00150 00151 /* -------------------------- 00152 * Initializations of stage1 00153 * -------------------------*/ 00154 00155 /* sum = x[0] * y[srcBlen - 1] 00156 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00157 * .... 00158 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00159 */ 00160 00161 /* In this stage the MAC operations are increased by 1 for every iteration. 00162 The count variable holds the number of MAC operations performed */ 00163 count = 1u; 00164 00165 /* Working pointer of inputA */ 00166 px = pIn1; 00167 00168 /* Working pointer of inputB */ 00169 pSrc1 = pIn2 + (srcBLen - 1u); 00170 py = pSrc1; 00171 00172 /* ------------------------ 00173 * Stage1 process 00174 * ----------------------*/ 00175 00176 /* The first stage starts here */ 00177 while(blockSize1 > 0u) 00178 { 00179 /* Accumulator is made zero for every iteration */ 00180 sum = 0; 00181 00182 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00183 k = count >> 2; 00184 00185 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00186 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00187 while(k > 0u) 00188 { 00189 /* Read x[0] */ 00190 x0 = *px++; 00191 /* Read y[srcBLen - 4] */ 00192 c0 = *py++; 00193 00194 /* Read x[1] */ 00195 x1 = *px++; 00196 /* Read y[srcBLen - 3] */ 00197 c1 = *py++; 00198 00199 /* x[0] * y[srcBLen - 4] */ 00200 sum += (q63_t) x0 * c0; 00201 00202 /* x[1] * y[srcBLen - 3] */ 00203 sum += (q63_t) x1 * c1; 00204 00205 /* Read x[2] */ 00206 x0 = *px++; 00207 /* Read y[srcBLen - 2] */ 00208 c0 = *py++; 00209 00210 /* Read x[3] */ 00211 x1 = *px++; 00212 /* Read y[srcBLen - 1] */ 00213 c1 = *py++; 00214 00215 /* x[2] * y[srcBLen - 2] */ 00216 sum += (q63_t) x0 * c0; 00217 00218 /* x[3] * y[srcBLen - 1] */ 00219 sum += (q63_t) x1 * c1; 00220 00221 /* Decrement the loop counter */ 00222 k--; 00223 } 00224 00225 /* If the count is not a multiple of 4, compute any remaining MACs here. 00226 ** No loop unrolling is used. */ 00227 k = count % 0x4u; 00228 00229 while(k > 0u) 00230 { 00231 /* Perform the multiply-accumulates */ 00232 /* x[0] * y[srcBLen - 1] */ 00233 sum += (q63_t) * px++ * (*py++); 00234 00235 /* Decrement the loop counter */ 00236 k--; 00237 } 00238 00239 /* Store the result in the accumulator in the destination buffer. */ 00240 *pOut = (q31_t) (sum >> 31); 00241 /* Destination pointer is updated according to the address modifier, inc */ 00242 pOut += inc; 00243 00244 /* Update the inputA and inputB pointers for next MAC calculation */ 00245 py = pSrc1 - count; 00246 px = pIn1; 00247 00248 /* Increment the MAC count */ 00249 count++; 00250 00251 /* Decrement the loop counter */ 00252 blockSize1--; 00253 } 00254 00255 /* -------------------------- 00256 * Initializations of stage2 00257 * ------------------------*/ 00258 00259 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00260 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00261 * .... 00262 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00263 */ 00264 00265 /* Working pointer of inputA */ 00266 px = pIn1; 00267 00268 /* Working pointer of inputB */ 00269 py = pIn2; 00270 00271 /* count is index by which the pointer pIn1 to be incremented */ 00272 count = 0u; 00273 00274 /* ------------------- 00275 * Stage2 process 00276 * ------------------*/ 00277 00278 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00279 * So, to loop unroll over blockSize2, 00280 * srcBLen should be greater than or equal to 4 */ 00281 if(srcBLen >= 4u) 00282 { 00283 /* Loop unroll by 3 */ 00284 blkCnt = blockSize2 / 3; 00285 00286 while(blkCnt > 0u) 00287 { 00288 /* Set all accumulators to zero */ 00289 acc0 = 0; 00290 acc1 = 0; 00291 acc2 = 0; 00292 00293 /* read x[0], x[1], x[2] samples */ 00294 x0 = *(px++); 00295 x1 = *(px++); 00296 00297 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00298 k = srcBLen / 3; 00299 00300 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00301 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00302 do 00303 { 00304 /* Read y[0] sample */ 00305 c0 = *(py); 00306 00307 /* Read x[3] sample */ 00308 x2 = *(px); 00309 00310 /* Perform the multiply-accumulate */ 00311 /* acc0 += x[0] * y[0] */ 00312 acc0 += ((q63_t) x0 * c0); 00313 /* acc1 += x[1] * y[0] */ 00314 acc1 += ((q63_t) x1 * c0); 00315 /* acc2 += x[2] * y[0] */ 00316 acc2 += ((q63_t) x2 * c0); 00317 00318 /* Read y[1] sample */ 00319 c0 = *(py + 1u); 00320 00321 /* Read x[4] sample */ 00322 x0 = *(px + 1u); 00323 00324 /* Perform the multiply-accumulates */ 00325 /* acc0 += x[1] * y[1] */ 00326 acc0 += ((q63_t) x1 * c0); 00327 /* acc1 += x[2] * y[1] */ 00328 acc1 += ((q63_t) x2 * c0); 00329 /* acc2 += x[3] * y[1] */ 00330 acc2 += ((q63_t) x0 * c0); 00331 00332 /* Read y[2] sample */ 00333 c0 = *(py + 2u); 00334 00335 /* Read x[5] sample */ 00336 x1 = *(px + 2u); 00337 00338 /* Perform the multiply-accumulates */ 00339 /* acc0 += x[2] * y[2] */ 00340 acc0 += ((q63_t) x2 * c0); 00341 /* acc1 += x[3] * y[2] */ 00342 acc1 += ((q63_t) x0 * c0); 00343 /* acc2 += x[4] * y[2] */ 00344 acc2 += ((q63_t) x1 * c0); 00345 00346 /* update scratch pointers */ 00347 px += 3u; 00348 py += 3u; 00349 00350 } while(--k); 00351 00352 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00353 ** No loop unrolling is used. */ 00354 k = srcBLen - ( 3 * (srcBLen/3)); 00355 00356 while(k > 0u) 00357 { 00358 /* Read y[4] sample */ 00359 c0 = *(py++); 00360 00361 /* Read x[7] sample */ 00362 x2 = *(px++); 00363 00364 /* Perform the multiply-accumulates */ 00365 /* acc0 += x[4] * y[4] */ 00366 acc0 += ((q63_t) x0 * c0); 00367 /* acc1 += x[5] * y[4] */ 00368 acc1 += ((q63_t) x1 * c0); 00369 /* acc2 += x[6] * y[4] */ 00370 acc2 += ((q63_t) x2 * c0); 00371 00372 /* Reuse the present samples for the next MAC */ 00373 x0 = x1; 00374 x1 = x2; 00375 00376 /* Decrement the loop counter */ 00377 k--; 00378 } 00379 00380 /* Store the result in the accumulator in the destination buffer. */ 00381 *pOut = (q31_t) (acc0 >> 31); 00382 /* Destination pointer is updated according to the address modifier, inc */ 00383 pOut += inc; 00384 00385 *pOut = (q31_t) (acc1 >> 31); 00386 pOut += inc; 00387 00388 *pOut = (q31_t) (acc2 >> 31); 00389 pOut += inc; 00390 00391 /* Increment the pointer pIn1 index, count by 1 */ 00392 count += 3u; 00393 00394 /* Update the inputA and inputB pointers for next MAC calculation */ 00395 px = pIn1 + count; 00396 py = pIn2; 00397 00398 /* Decrement the loop counter */ 00399 blkCnt--; 00400 } 00401 00402 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00403 ** No loop unrolling is used. */ 00404 blkCnt = blockSize2 - 3 * (blockSize2/3); 00405 00406 while(blkCnt > 0u) 00407 { 00408 /* Accumulator is made zero for every iteration */ 00409 sum = 0; 00410 00411 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00412 k = srcBLen >> 2u; 00413 00414 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00415 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00416 while(k > 0u) 00417 { 00418 /* Perform the multiply-accumulates */ 00419 sum += (q63_t) * px++ * (*py++); 00420 sum += (q63_t) * px++ * (*py++); 00421 sum += (q63_t) * px++ * (*py++); 00422 sum += (q63_t) * px++ * (*py++); 00423 00424 /* Decrement the loop counter */ 00425 k--; 00426 } 00427 00428 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00429 ** No loop unrolling is used. */ 00430 k = srcBLen % 0x4u; 00431 00432 while(k > 0u) 00433 { 00434 /* Perform the multiply-accumulate */ 00435 sum += (q63_t) * px++ * (*py++); 00436 00437 /* Decrement the loop counter */ 00438 k--; 00439 } 00440 00441 /* Store the result in the accumulator in the destination buffer. */ 00442 *pOut = (q31_t) (sum >> 31); 00443 /* Destination pointer is updated according to the address modifier, inc */ 00444 pOut += inc; 00445 00446 /* Increment the MAC count */ 00447 count++; 00448 00449 /* Update the inputA and inputB pointers for next MAC calculation */ 00450 px = pIn1 + count; 00451 py = pIn2; 00452 00453 /* Decrement the loop counter */ 00454 blkCnt--; 00455 } 00456 } 00457 else 00458 { 00459 /* If the srcBLen is not a multiple of 4, 00460 * the blockSize2 loop cannot be unrolled by 4 */ 00461 blkCnt = blockSize2; 00462 00463 while(blkCnt > 0u) 00464 { 00465 /* Accumulator is made zero for every iteration */ 00466 sum = 0; 00467 00468 /* srcBLen number of MACS should be performed */ 00469 k = srcBLen; 00470 00471 while(k > 0u) 00472 { 00473 /* Perform the multiply-accumulate */ 00474 sum += (q63_t) * px++ * (*py++); 00475 00476 /* Decrement the loop counter */ 00477 k--; 00478 } 00479 00480 /* Store the result in the accumulator in the destination buffer. */ 00481 *pOut = (q31_t) (sum >> 31); 00482 /* Destination pointer is updated according to the address modifier, inc */ 00483 pOut += inc; 00484 00485 /* Increment the MAC count */ 00486 count++; 00487 00488 /* Update the inputA and inputB pointers for next MAC calculation */ 00489 px = pIn1 + count; 00490 py = pIn2; 00491 00492 /* Decrement the loop counter */ 00493 blkCnt--; 00494 } 00495 } 00496 00497 /* -------------------------- 00498 * Initializations of stage3 00499 * -------------------------*/ 00500 00501 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00502 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00503 * .... 00504 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00505 * sum += x[srcALen-1] * y[0] 00506 */ 00507 00508 /* In this stage the MAC operations are decreased by 1 for every iteration. 00509 The count variable holds the number of MAC operations performed */ 00510 count = srcBLen - 1u; 00511 00512 /* Working pointer of inputA */ 00513 pSrc1 = pIn1 + (srcALen - (srcBLen - 1u)); 00514 px = pSrc1; 00515 00516 /* Working pointer of inputB */ 00517 py = pIn2; 00518 00519 /* ------------------- 00520 * Stage3 process 00521 * ------------------*/ 00522 00523 while(blockSize3 > 0u) 00524 { 00525 /* Accumulator is made zero for every iteration */ 00526 sum = 0; 00527 00528 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00529 k = count >> 2u; 00530 00531 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00532 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00533 while(k > 0u) 00534 { 00535 x0 = *px++; 00536 c0 = *py++; 00537 00538 x1 = *px++; 00539 c1 = *py++; 00540 00541 /* Perform the multiply-accumulates */ 00542 /* sum += x[srcALen - srcBLen + 4] * y[3] */ 00543 sum += (q63_t) x0 * c0; 00544 /* sum += x[srcALen - srcBLen + 3] * y[2] */ 00545 sum += (q63_t) x1 * c1; 00546 00547 x0 = *px++; 00548 c0 = *py++; 00549 00550 x1 = *px++; 00551 c1 = *py++; 00552 00553 /* sum += x[srcALen - srcBLen + 2] * y[1] */ 00554 sum += (q63_t) x0 * c0; 00555 /* sum += x[srcALen - srcBLen + 1] * y[0] */ 00556 sum += (q63_t) x1 * c1; 00557 00558 /* Decrement the loop counter */ 00559 k--; 00560 } 00561 00562 /* If the count is not a multiple of 4, compute any remaining MACs here. 00563 ** No loop unrolling is used. */ 00564 k = count % 0x4u; 00565 00566 while(k > 0u) 00567 { 00568 /* Perform the multiply-accumulates */ 00569 sum += (q63_t) * px++ * (*py++); 00570 00571 /* Decrement the loop counter */ 00572 k--; 00573 } 00574 00575 /* Store the result in the accumulator in the destination buffer. */ 00576 *pOut = (q31_t) (sum >> 31); 00577 /* Destination pointer is updated according to the address modifier, inc */ 00578 pOut += inc; 00579 00580 /* Update the inputA and inputB pointers for next MAC calculation */ 00581 px = ++pSrc1; 00582 py = pIn2; 00583 00584 /* Decrement the MAC count */ 00585 count--; 00586 00587 /* Decrement the loop counter */ 00588 blockSize3--; 00589 } 00590 00591 } 00592