00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_conv_fast_q31.c 00009 * 00010 * Description: Q31 Convolution (fast version). 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00062 void arm_conv_fast_q31( 00063 q31_t * pSrcA, 00064 uint32_t srcALen, 00065 q31_t * pSrcB, 00066 uint32_t srcBLen, 00067 q31_t * pDst) 00068 { 00069 q31_t *pIn1; /* inputA pointer */ 00070 q31_t *pIn2; /* inputB pointer */ 00071 q31_t *pOut = pDst; /* output pointer */ 00072 q31_t *px; /* Intermediate inputA pointer */ 00073 q31_t *py; /* Intermediate inputB pointer */ 00074 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00075 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00076 q31_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold input1 and input2 values */ 00077 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00078 00079 /* The algorithm implementation is based on the lengths of the inputs. */ 00080 /* srcB is always made to slide across srcA. */ 00081 /* So srcBLen is always considered as shorter or equal to srcALen */ 00082 if(srcALen >= srcBLen) 00083 { 00084 /* Initialization of inputA pointer */ 00085 pIn1 = pSrcA; 00086 00087 /* Initialization of inputB pointer */ 00088 pIn2 = pSrcB; 00089 } 00090 else 00091 { 00092 /* Initialization of inputA pointer */ 00093 pIn1 = pSrcB; 00094 00095 /* Initialization of inputB pointer */ 00096 pIn2 = pSrcA; 00097 00098 /* srcBLen is always considered as shorter or equal to srcALen */ 00099 j = srcBLen; 00100 srcBLen = srcALen; 00101 srcALen = j; 00102 } 00103 00104 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00105 /* The function is internally 00106 * divided into three stages according to the number of multiplications that has to be 00107 * taken place between inputA samples and inputB samples. In the first stage of the 00108 * algorithm, the multiplications increase by one for every iteration. 00109 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00110 * In the third stage of the algorithm, the multiplications decrease by one 00111 * for every iteration. */ 00112 00113 /* The algorithm is implemented in three stages. 00114 The loop counters of each stage is initiated here. */ 00115 blockSize1 = srcBLen - 1u; 00116 blockSize2 = srcALen - (srcBLen - 1u); 00117 blockSize3 = blockSize1; 00118 00119 /* -------------------------- 00120 * Initializations of stage1 00121 * -------------------------*/ 00122 00123 /* sum = x[0] * y[0] 00124 * sum = x[0] * y[1] + x[1] * y[0] 00125 * .... 00126 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00127 */ 00128 00129 /* In this stage the MAC operations are increased by 1 for every iteration. 00130 The count variable holds the number of MAC operations performed */ 00131 count = 1u; 00132 00133 /* Working pointer of inputA */ 00134 px = pIn1; 00135 00136 /* Working pointer of inputB */ 00137 py = pIn2; 00138 00139 00140 /* ------------------------ 00141 * Stage1 process 00142 * ----------------------*/ 00143 00144 /* The first stage starts here */ 00145 while(blockSize1 > 0u) 00146 { 00147 /* Accumulator is made zero for every iteration */ 00148 sum = 0; 00149 00150 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00151 k = count >> 2u; 00152 00153 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00154 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00155 while(k > 0u) 00156 { 00157 00158 /* Read x[0] */ 00159 x0 = *px++; 00160 /* Read y[srcBLen - 1] */ 00161 c0 = *py--; 00162 00163 /* Read x[1] */ 00164 x1 = *px++; 00165 /* Read y[srcBLen - 2] */ 00166 c1 = *py--; 00167 00168 /* x[0] * y[srcBLen - 1] */ 00169 sum = (q31_t) ((((q63_t) sum << 32) + 00170 ((q63_t) x0 * c0)) >> 32); 00171 00172 /* x[1] * y[srcBLen - 2] */ 00173 sum = (q31_t) ((((q63_t) sum << 32) + 00174 ((q63_t) x1 * c1)) >> 32); 00175 00176 /* Read x[2] */ 00177 x0 = *px++; 00178 /* Read y[srcBLen - 3] */ 00179 c0 = *py--; 00180 00181 /* Read x[3] */ 00182 x1 = *px++; 00183 /* Read y[srcBLen - 4] */ 00184 c1 = *py--; 00185 00186 /* x[2] * y[srcBLen - 3] */ 00187 sum = (q31_t) ((((q63_t) sum << 32) + 00188 ((q63_t) x0 * c0)) >> 32); 00189 00190 /* x[3] * y[srcBLen - 4] */ 00191 sum = (q31_t) ((((q63_t) sum << 32) + 00192 ((q63_t) x1 * c1)) >> 32); 00193 00194 /* Decrement the loop counter */ 00195 k--; 00196 } 00197 00198 /* If the count is not a multiple of 4, compute any remaining MACs here. 00199 ** No loop unrolling is used. */ 00200 k = count % 0x4u; 00201 00202 while(k > 0u) 00203 { 00204 00205 /* Perform the multiply-accumulate */ 00206 sum = (q31_t) ((((q63_t) sum << 32) + 00207 ((q63_t) * px++ * (*py--))) >> 32); 00208 00209 /* Decrement the loop counter */ 00210 k--; 00211 } 00212 00213 /* Store the result in the accumulator in the destination buffer. */ 00214 *pOut++ = sum << 1; 00215 00216 /* Update the inputA and inputB pointers for next MAC calculation */ 00217 py = pIn2 + count; 00218 px = pIn1; 00219 00220 /* Increment the MAC count */ 00221 count++; 00222 00223 /* Decrement the loop counter */ 00224 blockSize1--; 00225 } 00226 00227 /* -------------------------- 00228 * Initializations of stage2 00229 * ------------------------*/ 00230 00231 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00232 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00233 * .... 00234 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00235 */ 00236 00237 /* Working pointer of inputA */ 00238 px = pIn1; 00239 00240 /* Working pointer of inputB */ 00241 pSrc2 = pIn2 + (srcBLen - 1u); 00242 py = pSrc2; 00243 00244 /* count is index by which the pointer pIn1 to be incremented */ 00245 count = 0u; 00246 00247 /* ------------------- 00248 * Stage2 process 00249 * ------------------*/ 00250 00251 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00252 * So, to loop unroll over blockSize2, 00253 * srcBLen should be greater than or equal to 4 */ 00254 if(srcBLen >= 4u) 00255 { 00256 /* Loop unroll over blockSize2, by 4 */ 00257 blkCnt = blockSize2 >> 2u; 00258 00259 while(blkCnt > 0u) 00260 { 00261 /* Set all accumulators to zero */ 00262 acc0 = 0; 00263 acc1 = 0; 00264 acc2 = 0; 00265 acc3 = 0; 00266 00267 /* read x[0], x[1], x[2] samples */ 00268 x0 = *(px++); 00269 x1 = *(px++); 00270 x2 = *(px++); 00271 00272 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00273 k = srcBLen >> 2u; 00274 00275 00276 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00277 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00278 do 00279 { 00280 /* Read y[srcBLen - 1] sample */ 00281 c0 = *(py); 00282 00283 /* Read x[3] sample */ 00284 x3 = *(px); 00285 00286 /* Perform the multiply-accumulates */ 00287 /* acc0 += x[0] * y[srcBLen - 1] */ 00288 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00289 00290 /* acc1 += x[1] * y[srcBLen - 1] */ 00291 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00292 00293 /* acc2 += x[2] * y[srcBLen - 1] */ 00294 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00295 00296 /* acc3 += x[3] * y[srcBLen - 1] */ 00297 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00298 00299 /* Read y[srcBLen - 2] sample */ 00300 c0 = *(py - 1u); 00301 00302 /* Read x[4] sample */ 00303 x0 = *(px + 1u); 00304 00305 /* Perform the multiply-accumulate */ 00306 /* acc0 += x[1] * y[srcBLen - 2] */ 00307 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00308 /* acc1 += x[2] * y[srcBLen - 2] */ 00309 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00310 /* acc2 += x[3] * y[srcBLen - 2] */ 00311 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00312 /* acc3 += x[4] * y[srcBLen - 2] */ 00313 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00314 00315 /* Read y[srcBLen - 3] sample */ 00316 c0 = *(py - 2u); 00317 00318 /* Read x[5] sample */ 00319 x1 = *(px + 2u); 00320 00321 /* Perform the multiply-accumulates */ 00322 /* acc0 += x[2] * y[srcBLen - 3] */ 00323 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00324 /* acc1 += x[3] * y[srcBLen - 3] */ 00325 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00326 /* acc2 += x[4] * y[srcBLen - 3] */ 00327 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00328 /* acc3 += x[5] * y[srcBLen - 3] */ 00329 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00330 00331 /* Read y[srcBLen - 4] sample */ 00332 c0 = *(py - 3u); 00333 00334 /* Read x[6] sample */ 00335 x2 = *(px + 3u); 00336 00337 /* Perform the multiply-accumulates */ 00338 /* acc0 += x[3] * y[srcBLen - 4] */ 00339 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00340 /* acc1 += x[4] * y[srcBLen - 4] */ 00341 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00342 /* acc2 += x[5] * y[srcBLen - 4] */ 00343 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00344 /* acc3 += x[6] * y[srcBLen - 4] */ 00345 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00346 00347 /* update scratch pointers */ 00348 px += 4u; 00349 py -= 4u; 00350 00351 00352 } while(--k); 00353 00354 /* If the srcBLen is not a multiple of 5, compute any remaining MACs here. 00355 ** No loop unrolling is used. */ 00356 k = srcBLen % 0x4u; 00357 00358 while(k > 0u) 00359 { 00360 /* Read y[srcBLen - 5] sample */ 00361 c0 = *(py--); 00362 00363 /* Read x[7] sample */ 00364 x3 = *(px++); 00365 00366 /* Perform the multiply-accumulates */ 00367 /* acc0 += x[4] * y[srcBLen - 5] */ 00368 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00369 /* acc1 += x[5] * y[srcBLen - 5] */ 00370 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00371 /* acc2 += x[6] * y[srcBLen - 5] */ 00372 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00373 /* acc3 += x[7] * y[srcBLen - 5] */ 00374 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00375 00376 00377 /* Reuse the present samples for the next MAC */ 00378 x0 = x1; 00379 x1 = x2; 00380 x2 = x3; 00381 00382 00383 /* Decrement the loop counter */ 00384 k--; 00385 } 00386 00387 /* Store the results in the accumulators in the destination buffer. */ 00388 *pOut++ = (q31_t) (acc0 << 1); 00389 *pOut++ = (q31_t) (acc1 << 1); 00390 *pOut++ = (q31_t) (acc2 << 1); 00391 *pOut++ = (q31_t) (acc3 << 1); 00392 00393 00394 /* Increment the pointer pIn1 index, count by 1 */ 00395 count += 4u; 00396 00397 /* Update the inputA and inputB pointers for next MAC calculation */ 00398 px = pIn1 + count; 00399 py = pSrc2; 00400 00401 /* Decrement the loop counter */ 00402 blkCnt--; 00403 } 00404 00405 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00406 ** No loop unrolling is used. */ 00407 blkCnt = blockSize2 % 0x4u; 00408 00409 while(blkCnt > 0u) 00410 { 00411 /* Accumulator is made zero for every iteration */ 00412 sum = 0; 00413 00414 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00415 k = srcBLen >> 2u; 00416 00417 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00418 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00419 while(k > 0u) 00420 { 00421 /* Perform the multiply-accumulates */ 00422 sum = (q31_t) ((((q63_t) sum << 32) + 00423 ((q63_t) * px++ * (*py--))) >> 32); 00424 sum = (q31_t) ((((q63_t) sum << 32) + 00425 ((q63_t) * px++ * (*py--))) >> 32); 00426 sum = (q31_t) ((((q63_t) sum << 32) + 00427 ((q63_t) * px++ * (*py--))) >> 32); 00428 sum = (q31_t) ((((q63_t) sum << 32) + 00429 ((q63_t) * px++ * (*py--))) >> 32); 00430 00431 /* Decrement the loop counter */ 00432 k--; 00433 } 00434 00435 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00436 ** No loop unrolling is used. */ 00437 k = srcBLen % 0x4u; 00438 00439 while(k > 0u) 00440 { 00441 /* Perform the multiply-accumulate */ 00442 sum = (q31_t) ((((q63_t) sum << 32) + 00443 ((q63_t) * px++ * (*py--))) >> 32); 00444 00445 /* Decrement the loop counter */ 00446 k--; 00447 } 00448 00449 /* Store the result in the accumulator in the destination buffer. */ 00450 *pOut++ = sum << 1; 00451 00452 /* Increment the MAC count */ 00453 count++; 00454 00455 /* Update the inputA and inputB pointers for next MAC calculation */ 00456 px = pIn1 + count; 00457 py = pSrc2; 00458 00459 /* Decrement the loop counter */ 00460 blkCnt--; 00461 } 00462 } 00463 else 00464 { 00465 /* If the srcBLen is not a multiple of 4, 00466 * the blockSize2 loop cannot be unrolled by 4 */ 00467 blkCnt = blockSize2; 00468 00469 while(blkCnt > 0u) 00470 { 00471 /* Accumulator is made zero for every iteration */ 00472 sum = 0; 00473 00474 /* srcBLen number of MACS should be performed */ 00475 k = srcBLen; 00476 00477 while(k > 0u) 00478 { 00479 /* Perform the multiply-accumulate */ 00480 sum = (q31_t) ((((q63_t) sum << 32) + 00481 ((q63_t) * px++ * (*py--))) >> 32); 00482 00483 /* Decrement the loop counter */ 00484 k--; 00485 } 00486 00487 /* Store the result in the accumulator in the destination buffer. */ 00488 *pOut++ = sum << 1; 00489 00490 /* Increment the MAC count */ 00491 count++; 00492 00493 /* Update the inputA and inputB pointers for next MAC calculation */ 00494 px = pIn1 + count; 00495 py = pSrc2; 00496 00497 /* Decrement the loop counter */ 00498 blkCnt--; 00499 } 00500 } 00501 00502 00503 /* -------------------------- 00504 * Initializations of stage3 00505 * -------------------------*/ 00506 00507 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00508 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00509 * .... 00510 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00511 * sum += x[srcALen-1] * y[srcBLen-1] 00512 */ 00513 00514 /* In this stage the MAC operations are decreased by 1 for every iteration. 00515 The blockSize3 variable holds the number of MAC operations performed */ 00516 00517 /* Working pointer of inputA */ 00518 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00519 px = pSrc1; 00520 00521 /* Working pointer of inputB */ 00522 pSrc2 = pIn2 + (srcBLen - 1u); 00523 py = pSrc2; 00524 00525 /* ------------------- 00526 * Stage3 process 00527 * ------------------*/ 00528 00529 while(blockSize3 > 0u) 00530 { 00531 /* Accumulator is made zero for every iteration */ 00532 sum = 0; 00533 00534 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00535 k = blockSize3 >> 2u; 00536 00537 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00538 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00539 while(k > 0u) 00540 { 00541 00542 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00543 sum = (q31_t) ((((q63_t) sum << 32) + 00544 ((q63_t) * px++ * (*py--))) >> 32); 00545 00546 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00547 sum = (q31_t) ((((q63_t) sum << 32) + 00548 ((q63_t) * px++ * (*py--))) >> 32); 00549 00550 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00551 sum = (q31_t) ((((q63_t) sum << 32) + 00552 ((q63_t) * px++ * (*py--))) >> 32); 00553 00554 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00555 sum = (q31_t) ((((q63_t) sum << 32) + 00556 ((q63_t) * px++ * (*py--))) >> 32); 00557 00558 /* Decrement the loop counter */ 00559 k--; 00560 } 00561 00562 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00563 ** No loop unrolling is used. */ 00564 k = blockSize3 % 0x4u; 00565 00566 while(k > 0u) 00567 { 00568 /* Perform the multiply-accumulate */ 00569 sum = (q31_t) ((((q63_t) sum << 32) + 00570 ((q63_t) * px++ * (*py--))) >> 32); 00571 00572 /* Decrement the loop counter */ 00573 k--; 00574 } 00575 00576 /* Store the result in the accumulator in the destination buffer. */ 00577 *pOut++ = sum << 1; 00578 00579 /* Update the inputA and inputB pointers for next MAC calculation */ 00580 px = ++pSrc1; 00581 py = pSrc2; 00582 00583 /* Decrement the loop counter */ 00584 blockSize3--; 00585 } 00586 00587 } 00588