00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_conv_partial_fast_q31.c 00009 * 00010 * Description: Fast Q31 Partial convolution. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00050 arm_status arm_conv_partial_fast_q31( 00051 q31_t * pSrcA, 00052 uint32_t srcALen, 00053 q31_t * pSrcB, 00054 uint32_t srcBLen, 00055 q31_t * pDst, 00056 uint32_t firstIndex, 00057 uint32_t numPoints) 00058 { 00059 q31_t *pIn1; /* inputA pointer */ 00060 q31_t *pIn2; /* inputB pointer */ 00061 q31_t *pOut = pDst; /* output pointer */ 00062 q31_t *px; /* Intermediate inputA pointer */ 00063 q31_t *py; /* Intermediate inputB pointer */ 00064 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00065 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00066 q31_t x0, x1, x2, x3, c0; 00067 uint32_t j, k, count, check, blkCnt; 00068 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00069 arm_status status; /* status of Partial convolution */ 00070 00071 00072 /* Check for range of output samples to be calculated */ 00073 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00074 { 00075 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00076 status = ARM_MATH_ARGUMENT_ERROR; 00077 } 00078 else 00079 { 00080 00081 /* The algorithm implementation is based on the lengths of the inputs. */ 00082 /* srcB is always made to slide across srcA. */ 00083 /* So srcBLen is always considered as shorter or equal to srcALen */ 00084 if(srcALen >= srcBLen) 00085 { 00086 /* Initialization of inputA pointer */ 00087 pIn1 = pSrcA; 00088 00089 /* Initialization of inputB pointer */ 00090 pIn2 = pSrcB; 00091 } 00092 else 00093 { 00094 /* Initialization of inputA pointer */ 00095 pIn1 = pSrcB; 00096 00097 /* Initialization of inputB pointer */ 00098 pIn2 = pSrcA; 00099 00100 /* srcBLen is always considered as shorter or equal to srcALen */ 00101 j = srcBLen; 00102 srcBLen = srcALen; 00103 srcALen = j; 00104 } 00105 00106 /* Conditions to check which loopCounter holds 00107 * the first and last indices of the output samples to be calculated. */ 00108 check = firstIndex + numPoints; 00109 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00110 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00111 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00112 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00113 (int32_t) numPoints) : 0; 00114 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00115 (int32_t) firstIndex); 00116 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00117 00118 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00119 /* The function is internally 00120 * divided into three stages according to the number of multiplications that has to be 00121 * taken place between inputA samples and inputB samples. In the first stage of the 00122 * algorithm, the multiplications increase by one for every iteration. 00123 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00124 * In the third stage of the algorithm, the multiplications decrease by one 00125 * for every iteration. */ 00126 00127 /* Set the output pointer to point to the firstIndex 00128 * of the output sample to be calculated. */ 00129 pOut = pDst + firstIndex; 00130 00131 /* -------------------------- 00132 * Initializations of stage1 00133 * -------------------------*/ 00134 00135 /* sum = x[0] * y[0] 00136 * sum = x[0] * y[1] + x[1] * y[0] 00137 * .... 00138 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00139 */ 00140 00141 /* In this stage the MAC operations are increased by 1 for every iteration. 00142 The count variable holds the number of MAC operations performed. 00143 Since the partial convolution starts from firstIndex 00144 Number of Macs to be performed is firstIndex + 1 */ 00145 count = 1u + firstIndex; 00146 00147 /* Working pointer of inputA */ 00148 px = pIn1; 00149 00150 /* Working pointer of inputB */ 00151 pSrc2 = pIn2 + firstIndex; 00152 py = pSrc2; 00153 00154 /* ------------------------ 00155 * Stage1 process 00156 * ----------------------*/ 00157 00158 /* The first loop starts here */ 00159 while(blockSize1 > 0) 00160 { 00161 /* Accumulator is made zero for every iteration */ 00162 sum = 0; 00163 00164 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00165 k = count >> 2u; 00166 00167 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00168 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00169 while(k > 0u) 00170 { 00171 /* x[0] * y[srcBLen - 1] */ 00172 sum = (q31_t) ((((q63_t) sum << 32) + 00173 ((q63_t) * px++ * (*py--))) >> 32); 00174 00175 /* x[1] * y[srcBLen - 2] */ 00176 sum = (q31_t) ((((q63_t) sum << 32) + 00177 ((q63_t) * px++ * (*py--))) >> 32); 00178 00179 /* x[2] * y[srcBLen - 3] */ 00180 sum = (q31_t) ((((q63_t) sum << 32) + 00181 ((q63_t) * px++ * (*py--))) >> 32); 00182 00183 /* x[3] * y[srcBLen - 4] */ 00184 sum = (q31_t) ((((q63_t) sum << 32) + 00185 ((q63_t) * px++ * (*py--))) >> 32); 00186 00187 /* Decrement the loop counter */ 00188 k--; 00189 } 00190 00191 /* If the count is not a multiple of 4, compute any remaining MACs here. 00192 ** No loop unrolling is used. */ 00193 k = count % 0x4u; 00194 00195 while(k > 0u) 00196 { 00197 /* Perform the multiply-accumulates */ 00198 sum = (q31_t) ((((q63_t) sum << 32) + 00199 ((q63_t) * px++ * (*py--))) >> 32); 00200 00201 /* Decrement the loop counter */ 00202 k--; 00203 } 00204 00205 /* Store the result in the accumulator in the destination buffer. */ 00206 *pOut++ = sum << 1; 00207 00208 /* Update the inputA and inputB pointers for next MAC calculation */ 00209 py = ++pSrc2; 00210 px = pIn1; 00211 00212 /* Increment the MAC count */ 00213 count++; 00214 00215 /* Decrement the loop counter */ 00216 blockSize1--; 00217 } 00218 00219 /* -------------------------- 00220 * Initializations of stage2 00221 * ------------------------*/ 00222 00223 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00224 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00225 * .... 00226 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00227 */ 00228 00229 /* Working pointer of inputA */ 00230 px = pIn1; 00231 00232 /* Working pointer of inputB */ 00233 pSrc2 = pIn2 + (srcBLen - 1u); 00234 py = pSrc2; 00235 00236 /* count is index by which the pointer pIn1 to be incremented */ 00237 count = 0u; 00238 00239 /* ------------------- 00240 * Stage2 process 00241 * ------------------*/ 00242 00243 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00244 * So, to loop unroll over blockSize2, 00245 * srcBLen should be greater than or equal to 4 */ 00246 if(srcBLen >= 4u) 00247 { 00248 /* Loop unroll over blockSize2 */ 00249 blkCnt = ((uint32_t) blockSize2 >> 2u); 00250 00251 while(blkCnt > 0u) 00252 { 00253 /* Set all accumulators to zero */ 00254 acc0 = 0; 00255 acc1 = 0; 00256 acc2 = 0; 00257 acc3 = 0; 00258 00259 /* read x[0], x[1], x[2] samples */ 00260 x0 = *(px++); 00261 x1 = *(px++); 00262 x2 = *(px++); 00263 00264 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00265 k = srcBLen >> 2u; 00266 00267 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00268 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00269 do 00270 { 00271 /* Read y[srcBLen - 1] sample */ 00272 c0 = *(py--); 00273 00274 /* Read x[3] sample */ 00275 x3 = *(px++); 00276 00277 /* Perform the multiply-accumulate */ 00278 /* acc0 += x[0] * y[srcBLen - 1] */ 00279 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00280 00281 /* acc1 += x[1] * y[srcBLen - 1] */ 00282 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00283 00284 /* acc2 += x[2] * y[srcBLen - 1] */ 00285 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00286 00287 /* acc3 += x[3] * y[srcBLen - 1] */ 00288 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00289 00290 /* Read y[srcBLen - 2] sample */ 00291 c0 = *(py--); 00292 00293 /* Read x[4] sample */ 00294 x0 = *(px++); 00295 00296 /* Perform the multiply-accumulate */ 00297 /* acc0 += x[1] * y[srcBLen - 2] */ 00298 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00299 /* acc1 += x[2] * y[srcBLen - 2] */ 00300 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00301 /* acc2 += x[3] * y[srcBLen - 2] */ 00302 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00303 /* acc3 += x[4] * y[srcBLen - 2] */ 00304 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00305 00306 /* Read y[srcBLen - 3] sample */ 00307 c0 = *(py--); 00308 00309 /* Read x[5] sample */ 00310 x1 = *(px++); 00311 00312 /* Perform the multiply-accumulates */ 00313 /* acc0 += x[2] * y[srcBLen - 3] */ 00314 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00315 /* acc1 += x[3] * y[srcBLen - 2] */ 00316 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00317 /* acc2 += x[4] * y[srcBLen - 2] */ 00318 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00319 /* acc3 += x[5] * y[srcBLen - 2] */ 00320 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00321 00322 /* Read y[srcBLen - 4] sample */ 00323 c0 = *(py--); 00324 00325 /* Read x[6] sample */ 00326 x2 = *(px++); 00327 00328 /* Perform the multiply-accumulates */ 00329 /* acc0 += x[3] * y[srcBLen - 4] */ 00330 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00331 /* acc1 += x[4] * y[srcBLen - 4] */ 00332 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00333 /* acc2 += x[5] * y[srcBLen - 4] */ 00334 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00335 /* acc3 += x[6] * y[srcBLen - 4] */ 00336 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00337 00338 00339 } while(--k); 00340 00341 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00342 ** No loop unrolling is used. */ 00343 k = srcBLen % 0x4u; 00344 00345 while(k > 0u) 00346 { 00347 /* Read y[srcBLen - 5] sample */ 00348 c0 = *(py--); 00349 00350 /* Read x[7] sample */ 00351 x3 = *(px++); 00352 00353 /* Perform the multiply-accumulates */ 00354 /* acc0 += x[4] * y[srcBLen - 5] */ 00355 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00356 /* acc1 += x[5] * y[srcBLen - 5] */ 00357 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00358 /* acc2 += x[6] * y[srcBLen - 5] */ 00359 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00360 /* acc3 += x[7] * y[srcBLen - 5] */ 00361 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00362 00363 /* Reuse the present samples for the next MAC */ 00364 x0 = x1; 00365 x1 = x2; 00366 x2 = x3; 00367 00368 /* Decrement the loop counter */ 00369 k--; 00370 } 00371 00372 /* Store the result in the accumulator in the destination buffer. */ 00373 *pOut++ = (q31_t) (acc0 << 1); 00374 *pOut++ = (q31_t) (acc1 << 1); 00375 *pOut++ = (q31_t) (acc2 << 1); 00376 *pOut++ = (q31_t) (acc3 << 1); 00377 00378 /* Increment the pointer pIn1 index, count by 1 */ 00379 count += 4u; 00380 00381 /* Update the inputA and inputB pointers for next MAC calculation */ 00382 px = pIn1 + count; 00383 py = pSrc2; 00384 00385 /* Decrement the loop counter */ 00386 blkCnt--; 00387 } 00388 00389 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00390 ** No loop unrolling is used. */ 00391 blkCnt = (uint32_t) blockSize2 % 0x4u; 00392 00393 while(blkCnt > 0u) 00394 { 00395 /* Accumulator is made zero for every iteration */ 00396 sum = 0; 00397 00398 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00399 k = srcBLen >> 2u; 00400 00401 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00402 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00403 while(k > 0u) 00404 { 00405 /* Perform the multiply-accumulates */ 00406 sum = (q31_t) ((((q63_t) sum << 32) + 00407 ((q63_t) * px++ * (*py--))) >> 32); 00408 sum = (q31_t) ((((q63_t) sum << 32) + 00409 ((q63_t) * px++ * (*py--))) >> 32); 00410 sum = (q31_t) ((((q63_t) sum << 32) + 00411 ((q63_t) * px++ * (*py--))) >> 32); 00412 sum = (q31_t) ((((q63_t) sum << 32) + 00413 ((q63_t) * px++ * (*py--))) >> 32); 00414 00415 /* Decrement the loop counter */ 00416 k--; 00417 } 00418 00419 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00420 ** No loop unrolling is used. */ 00421 k = srcBLen % 0x4u; 00422 00423 while(k > 0u) 00424 { 00425 /* Perform the multiply-accumulate */ 00426 sum = (q31_t) ((((q63_t) sum << 32) + 00427 ((q63_t) * px++ * (*py--))) >> 32); 00428 00429 /* Decrement the loop counter */ 00430 k--; 00431 } 00432 00433 /* Store the result in the accumulator in the destination buffer. */ 00434 *pOut++ = sum << 1; 00435 00436 /* Increment the MAC count */ 00437 count++; 00438 00439 /* Update the inputA and inputB pointers for next MAC calculation */ 00440 px = pIn1 + count; 00441 py = pSrc2; 00442 00443 /* Decrement the loop counter */ 00444 blkCnt--; 00445 } 00446 } 00447 else 00448 { 00449 /* If the srcBLen is not a multiple of 4, 00450 * the blockSize2 loop cannot be unrolled by 4 */ 00451 blkCnt = (uint32_t) blockSize2; 00452 00453 while(blkCnt > 0u) 00454 { 00455 /* Accumulator is made zero for every iteration */ 00456 sum = 0; 00457 00458 /* srcBLen number of MACS should be performed */ 00459 k = srcBLen; 00460 00461 while(k > 0u) 00462 { 00463 /* Perform the multiply-accumulate */ 00464 sum = (q31_t) ((((q63_t) sum << 32) + 00465 ((q63_t) * px++ * (*py--))) >> 32); 00466 00467 /* Decrement the loop counter */ 00468 k--; 00469 } 00470 00471 /* Store the result in the accumulator in the destination buffer. */ 00472 *pOut++ = sum << 1; 00473 00474 /* Increment the MAC count */ 00475 count++; 00476 00477 /* Update the inputA and inputB pointers for next MAC calculation */ 00478 px = pIn1 + count; 00479 py = pSrc2; 00480 00481 /* Decrement the loop counter */ 00482 blkCnt--; 00483 } 00484 } 00485 00486 00487 /* -------------------------- 00488 * Initializations of stage3 00489 * -------------------------*/ 00490 00491 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00492 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00493 * .... 00494 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00495 * sum += x[srcALen-1] * y[srcBLen-1] 00496 */ 00497 00498 /* In this stage the MAC operations are decreased by 1 for every iteration. 00499 The count variable holds the number of MAC operations performed */ 00500 count = srcBLen - 1u; 00501 00502 /* Working pointer of inputA */ 00503 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00504 px = pSrc1; 00505 00506 /* Working pointer of inputB */ 00507 pSrc2 = pIn2 + (srcBLen - 1u); 00508 py = pSrc2; 00509 00510 /* ------------------- 00511 * Stage3 process 00512 * ------------------*/ 00513 00514 while(blockSize3 > 0) 00515 { 00516 /* Accumulator is made zero for every iteration */ 00517 sum = 0; 00518 00519 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00520 k = count >> 2u; 00521 00522 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00523 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00524 while(k > 0u) 00525 { 00526 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00527 sum = (q31_t) ((((q63_t) sum << 32) + 00528 ((q63_t) * px++ * (*py--))) >> 32); 00529 00530 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00531 sum = (q31_t) ((((q63_t) sum << 32) + 00532 ((q63_t) * px++ * (*py--))) >> 32); 00533 00534 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00535 sum = (q31_t) ((((q63_t) sum << 32) + 00536 ((q63_t) * px++ * (*py--))) >> 32); 00537 00538 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00539 sum = (q31_t) ((((q63_t) sum << 32) + 00540 ((q63_t) * px++ * (*py--))) >> 32); 00541 00542 /* Decrement the loop counter */ 00543 k--; 00544 } 00545 00546 /* If the count is not a multiple of 4, compute any remaining MACs here. 00547 ** No loop unrolling is used. */ 00548 k = count % 0x4u; 00549 00550 while(k > 0u) 00551 { 00552 /* Perform the multiply-accumulates */ 00553 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00554 sum = (q31_t) ((((q63_t) sum << 32) + 00555 ((q63_t) * px++ * (*py--))) >> 32); 00556 00557 /* Decrement the loop counter */ 00558 k--; 00559 } 00560 00561 /* Store the result in the accumulator in the destination buffer. */ 00562 *pOut++ = sum << 1; 00563 00564 /* Update the inputA and inputB pointers for next MAC calculation */ 00565 px = ++pSrc1; 00566 py = pSrc2; 00567 00568 /* Decrement the MAC count */ 00569 count--; 00570 00571 /* Decrement the loop counter */ 00572 blockSize3--; 00573 00574 } 00575 00576 /* set status as ARM_MATH_SUCCESS */ 00577 status = ARM_MATH_SUCCESS; 00578 } 00579 00580 /* Return to application */ 00581 return (status); 00582 00583 } 00584