00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_conv_partial_q31.c 00009 * 00010 * Description: Partial convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00050 arm_status arm_conv_partial_q31( 00051 q31_t * pSrcA, 00052 uint32_t srcALen, 00053 q31_t * pSrcB, 00054 uint32_t srcBLen, 00055 q31_t * pDst, 00056 uint32_t firstIndex, 00057 uint32_t numPoints) 00058 { 00059 q31_t *pIn1; /* inputA pointer */ 00060 q31_t *pIn2; /* inputB pointer */ 00061 q31_t *pOut = pDst; /* output pointer */ 00062 q31_t *px; /* Intermediate inputA pointer */ 00063 q31_t *py; /* Intermediate inputB pointer */ 00064 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00065 q63_t sum, acc0, acc1, acc2; /* Accumulator */ 00066 q31_t x0, x1, x2, c0; 00067 uint32_t j, k, count, check, blkCnt; 00068 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00069 arm_status status; /* status of Partial convolution */ 00070 q31_t c1; 00071 00072 /* Check for range of output samples to be calculated */ 00073 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00074 { 00075 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00076 status = ARM_MATH_ARGUMENT_ERROR; 00077 } 00078 else 00079 { 00080 00081 /* The algorithm implementation is based on the lengths of the inputs. */ 00082 /* srcB is always made to slide across srcA. */ 00083 /* So srcBLen is always considered as shorter or equal to srcALen */ 00084 if(srcALen >= srcBLen) 00085 { 00086 /* Initialization of inputA pointer */ 00087 pIn1 = pSrcA; 00088 00089 /* Initialization of inputB pointer */ 00090 pIn2 = pSrcB; 00091 } 00092 else 00093 { 00094 /* Initialization of inputA pointer */ 00095 pIn1 = pSrcB; 00096 00097 /* Initialization of inputB pointer */ 00098 pIn2 = pSrcA; 00099 00100 /* srcBLen is always considered as shorter or equal to srcALen */ 00101 j = srcBLen; 00102 srcBLen = srcALen; 00103 srcALen = j; 00104 } 00105 00106 /* Conditions to check which loopCounter holds 00107 * the first and last indices of the output samples to be calculated. */ 00108 check = firstIndex + numPoints; 00109 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00110 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00111 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00112 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00113 (int32_t) numPoints) : 0; 00114 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00115 (int32_t) firstIndex); 00116 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00117 00118 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00119 /* The function is internally 00120 * divided into three stages according to the number of multiplications that has to be 00121 * taken place between inputA samples and inputB samples. In the first stage of the 00122 * algorithm, the multiplications increase by one for every iteration. 00123 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00124 * In the third stage of the algorithm, the multiplications decrease by one 00125 * for every iteration. */ 00126 00127 /* Set the output pointer to point to the firstIndex 00128 * of the output sample to be calculated. */ 00129 pOut = pDst + firstIndex; 00130 00131 /* -------------------------- 00132 * Initializations of stage1 00133 * -------------------------*/ 00134 00135 /* sum = x[0] * y[0] 00136 * sum = x[0] * y[1] + x[1] * y[0] 00137 * .... 00138 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00139 */ 00140 00141 /* In this stage the MAC operations are increased by 1 for every iteration. 00142 The count variable holds the number of MAC operations performed. 00143 Since the partial convolution starts from firstIndex 00144 Number of Macs to be performed is firstIndex + 1 */ 00145 count = 1u + firstIndex; 00146 00147 /* Working pointer of inputA */ 00148 px = pIn1; 00149 00150 /* Working pointer of inputB */ 00151 pSrc2 = pIn2 + firstIndex; 00152 py = pSrc2; 00153 00154 /* ------------------------ 00155 * Stage1 process 00156 * ----------------------*/ 00157 00158 /* The first loop starts here */ 00159 while(blockSize1 > 0) 00160 { 00161 /* Accumulator is made zero for every iteration */ 00162 sum = 0; 00163 00164 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00165 k = count >> 2u; 00166 00167 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00168 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00169 while(k > 0u) 00170 { 00171 x0 = *px++; 00172 c0 = *py--; 00173 00174 x1 = *px++; 00175 c1 = *py--; 00176 00177 /* x[0] * y[srcBLen - 1] */ 00178 sum += (q63_t) x0 * c0; 00179 00180 /* x[1] * y[srcBLen - 2] */ 00181 sum += (q63_t) x1 * c1; 00182 00183 x0 = *px++; 00184 c0 = *py--; 00185 00186 x1 = *px++; 00187 c1 = *py--; 00188 00189 /* x[2] * y[srcBLen - 3] */ 00190 sum += (q63_t) x0 * c0; 00191 00192 /* x[3] * y[srcBLen - 4] */ 00193 sum += (q63_t) x1 * c1; 00194 00195 /* Decrement the loop counter */ 00196 k--; 00197 } 00198 00199 /* If the count is not a multiple of 4, compute any remaining MACs here. 00200 ** No loop unrolling is used. */ 00201 k = count % 0x4u; 00202 00203 while(k > 0u) 00204 { 00205 /* Perform the multiply-accumulate */ 00206 sum += (q63_t) * px++ * (*py--); 00207 00208 /* Decrement the loop counter */ 00209 k--; 00210 } 00211 00212 /* Store the result in the accumulator in the destination buffer. */ 00213 *pOut++ = (q31_t) (sum >> 31); 00214 00215 /* Update the inputA and inputB pointers for next MAC calculation */ 00216 py = ++pSrc2; 00217 px = pIn1; 00218 00219 /* Increment the MAC count */ 00220 count++; 00221 00222 /* Decrement the loop counter */ 00223 blockSize1--; 00224 } 00225 00226 /* -------------------------- 00227 * Initializations of stage2 00228 * ------------------------*/ 00229 00230 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00231 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00232 * .... 00233 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00234 */ 00235 00236 /* Working pointer of inputA */ 00237 px = pIn1; 00238 00239 /* Working pointer of inputB */ 00240 pSrc2 = pIn2 + (srcBLen - 1u); 00241 py = pSrc2; 00242 00243 /* count is index by which the pointer pIn1 to be incremented */ 00244 count = 0u; 00245 00246 /* ------------------- 00247 * Stage2 process 00248 * ------------------*/ 00249 00250 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00251 * So, to loop unroll over blockSize2, 00252 * srcBLen should be greater than or equal to 4 */ 00253 if(srcBLen >= 4u) 00254 { 00255 /* Loop unroll over blockSize2 */ 00256 //blkCnt = ((uint32_t) blockSize2 >> 2u); 00257 00258 blkCnt = blockSize2 / 3; 00259 00260 while(blkCnt > 0u) 00261 { 00262 /* Set all accumulators to zero */ 00263 acc0 = 0; 00264 acc1 = 0; 00265 acc2 = 0; 00266 //acc3 = 0; 00267 00268 /* read x[0], x[1], x[2] samples */ 00269 x0 = *(px++); 00270 x1 = *(px++); 00271 //x2 = *(px++); 00272 00273 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00274 //k = srcBLen >> 2u; 00275 00276 k = srcBLen / 3; 00277 00278 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00279 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00280 do 00281 { 00282 /* Read y[srcBLen - 1] sample */ 00283 c0 = *(py); 00284 00285 /* Read x[3] sample */ 00286 x2 = *(px); 00287 00288 /* Perform the multiply-accumulates */ 00289 /* acc0 += x[0] * y[srcBLen - 1] */ 00290 acc0 += (q63_t) x0 *c0; 00291 /* acc1 += x[1] * y[srcBLen - 1] */ 00292 acc1 += (q63_t) x1 *c0; 00293 /* acc2 += x[2] * y[srcBLen - 1] */ 00294 acc2 += (q63_t) x2 *c0; 00295 /* acc3 += x[3] * y[srcBLen - 1] */ 00296 //acc3 += (q63_t) x3 *c0; 00297 00298 /* Read y[srcBLen - 2] sample */ 00299 c0 = *(py - 1u); 00300 00301 /* Read x[4] sample */ 00302 x0 = *(px + 1u); 00303 00304 /* Perform the multiply-accumulate */ 00305 /* acc0 += x[1] * y[srcBLen - 2] */ 00306 acc0 += (q63_t) x1 *c0; 00307 /* acc1 += x[2] * y[srcBLen - 2] */ 00308 acc1 += (q63_t) x2 *c0; 00309 /* acc2 += x[3] * y[srcBLen - 2] */ 00310 acc2 += (q63_t) x0 *c0; 00311 /* acc3 += x[4] * y[srcBLen - 2] */ 00312 //acc3 += (q63_t) x0 *c0; 00313 00314 /* Read y[srcBLen - 3] sample */ 00315 c0 = *(py - 2u); 00316 00317 /* Read x[5] sample */ 00318 x1 = *(px + 2u); 00319 00320 /* Perform the multiply-accumulates */ 00321 /* acc0 += x[2] * y[srcBLen - 3] */ 00322 acc0 += (q63_t) x2 *c0; 00323 /* acc1 += x[3] * y[srcBLen - 2] */ 00324 acc1 += (q63_t) x0 *c0; 00325 /* acc2 += x[4] * y[srcBLen - 2] */ 00326 acc2 += (q63_t) x1 *c0; 00327 /* acc3 += x[5] * y[srcBLen - 2] */ 00328 //acc3 += (q63_t) x1 *c0; 00329 00330 px += 3u; 00331 00332 py -= 3u; 00333 00335 //c0 = *(py--); 00336 00338 //x2 = *(px++); 00339 00342 //acc0 += (q63_t) x3 *c0; 00344 //acc1 += (q63_t) x0 *c0; 00346 //acc2 += (q63_t) x1 *c0; 00348 //acc3 += (q63_t) x2 *c0; 00349 00350 } while(--k); 00351 00352 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00353 ** No loop unrolling is used. */ 00354 //k = srcBLen % 0x4u; 00355 k = srcBLen - ( 3 * (srcBLen/3)); 00356 00357 while(k > 0u) 00358 { 00359 /* Read y[srcBLen - 5] sample */ 00360 c0 = *(py--); 00361 00362 /* Read x[7] sample */ 00363 x2 = *(px++); 00364 00365 /* Perform the multiply-accumulates */ 00366 /* acc0 += x[4] * y[srcBLen - 5] */ 00367 acc0 += (q63_t) x0 *c0; 00368 /* acc1 += x[5] * y[srcBLen - 5] */ 00369 acc1 += (q63_t) x1 *c0; 00370 /* acc2 += x[6] * y[srcBLen - 5] */ 00371 acc2 += (q63_t) x2 *c0; 00372 /* acc3 += x[7] * y[srcBLen - 5] */ 00373 //acc3 += (q63_t) x3 *c0; 00374 00375 /* Reuse the present samples for the next MAC */ 00376 x0 = x1; 00377 x1 = x2; 00378 //x2 = x3; 00379 00380 /* Decrement the loop counter */ 00381 k--; 00382 } 00383 00384 /* Increment the pointer pIn1 index, count by 1 */ 00385 count += 3u; 00386 00387 /* Store the result in the accumulator in the destination buffer. */ 00388 *pOut++ = (q31_t) (acc0 >> 31); 00389 *pOut++ = (q31_t) (acc1 >> 31); 00390 *pOut++ = (q31_t) (acc2 >> 31); 00391 //*pOut++ = (q31_t) (acc3 >> 31); 00392 00393 /* Update the inputA and inputB pointers for next MAC calculation */ 00394 px = pIn1 + count; 00395 py = pSrc2; 00396 00397 /* Decrement the loop counter */ 00398 blkCnt--; 00399 } 00400 00401 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00402 ** No loop unrolling is used. */ 00403 //blkCnt = (uint32_t) blockSize2 % 0x4u; 00404 blkCnt = blockSize2 - 3 * (blockSize2/3); 00405 00406 00407 while(blkCnt > 0u) 00408 { 00409 /* Accumulator is made zero for every iteration */ 00410 sum = 0; 00411 00412 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00413 k = srcBLen >> 2u; 00414 00415 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00416 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00417 while(k > 0u) 00418 { 00419 /* Perform the multiply-accumulates */ 00420 sum += (q63_t) * px++ * (*py--); 00421 sum += (q63_t) * px++ * (*py--); 00422 sum += (q63_t) * px++ * (*py--); 00423 sum += (q63_t) * px++ * (*py--); 00424 00425 /* Decrement the loop counter */ 00426 k--; 00427 } 00428 00429 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00430 ** No loop unrolling is used. */ 00431 k = srcBLen % 0x4u; 00432 00433 while(k > 0u) 00434 { 00435 /* Perform the multiply-accumulate */ 00436 sum += (q63_t) * px++ * (*py--); 00437 00438 /* Decrement the loop counter */ 00439 k--; 00440 } 00441 00442 /* Store the result in the accumulator in the destination buffer. */ 00443 *pOut++ = (q31_t) (sum >> 31); 00444 00445 /* Increment the MAC count */ 00446 count++; 00447 00448 /* Update the inputA and inputB pointers for next MAC calculation */ 00449 px = pIn1 + count; 00450 py = pSrc2; 00451 00452 /* Decrement the loop counter */ 00453 blkCnt--; 00454 } 00455 } 00456 else 00457 { 00458 /* If the srcBLen is not a multiple of 4, 00459 * the blockSize2 loop cannot be unrolled by 4 */ 00460 blkCnt = (uint32_t) blockSize2; 00461 00462 while(blkCnt > 0u) 00463 { 00464 /* Accumulator is made zero for every iteration */ 00465 sum = 0; 00466 00467 /* srcBLen number of MACS should be performed */ 00468 k = srcBLen; 00469 00470 while(k > 0u) 00471 { 00472 /* Perform the multiply-accumulate */ 00473 sum += (q63_t) * px++ * (*py--); 00474 00475 /* Decrement the loop counter */ 00476 k--; 00477 } 00478 00479 /* Store the result in the accumulator in the destination buffer. */ 00480 *pOut++ = (q31_t) (sum >> 31); 00481 00482 /* Increment the MAC count */ 00483 count++; 00484 00485 /* Update the inputA and inputB pointers for next MAC calculation */ 00486 px = pIn1 + count; 00487 py = pSrc2; 00488 00489 /* Decrement the loop counter */ 00490 blkCnt--; 00491 } 00492 } 00493 00494 00495 /* -------------------------- 00496 * Initializations of stage3 00497 * -------------------------*/ 00498 00499 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00500 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00501 * .... 00502 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00503 * sum += x[srcALen-1] * y[srcBLen-1] 00504 */ 00505 00506 /* In this stage the MAC operations are decreased by 1 for every iteration. 00507 The blockSize3 variable holds the number of MAC operations performed */ 00508 count = srcBLen - 1u; 00509 00510 /* Working pointer of inputA */ 00511 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00512 px = pSrc1; 00513 00514 /* Working pointer of inputB */ 00515 pSrc2 = pIn2 + (srcBLen - 1u); 00516 py = pSrc2; 00517 00518 /* ------------------- 00519 * Stage3 process 00520 * ------------------*/ 00521 00522 while(blockSize3 > 0) 00523 { 00524 /* Accumulator is made zero for every iteration */ 00525 sum = 0; 00526 00527 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00528 k = count >> 2u; 00529 00530 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00531 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00532 while(k > 0u) 00533 { 00534 x0 = *px++; 00535 c0 = *py--; 00536 00537 x1 = *px++; 00538 c1 = *py--; 00539 00540 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00541 sum += (q63_t) x0 * c0; 00542 00543 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00544 sum += (q63_t) x1 * c1; 00545 00546 x0 = *px++; 00547 c0 = *py--; 00548 00549 x1 = *px++; 00550 c1 = *py--; 00551 00552 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00553 sum += (q63_t) x0 * c0; 00554 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00555 sum += (q63_t) x1 * c1; 00556 00557 /* Decrement the loop counter */ 00558 k--; 00559 } 00560 00561 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00562 ** No loop unrolling is used. */ 00563 k = count % 0x4u; 00564 00565 while(k > 0u) 00566 { 00567 /* Perform the multiply-accumulate */ 00568 sum += (q63_t) * px++ * (*py--); 00569 00570 /* Decrement the loop counter */ 00571 k--; 00572 } 00573 00574 /* Store the result in the accumulator in the destination buffer. */ 00575 *pOut++ = (q31_t) (sum >> 31); 00576 00577 /* Update the inputA and inputB pointers for next MAC calculation */ 00578 px = ++pSrc1; 00579 py = pSrc2; 00580 00581 /* Decrement the MAC count */ 00582 count--; 00583 00584 /* Decrement the loop counter */ 00585 blockSize3--; 00586 00587 } 00588 00589 /* set status as ARM_MATH_SUCCESS */ 00590 status = ARM_MATH_SUCCESS; 00591 } 00592 00593 /* Return to application */ 00594 return (status); 00595 00596 } 00597