00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_conv_q31.c 00009 * 00010 * Description: Convolution of Q31 sequences. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00062 void arm_conv_q31( 00063 q31_t * pSrcA, 00064 uint32_t srcALen, 00065 q31_t * pSrcB, 00066 uint32_t srcBLen, 00067 q31_t * pDst) 00068 { 00069 q31_t *pIn1; /* inputA pointer */ 00070 q31_t *pIn2; /* inputB pointer */ 00071 q31_t *pOut = pDst; /* output pointer */ 00072 q31_t *px; /* Intermediate inputA pointer */ 00073 q31_t *py; /* Intermediate inputB pointer */ 00074 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00075 q63_t sum; /* Accumulator */ 00076 q63_t acc0, acc1, acc2; /* Accumulator */ 00077 q31_t x0, x1, x2, c0; /* Temporary variables to hold input1 and input2 values */ 00078 uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */ 00079 q31_t c1; /* Temporary variable to hold input */ 00080 00081 /* The algorithm implementation is based on the lengths of the inputs. */ 00082 /* srcB is always made to slide across srcA. */ 00083 /* So srcBLen is always considered as shorter or equal to srcALen */ 00084 if(srcALen >= srcBLen) 00085 { 00086 /* Initialization of inputA pointer */ 00087 pIn1 = pSrcA; 00088 00089 /* Initialization of inputB pointer */ 00090 pIn2 = pSrcB; 00091 } 00092 else 00093 { 00094 /* Initialization of inputA pointer */ 00095 pIn1 = pSrcB; 00096 00097 /* Initialization of inputB pointer */ 00098 pIn2 = pSrcA; 00099 00100 /* srcBLen is always considered as shorter or equal to srcALen */ 00101 j = srcBLen; 00102 srcBLen = srcALen; 00103 srcALen = j; 00104 } 00105 00106 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00107 /* The function is internally 00108 * divided into three stages according to the number of multiplications that has to be 00109 * taken place between inputA samples and inputB samples. In the first stage of the 00110 * algorithm, the multiplications increase by one for every iteration. 00111 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00112 * In the third stage of the algorithm, the multiplications decrease by one 00113 * for every iteration. */ 00114 00115 /* The algorithm is implemented in three stages. 00116 The loop counters of each stage is initiated here. */ 00117 blockSize1 = srcBLen - 1u; 00118 blockSize2 = srcALen - (srcBLen - 1u); 00119 blockSize3 = blockSize1; 00120 00121 /* -------------------------- 00122 * Initializations of stage1 00123 * -------------------------*/ 00124 00125 /* sum = x[0] * y[0] 00126 * sum = x[0] * y[1] + x[1] * y[0] 00127 * .... 00128 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00129 */ 00130 00131 /* In this stage the MAC operations are increased by 1 for every iteration. 00132 The count variable holds the number of MAC operations performed */ 00133 count = 1u; 00134 00135 /* Working pointer of inputA */ 00136 px = pIn1; 00137 00138 /* Working pointer of inputB */ 00139 py = pIn2; 00140 00141 00142 /* ------------------------ 00143 * Stage1 process 00144 * ----------------------*/ 00145 00146 /* The first stage starts here */ 00147 while(blockSize1 > 0u) 00148 { 00149 /* Accumulator is made zero for every iteration */ 00150 sum = 0; 00151 00152 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00153 k = count >> 2u; 00154 00155 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00156 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00157 while(k > 0u) 00158 { 00159 00160 /* Read x[0] */ 00161 x0 = *px++; 00162 /* Read y[srcBLen - 1] */ 00163 c0 = *py--; 00164 00165 /* Read x[1] */ 00166 x1 = *px++; 00167 /* Read y[srcBLen - 2] */ 00168 c1 = *py--; 00169 00170 /* x[0] * y[srcBLen - 1] */ 00171 sum += (q63_t) x0 * c0; 00172 00173 /* x[1] * y[srcBLen - 2] */ 00174 sum += (q63_t) x1 * c1; 00175 00176 /* Read x[2] */ 00177 x0 = *px++; 00178 /* Read y[srcBLen - 3] */ 00179 c0 = *py--; 00180 00181 /* Read x[3] */ 00182 x1 = *px++; 00183 /* Read y[srcBLen - 4] */ 00184 c1 = *py--; 00185 00186 /* x[2] * y[srcBLen - 3] */ 00187 sum += (q63_t) x0 * c0; 00188 00189 /* x[3] * y[srcBLen - 4] */ 00190 sum += (q63_t) x1 * c1; 00191 00192 /* Decrement the loop counter */ 00193 k--; 00194 } 00195 00196 /* If the count is not a multiple of 4, compute any remaining MACs here. 00197 ** No loop unrolling is used. */ 00198 k = count % 0x4u; 00199 00200 while(k > 0u) 00201 { 00202 00203 x0 = *px++; 00204 c0 = *py--; 00205 00206 /* Perform the multiply-accumulate */ 00207 sum += (q63_t) x0 * c0; 00208 00209 /* Decrement the loop counter */ 00210 k--; 00211 } 00212 00213 /* Store the result in the accumulator in the destination buffer. */ 00214 *pOut++ = (q31_t) (sum >> 31); 00215 00216 /* Update the inputA and inputB pointers for next MAC calculation */ 00217 py = pIn2 + count; 00218 px = pIn1; 00219 00220 /* Increment the MAC count */ 00221 count++; 00222 00223 /* Decrement the loop counter */ 00224 blockSize1--; 00225 } 00226 00227 /* -------------------------- 00228 * Initializations of stage2 00229 * ------------------------*/ 00230 00231 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00232 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00233 * .... 00234 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00235 */ 00236 00237 /* Working pointer of inputA */ 00238 px = pIn1; 00239 00240 /* Working pointer of inputB */ 00241 pSrc2 = pIn2 + (srcBLen - 1u); 00242 py = pSrc2; 00243 00244 /* count is index by which the pointer pIn1 to be incremented */ 00245 count = 0u; 00246 00247 /* ------------------- 00248 * Stage2 process 00249 * ------------------*/ 00250 00251 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00252 * So, to loop unroll over blockSize2, 00253 * srcBLen should be greater than or equal to 4 */ 00254 if(srcBLen >= 4u) 00255 { 00256 /* Loop unroll by 3 */ 00257 blkCnt = blockSize2 / 3; 00258 00259 while(blkCnt > 0u) 00260 { 00261 /* Set all accumulators to zero */ 00262 acc0 = 0; 00263 acc1 = 0; 00264 acc2 = 0; 00265 00266 /* read x[0], x[1], x[2] samples */ 00267 x0 = *(px++); 00268 x1 = *(px++); 00269 00270 /* Apply loop unrolling and compute 3 MACs simultaneously. */ 00271 k = srcBLen / 3; 00272 00273 /* First part of the processing with loop unrolling. Compute 3 MACs at a time. 00274 ** a second loop below computes MACs for the remaining 1 to 2 samples. */ 00275 do 00276 { 00277 /* Read y[srcBLen - 1] sample */ 00278 c0 = *(py); 00279 00280 /* Read x[3] sample */ 00281 x2 = *(px); 00282 00283 /* Perform the multiply-accumulates */ 00284 /* acc0 += x[0] * y[srcBLen - 1] */ 00285 acc0 += ((q63_t) x0 * c0); 00286 /* acc1 += x[1] * y[srcBLen - 1] */ 00287 acc1 += ((q63_t) x1 * c0); 00288 /* acc2 += x[2] * y[srcBLen - 1] */ 00289 acc2 += ((q63_t) x2 * c0); 00290 00291 /* Read y[srcBLen - 2] sample */ 00292 c0 = *(py - 1u); 00293 00294 /* Read x[4] sample */ 00295 x0 = *(px + 1u); 00296 00297 /* Perform the multiply-accumulate */ 00298 /* acc0 += x[1] * y[srcBLen - 2] */ 00299 acc0 += ((q63_t) x1 * c0); 00300 /* acc1 += x[2] * y[srcBLen - 2] */ 00301 acc1 += ((q63_t) x2 * c0); 00302 /* acc2 += x[3] * y[srcBLen - 2] */ 00303 acc2 += ((q63_t) x0 * c0); 00304 00305 /* Read y[srcBLen - 3] sample */ 00306 c0 = *(py - 2u); 00307 00308 /* Read x[5] sample */ 00309 x1 = *(px + 2u); 00310 00311 /* Perform the multiply-accumulates */ 00312 /* acc0 += x[2] * y[srcBLen - 3] */ 00313 acc0 += ((q63_t) x2 * c0); 00314 /* acc1 += x[3] * y[srcBLen - 2] */ 00315 acc1 += ((q63_t) x0 * c0); 00316 /* acc2 += x[4] * y[srcBLen - 2] */ 00317 acc2 += ((q63_t) x1 * c0); 00318 00319 /* update scratch pointers */ 00320 px += 3u; 00321 py -= 3u; 00322 00323 } while(--k); 00324 00325 /* If the srcBLen is not a multiple of 3, compute any remaining MACs here. 00326 ** No loop unrolling is used. */ 00327 k = srcBLen - ( 3 * (srcBLen/3)); 00328 00329 while(k > 0u) 00330 { 00331 /* Read y[srcBLen - 5] sample */ 00332 c0 = *(py--); 00333 00334 /* Read x[7] sample */ 00335 x2 = *(px++); 00336 00337 /* Perform the multiply-accumulates */ 00338 /* acc0 += x[4] * y[srcBLen - 5] */ 00339 acc0 += ((q63_t) x0 * c0); 00340 /* acc1 += x[5] * y[srcBLen - 5] */ 00341 acc1 += ((q63_t) x1 * c0); 00342 /* acc2 += x[6] * y[srcBLen - 5] */ 00343 acc2 += ((q63_t) x2 * c0); 00344 00345 /* Reuse the present samples for the next MAC */ 00346 x0 = x1; 00347 x1 = x2; 00348 00349 /* Decrement the loop counter */ 00350 k--; 00351 } 00352 00353 /* Store the results in the accumulators in the destination buffer. */ 00354 *pOut++ = (q31_t) (acc0 >> 31); 00355 *pOut++ = (q31_t) (acc1 >> 31); 00356 *pOut++ = (q31_t) (acc2 >> 31); 00357 00358 /* Increment the pointer pIn1 index, count by 1 */ 00359 count += 3u; 00360 00361 /* Update the inputA and inputB pointers for next MAC calculation */ 00362 px = pIn1 + count; 00363 py = pSrc2; 00364 00365 /* Decrement the loop counter */ 00366 blkCnt--; 00367 } 00368 00369 /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here. 00370 ** No loop unrolling is used. */ 00371 blkCnt = blockSize2 - 3 * (blockSize2/3); 00372 00373 while(blkCnt > 0u) 00374 { 00375 /* Accumulator is made zero for every iteration */ 00376 sum = 0; 00377 00378 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00379 k = srcBLen >> 2u; 00380 00381 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00382 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00383 while(k > 0u) 00384 { 00385 /* Perform the multiply-accumulates */ 00386 sum += (q63_t) * px++ * (*py--); 00387 sum += (q63_t) * px++ * (*py--); 00388 sum += (q63_t) * px++ * (*py--); 00389 sum += (q63_t) * px++ * (*py--); 00390 00391 /* Decrement the loop counter */ 00392 k--; 00393 } 00394 00395 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00396 ** No loop unrolling is used. */ 00397 k = srcBLen % 0x4u; 00398 00399 while(k > 0u) 00400 { 00401 /* Perform the multiply-accumulate */ 00402 sum += (q63_t) * px++ * (*py--); 00403 00404 /* Decrement the loop counter */ 00405 k--; 00406 } 00407 00408 /* Store the result in the accumulator in the destination buffer. */ 00409 *pOut++ = (q31_t) (sum >> 31); 00410 00411 /* Increment the MAC count */ 00412 count++; 00413 00414 /* Update the inputA and inputB pointers for next MAC calculation */ 00415 px = pIn1 + count; 00416 py = pSrc2; 00417 00418 /* Decrement the loop counter */ 00419 blkCnt--; 00420 } 00421 } 00422 else 00423 { 00424 /* If the srcBLen is not a multiple of 4, 00425 * the blockSize2 loop cannot be unrolled by 4 */ 00426 blkCnt = blockSize2; 00427 00428 while(blkCnt > 0u) 00429 { 00430 /* Accumulator is made zero for every iteration */ 00431 sum = 0; 00432 00433 /* srcBLen number of MACS should be performed */ 00434 k = srcBLen; 00435 00436 while(k > 0u) 00437 { 00438 /* Perform the multiply-accumulate */ 00439 sum += (q63_t) * px++ * (*py--); 00440 00441 /* Decrement the loop counter */ 00442 k--; 00443 } 00444 00445 /* Store the result in the accumulator in the destination buffer. */ 00446 *pOut++ = (q31_t) (sum >> 31); 00447 00448 /* Increment the MAC count */ 00449 count++; 00450 00451 /* Update the inputA and inputB pointers for next MAC calculation */ 00452 px = pIn1 + count; 00453 py = pSrc2; 00454 00455 /* Decrement the loop counter */ 00456 blkCnt--; 00457 } 00458 } 00459 00460 00461 /* -------------------------- 00462 * Initializations of stage3 00463 * -------------------------*/ 00464 00465 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00466 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00467 * .... 00468 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00469 * sum += x[srcALen-1] * y[srcBLen-1] 00470 */ 00471 00472 /* In this stage the MAC operations are decreased by 1 for every iteration. 00473 The blockSize3 variable holds the number of MAC operations performed */ 00474 00475 /* Working pointer of inputA */ 00476 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00477 px = pSrc1; 00478 00479 /* Working pointer of inputB */ 00480 pSrc2 = pIn2 + (srcBLen - 1u); 00481 py = pSrc2; 00482 00483 /* ------------------- 00484 * Stage3 process 00485 * ------------------*/ 00486 00487 while(blockSize3 > 0u) 00488 { 00489 /* Accumulator is made zero for every iteration */ 00490 sum = 0; 00491 00492 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00493 k = blockSize3 >> 2u; 00494 00495 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00496 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00497 while(k > 0u) 00498 { 00499 00500 /* Read x[srcALen - srcBLen + 1] */ 00501 x0 = *px++; 00502 /* Read y[srcBLen - 1] */ 00503 c0 = *py--; 00504 00505 /* Read x[srcALen - srcBLen + 2] */ 00506 x1 = *px++; 00507 /* Read y[srcBLen - 2] */ 00508 c1 = *py--; 00509 00510 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00511 sum += (q63_t) x0 * c0; 00512 00513 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00514 sum += (q63_t) x1 * c1; 00515 00516 /* Read x[srcALen - srcBLen + 3] */ 00517 x0 = *px++; 00518 /* Read y[srcBLen - 3] */ 00519 c0 = *py--; 00520 00521 /* Read x[srcALen - srcBLen + 4] */ 00522 x1 = *px++; 00523 /* Read y[srcBLen - 4] */ 00524 c1 = *py--; 00525 00526 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00527 sum += (q63_t) x0 * c0; 00528 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00529 sum += (q63_t) x1 * c1; 00530 00531 /* Decrement the loop counter */ 00532 k--; 00533 } 00534 00535 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here. 00536 ** No loop unrolling is used. */ 00537 k = blockSize3 % 0x4u; 00538 00539 while(k > 0u) 00540 { 00541 00542 x0 = *px++; 00543 c0 = *py--; 00544 00545 /* Perform the multiply-accumulate */ 00546 sum += (q63_t) x0 * c0; 00547 00548 /* Decrement the loop counter */ 00549 k--; 00550 } 00551 00552 /* Store the result in the accumulator in the destination buffer. */ 00553 *pOut++ = (q31_t) (sum >> 31); 00554 00555 /* Update the inputA and inputB pointers for next MAC calculation */ 00556 px = ++pSrc1; 00557 py = pSrc2; 00558 00559 /* Decrement the loop counter */ 00560 blockSize3--; 00561 } 00562 00563 } 00564