00001 /* ---------------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_conv_partial_f32.c 00009 * 00010 * Description: Partial convolution of floating-point sequences. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------------- */ 00024 00025 #include "arm_math.h" 00026 00065 arm_status arm_conv_partial_f32( 00066 float32_t * pSrcA, 00067 uint32_t srcALen, 00068 float32_t * pSrcB, 00069 uint32_t srcBLen, 00070 float32_t * pDst, 00071 uint32_t firstIndex, 00072 uint32_t numPoints) 00073 { 00074 float32_t *pIn1 = pSrcA; /* inputA pointer */ 00075 float32_t *pIn2 = pSrcB; /* inputB pointer */ 00076 float32_t *pOut = pDst; /* output pointer */ 00077 float32_t *px; /* Intermediate inputA pointer */ 00078 float32_t *py; /* Intermediate inputB pointer */ 00079 float32_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00080 float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00081 float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */ 00082 uint32_t j, k, count = 0u, blkCnt, check; 00083 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00084 arm_status status; /* status of Partial convolution */ 00085 00086 00087 /* Check for range of output samples to be calculated */ 00088 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00089 { 00090 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00091 status = ARM_MATH_ARGUMENT_ERROR; 00092 } 00093 else 00094 { 00095 00096 /* The algorithm implementation is based on the lengths of the inputs. */ 00097 /* srcB is always made to slide across srcA. */ 00098 /* So srcBLen is always considered as shorter or equal to srcALen */ 00099 if(srcALen >= srcBLen) 00100 { 00101 /* Initialization of inputA pointer */ 00102 pIn1 = pSrcA; 00103 00104 /* Initialization of inputB pointer */ 00105 pIn2 = pSrcB; 00106 } 00107 else 00108 { 00109 /* Initialization of inputA pointer */ 00110 pIn1 = pSrcB; 00111 00112 /* Initialization of inputB pointer */ 00113 pIn2 = pSrcA; 00114 00115 /* srcBLen is always considered as shorter or equal to srcALen */ 00116 j = srcBLen; 00117 srcBLen = srcALen; 00118 srcALen = j; 00119 } 00120 00121 /* Conditions to check which loopCounter holds 00122 * the first and last indices of the output samples to be calculated. */ 00123 check = firstIndex + numPoints; 00124 blockSize3 = (int32_t) check - (int32_t) srcALen; 00125 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00126 blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex; 00127 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00128 (int32_t) numPoints) : 0; 00129 blockSize2 = ((int32_t) check - blockSize3) - 00130 (blockSize1 + (int32_t) firstIndex); 00131 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00132 00133 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00134 /* The function is internally 00135 * divided into three stages according to the number of multiplications that has to be 00136 * taken place between inputA samples and inputB samples. In the first stage of the 00137 * algorithm, the multiplications increase by one for every iteration. 00138 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00139 * In the third stage of the algorithm, the multiplications decrease by one 00140 * for every iteration. */ 00141 00142 /* Set the output pointer to point to the firstIndex 00143 * of the output sample to be calculated. */ 00144 pOut = pDst + firstIndex; 00145 00146 /* -------------------------- 00147 * Initializations of stage1 00148 * -------------------------*/ 00149 00150 /* sum = x[0] * y[0] 00151 * sum = x[0] * y[1] + x[1] * y[0] 00152 * .... 00153 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00154 */ 00155 00156 /* In this stage the MAC operations are increased by 1 for every iteration. 00157 The count variable holds the number of MAC operations performed. 00158 Since the partial convolution starts from from firstIndex 00159 Number of Macs to be performed is firstIndex + 1 */ 00160 count = 1u + firstIndex; 00161 00162 /* Working pointer of inputA */ 00163 px = pIn1; 00164 00165 /* Working pointer of inputB */ 00166 pSrc1 = pIn2 + firstIndex; 00167 py = pSrc1; 00168 00169 /* ------------------------ 00170 * Stage1 process 00171 * ----------------------*/ 00172 00173 /* The first stage starts here */ 00174 while(blockSize1 > 0) 00175 { 00176 /* Accumulator is made zero for every iteration */ 00177 sum = 0.0f; 00178 00179 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00180 k = count >> 2u; 00181 00182 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00183 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00184 while(k > 0u) 00185 { 00186 00187 /* Read x[0] */ 00188 x0 = *px++; 00189 00190 /* y[srcBLen - 1] */ 00191 c0 = *py--; 00192 00193 /* x[0] * y[srcBLen - 1] */ 00194 sum += x0 * c0; 00195 00196 /* x[1] * y[srcBLen - 2] */ 00197 sum += *px++ * *py--; 00198 00199 /* x[2] * y[srcBLen - 3] */ 00200 sum += *px++ * *py--; 00201 00202 /* x[3] * y[srcBLen - 4] */ 00203 sum += *px++ * *py--; 00204 00205 /* Decrement the loop counter */ 00206 k--; 00207 } 00208 00209 /* If the count is not a multiple of 4, compute any remaining MACs here. 00210 ** No loop unrolling is used. */ 00211 k = count % 0x4u; 00212 00213 while(k > 0u) 00214 { 00215 00216 x0 = *px++; 00217 c0 = *py--; 00218 00219 /* Perform the multiply-accumulate */ 00220 sum += x0 * c0; 00221 00222 /* Decrement the loop counter */ 00223 k--; 00224 } 00225 00226 /* Store the result in the accumulator in the destination buffer. */ 00227 *pOut++ = sum; 00228 00229 /* Update the inputA and inputB pointers for next MAC calculation */ 00230 py = ++pSrc1; 00231 px = pIn1; 00232 00233 /* Increment the MAC count */ 00234 count++; 00235 00236 /* Decrement the loop counter */ 00237 blockSize1--; 00238 } 00239 00240 /* -------------------------- 00241 * Initializations of stage2 00242 * ------------------------*/ 00243 00244 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00245 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00246 * .... 00247 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00248 */ 00249 00250 /* Working pointer of inputA */ 00251 px = pIn1; 00252 00253 /* Working pointer of inputB */ 00254 pSrc2 = pIn2 + (srcBLen - 1u); 00255 py = pSrc2; 00256 00257 /* count is index by which the pointer pIn1 to be incremented */ 00258 count = 0u; 00259 00260 /* ------------------- 00261 * Stage2 process 00262 * ------------------*/ 00263 00264 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00265 * So, to loop unroll over blockSize2, 00266 * srcBLen should be greater than or equal to 4 */ 00267 if(srcBLen >= 4u) 00268 { 00269 /* Loop unroll over blockSize2, by 4 */ 00270 blkCnt = ((uint32_t) blockSize2 >> 2u); 00271 00272 while(blkCnt > 0u) 00273 { 00274 /* Set all accumulators to zero */ 00275 acc0 = 0.0f; 00276 acc1 = 0.0f; 00277 acc2 = 0.0f; 00278 acc3 = 0.0f; 00279 00280 /* read x[0], x[1], x[2] samples */ 00281 x0 = *(px++); 00282 x1 = *(px++); 00283 x2 = *(px++); 00284 00285 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00286 k = srcBLen >> 2u; 00287 00288 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00289 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00290 do 00291 { 00292 /* Read y[srcBLen - 1] sample */ 00293 c0 = *(py--); 00294 00295 /* Read x[3] sample */ 00296 x3 = *(px); 00297 00298 /* Perform the multiply-accumulate */ 00299 /* acc0 += x[0] * y[srcBLen - 1] */ 00300 acc0 += x0 * c0; 00301 00302 /* acc1 += x[1] * y[srcBLen - 1] */ 00303 acc1 += x1 * c0; 00304 00305 /* acc2 += x[2] * y[srcBLen - 1] */ 00306 acc2 += x2 * c0; 00307 00308 /* acc3 += x[3] * y[srcBLen - 1] */ 00309 acc3 += x3 * c0; 00310 00311 /* Read y[srcBLen - 2] sample */ 00312 c0 = *(py--); 00313 00314 /* Read x[4] sample */ 00315 x0 = *(px + 1u); 00316 00317 /* Perform the multiply-accumulate */ 00318 /* acc0 += x[1] * y[srcBLen - 2] */ 00319 acc0 += x1 * c0; 00320 /* acc1 += x[2] * y[srcBLen - 2] */ 00321 acc1 += x2 * c0; 00322 /* acc2 += x[3] * y[srcBLen - 2] */ 00323 acc2 += x3 * c0; 00324 /* acc3 += x[4] * y[srcBLen - 2] */ 00325 acc3 += x0 * c0; 00326 00327 /* Read y[srcBLen - 3] sample */ 00328 c0 = *(py--); 00329 00330 /* Read x[5] sample */ 00331 x1 = *(px + 2u); 00332 00333 /* Perform the multiply-accumulates */ 00334 /* acc0 += x[2] * y[srcBLen - 3] */ 00335 acc0 += x2 * c0; 00336 /* acc1 += x[3] * y[srcBLen - 2] */ 00337 acc1 += x3 * c0; 00338 /* acc2 += x[4] * y[srcBLen - 2] */ 00339 acc2 += x0 * c0; 00340 /* acc3 += x[5] * y[srcBLen - 2] */ 00341 acc3 += x1 * c0; 00342 00343 /* Read y[srcBLen - 4] sample */ 00344 c0 = *(py--); 00345 00346 /* Read x[6] sample */ 00347 x2 = *(px + 3u); 00348 00349 /* Perform the multiply-accumulates */ 00350 /* acc0 += x[3] * y[srcBLen - 4] */ 00351 acc0 += x3 * c0; 00352 /* acc1 += x[4] * y[srcBLen - 4] */ 00353 acc1 += x0 * c0; 00354 /* acc2 += x[5] * y[srcBLen - 4] */ 00355 acc2 += x1 * c0; 00356 /* acc3 += x[6] * y[srcBLen - 4] */ 00357 acc3 += x2 * c0; 00358 00359 px += 4u; 00360 00361 00362 } while(--k); 00363 00364 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00365 ** No loop unrolling is used. */ 00366 k = srcBLen % 0x4u; 00367 00368 while(k > 0u) 00369 { 00370 /* Read y[srcBLen - 5] sample */ 00371 c0 = *(py--); 00372 00373 /* Read x[7] sample */ 00374 x3 = *(px++); 00375 00376 /* Perform the multiply-accumulates */ 00377 /* acc0 += x[4] * y[srcBLen - 5] */ 00378 acc0 += x0 * c0; 00379 /* acc1 += x[5] * y[srcBLen - 5] */ 00380 acc1 += x1 * c0; 00381 /* acc2 += x[6] * y[srcBLen - 5] */ 00382 acc2 += x2 * c0; 00383 /* acc3 += x[7] * y[srcBLen - 5] */ 00384 acc3 += x3 * c0; 00385 00386 /* Reuse the present samples for the next MAC */ 00387 x0 = x1; 00388 x1 = x2; 00389 x2 = x3; 00390 00391 /* Decrement the loop counter */ 00392 k--; 00393 } 00394 00395 /* Store the result in the accumulator in the destination buffer. */ 00396 *pOut++ = acc0; 00397 *pOut++ = acc1; 00398 *pOut++ = acc2; 00399 *pOut++ = acc3; 00400 00401 /* Increment the pointer pIn1 index, count by 1 */ 00402 count += 4u; 00403 00404 /* Update the inputA and inputB pointers for next MAC calculation */ 00405 px = pIn1 + count; 00406 py = pSrc2; 00407 00408 /* Decrement the loop counter */ 00409 blkCnt--; 00410 } 00411 00412 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00413 ** No loop unrolling is used. */ 00414 blkCnt = (uint32_t) blockSize2 % 0x4u; 00415 00416 while(blkCnt > 0u) 00417 { 00418 /* Accumulator is made zero for every iteration */ 00419 sum = 0.0f; 00420 00421 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00422 k = srcBLen >> 2u; 00423 00424 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00425 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00426 while(k > 0u) 00427 { 00428 /* Perform the multiply-accumulates */ 00429 sum += *px++ * *py--; 00430 sum += *px++ * *py--; 00431 sum += *px++ * *py--; 00432 sum += *px++ * *py--; 00433 00434 /* Decrement the loop counter */ 00435 k--; 00436 } 00437 00438 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00439 ** No loop unrolling is used. */ 00440 k = srcBLen % 0x4u; 00441 00442 while(k > 0u) 00443 { 00444 /* Perform the multiply-accumulate */ 00445 sum += *px++ * *py--; 00446 00447 /* Decrement the loop counter */ 00448 k--; 00449 } 00450 00451 /* Increment the MAC count */ 00452 count++; 00453 00454 /* Store the result in the accumulator in the destination buffer. */ 00455 *pOut++ = sum; 00456 00457 /* Update the inputA and inputB pointers for next MAC calculation */ 00458 px = pIn1 + count; 00459 py = pSrc2; 00460 00461 /* Decrement the loop counter */ 00462 blkCnt--; 00463 } 00464 } 00465 else 00466 { 00467 /* If the srcBLen is not a multiple of 4, 00468 * the blockSize2 loop cannot be unrolled by 4 */ 00469 blkCnt = (uint32_t) blockSize2; 00470 00471 while(blkCnt > 0u) 00472 { 00473 /* Accumulator is made zero for every iteration */ 00474 sum = 0.0f; 00475 00476 /* srcBLen number of MACS should be performed */ 00477 k = srcBLen; 00478 00479 while(k > 0u) 00480 { 00481 /* Perform the multiply-accumulate */ 00482 sum += *px++ * *py--; 00483 00484 /* Decrement the loop counter */ 00485 k--; 00486 } 00487 00488 /* Increment the MAC count */ 00489 count++; 00490 00491 /* Store the result in the accumulator in the destination buffer. */ 00492 *pOut++ = sum; 00493 00494 /* Update the inputA and inputB pointers for next MAC calculation */ 00495 px = pIn1 + count; 00496 py = pSrc2; 00497 00498 /* Decrement the loop counter */ 00499 blkCnt--; 00500 } 00501 } 00502 00503 00504 /* -------------------------- 00505 * Initializations of stage3 00506 * -------------------------*/ 00507 00508 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00509 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00510 * .... 00511 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00512 * sum += x[srcALen-1] * y[srcBLen-1] 00513 */ 00514 00515 /* In this stage the MAC operations are decreased by 1 for every iteration. 00516 The count variable holds the number of MAC operations performed */ 00517 count = srcBLen - 1u; 00518 00519 /* Working pointer of inputA */ 00520 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00521 px = pSrc1; 00522 00523 /* Working pointer of inputB */ 00524 pSrc2 = pIn2 + (srcBLen - 1u); 00525 py = pSrc2; 00526 00527 while(blockSize3 > 0) 00528 { 00529 /* Accumulator is made zero for every iteration */ 00530 sum = 0.0f; 00531 00532 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00533 k = count >> 2u; 00534 00535 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00536 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00537 while(k > 0u) 00538 { 00539 /* Read x[srcALen - srcBLen + 1] */ 00540 x0 = *px++; 00541 /* Read y[srcBLen - 1] */ 00542 c0 = *py--; 00543 00544 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00545 sum += x0 * c0; 00546 00547 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00548 sum += *px++ * *py--; 00549 00550 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00551 sum += *px++ * *py--; 00552 00553 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00554 sum += *px++ * *py--; 00555 00556 /* Decrement the loop counter */ 00557 k--; 00558 } 00559 00560 /* If the count is not a multiple of 4, compute any remaining MACs here. 00561 ** No loop unrolling is used. */ 00562 k = count % 0x4u; 00563 00564 while(k > 0u) 00565 { 00566 /* Perform the multiply-accumulates */ 00567 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00568 sum += *px++ * *py--; 00569 00570 /* Decrement the loop counter */ 00571 k--; 00572 } 00573 00574 /* Store the result in the accumulator in the destination buffer. */ 00575 *pOut++ = sum; 00576 00577 /* Update the inputA and inputB pointers for next MAC calculation */ 00578 px = ++pSrc1; 00579 py = pSrc2; 00580 00581 /* Decrement the MAC count */ 00582 count--; 00583 00584 /* Decrement the loop counter */ 00585 blockSize3--; 00586 00587 } 00588 00589 /* set status as ARM_MATH_SUCCESS */ 00590 status = ARM_MATH_SUCCESS; 00591 } 00592 00593 /* Return to application */ 00594 return (status); 00595 00596 } 00597