00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "arm_math.h"
00025
00059 #ifndef UNALIGNED_SUPPORT_DISABLE
00060
00061 arm_status arm_conv_partial_fast_q15(
00062 q15_t * pSrcA,
00063 uint32_t srcALen,
00064 q15_t * pSrcB,
00065 uint32_t srcBLen,
00066 q15_t * pDst,
00067 uint32_t firstIndex,
00068 uint32_t numPoints,
00069 q15_t * pScratch1,
00070 q15_t *pScratch2)
00071 {
00072
00073 q15_t *pOut = pDst;
00074 q15_t *pScr1 = pScratch1;
00075 q15_t *pScr2 = pScratch2;
00076 q31_t acc0, acc1, acc2, acc3;
00077 q31_t x1, x2, x3;
00078 q31_t y1, y2;
00079 q15_t *pIn1;
00080 q15_t *pIn2;
00081 q15_t *px;
00082 q15_t *py;
00083 uint32_t j, k, blkCnt;
00084 arm_status status;
00085
00086 uint32_t tapCnt;
00087
00088 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00089 {
00090
00091 status = ARM_MATH_ARGUMENT_ERROR;
00092 }
00093 else
00094 {
00095
00096
00097
00098
00099 if(srcALen >= srcBLen)
00100 {
00101
00102 pIn1 = pSrcA;
00103
00104
00105 pIn2 = pSrcB;
00106 }
00107 else
00108 {
00109
00110 pIn1 = pSrcB;
00111
00112
00113 pIn2 = pSrcA;
00114
00115
00116 j = srcBLen;
00117 srcBLen = srcALen;
00118 srcALen = j;
00119 }
00120
00121
00122 py = pScratch2;
00123
00124
00125 pScr2 = pScratch2 + srcBLen - 1;
00126
00127
00128 px = pIn2;
00129
00130
00131 k = srcBLen >> 2u;
00132
00133
00134
00135
00136
00137 while(k > 0u)
00138 {
00139
00140 *pScr2-- = *px++;
00141 *pScr2-- = *px++;
00142 *pScr2-- = *px++;
00143 *pScr2-- = *px++;
00144
00145
00146 k--;
00147 }
00148
00149
00150
00151 k = srcBLen % 0x4u;
00152
00153 while(k > 0u)
00154 {
00155
00156 *pScr2-- = *px++;
00157
00158
00159 k--;
00160 }
00161
00162
00163 pScr1 = pScratch1;
00164
00165
00166
00167 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00168
00169
00170 pScr1 += (srcBLen - 1u);
00171
00172
00173
00174
00175 arm_copy_q15(pIn1, pScr1, srcALen);
00176
00177
00178 pScr1 += srcALen;
00179
00180
00181 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00182
00183
00184 pScr1 += (srcBLen - 1u);
00185
00186
00187 pIn2 = py;
00188
00189 pScratch1 += firstIndex;
00190
00191 pOut = pDst + firstIndex;
00192
00193
00194
00195
00196
00197 blkCnt = (numPoints) >> 2;
00198
00199 while(blkCnt > 0)
00200 {
00201
00202 pScr1 = pScratch1;
00203
00204
00205 acc0 = 0;
00206 acc1 = 0;
00207 acc2 = 0;
00208 acc3 = 0;
00209
00210
00211 x1 = *__SIMD32(pScr1)++;
00212
00213
00214 x2 = *__SIMD32(pScr1)++;
00215
00216 tapCnt = (srcBLen) >> 2u;
00217
00218 while(tapCnt > 0u)
00219 {
00220
00221
00222 y1 = _SIMD32_OFFSET(pIn2);
00223 y2 = _SIMD32_OFFSET(pIn2 + 2u);
00224
00225
00226 acc0 = __SMLAD(x1, y1, acc0);
00227 acc2 = __SMLAD(x2, y1, acc2);
00228
00229
00230 #ifndef ARM_MATH_BIG_ENDIAN
00231 x3 = __PKHBT(x2, x1, 0);
00232 #else
00233 x3 = __PKHBT(x1, x2, 0);
00234 #endif
00235
00236
00237 acc1 = __SMLADX(x3, y1, acc1);
00238
00239
00240 x1 = _SIMD32_OFFSET(pScr1);
00241
00242
00243 acc0 = __SMLAD(x2, y2, acc0);
00244
00245 acc2 = __SMLAD(x1, y2, acc2);
00246
00247
00248 #ifndef ARM_MATH_BIG_ENDIAN
00249 x3 = __PKHBT(x1, x2, 0);
00250 #else
00251 x3 = __PKHBT(x2, x1, 0);
00252 #endif
00253
00254 acc3 = __SMLADX(x3, y1, acc3);
00255 acc1 = __SMLADX(x3, y2, acc1);
00256
00257 x2 = _SIMD32_OFFSET(pScr1 + 2u);
00258
00259 #ifndef ARM_MATH_BIG_ENDIAN
00260 x3 = __PKHBT(x2, x1, 0);
00261 #else
00262 x3 = __PKHBT(x1, x2, 0);
00263 #endif
00264
00265 acc3 = __SMLADX(x3, y2, acc3);
00266
00267
00268 pIn2 += 4u;
00269 pScr1 += 4u;
00270
00271
00272
00273 tapCnt--;
00274 }
00275
00276
00277 pScr1 -= 4u;
00278
00279
00280 tapCnt = (srcBLen) & 3u;
00281
00282 while(tapCnt > 0u)
00283 {
00284
00285
00286 acc0 += (*pScr1++ * *pIn2);
00287 acc1 += (*pScr1++ * *pIn2);
00288 acc2 += (*pScr1++ * *pIn2);
00289 acc3 += (*pScr1++ * *pIn2++);
00290
00291 pScr1 -= 3u;
00292
00293
00294 tapCnt--;
00295 }
00296
00297 blkCnt--;
00298
00299
00300
00301
00302 #ifndef ARM_MATH_BIG_ENDIAN
00303
00304 #ifdef CCS
00305 *__SIMD32(pOut)++ =
00306 __PKHBT(__SSATA(acc0, 15, 16), __SSATA(acc1, 15, 16), 16);
00307 *__SIMD32(pOut)++ =
00308 __PKHBT(__SSATA(acc2, 15, 16), __SSATA(acc3, 15, 16), 16);
00309 #else
00310 *__SIMD32(pOut)++ =
00311 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00312 *__SIMD32(pOut)++ =
00313 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00314 #endif
00315
00316 #else
00317
00318 #ifdef CCS
00319 *__SIMD32(pOut)++ =
00320 __PKHBT(__SSATA(acc1, 15, 16), __SSATA(acc0, 15, 16), 16);
00321 *__SIMD32(pOut)++ =
00322 __PKHBT(__SSATA(acc3, 15, 16), __SSATA(acc2, 15, 16), 16);
00323
00324 #else
00325 *__SIMD32(pOut)++ =
00326 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00327 *__SIMD32(pOut)++ =
00328 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00329 #endif
00330
00331 #endif
00332
00333
00334 pIn2 = py;
00335
00336 pScratch1 += 4u;
00337
00338 }
00339
00340
00341 blkCnt = numPoints & 0x3;
00342
00343
00344 while(blkCnt > 0)
00345 {
00346
00347 pScr1 = pScratch1;
00348
00349
00350 acc0 = 0;
00351
00352 tapCnt = (srcBLen) >> 1u;
00353
00354 while(tapCnt > 0u)
00355 {
00356
00357
00358 x1 = *__SIMD32(pScr1)++;
00359
00360
00361 y1 = *__SIMD32(pIn2)++;
00362
00363 acc0 = __SMLAD(x1, y1, acc0);
00364
00365
00366 tapCnt--;
00367 }
00368
00369 tapCnt = (srcBLen) & 1u;
00370
00371
00372 while(tapCnt > 0u)
00373 {
00374
00375
00376 acc0 += (*pScr1++ * *pIn2++);
00377
00378
00379 tapCnt--;
00380 }
00381
00382 blkCnt--;
00383
00384
00385
00386 #ifdef CCS
00387 *pOut++ = (q15_t) (__SSATA(acc0, 15, 16));
00388 #else
00389 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00390 #endif
00391
00392
00393
00394 pIn2 = py;
00395
00396 pScratch1 += 1u;
00397
00398 }
00399
00400 status = ARM_MATH_SUCCESS;
00401 }
00402
00403 return (status);
00404 }
00405
00406 #else
00407
00408 arm_status arm_conv_partial_fast_q15(
00409 q15_t * pSrcA,
00410 uint32_t srcALen,
00411 q15_t * pSrcB,
00412 uint32_t srcBLen,
00413 q15_t * pDst,
00414 uint32_t firstIndex,
00415 uint32_t numPoints,
00416 q15_t * pScratch1,
00417 q15_t *pScratch2)
00418 {
00419
00420 q15_t *pOut = pDst;
00421 q15_t *pScr1 = pScratch1;
00422 q15_t *pScr2 = pScratch2;
00423 q31_t acc0, acc1, acc2, acc3;
00424 q15_t *pIn1;
00425 q15_t *pIn2;
00426 q15_t *px;
00427 q15_t *py;
00428 uint32_t j, k, blkCnt;
00429 arm_status status;
00430 uint32_t tapCnt;
00431 q15_t x10, x11, x20, x21;
00432 q15_t y10, y11;
00433
00434
00435
00436 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00437 {
00438
00439 status = ARM_MATH_ARGUMENT_ERROR;
00440 }
00441 else
00442 {
00443
00444
00445
00446
00447 if(srcALen >= srcBLen)
00448 {
00449
00450 pIn1 = pSrcA;
00451
00452
00453 pIn2 = pSrcB;
00454 }
00455 else
00456 {
00457
00458 pIn1 = pSrcB;
00459
00460
00461 pIn2 = pSrcA;
00462
00463
00464 j = srcBLen;
00465 srcBLen = srcALen;
00466 srcALen = j;
00467 }
00468
00469
00470 py = pScratch2;
00471
00472
00473 pScr2 = pScratch2 + srcBLen - 1;
00474
00475
00476 px = pIn2;
00477
00478
00479 k = srcBLen >> 2u;
00480
00481
00482
00483 while(k > 0u)
00484 {
00485
00486 *pScr2-- = *px++;
00487 *pScr2-- = *px++;
00488 *pScr2-- = *px++;
00489 *pScr2-- = *px++;
00490
00491
00492 k--;
00493 }
00494
00495
00496
00497 k = srcBLen % 0x4u;
00498
00499 while(k > 0u)
00500 {
00501
00502 *pScr2-- = *px++;
00503
00504
00505 k--;
00506 }
00507
00508
00509 pScr1 = pScratch1;
00510
00511
00512 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00513
00514
00515 pScr1 += (srcBLen - 1u);
00516
00517
00518
00519
00520
00521 k = srcALen >> 2u;
00522
00523
00524
00525 while(k > 0u)
00526 {
00527
00528 *pScr1++ = *pIn1++;
00529 *pScr1++ = *pIn1++;
00530 *pScr1++ = *pIn1++;
00531 *pScr1++ = *pIn1++;
00532
00533
00534 k--;
00535 }
00536
00537
00538
00539 k = srcALen % 0x4u;
00540
00541 while(k > 0u)
00542 {
00543
00544 *pScr1++ = *pIn1++;
00545
00546
00547 k--;
00548 }
00549
00550
00551
00552 k = (srcBLen - 1u) >> 2u;
00553
00554
00555
00556 while(k > 0u)
00557 {
00558
00559 *pScr1++ = 0;
00560 *pScr1++ = 0;
00561 *pScr1++ = 0;
00562 *pScr1++ = 0;
00563
00564
00565 k--;
00566 }
00567
00568
00569
00570 k = (srcBLen - 1u) % 0x4u;
00571
00572 while(k > 0u)
00573 {
00574
00575 *pScr1++ = 0;
00576
00577
00578 k--;
00579 }
00580
00581
00582
00583 pIn2 = py;
00584
00585 pScratch1 += firstIndex;
00586
00587 pOut = pDst + firstIndex;
00588
00589
00590 blkCnt = (numPoints) >> 2;
00591
00592 while(blkCnt > 0)
00593 {
00594
00595 pScr1 = pScratch1;
00596
00597
00598 acc0 = 0;
00599 acc1 = 0;
00600 acc2 = 0;
00601 acc3 = 0;
00602
00603
00604 x10 = *pScr1++;
00605 x11 = *pScr1++;
00606
00607
00608 x20 = *pScr1++;
00609 x21 = *pScr1++;
00610
00611 tapCnt = (srcBLen) >> 2u;
00612
00613 while(tapCnt > 0u)
00614 {
00615
00616
00617 y10 = *pIn2;
00618 y11 = *(pIn2 + 1u);
00619
00620
00621 acc0 += (q31_t) x10 * y10;
00622 acc0 += (q31_t) x11 * y11;
00623 acc2 += (q31_t) x20 * y10;
00624 acc2 += (q31_t) x21 * y11;
00625
00626
00627 acc1 += (q31_t) x11 * y10;
00628 acc1 += (q31_t) x20 * y11;
00629
00630
00631 x10 = *pScr1;
00632 x11 = *(pScr1 + 1u);
00633
00634
00635 acc3 += (q31_t) x21 * y10;
00636 acc3 += (q31_t) x10 * y11;
00637
00638
00639 y10 = *(pIn2 + 2u);
00640 y11 = *(pIn2 + 3u);
00641
00642
00643 acc0 += (q31_t) x20 * y10;
00644 acc0 += (q31_t) x21 * y11;
00645 acc2 += (q31_t) x10 * y10;
00646 acc2 += (q31_t) x11 * y11;
00647 acc1 += (q31_t) x21 * y10;
00648 acc1 += (q31_t) x10 * y11;
00649
00650
00651 x20 = *(pScr1 + 2);
00652 x21 = *(pScr1 + 3);
00653
00654
00655 acc3 += (q31_t) x11 * y10;
00656 acc3 += (q31_t) x20 * y11;
00657
00658
00659 pIn2 += 4u;
00660 pScr1 += 4u;
00661
00662
00663 tapCnt--;
00664 }
00665
00666
00667 pScr1 -= 4u;
00668
00669
00670 tapCnt = (srcBLen) & 3u;
00671
00672 while(tapCnt > 0u)
00673 {
00674
00675 acc0 += (*pScr1++ * *pIn2);
00676 acc1 += (*pScr1++ * *pIn2);
00677 acc2 += (*pScr1++ * *pIn2);
00678 acc3 += (*pScr1++ * *pIn2++);
00679
00680 pScr1 -= 3u;
00681
00682
00683 tapCnt--;
00684 }
00685
00686 blkCnt--;
00687
00688
00689
00690 #ifdef CCS
00691 *pOut++ = __SSATA(acc0, 15, 16);
00692 *pOut++ = __SSATA(acc1, 15, 16);
00693 *pOut++ = __SSATA(acc2, 15, 16);
00694 *pOut++ = __SSATA(acc3, 15, 16);
00695 #else
00696 *pOut++ = __SSAT((acc0 >> 15), 16);
00697 *pOut++ = __SSAT((acc1 >> 15), 16);
00698 *pOut++ = __SSAT((acc2 >> 15), 16);
00699 *pOut++ = __SSAT((acc3 >> 15), 16);
00700 #endif
00701
00702
00703 pIn2 = py;
00704
00705 pScratch1 += 4u;
00706
00707 }
00708
00709
00710 blkCnt = numPoints & 0x3;
00711
00712
00713 while(blkCnt > 0)
00714 {
00715
00716 pScr1 = pScratch1;
00717
00718
00719 acc0 = 0;
00720
00721 tapCnt = (srcBLen) >> 1u;
00722
00723 while(tapCnt > 0u)
00724 {
00725
00726
00727 x10 = *pScr1++;
00728 x11 = *pScr1++;
00729
00730
00731 y10 = *pIn2++;
00732 y11 = *pIn2++;
00733
00734
00735 acc0 += (q31_t) x10 * y10;
00736 acc0 += (q31_t) x11 * y11;
00737
00738
00739 tapCnt--;
00740 }
00741
00742 tapCnt = (srcBLen) & 1u;
00743
00744
00745 while(tapCnt > 0u)
00746 {
00747
00748
00749 acc0 += (*pScr1++ * *pIn2++);
00750
00751
00752 tapCnt--;
00753 }
00754
00755 blkCnt--;
00756
00757
00758 #ifdef CCS
00759 *pOut++ = (q15_t) (__SSATA(acc0, 15, 16));
00760 #else
00761 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00762 #endif
00763
00764
00765 pIn2 = py;
00766
00767 pScratch1 += 1u;
00768
00769 }
00770
00771
00772 status = ARM_MATH_SUCCESS;
00773
00774 }
00775
00776
00777 return (status);
00778 }
00779
00780 #endif
00781
00782