00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "arm_math.h"
00026
00063 #ifndef UNALIGNED_SUPPORT_DISABLE
00064
00065 arm_status arm_conv_partial_q15(
00066 q15_t * pSrcA,
00067 uint32_t srcALen,
00068 q15_t * pSrcB,
00069 uint32_t srcBLen,
00070 q15_t * pDst,
00071 uint32_t firstIndex,
00072 uint32_t numPoints,
00073 q15_t * pScratch1,
00074 q15_t *pScratch2)
00075 {
00076
00077 q15_t *pOut = pDst;
00078 q15_t *pScr1 = pScratch1;
00079 q15_t *pScr2 = pScratch2;
00080 q63_t acc0, acc1, acc2, acc3;
00081 q31_t x1, x2, x3;
00082 q31_t y1, y2;
00083 q15_t *pIn1;
00084 q15_t *pIn2;
00085 q15_t *px;
00086 q15_t *py;
00087 uint32_t j, k, blkCnt;
00088 arm_status status;
00089 uint32_t tapCnt;
00090
00091
00092 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00093 {
00094
00095 status = ARM_MATH_ARGUMENT_ERROR;
00096 }
00097 else
00098 {
00099
00100
00101
00102
00103 if(srcALen >= srcBLen)
00104 {
00105
00106 pIn1 = pSrcA;
00107
00108
00109 pIn2 = pSrcB;
00110 }
00111 else
00112 {
00113
00114 pIn1 = pSrcB;
00115
00116
00117 pIn2 = pSrcA;
00118
00119
00120 j = srcBLen;
00121 srcBLen = srcALen;
00122 srcALen = j;
00123 }
00124
00125
00126 py = pScratch2;
00127
00128
00129 pScr2 = pScratch2 + srcBLen - 1;
00130
00131
00132 px = pIn2;
00133
00134
00135 k = srcBLen >> 2u;
00136
00137
00138
00139 while(k > 0u)
00140 {
00141
00142 *pScr2-- = *px++;
00143 *pScr2-- = *px++;
00144 *pScr2-- = *px++;
00145 *pScr2-- = *px++;
00146
00147
00148 k--;
00149 }
00150
00151
00152
00153 k = srcBLen % 0x4u;
00154
00155 while(k > 0u)
00156 {
00157
00158 *pScr2-- = *px++;
00159
00160
00161 k--;
00162 }
00163
00164
00165 pScr1 = pScratch1;
00166
00167
00168 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00169
00170
00171 pScr1 += (srcBLen - 1u);
00172
00173
00174
00175
00176 arm_copy_q15(pIn1, pScr1, srcALen);
00177
00178
00179 pScr1 += srcALen;
00180
00181
00182 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00183
00184
00185 pScr1 += (srcBLen - 1u);
00186
00187
00188 pIn2 = py;
00189
00190 pScratch1 += firstIndex;
00191
00192 pOut = pDst + firstIndex;
00193
00194
00195 blkCnt = (numPoints) >> 2;
00196
00197 while(blkCnt > 0)
00198 {
00199
00200 pScr1 = pScratch1;
00201
00202
00203 acc0 = 0;
00204 acc1 = 0;
00205 acc2 = 0;
00206 acc3 = 0;
00207
00208
00209 x1 = *__SIMD32(pScr1)++;
00210
00211
00212 x2 = *__SIMD32(pScr1)++;
00213
00214 tapCnt = (srcBLen) >> 2u;
00215
00216 while(tapCnt > 0u)
00217 {
00218
00219
00220 y1 = _SIMD32_OFFSET(pIn2);
00221 y2 = _SIMD32_OFFSET(pIn2 + 2u);
00222
00223
00224 acc0 = __SMLALD(x1, y1, acc0);
00225 acc2 = __SMLALD(x2, y1, acc2);
00226
00227
00228 #ifndef ARM_MATH_BIG_ENDIAN
00229 x3 = __PKHBT(x2, x1, 0);
00230 #else
00231 x3 = __PKHBT(x1, x2, 0);
00232 #endif
00233
00234
00235 acc1 = __SMLALDX(x3, y1, acc1);
00236
00237
00238 x1 = _SIMD32_OFFSET(pScr1);
00239
00240
00241 acc0 = __SMLALD(x2, y2, acc0);
00242 acc2 = __SMLALD(x1, y2, acc2);
00243
00244
00245 #ifndef ARM_MATH_BIG_ENDIAN
00246 x3 = __PKHBT(x1, x2, 0);
00247 #else
00248 x3 = __PKHBT(x2, x1, 0);
00249 #endif
00250
00251 acc3 = __SMLALDX(x3, y1, acc3);
00252 acc1 = __SMLALDX(x3, y2, acc1);
00253
00254 x2 = _SIMD32_OFFSET(pScr1 + 2u);
00255
00256 #ifndef ARM_MATH_BIG_ENDIAN
00257 x3 = __PKHBT(x2, x1, 0);
00258 #else
00259 x3 = __PKHBT(x1, x2, 0);
00260 #endif
00261
00262 acc3 = __SMLALDX(x3, y2, acc3);
00263
00264
00265 pIn2 += 4u;
00266 pScr1 += 4u;
00267
00268
00269
00270 tapCnt--;
00271 }
00272
00273
00274 pScr1 -= 4u;
00275
00276
00277 tapCnt = (srcBLen) & 3u;
00278
00279 while(tapCnt > 0u)
00280 {
00281
00282 acc0 += (*pScr1++ * *pIn2);
00283 acc1 += (*pScr1++ * *pIn2);
00284 acc2 += (*pScr1++ * *pIn2);
00285 acc3 += (*pScr1++ * *pIn2++);
00286
00287 pScr1 -= 3u;
00288
00289
00290 tapCnt--;
00291 }
00292
00293 blkCnt--;
00294
00295
00296
00297
00298 #ifndef ARM_MATH_BIG_ENDIAN
00299
00300 #ifdef CCS
00301 *__SIMD32(pOut)++ =
00302 __PKHBT(__SSATA(acc0, 15, 16), __SSATA(acc1, 15, 16), 16);
00303 *__SIMD32(pOut)++ =
00304 __PKHBT(__SSATA(acc2, 15, 16), __SSATA(acc3, 15, 16), 16);
00305 #else
00306 *__SIMD32(pOut)++ =
00307 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
00308 *__SIMD32(pOut)++ =
00309 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
00310 #endif
00311
00312 #else
00313
00314 #ifdef CCS
00315 *__SIMD32(pOut)++ =
00316 __PKHBT(__SSATA(acc1, 15, 16), __SSATA(acc0, 15, 16), 16);
00317 *__SIMD32(pOut)++ =
00318 __PKHBT(__SSATA(acc3, 15, 16), __SSATA(acc2, 15, 16), 16);
00319
00320 #else
00321 *__SIMD32(pOut)++ =
00322 __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
00323 *__SIMD32(pOut)++ =
00324 __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
00325 #endif
00326
00327 #endif
00328
00329
00330 pIn2 = py;
00331
00332 pScratch1 += 4u;
00333
00334 }
00335
00336
00337 blkCnt = numPoints & 0x3;
00338
00339
00340 while(blkCnt > 0)
00341 {
00342
00343 pScr1 = pScratch1;
00344
00345
00346 acc0 = 0;
00347
00348 tapCnt = (srcBLen) >> 1u;
00349
00350 while(tapCnt > 0u)
00351 {
00352
00353
00354 x1 = *__SIMD32(pScr1)++;
00355
00356
00357 y1 = *__SIMD32(pIn2)++;
00358
00359 acc0 = __SMLALD(x1, y1, acc0);
00360
00361
00362 tapCnt--;
00363 }
00364
00365 tapCnt = (srcBLen) & 1u;
00366
00367
00368 while(tapCnt > 0u)
00369 {
00370
00371
00372 acc0 += (*pScr1++ * *pIn2++);
00373
00374
00375 tapCnt--;
00376 }
00377
00378 blkCnt--;
00379
00380
00381 #ifdef CCS
00382 *pOut++ = (q15_t) (__SSATA(acc0, 15, 16));
00383 #else
00384 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00385 #endif
00386
00387
00388 pIn2 = py;
00389
00390 pScratch1 += 1u;
00391
00392 }
00393
00394
00395 status = ARM_MATH_SUCCESS;
00396
00397 }
00398
00399
00400 return (status);
00401 }
00402
00403 #else
00404
00405 arm_status arm_conv_partial_q15(
00406 q15_t * pSrcA,
00407 uint32_t srcALen,
00408 q15_t * pSrcB,
00409 uint32_t srcBLen,
00410 q15_t * pDst,
00411 uint32_t firstIndex,
00412 uint32_t numPoints,
00413 q15_t * pScratch1,
00414 q15_t *pScratch2)
00415 {
00416
00417 q15_t *pOut = pDst;
00418 q15_t *pScr1 = pScratch1;
00419 q15_t *pScr2 = pScratch2;
00420 q63_t acc0, acc1, acc2, acc3;
00421 q15_t *pIn1;
00422 q15_t *pIn2;
00423 q15_t *px;
00424 q15_t *py;
00425 uint32_t j, k, blkCnt;
00426 arm_status status;
00427 uint32_t tapCnt;
00428 q15_t x10, x11, x20, x21;
00429 q15_t y10, y11;
00430
00431
00432
00433 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00434 {
00435
00436 status = ARM_MATH_ARGUMENT_ERROR;
00437 }
00438 else
00439 {
00440
00441
00442
00443
00444 if(srcALen >= srcBLen)
00445 {
00446
00447 pIn1 = pSrcA;
00448
00449
00450 pIn2 = pSrcB;
00451 }
00452 else
00453 {
00454
00455 pIn1 = pSrcB;
00456
00457
00458 pIn2 = pSrcA;
00459
00460
00461 j = srcBLen;
00462 srcBLen = srcALen;
00463 srcALen = j;
00464 }
00465
00466
00467 py = pScratch2;
00468
00469
00470 pScr2 = pScratch2 + srcBLen - 1;
00471
00472
00473 px = pIn2;
00474
00475
00476 k = srcBLen >> 2u;
00477
00478
00479
00480 while(k > 0u)
00481 {
00482
00483 *pScr2-- = *px++;
00484 *pScr2-- = *px++;
00485 *pScr2-- = *px++;
00486 *pScr2-- = *px++;
00487
00488
00489 k--;
00490 }
00491
00492
00493
00494 k = srcBLen % 0x4u;
00495
00496 while(k > 0u)
00497 {
00498
00499 *pScr2-- = *px++;
00500
00501
00502 k--;
00503 }
00504
00505
00506 pScr1 = pScratch1;
00507
00508
00509 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00510
00511
00512 pScr1 += (srcBLen - 1u);
00513
00514
00515
00516
00517
00518 k = srcALen >> 2u;
00519
00520
00521
00522 while(k > 0u)
00523 {
00524
00525 *pScr1++ = *pIn1++;
00526 *pScr1++ = *pIn1++;
00527 *pScr1++ = *pIn1++;
00528 *pScr1++ = *pIn1++;
00529
00530
00531 k--;
00532 }
00533
00534
00535
00536 k = srcALen % 0x4u;
00537
00538 while(k > 0u)
00539 {
00540
00541 *pScr1++ = *pIn1++;
00542
00543
00544 k--;
00545 }
00546
00547
00548
00549 k = (srcBLen - 1u) >> 2u;
00550
00551
00552
00553 while(k > 0u)
00554 {
00555
00556 *pScr1++ = 0;
00557 *pScr1++ = 0;
00558 *pScr1++ = 0;
00559 *pScr1++ = 0;
00560
00561
00562 k--;
00563 }
00564
00565
00566
00567 k = (srcBLen - 1u) % 0x4u;
00568
00569 while(k > 0u)
00570 {
00571
00572 *pScr1++ = 0;
00573
00574
00575 k--;
00576 }
00577
00578
00579
00580 pIn2 = py;
00581
00582 pScratch1 += firstIndex;
00583
00584 pOut = pDst + firstIndex;
00585
00586
00587 blkCnt = (numPoints) >> 2;
00588
00589 while(blkCnt > 0)
00590 {
00591
00592 pScr1 = pScratch1;
00593
00594
00595 acc0 = 0;
00596 acc1 = 0;
00597 acc2 = 0;
00598 acc3 = 0;
00599
00600
00601 x10 = *pScr1++;
00602 x11 = *pScr1++;
00603
00604
00605 x20 = *pScr1++;
00606 x21 = *pScr1++;
00607
00608 tapCnt = (srcBLen) >> 2u;
00609
00610 while(tapCnt > 0u)
00611 {
00612
00613
00614 y10 = *pIn2;
00615 y11 = *(pIn2 + 1u);
00616
00617
00618 acc0 += (q63_t) x10 * y10;
00619 acc0 += (q63_t) x11 * y11;
00620 acc2 += (q63_t) x20 * y10;
00621 acc2 += (q63_t) x21 * y11;
00622
00623
00624 acc1 += (q63_t) x11 * y10;
00625 acc1 += (q63_t) x20 * y11;
00626
00627
00628 x10 = *pScr1;
00629 x11 = *(pScr1 + 1u);
00630
00631
00632 acc3 += (q63_t) x21 * y10;
00633 acc3 += (q63_t) x10 * y11;
00634
00635
00636 y10 = *(pIn2 + 2u);
00637 y11 = *(pIn2 + 3u);
00638
00639
00640 acc0 += (q63_t) x20 * y10;
00641 acc0 += (q63_t) x21 * y11;
00642 acc2 += (q63_t) x10 * y10;
00643 acc2 += (q63_t) x11 * y11;
00644 acc1 += (q63_t) x21 * y10;
00645 acc1 += (q63_t) x10 * y11;
00646
00647
00648 x20 = *(pScr1 + 2);
00649 x21 = *(pScr1 + 3);
00650
00651
00652 acc3 += (q63_t) x11 * y10;
00653 acc3 += (q63_t) x20 * y11;
00654
00655
00656 pIn2 += 4u;
00657 pScr1 += 4u;
00658
00659
00660 tapCnt--;
00661 }
00662
00663
00664 pScr1 -= 4u;
00665
00666
00667 tapCnt = (srcBLen) & 3u;
00668
00669 while(tapCnt > 0u)
00670 {
00671
00672 acc0 += (*pScr1++ * *pIn2);
00673 acc1 += (*pScr1++ * *pIn2);
00674 acc2 += (*pScr1++ * *pIn2);
00675 acc3 += (*pScr1++ * *pIn2++);
00676
00677 pScr1 -= 3u;
00678
00679
00680 tapCnt--;
00681 }
00682
00683 blkCnt--;
00684
00685
00686
00687 #ifdef CCS
00688 *pOut++ = __SSATA(acc0, 15, 16);
00689 *pOut++ = __SSATA(acc1, 15, 16);
00690 *pOut++ = __SSATA(acc2, 15, 16);
00691 *pOut++ = __SSATA(acc3, 15, 16);
00692 #else
00693 *pOut++ = __SSAT((acc0 >> 15), 16);
00694 *pOut++ = __SSAT((acc1 >> 15), 16);
00695 *pOut++ = __SSAT((acc2 >> 15), 16);
00696 *pOut++ = __SSAT((acc3 >> 15), 16);
00697 #endif
00698
00699
00700 pIn2 = py;
00701
00702 pScratch1 += 4u;
00703
00704 }
00705
00706
00707 blkCnt = numPoints & 0x3;
00708
00709
00710 while(blkCnt > 0)
00711 {
00712
00713 pScr1 = pScratch1;
00714
00715
00716 acc0 = 0;
00717
00718 tapCnt = (srcBLen) >> 1u;
00719
00720 while(tapCnt > 0u)
00721 {
00722
00723
00724 x10 = *pScr1++;
00725 x11 = *pScr1++;
00726
00727
00728 y10 = *pIn2++;
00729 y11 = *pIn2++;
00730
00731
00732 acc0 += (q63_t) x10 * y10;
00733 acc0 += (q63_t) x11 * y11;
00734
00735
00736 tapCnt--;
00737 }
00738
00739 tapCnt = (srcBLen) & 1u;
00740
00741
00742 while(tapCnt > 0u)
00743 {
00744
00745
00746 acc0 += (*pScr1++ * *pIn2++);
00747
00748
00749 tapCnt--;
00750 }
00751
00752 blkCnt--;
00753
00754
00755 #ifdef CCS
00756 *pOut++ = (q15_t) (__SSATA(acc0, 15, 16));
00757 #else
00758 *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
00759 #endif
00760
00761
00762 pIn2 = py;
00763
00764 pScratch1 += 1u;
00765
00766 }
00767
00768
00769 status = ARM_MATH_SUCCESS;
00770
00771 }
00772
00773
00774 return (status);
00775 }
00776
00777 #endif
00778