00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "arm_math.h"
00026
00058 #ifndef UNALIGNED_SUPPORT_DISABLE
00059
00060 arm_status arm_conv_partial_q7(
00061 q7_t * pSrcA,
00062 uint32_t srcALen,
00063 q7_t * pSrcB,
00064 uint32_t srcBLen,
00065 q7_t * pDst,
00066 uint32_t firstIndex,
00067 uint32_t numPoints,
00068 q15_t * pScratch1,
00069 q15_t *pScratch2)
00070 {
00071
00072 q15_t *pScr2, *pScr1;
00073 q15_t x4;
00074 q7_t *pIn1, *pIn2;
00075 uint32_t j, k, blkCnt, tapCnt;
00076 q7_t *px;
00077 q15_t *py;
00078 q31_t acc0, acc1, acc2, acc3;
00079 q31_t x1, x2, x3, y1;
00080 arm_status status;
00081 q7_t *pOut = pDst;
00082 q7_t out0, out1, out2, out3;
00083
00084
00085 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00086 {
00087
00088 status = ARM_MATH_ARGUMENT_ERROR;
00089 }
00090 else
00091 {
00092
00093
00094
00095
00096 if(srcALen >= srcBLen)
00097 {
00098
00099 pIn1 = pSrcA;
00100
00101
00102 pIn2 = pSrcB;
00103 }
00104 else
00105 {
00106
00107 pIn1 = pSrcB;
00108
00109
00110 pIn2 = pSrcA;
00111
00112
00113 j = srcBLen;
00114 srcBLen = srcALen;
00115 srcALen = j;
00116 }
00117
00118
00119 pScr2 = pScratch2;
00120
00121
00122 px = pIn2 + srcBLen - 1;
00123
00124
00125 k = srcBLen >> 2u;
00126
00127
00128
00129 while(k > 0u)
00130 {
00131
00132 x4 = (q15_t) *px--;
00133 *pScr2++ = x4;
00134 x4 = (q15_t) *px--;
00135 *pScr2++ = x4;
00136 x4 = (q15_t) *px--;
00137 *pScr2++ = x4;
00138 x4 = (q15_t) *px--;
00139 *pScr2++ = x4;
00140
00141
00142 k--;
00143 }
00144
00145
00146
00147 k = srcBLen % 0x4u;
00148
00149 while(k > 0u)
00150 {
00151
00152 x4 = (q15_t) *px--;
00153 *pScr2++ = x4;
00154
00155
00156 k--;
00157 }
00158
00159
00160 pScr1 = pScratch1;
00161
00162
00163 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00164
00165
00166 pScr1 += (srcBLen - 1u);
00167
00168
00169
00170 k = srcALen >> 2u;
00171
00172
00173
00174 while(k > 0u)
00175 {
00176
00177 x4 = (q15_t) *pIn1++;
00178 *pScr1++ = x4;
00179 x4 = (q15_t) *pIn1++;
00180 *pScr1++ = x4;
00181 x4 = (q15_t) *pIn1++;
00182 *pScr1++ = x4;
00183 x4 = (q15_t) *pIn1++;
00184 *pScr1++ = x4;
00185
00186
00187 k--;
00188 }
00189
00190
00191
00192 k = srcALen % 0x4u;
00193
00194 while(k > 0u)
00195 {
00196
00197 x4 = (q15_t) *pIn1++;
00198 *pScr1++ = x4;
00199
00200
00201 k--;
00202 }
00203
00204
00205 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00206
00207
00208 pScr1 += (srcBLen - 1u);
00209
00210
00211
00212 py = pScratch2;
00213
00214
00215 pIn2 = (q7_t *)py;
00216
00217 pScr2 = py;
00218
00219 pOut = pDst + firstIndex;
00220
00221 pScratch1 += firstIndex;
00222
00223
00224 blkCnt = (numPoints) >> 2;
00225
00226
00227 while(blkCnt > 0)
00228 {
00229
00230 pScr1 = pScratch1;
00231
00232
00233 acc0 = 0;
00234 acc1 = 0;
00235 acc2 = 0;
00236 acc3 = 0;
00237
00238
00239 x1 = *__SIMD32(pScr1)++;
00240
00241
00242 x2 = *__SIMD32(pScr1)++;
00243
00244 tapCnt = (srcBLen) >> 2u;
00245
00246 while(tapCnt > 0u)
00247 {
00248
00249
00250 y1 = _SIMD32_OFFSET(pScr2);
00251
00252
00253 acc0 = __SMLAD(x1, y1, acc0);
00254 acc2 = __SMLAD(x2, y1, acc2);
00255
00256
00257 #ifndef ARM_MATH_BIG_ENDIAN
00258 x3 = __PKHBT(x2, x1, 0);
00259 #else
00260 x3 = __PKHBT(x1, x2, 0);
00261 #endif
00262
00263
00264 acc1 = __SMLADX(x3, y1, acc1);
00265
00266
00267 x1 = *__SIMD32(pScr1)++;
00268
00269
00270 #ifndef ARM_MATH_BIG_ENDIAN
00271 x3 = __PKHBT(x1, x2, 0);
00272 #else
00273 x3 = __PKHBT(x2, x1, 0);
00274 #endif
00275
00276 acc3 = __SMLADX(x3, y1, acc3);
00277
00278
00279 y1 = _SIMD32_OFFSET(pScr2 + 2u);
00280
00281 acc0 = __SMLAD(x2, y1, acc0);
00282
00283 acc2 = __SMLAD(x1, y1, acc2);
00284
00285 acc1 = __SMLADX(x3, y1, acc1);
00286
00287 x2 = *__SIMD32(pScr1)++;
00288
00289 #ifndef ARM_MATH_BIG_ENDIAN
00290 x3 = __PKHBT(x2, x1, 0);
00291 #else
00292 x3 = __PKHBT(x1, x2, 0);
00293 #endif
00294
00295 acc3 = __SMLADX(x3, y1, acc3);
00296
00297 pScr2 += 4u;
00298
00299
00300
00301 tapCnt--;
00302 }
00303
00304
00305
00306
00307 pScr1 -= 4u;
00308
00309
00310
00311 tapCnt = (srcBLen) & 3u;
00312
00313 while(tapCnt > 0u)
00314 {
00315
00316
00317 acc0 += (*pScr1++ * *pScr2);
00318 acc1 += (*pScr1++ * *pScr2);
00319 acc2 += (*pScr1++ * *pScr2);
00320 acc3 += (*pScr1++ * *pScr2++);
00321
00322 pScr1 -= 3u;
00323
00324
00325 tapCnt--;
00326 }
00327
00328 blkCnt--;
00329
00330
00331 #ifdef CCS
00332 out0 = (q7_t) (__SSATA(acc0, 7u, 8));
00333 out1 = (q7_t) (__SSATA(acc1, 7u, 8));
00334 out2 = (q7_t) (__SSATA(acc2, 7u, 8));
00335 out3 = (q7_t) (__SSATA(acc3, 7u, 8));
00336 #else
00337 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00338 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00339 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00340 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00341 #endif
00342
00343 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00344
00345
00346 pScr2 = py;
00347
00348 pScratch1 += 4u;
00349
00350 }
00351
00352 blkCnt = (numPoints) & 0x3;
00353
00354
00355 while(blkCnt > 0)
00356 {
00357
00358 pScr1 = pScratch1;
00359
00360
00361 acc0 = 0;
00362
00363 tapCnt = (srcBLen) >> 1u;
00364
00365 while(tapCnt > 0u)
00366 {
00367
00368
00369 x1 = *__SIMD32(pScr1)++;
00370
00371
00372 y1 = *__SIMD32(pScr2)++;
00373
00374 acc0 = __SMLAD(x1, y1, acc0);
00375
00376
00377 tapCnt--;
00378 }
00379
00380 tapCnt = (srcBLen) & 1u;
00381
00382
00383 while(tapCnt > 0u)
00384 {
00385
00386
00387 acc0 += (*pScr1++ * *pScr2++);
00388
00389
00390 tapCnt--;
00391 }
00392
00393 blkCnt--;
00394
00395
00396 #ifdef CCS
00397 *pOut++ = (q7_t) (__SSATA(acc0, 7u, 8));
00398 #else
00399 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00400 #endif
00401
00402
00403 pScr2 = py;
00404
00405 pScratch1 += 1u;
00406
00407 }
00408
00409
00410 status = ARM_MATH_SUCCESS;
00411
00412
00413 }
00414
00415 return (status);
00416
00417 }
00418
00419 #else
00420
00421 arm_status arm_conv_partial_q7(
00422 q7_t * pSrcA,
00423 uint32_t srcALen,
00424 q7_t * pSrcB,
00425 uint32_t srcBLen,
00426 q7_t * pDst,
00427 uint32_t firstIndex,
00428 uint32_t numPoints,
00429 q15_t * pScratch1,
00430 q15_t *pScratch2)
00431 {
00432
00433 q15_t *pScr2, *pScr1;
00434 q15_t x4;
00435 q7_t *pIn1, *pIn2;
00436 uint32_t j, k, blkCnt, tapCnt;
00437 q7_t *px;
00438 q15_t *py;
00439 q31_t acc0, acc1, acc2, acc3;
00440 arm_status status;
00441 q7_t *pOut = pDst;
00442 q15_t x10, x11, x20, x21;
00443 q15_t y10, y11;
00444 q7_t out0, out1, out2, out3;
00445
00446
00447 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00448 {
00449
00450 status = ARM_MATH_ARGUMENT_ERROR;
00451 }
00452 else
00453 {
00454
00455
00456
00457
00458 if(srcALen >= srcBLen)
00459 {
00460
00461 pIn1 = pSrcA;
00462
00463
00464 pIn2 = pSrcB;
00465 }
00466 else
00467 {
00468
00469 pIn1 = pSrcB;
00470
00471
00472 pIn2 = pSrcA;
00473
00474
00475 j = srcBLen;
00476 srcBLen = srcALen;
00477 srcALen = j;
00478 }
00479
00480
00481 pScr2 = pScratch2;
00482
00483
00484 px = pIn2 + srcBLen - 1;
00485
00486
00487 k = srcBLen >> 2u;
00488
00489
00490
00491 while(k > 0u)
00492 {
00493
00494 x4 = (q15_t) *px--;
00495 *pScr2++ = x4;
00496 x4 = (q15_t) *px--;
00497 *pScr2++ = x4;
00498 x4 = (q15_t) *px--;
00499 *pScr2++ = x4;
00500 x4 = (q15_t) *px--;
00501 *pScr2++ = x4;
00502
00503
00504 k--;
00505 }
00506
00507
00508
00509 k = srcBLen % 0x4u;
00510
00511 while(k > 0u)
00512 {
00513
00514 x4 = (q15_t) *px--;
00515 *pScr2++ = x4;
00516
00517
00518 k--;
00519 }
00520
00521
00522 pScr1 = pScratch1;
00523
00524
00525 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00526
00527
00528 pScr1 += (srcBLen - 1u);
00529
00530
00531
00532 k = srcALen >> 2u;
00533
00534
00535
00536 while(k > 0u)
00537 {
00538
00539 x4 = (q15_t) *pIn1++;
00540 *pScr1++ = x4;
00541 x4 = (q15_t) *pIn1++;
00542 *pScr1++ = x4;
00543 x4 = (q15_t) *pIn1++;
00544 *pScr1++ = x4;
00545 x4 = (q15_t) *pIn1++;
00546 *pScr1++ = x4;
00547
00548
00549 k--;
00550 }
00551
00552
00553
00554 k = srcALen % 0x4u;
00555
00556 while(k > 0u)
00557 {
00558
00559 x4 = (q15_t) *pIn1++;
00560 *pScr1++ = x4;
00561
00562
00563 k--;
00564 }
00565
00566
00567 k = (srcBLen - 1u) >> 2u;
00568
00569
00570
00571 while(k > 0u)
00572 {
00573
00574 *pScr1++ = 0;
00575 *pScr1++ = 0;
00576 *pScr1++ = 0;
00577 *pScr1++ = 0;
00578
00579
00580 k--;
00581 }
00582
00583
00584
00585 k = (srcBLen - 1u) % 0x4u;
00586
00587 while(k > 0u)
00588 {
00589
00590 *pScr1++ = 0;
00591
00592
00593 k--;
00594 }
00595
00596
00597
00598 py = pScratch2;
00599
00600
00601 pIn2 = (q7_t *)py;
00602
00603 pScr2 = py;
00604
00605 pOut = pDst + firstIndex;
00606
00607 pScratch1 += firstIndex;
00608
00609
00610 blkCnt = (numPoints) >> 2;
00611
00612
00613 while(blkCnt > 0)
00614 {
00615
00616 pScr1 = pScratch1;
00617
00618
00619 acc0 = 0;
00620 acc1 = 0;
00621 acc2 = 0;
00622 acc3 = 0;
00623
00624
00625 x10 = *pScr1++;
00626 x11 = *pScr1++;
00627
00628
00629 x20 = *pScr1++;
00630 x21 = *pScr1++;
00631
00632 tapCnt = (srcBLen) >> 2u;
00633
00634 while(tapCnt > 0u)
00635 {
00636
00637
00638 y10 = *pScr2;
00639 y11 = *(pScr2 + 1u);
00640
00641
00642 acc0 += (q31_t) x10 *y10;
00643 acc0 += (q31_t) x11 *y11;
00644 acc2 += (q31_t) x20 *y10;
00645 acc2 += (q31_t) x21 *y11;
00646
00647
00648 acc1 += (q31_t) x11 *y10;
00649 acc1 += (q31_t) x20 *y11;
00650
00651
00652 x10 = *pScr1;
00653 x11 = *(pScr1 + 1u);
00654
00655
00656 acc3 += (q31_t) x21 * y10;
00657 acc3 += (q31_t) x10 * y11;
00658
00659
00660 y10 = *(pScr2 + 2u);
00661 y11 = *(pScr2 + 3u);
00662
00663
00664 acc0 += (q31_t) x20 * y10;
00665 acc0 += (q31_t) x21 * y11;
00666 acc2 += (q31_t) x10 * y10;
00667 acc2 += (q31_t) x11 * y11;
00668 acc1 += (q31_t) x21 * y10;
00669 acc1 += (q31_t) x10 * y11;
00670
00671
00672 x20 = *(pScr1 + 2);
00673 x21 = *(pScr1 + 3);
00674
00675
00676 acc3 += (q31_t) x11 * y10;
00677 acc3 += (q31_t) x20 * y11;
00678
00679
00680
00681 pScr1 += 4u;
00682 pScr2 += 4u;
00683
00684
00685 tapCnt--;
00686 }
00687
00688
00689
00690
00691 pScr1 -= 4u;
00692
00693
00694
00695 tapCnt = (srcBLen) & 3u;
00696
00697 while(tapCnt > 0u)
00698 {
00699
00700
00701 acc0 += (*pScr1++ * *pScr2);
00702 acc1 += (*pScr1++ * *pScr2);
00703 acc2 += (*pScr1++ * *pScr2);
00704 acc3 += (*pScr1++ * *pScr2++);
00705
00706 pScr1 -= 3u;
00707
00708
00709 tapCnt--;
00710 }
00711
00712 blkCnt--;
00713
00714
00715 #ifdef CCS
00716 out0 = (q7_t) (__SSATA(acc0, 7u, 8));
00717 out1 = (q7_t) (__SSATA(acc1, 7u, 8));
00718 out2 = (q7_t) (__SSATA(acc2, 7u, 8));
00719 out3 = (q7_t) (__SSATA(acc3, 7u, 8));
00720 #else
00721 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00722 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00723 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00724 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00725 #endif
00726
00727 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00728
00729
00730 pScr2 = py;
00731
00732 pScratch1 += 4u;
00733
00734 }
00735
00736 blkCnt = (numPoints) & 0x3;
00737
00738
00739 while(blkCnt > 0)
00740 {
00741
00742 pScr1 = pScratch1;
00743
00744
00745 acc0 = 0;
00746
00747 tapCnt = (srcBLen) >> 1u;
00748
00749 while(tapCnt > 0u)
00750 {
00751
00752
00753 x10 = *pScr1++;
00754 x11 = *pScr1++;
00755
00756
00757 y10 = *pScr2++;
00758 y11 = *pScr2++;
00759
00760
00761 acc0 += (q31_t) x10 *y10;
00762 acc0 += (q31_t) x11 *y11;
00763
00764
00765 tapCnt--;
00766 }
00767
00768 tapCnt = (srcBLen) & 1u;
00769
00770
00771 while(tapCnt > 0u)
00772 {
00773
00774
00775 acc0 += (*pScr1++ * *pScr2++);
00776
00777
00778 tapCnt--;
00779 }
00780
00781 blkCnt--;
00782
00783
00784 #ifdef CCS
00785 *pOut++ = (q7_t) (__SSATA(acc0, 7u, 8));
00786 #else
00787 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00788 #endif
00789
00790
00791 pScr2 = py;
00792
00793 pScratch1 += 1u;
00794
00795 }
00796
00797
00798 status = ARM_MATH_SUCCESS;
00799
00800 }
00801
00802 return (status);
00803
00804 }
00805
00806 #endif
00807