00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "arm_math.h"
00026
00065 void arm_correlate_q7(
00066 q7_t * pSrcA,
00067 uint32_t srcALen,
00068 q7_t * pSrcB,
00069 uint32_t srcBLen,
00070 q7_t * pDst,
00071 q15_t * pScratch1,
00072 q15_t * pScratch2)
00073 {
00074 q7_t *pOut = pDst;
00075 q15_t *pScr1 = pScratch1;
00076 q15_t *pScr2 = pScratch2;
00077 q7_t *pIn1;
00078 q7_t *pIn2;
00079 q15_t *py;
00080 q31_t acc0, acc1, acc2, acc3;
00081 uint32_t j, k = 0u, blkCnt;
00082 int32_t inc = 1;
00083 uint32_t outBlockSize;
00084 q15_t x4;
00085 uint32_t tapCnt;
00086 q31_t x1, x2, x3, y1;
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101 if(srcALen >= srcBLen)
00102 {
00103
00104 pIn1 = (pSrcA);
00105
00106
00107 pIn2 = (pSrcB);
00108
00109
00110 outBlockSize = (2u * srcALen) - 1u;
00111
00112
00113
00114
00115
00116 j = outBlockSize - (srcALen + (srcBLen - 1u));
00117
00118
00119 pOut += j;
00120
00121 }
00122 else
00123 {
00124
00125 pIn1 = (pSrcB);
00126
00127
00128 pIn2 = (pSrcA);
00129
00130
00131 j = srcBLen;
00132 srcBLen = srcALen;
00133 srcALen = j;
00134
00135
00136
00137 pOut = pDst + ((srcALen + srcBLen) - 2u);
00138
00139
00140 inc = -1;
00141
00142 }
00143
00144
00145
00146 k = srcBLen >> 2u;
00147
00148
00149
00150 while(k > 0u)
00151 {
00152
00153 x4 = (q15_t) *pIn2++;
00154 *pScr2++ = x4;
00155 x4 = (q15_t) *pIn2++;
00156 *pScr2++ = x4;
00157 x4 = (q15_t) *pIn2++;
00158 *pScr2++ = x4;
00159 x4 = (q15_t) *pIn2++;
00160 *pScr2++ = x4;
00161
00162
00163 k--;
00164 }
00165
00166
00167
00168 k = srcBLen % 0x4u;
00169
00170 while(k > 0u)
00171 {
00172
00173 x4 = (q15_t) *pIn2++;
00174 *pScr2++ = x4;
00175
00176
00177 k--;
00178 }
00179
00180
00181 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00182
00183
00184 pScr1 += (srcBLen - 1u);
00185
00186
00187 k = srcALen >> 2u;
00188
00189
00190
00191 while(k > 0u)
00192 {
00193
00194 x4 = (q15_t) *pIn1++;
00195 *pScr1++ = x4;
00196 x4 = (q15_t) *pIn1++;
00197 *pScr1++ = x4;
00198 x4 = (q15_t) *pIn1++;
00199 *pScr1++ = x4;
00200 x4 = (q15_t) *pIn1++;
00201 *pScr1++ = x4;
00202
00203
00204 k--;
00205 }
00206
00207
00208
00209 k = srcALen % 0x4u;
00210
00211 while(k > 0u)
00212 {
00213
00214 x4 = (q15_t) *pIn1++;
00215 *pScr1++ = x4;
00216
00217
00218 k--;
00219 }
00220
00221 #ifdef UNALIGNED_SUPPORT_ENABLE
00222
00223
00224 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00225
00226
00227 pScr1 += (srcBLen - 1u);
00228
00229 #else
00230
00231
00232 k = (srcBLen - 1u) >> 2u;
00233
00234
00235
00236 while(k > 0u)
00237 {
00238
00239 *pScr1++ = 0;
00240 *pScr1++ = 0;
00241 *pScr1++ = 0;
00242 *pScr1++ = 0;
00243
00244
00245 k--;
00246 }
00247
00248
00249
00250 k = (srcBLen - 1u) % 0x4u;
00251
00252 while(k > 0u)
00253 {
00254
00255 *pScr1++ = 0;
00256
00257
00258 k--;
00259 }
00260
00261 #endif
00262
00263
00264 py = pScratch2;
00265
00266
00267 pScr2 = pScratch2;
00268
00269
00270 blkCnt = (srcALen + srcBLen - 1u) >> 2;
00271
00272 while(blkCnt > 0)
00273 {
00274
00275 pScr1 = pScratch1;
00276
00277
00278 acc0 = 0;
00279 acc1 = 0;
00280 acc2 = 0;
00281 acc3 = 0;
00282
00283
00284 x1 = *__SIMD32(pScr1)++;
00285
00286
00287 x2 = *__SIMD32(pScr1)++;
00288
00289 tapCnt = (srcBLen) >> 2u;
00290
00291 while(tapCnt > 0u)
00292 {
00293
00294
00295 y1 = _SIMD32_OFFSET(pScr2);
00296
00297
00298 acc0 = __SMLAD(x1, y1, acc0);
00299 acc2 = __SMLAD(x2, y1, acc2);
00300
00301
00302 #ifndef ARM_MATH_BIG_ENDIAN
00303 x3 = __PKHBT(x2, x1, 0);
00304 #else
00305 x3 = __PKHBT(x1, x2, 0);
00306 #endif
00307
00308
00309 acc1 = __SMLADX(x3, y1, acc1);
00310
00311
00312 x1 = *__SIMD32(pScr1)++;
00313
00314
00315 #ifndef ARM_MATH_BIG_ENDIAN
00316 x3 = __PKHBT(x1, x2, 0);
00317 #else
00318 x3 = __PKHBT(x2, x1, 0);
00319 #endif
00320
00321 acc3 = __SMLADX(x3, y1, acc3);
00322
00323
00324 y1 = _SIMD32_OFFSET(pScr2 + 2u);
00325
00326 acc0 = __SMLAD(x2, y1, acc0);
00327
00328 acc2 = __SMLAD(x1, y1, acc2);
00329
00330 acc1 = __SMLADX(x3, y1, acc1);
00331
00332 x2 = *__SIMD32(pScr1)++;
00333
00334 #ifndef ARM_MATH_BIG_ENDIAN
00335 x3 = __PKHBT(x2, x1, 0);
00336 #else
00337 x3 = __PKHBT(x1, x2, 0);
00338 #endif
00339
00340 acc3 = __SMLADX(x3, y1, acc3);
00341
00342 pScr2 += 4u;
00343
00344
00345
00346 tapCnt--;
00347 }
00348
00349
00350
00351
00352 pScr1 -= 4u;
00353
00354
00355
00356 tapCnt = (srcBLen) & 3u;
00357
00358 while(tapCnt > 0u)
00359 {
00360
00361
00362 acc0 += (*pScr1++ * *pScr2);
00363 acc1 += (*pScr1++ * *pScr2);
00364 acc2 += (*pScr1++ * *pScr2);
00365 acc3 += (*pScr1++ * *pScr2++);
00366
00367 pScr1 -= 3u;
00368
00369
00370 tapCnt--;
00371 }
00372
00373 blkCnt--;
00374
00375 #ifdef CCS
00376
00377 *pOut = (q7_t) (__SSATA(acc0, 7u, 8));
00378 pOut += inc;
00379 *pOut = (q7_t) (__SSATA(acc1, 7u, 8));
00380 pOut += inc;
00381 *pOut = (q7_t) (__SSATA(acc2, 7u, 8));
00382 pOut += inc;
00383 *pOut = (q7_t) (__SSATA(acc3, 7u, 8));
00384 pOut += inc;
00385
00386 #else
00387
00388 *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
00389 pOut += inc;
00390 *pOut = (q7_t) (__SSAT(acc1 >> 7u, 8));
00391 pOut += inc;
00392 *pOut = (q7_t) (__SSAT(acc2 >> 7u, 8));
00393 pOut += inc;
00394 *pOut = (q7_t) (__SSAT(acc3 >> 7u, 8));
00395 pOut += inc;
00396 #endif
00397
00398
00399
00400 pScr2 = py;
00401
00402 pScratch1 += 4u;
00403
00404 }
00405
00406
00407 blkCnt = (srcALen + srcBLen - 1u) & 0x3;
00408
00409
00410 while(blkCnt > 0)
00411 {
00412
00413 pScr1 = pScratch1;
00414
00415
00416 acc0 = 0;
00417
00418 tapCnt = (srcBLen) >> 1u;
00419
00420 while(tapCnt > 0u)
00421 {
00422
00423
00424 x1 = *__SIMD32(pScr1)++;
00425
00426
00427 y1 = *__SIMD32(pScr2)++;
00428
00429 acc0 = __SMLAD(x1, y1, acc0);
00430
00431
00432 tapCnt--;
00433 }
00434
00435 tapCnt = (srcBLen) & 1u;
00436
00437
00438 while(tapCnt > 0u)
00439 {
00440
00441
00442 acc0 += (*pScr1++ * *pScr2++);
00443
00444
00445 tapCnt--;
00446 }
00447
00448 blkCnt--;
00449
00450
00451 #ifdef CCS
00452 *pOut = (q7_t) (__SSATA(acc0, 7u, 8));
00453 #else
00454 *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
00455 #endif
00456 pOut += inc;
00457
00458
00459 pScr2 = py;
00460
00461 pScratch1 += 1u;
00462
00463 }
00464
00465 }
00466