00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "arm_math.h"
00026
00074 void arm_conv_q7(
00075 q7_t * pSrcA,
00076 uint32_t srcALen,
00077 q7_t * pSrcB,
00078 uint32_t srcBLen,
00079 q7_t * pDst,
00080 q15_t * pScratch1,
00081 q15_t *pScratch2 )
00082 {
00083
00084 q15_t *pScr2, *pScr1;
00085 q15_t x4;
00086 q7_t *pIn1, *pIn2;
00087 uint32_t j, k, blkCnt, tapCnt;
00088 q7_t *px;
00089 q15_t *py;
00090 q31_t acc0, acc1, acc2, acc3;
00091 q31_t x1, x2, x3, y1;
00092 q7_t *pOut = pDst;
00093 q7_t out0, out1, out2, out3;
00094
00095
00096
00097
00098 if(srcALen >= srcBLen)
00099 {
00100
00101 pIn1 = pSrcA;
00102
00103
00104 pIn2 = pSrcB;
00105 }
00106 else
00107 {
00108
00109 pIn1 = pSrcB;
00110
00111
00112 pIn2 = pSrcA;
00113
00114
00115 j = srcBLen;
00116 srcBLen = srcALen;
00117 srcALen = j;
00118 }
00119
00120
00121 pScr2 = pScratch2;
00122
00123
00124 px = pIn2 + srcBLen - 1;
00125
00126
00127 k = srcBLen >> 2u;
00128
00129
00130
00131 while(k > 0u)
00132 {
00133
00134 x4 = (q15_t) *px--;
00135 *pScr2++ = x4;
00136 x4 = (q15_t) *px--;
00137 *pScr2++ = x4;
00138 x4 = (q15_t) *px--;
00139 *pScr2++ = x4;
00140 x4 = (q15_t) *px--;
00141 *pScr2++ = x4;
00142
00143
00144 k--;
00145 }
00146
00147
00148
00149 k = srcBLen % 0x4u;
00150
00151 while(k > 0u)
00152 {
00153
00154 x4 = (q15_t) *px--;
00155 *pScr2++ = x4;
00156
00157
00158 k--;
00159 }
00160
00161
00162 pScr1 = pScratch1;
00163
00164
00165 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00166
00167
00168 pScr1 += (srcBLen - 1u);
00169
00170
00171
00172 k = srcALen >> 2u;
00173
00174
00175
00176 while(k > 0u)
00177 {
00178
00179 x4 = (q15_t) *pIn1++;
00180 *pScr1++ = x4;
00181 x4 = (q15_t) *pIn1++;
00182 *pScr1++ = x4;
00183 x4 = (q15_t) *pIn1++;
00184 *pScr1++ = x4;
00185 x4 = (q15_t) *pIn1++;
00186 *pScr1++ = x4;
00187
00188
00189 k--;
00190 }
00191
00192
00193
00194 k = srcALen % 0x4u;
00195
00196 while(k > 0u)
00197 {
00198
00199 x4 = (q15_t) *pIn1++;
00200 *pScr1++ = x4;
00201
00202
00203 k--;
00204 }
00205
00206 #ifndef UNALIGNED_SUPPORT_DISABLE
00207
00208
00209 arm_fill_q15(0, pScr1, (srcBLen - 1u));
00210
00211
00212 pScr1 += (srcBLen - 1u);
00213
00214 #else
00215
00216
00217 k = (srcBLen - 1u) >> 2u;
00218
00219
00220
00221 while(k > 0u)
00222 {
00223
00224 *pScr1++ = 0;
00225 *pScr1++ = 0;
00226 *pScr1++ = 0;
00227 *pScr1++ = 0;
00228
00229
00230 k--;
00231 }
00232
00233
00234
00235 k = (srcBLen - 1u) % 0x4u;
00236
00237 while(k > 0u)
00238 {
00239
00240 *pScr1++ = 0;
00241
00242
00243 k--;
00244 }
00245
00246 #endif
00247
00248
00249 py = pScratch2;
00250
00251
00252 pIn2 = (q7_t *)py;
00253
00254 pScr2 = py;
00255
00256
00257 blkCnt = (srcALen + srcBLen - 1u) >> 2;
00258
00259 while(blkCnt > 0)
00260 {
00261
00262 pScr1 = pScratch1;
00263
00264
00265 acc0 = 0;
00266 acc1 = 0;
00267 acc2 = 0;
00268 acc3 = 0;
00269
00270
00271 x1 = *__SIMD32(pScr1)++;
00272
00273
00274 x2 = *__SIMD32(pScr1)++;
00275
00276 tapCnt = (srcBLen) >> 2u;
00277
00278 while(tapCnt > 0u)
00279 {
00280
00281
00282 y1 = _SIMD32_OFFSET(pScr2);
00283
00284
00285 acc0 = __SMLAD(x1, y1, acc0);
00286 acc2 = __SMLAD(x2, y1, acc2);
00287
00288
00289 #ifndef ARM_MATH_BIG_ENDIAN
00290 x3 = __PKHBT(x2, x1, 0);
00291 #else
00292 x3 = __PKHBT(x1, x2, 0);
00293 #endif
00294
00295
00296 acc1 = __SMLADX(x3, y1, acc1);
00297
00298
00299 x1 = *__SIMD32(pScr1)++;
00300
00301
00302 #ifndef ARM_MATH_BIG_ENDIAN
00303 x3 = __PKHBT(x1, x2, 0);
00304 #else
00305 x3 = __PKHBT(x2, x1, 0);
00306 #endif
00307
00308 acc3 = __SMLADX(x3, y1, acc3);
00309
00310
00311 y1 = _SIMD32_OFFSET(pScr2 + 2u);
00312
00313 acc0 = __SMLAD(x2, y1, acc0);
00314
00315 acc2 = __SMLAD(x1, y1, acc2);
00316
00317 acc1 = __SMLADX(x3, y1, acc1);
00318
00319 x2 = *__SIMD32(pScr1)++;
00320
00321 #ifndef ARM_MATH_BIG_ENDIAN
00322 x3 = __PKHBT(x2, x1, 0);
00323 #else
00324 x3 = __PKHBT(x1, x2, 0);
00325 #endif
00326
00327 acc3 = __SMLADX(x3, y1, acc3);
00328
00329 pScr2 += 4u;
00330
00331
00332
00333 tapCnt--;
00334 }
00335
00336
00337
00338
00339 pScr1 -= 4u;
00340
00341
00342
00343 tapCnt = (srcBLen) & 3u;
00344
00345 while(tapCnt > 0u)
00346 {
00347
00348
00349 acc0 += (*pScr1++ * *pScr2);
00350 acc1 += (*pScr1++ * *pScr2);
00351 acc2 += (*pScr1++ * *pScr2);
00352 acc3 += (*pScr1++ * *pScr2++);
00353
00354 pScr1 -= 3u;
00355
00356
00357 tapCnt--;
00358 }
00359
00360 blkCnt--;
00361
00362
00363 #ifdef CCS
00364 out0 = (q7_t) (__SSATA(acc0, 7u, 8));
00365 out1 = (q7_t) (__SSATA(acc1, 7u, 8));
00366 out2 = (q7_t) (__SSATA(acc2, 7u, 8));
00367 out3 = (q7_t) (__SSATA(acc3, 7u, 8));
00368 #else
00369 out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00370 out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00371 out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00372 out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00373 #endif
00374
00375 *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00376
00377
00378 pScr2 = py;
00379
00380 pScratch1 += 4u;
00381
00382 }
00383
00384
00385 blkCnt = (srcALen + srcBLen - 1u) & 0x3;
00386
00387
00388 while(blkCnt > 0)
00389 {
00390
00391 pScr1 = pScratch1;
00392
00393
00394 acc0 = 0;
00395
00396 tapCnt = (srcBLen) >> 1u;
00397
00398 while(tapCnt > 0u)
00399 {
00400
00401
00402 x1 = *__SIMD32(pScr1)++;
00403
00404
00405 y1 = *__SIMD32(pScr2)++;
00406
00407 acc0 = __SMLAD(x1, y1, acc0);
00408
00409
00410 tapCnt--;
00411 }
00412
00413 tapCnt = (srcBLen) & 1u;
00414
00415
00416 while(tapCnt > 0u)
00417 {
00418
00419
00420 acc0 += (*pScr1++ * *pScr2++);
00421
00422
00423 tapCnt--;
00424 }
00425
00426 blkCnt--;
00427
00428
00429 #ifdef CCS
00430 *pOut++ = (q7_t) (__SSATA(acc0, 7u, 8));
00431 #else
00432 *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00433 #endif
00434
00435
00436 pScr2 = py;
00437
00438 pScratch1 += 1u;
00439
00440 }
00441
00442 }
00443