Cortex-R DSP Software Library: arm_sub

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_sub_q7.c      
00009 *      
00010 * Description:  Q7 vector subtraction.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00053 void arm_sub_q7(     
00054   q7_t * pSrcA,     
00055   q7_t * pSrcB,     
00056   q7_t * pDst,     
00057   uint32_t blockSize)     
00058 {     
00059   uint32_t blkCnt;                               /* loop counter */     
00060   q31_t inA1, inB1, inA2, inB2;                  /* temporary input variabels */  
00061   q7_t inA, inB;                                 /* temporary variables */  
00062   q31_t out1, out2, out3, out4;                  /* temporary output variables */  
00063      
00064      
00065   /*loop Unrolling */     
00066   blkCnt = blockSize >> 4u;     
00067      
00068   /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.      
00069    ** a second loop below computes the remaining 1 to 15 samples. */     
00070   while(blkCnt > 0u)     
00071   {     
00072     /* C = A - B */     
00073     /* Subtract and then store the results in the destination buffer 4 samples at a time. */     
00074     /* read 4 samples at a time from sourceA */  
00075     inA1 = _SIMD32_OFFSET(pSrcA);  
00076     /* read 4 samples at a time from sourceB */  
00077     inB1 = _SIMD32_OFFSET(pSrcB);  
00078     /* read 4 samples at a time from sourceA */  
00079     inA2 = _SIMD32_OFFSET(pSrcA + 4);  
00080   
00081     /* out = saturate(sourceA - sourceB) four samples at a time */  
00082     out1 = __QSUB8(inA1, inB1);  
00083   
00084     /* read 4 samples at a time from sourceB */  
00085     inB2 = _SIMD32_OFFSET(pSrcB + 4);  
00086   
00087     /* store result to destination four samples at a time */  
00088     _SIMD32_OFFSET(pDst) = out1;  
00089   
00090     /* out = saturate(sourceA - sourceB) four samples at a time */  
00091     out2 = __QSUB8(inA2, inB2);  
00092   
00093     /* read 4 samples at a time from sourceA */  
00094     inA1 = _SIMD32_OFFSET(pSrcA + 8);  
00095     /* read 4 samples at a time from sourceB */  
00096     inB1 = _SIMD32_OFFSET(pSrcB + 8);  
00097     /* read 4 samples at a time from sourceA */  
00098     inA2 = _SIMD32_OFFSET(pSrcA + 12);  
00099   
00100     /* out = saturate(sourceA - sourceB) four samples at a time */  
00101     out3 = __QSUB8(inA1, inB1);  
00102   
00103     /* read 4 samples at a time from sourceB */  
00104     inB2 = _SIMD32_OFFSET(pSrcB + 12);  
00105       
00106     /* increment sourceA pointer by 16 to process next samples */  
00107     pSrcA += 16u;  
00108   
00109     /* store result to destination four samples at a time */  
00110     _SIMD32_OFFSET(pDst + 4) = out2;  
00111   
00112     /* out = saturate(sourceA - sourceB) four samples at a time */  
00113     out4 = __QSUB8(inA2, inB2);  
00114       
00115     /* store result to destination four samples at a time */  
00116     _SIMD32_OFFSET(pDst + 8) = out3;  
00117   
00118     /* Update source pointer to process next sampels */  
00119     pSrcB += 16u;  
00120   
00121     /* store result to destination four samples at a time */  
00122     _SIMD32_OFFSET(pDst + 12) = out4;  
00123   
00124     /* Update destination pointer to process next sampels */  
00125     pDst += 16u;  
00126   
00127     /* Decrement the loop counter */     
00128     blkCnt--;     
00129   }     
00130      
00131   /* If the blockSize is not a multiple of 16, compute any remaining output samples here.      
00132    ** No loop unrolling is used. */     
00133   blkCnt = blockSize % 0x10u;     
00134      
00135   while(blkCnt > 0u)     
00136   {     
00137     /* C = A - B */     
00138     /* Subtract and then store the result in the destination buffer. */     
00139     inA = *pSrcA++;  
00140     inB = *pSrcB++;  
00141 #ifdef CCS   
00142     *pDst++ = __SSATA(inA - inB, 0, 8);     
00143 #else   
00144     *pDst++ = __SSAT(inA - inB, 8);     
00145 #endif  //#ifdef CCS   
00146      
00147     /* Decrement the loop counter */     
00148     blkCnt--;     
00149   }     
00150      
00151 }     
00152