Cortex-R DSP Software Library: arm_add

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_add_q7.c      
00009 *      
00010 * Description:  Q7 vector addition.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00053 void arm_add_q7(     
00054   q7_t * pSrcA,     
00055   q7_t * pSrcB,     
00056   q7_t * pDst,     
00057   uint32_t blockSize)     
00058 {     
00059   uint32_t blkCnt;                               /* loop counter */     
00060   q31_t inA1, inA2, inB2, inB1;                  /* temporary input variables */  
00061   q31_t out1, out2, out3, out4;                  /* temporary output variables */  
00062      
00063      
00064   /*loop Unrolling */     
00065   blkCnt = blockSize >> 4u;  
00066     
00067   /* First part of the processing with loop unrolling.  Compute 16 outputs at a time.      
00068    ** a second loop below computes the remaining 1 to 15 samples. */     
00069   while(blkCnt)     
00070   {     
00071     /* C = A + B */     
00072     /* Add and then store the results in the destination buffer. */  
00073     /* read four input samples at a time from sourceA*/     
00074     inA1 = _SIMD32_OFFSET(pSrcA);  
00075     /* read four input samples at a time from sourceB*/     
00076     inB1 = _SIMD32_OFFSET(pSrcB);  
00077     /* read four input samples at a time from sourceA*/     
00078     inA2 = _SIMD32_OFFSET(pSrcA + 4);  
00079   
00080     /* add, saturate and store four samples to destination at a time*/  
00081     out1 = __QADD8(inA1, inB1);  
00082   
00083     /* read four input samples at a time */  
00084     inB2 = _SIMD32_OFFSET(pSrcB + 4);  
00085   
00086     /* store four samples at a time to destination */  
00087     _SIMD32_OFFSET(pDst) = out1;  
00088   
00089     /* add, saturate and store four samples to destination at a time*/  
00090     out2 = __QADD8(inA2, inB2);  
00091   
00092     /* read four input samples at a time from sourceA*/     
00093     inA1 = _SIMD32_OFFSET(pSrcA + 8);  
00094     /* read four input samples at a time from sourceA*/     
00095     inB1 = _SIMD32_OFFSET(pSrcB + 8);  
00096     /* read four input samples at a time from sourceA*/     
00097     inA2 = _SIMD32_OFFSET(pSrcA + 12);  
00098   
00099     /* add, saturate and store four samples to destination at a time*/  
00100     out3 = __QADD8(inA1, inB1);  
00101   
00102     /* read four input samples at a time from sourceB*/     
00103     inB2 = _SIMD32_OFFSET(pSrcB + 12);  
00104   
00105     /* increment sourceA pointer by 16 */  
00106     pSrcA += 16u;  
00107   
00108     /* store four samples at a time to destination */  
00109     _SIMD32_OFFSET(pDst + 4) = out2;  
00110   
00111     /* add, saturate and store four samples to destination at a time*/  
00112     out4 = __QADD8(inA2, inB2);  
00113   
00114     /* store four samples at a time to destination */  
00115     _SIMD32_OFFSET(pDst + 8) = out3;  
00116   
00117     /* increment sourecB pointer by 16 */  
00118     pSrcB += 16u;  
00119   
00120     /* store four samples at a time to destination */  
00121     _SIMD32_OFFSET(pDst + 12) = out4;  
00122   
00123     /* incremnet destination pointer by 16 */  
00124     pDst += 16u;                           
00125   
00126     /* Decrement the loop counter */     
00127     blkCnt--;     
00128   }    
00129      
00130   /* If the blockSize is not a multiple of 16, compute any remaining output samples here.      
00131    ** No loop unrolling is used. */     
00132   blkCnt = blockSize % 0x10u;     
00133      
00134   while(blkCnt > 0u)     
00135   {     
00136     /* C = A + B */     
00137     /* Add and then store the results in the destination buffer. */     
00138 #ifdef CCS   
00139     *pDst++ = (q7_t) __SSATA(*pSrcA++ + *pSrcB++, 0, 8);     
00140 #else   
00141     *pDst++ = (q7_t) __SSAT(*pSrcA++ + *pSrcB++, 8);     
00142 #endif // #ifdef CCS       
00143      
00144     /* Decrement the loop counter */     
00145     blkCnt--;     
00146   }     
00147      
00148 }     
00149