Cortex-R DSP Software Library: arm_add

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_add_f32.c      
00009 *      
00010 * Description:  Floating-point vector addition.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * ---------------------------------------------------------------------------- */     
00024 #include "arm_math.h"     
00025      
00056 void arm_add_f32(     
00057   float32_t * pSrcA,     
00058   float32_t * pSrcB,     
00059   float32_t * pDst,     
00060   uint32_t blockSize)     
00061 {     
00062   uint32_t blkCnt;                               /* loop counter */     
00063   float32_t inA1, inA2, inA3, inA4;              /* temporary input variabels */  
00064   float32_t inB1, inB2, inB3, inB4;              /* temporary input variables */  
00065      
00066   /*loop Unrolling */     
00067   blkCnt = blockSize >> 3u;  
00068      
00069   /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
00070    ** a second loop below computes the remaining 1 to 7 samples. */     
00071   while(blkCnt > 0u)     
00072   {     
00073     /* C = A + B */     
00074     /* Add and then store the results in the destination buffer. */  
00075    
00076     /* read four inputs from sourceA and four inputs from sourceB */  
00077     inA1 = *pSrcA;  
00078     inB1 = *pSrcB;  
00079     inA2 = *(pSrcA + 1);  
00080     inB2 = *(pSrcB + 1);  
00081     inA3 = *(pSrcA + 2);  
00082     inB3 = *(pSrcB + 2);  
00083     inA4 = *(pSrcA + 3);  
00084     inB4 = *(pSrcB + 3);  
00085   
00086     /* C = A + B */     
00087     /* add and store result to destination */  
00088     *pDst = inA1 + inB1;  
00089     *(pDst + 1) = inA2 + inB2;  
00090     *(pDst + 2) = inA3 + inB3;  
00091     *(pDst + 3) = inA4 + inB4;  
00092   
00093     /* read four inputs from sourceA and four inputs from sourceB */  
00094     inA1 = *(pSrcA + 4);  
00095     inB1 = *(pSrcB + 4);  
00096     inA2 = *(pSrcA + 5);  
00097     inB2 = *(pSrcB + 5);  
00098     inA3 = *(pSrcA + 6);  
00099     inB3 = *(pSrcB + 6);  
00100     inA4 = *(pSrcA + 7);  
00101     inB4 = *(pSrcB + 7);  
00102   
00103     /* C = A + B */     
00104     /* add and store result to destination */  
00105     *(pDst + 4) = inA1 + inB1;  
00106     *(pDst + 5) = inA2 + inB2;  
00107     *(pDst + 6) = inA3 + inB3;  
00108     *(pDst + 7) = inA4 + inB4;  
00109   
00110     /* Update pointers to process next sampels */  
00111     pSrcA += 8u;  
00112     pSrcB += 8u;  
00113     pDst += 8u;  
00114   
00115     /* Decrement the loop counter */     
00116     blkCnt--;     
00117   }     
00118      
00119   /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
00120    ** No loop unrolling is used. */     
00121   blkCnt = blockSize % 0x8u;     
00122      
00123   while(blkCnt > 0u)     
00124   {     
00125     /* C = A + B */     
00126     /* Add and then store the results in the destination buffer. */   
00127     inA1 = *pSrcA++;  
00128     inB1 = *pSrcB++;  
00129   
00130     *pDst++ = inA1 + inB1;  
00131      
00132     /* Decrement the loop counter */     
00133     blkCnt--;     
00134   }     
00135 }     
00136