Cortex-R DSP Software Library: arm_mult

Go to the documentation of this file.
00001 /* ----------------------------------------------------------------------      
00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 
00003 *      
00004 * $Date:        15. December 2011   
00005 * $Revision:    V2.0.0  
00006 *      
00007 * Project:      Cortex-R DSP Library 
00008 * Title:        arm_mult_f32.c      
00009 *      
00010 * Description:  Floating-point vector multiplication.      
00011 *      
00012 * Target Processor:          Cortex-R4/R5
00013 *
00014 * Version 1.0.0 2011/03/08
00015 *     Alpha release.
00016 *
00017 * Version 1.0.1 2011/09/30
00018 *     Beta release.
00019 *
00020 * Version 2.0.0 2011/12/15
00021 *     Final release. 
00022 * 
00023 * -------------------------------------------------------------------- */     
00024      
00025 #include "arm_math.h"     
00026      
00057 void arm_mult_f32(     
00058   float32_t * pSrcA,     
00059   float32_t * pSrcB,     
00060   float32_t * pDst,     
00061   uint32_t blockSize)     
00062 {     
00063   uint32_t blkCnt;                               /* loop counters */     
00064   float32_t inA1, inA2, inA3, inA4;              /* temporary input variables */  
00065   float32_t inB1, inB2, inB3, inB4;              /* temporary input variables */  
00066   float32_t out1, out2, out3, out4;              /* temporary output variables */  
00067      
00068   /* loop Unrolling */     
00069   blkCnt = blockSize >> 3u;     
00070      
00071   /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.      
00072    ** a second loop below computes the remaining 1 to 7 samples. */     
00073   while(blkCnt > 0u)     
00074   {     
00075     /* C = A * B */     
00076     /* Multiply the inputs and store the results in output buffer */  
00077     /* read sample from sourceA */  
00078     inA1 = *pSrcA;  
00079     /* read sample from sourceB */  
00080     inB1 = *pSrcB;  
00081     /* read sample from sourceA */  
00082     inA2 = *(pSrcA + 1);  
00083     /* read sample from sourceB */  
00084     inB2 = *(pSrcB + 1);  
00085   
00086     /* out = sourceA * sourceB */  
00087     out1 = inA1 * inB1;  
00088   
00089     /* read sample from sourceA */  
00090     inA3 = *(pSrcA + 2);  
00091     /* read sample from sourceB */  
00092     inB3 = *(pSrcB + 2);  
00093   
00094     /* out = sourceA * sourceB */  
00095     out2 = inA2 * inB2;  
00096   
00097     /* read sample from sourceA */  
00098     inA4 = *(pSrcA + 3);  
00099   
00100     /* store result to destination buffer */  
00101     *pDst = out1;  
00102   
00103     /* read sample from sourceB */  
00104     inB4 = *(pSrcB + 3);  
00105   
00106     /* out = sourceA * sourceB */  
00107     out3 = inA3 * inB3;  
00108   
00109     /* read sample from sourceA */  
00110     inA1 = *(pSrcA + 4);  
00111   
00112     /* store result to destination buffer */  
00113     *(pDst + 1) = out2;  
00114   
00115     /* read sample from sourceB */  
00116     inB1 = *(pSrcB + 4);  
00117   
00118     /* out = sourceA * sourceB */  
00119     out4 = inA4 * inB4;  
00120   
00121     /* read sample from sourceA */  
00122     inA2 = *(pSrcA + 5);  
00123   
00124     /* store result to destination buffer */  
00125     *(pDst + 2) = out3;  
00126   
00127     /* read sample from sourceB */  
00128     inB2 = *(pSrcB + 5);  
00129   
00130     /* out = sourceA * sourceB */  
00131     out1 = inA1 * inB1;  
00132   
00133     /* read sample from sourceA */  
00134     inA3 = *(pSrcA + 6);  
00135   
00136     /* store result to destination buffer */  
00137     *(pDst + 3) = out4;  
00138   
00139     /* read sample from sourceB */  
00140     inB3 = *(pSrcB + 6);  
00141   
00142     /* out = sourceA * sourceB */  
00143     out2 = inA2 * inB2;  
00144   
00145     /* read sample from sourceA */  
00146     inA4 = *(pSrcA + 7);  
00147     /* read sample from sourceB */  
00148     inB4 = *(pSrcB + 7);  
00149   
00150     /* store result to destination buffer */  
00151     *(pDst + 4) = out1;  
00152   
00153     /* out = sourceA * sourceB */  
00154     out3 = inA3 * inB3;  
00155   
00156     /* store result to destination buffer */  
00157     *(pDst + 5) = out2;  
00158   
00159     /* out = sourceA * sourceB */  
00160     out4 = inA4 * inB4;  
00161   
00162     /* increment sourceA by 8 to process next samples */  
00163     pSrcA += 8u;  
00164   
00165     /* store result to destination buffer */  
00166     *(pDst + 6) = out3;  
00167   
00168     /* increment sourceB by 8 to process next samples */  
00169     pSrcB += 8u;  
00170   
00171     /* store result to destination buffer */  
00172     *(pDst + 7) = out4;  
00173   
00174     /* increment destination buffer by 8 to process next samples */  
00175     pDst += 8u;  
00176   
00177     /* Decrement the blockSize loop counter */     
00178     blkCnt--;     
00179   }     
00180      
00181   /* If the blockSize is not a multiple of 8, compute any remaining output samples here.      
00182    ** No loop unrolling is used. */     
00183   blkCnt = blockSize % 0x8u;     
00184      
00185   while(blkCnt > 0u)     
00186   {     
00187     /* C = A * B */     
00188     /* Multiply the inputs and store the results in output buffer */     
00189     inA1 = *pSrcA++;  
00190     inB1 = *pSrcB++;  
00191   
00192     *pDst++ = inA1 * inB1;     
00193      
00194     /* Decrement the blockSize loop counter */     
00195     blkCnt--;     
00196   }     
00197 }     
00198