00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_dct4_f32.c 00009 * 00010 * Description: Processing function of DCT4 & IDCT4 F32. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00120 void arm_dct4_f32( 00121 const arm_dct4_instance_f32 * S, 00122 float32_t * pState, 00123 float32_t * pInlineBuffer) 00124 { 00125 uint32_t i; /* Loop counter */ 00126 float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */ 00127 float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ 00128 float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ 00129 float32_t in; /* Temporary variable */ 00130 float32_t in1, in2, in3, in4; 00131 00132 00133 /* DCT4 computation involves DCT2 (which is calculated using RFFT) 00134 * along with some pre-processing and post-processing. 00135 * Computational procedure is explained as follows: 00136 * (a) Pre-processing involves multiplying input with cos factor, 00137 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) 00138 * where, 00139 * r(n) -- output of preprocessing 00140 * u(n) -- input to preprocessing(actual Source buffer) 00141 * (b) Calculation of DCT2 using FFT is divided into three steps: 00142 * Step1: Re-ordering of even and odd elements of input. 00143 * Step2: Calculating FFT of the re-ordered input. 00144 * Step3: Taking the real part of the product of FFT output and weights. 00145 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: 00146 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00147 * where, 00148 * Y4 -- DCT4 output, Y2 -- DCT2 output 00149 * (d) Multiplying the output with the normalizing factor sqrt(2/N). 00150 */ 00151 00152 /*-------- Pre-processing ------------*/ 00153 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 00154 arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N); 00155 arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N); 00156 00157 /* ---------------------------------------------------------------- 00158 * Step1: Re-ordering of even and odd elements as, 00159 * pState[i] = pInlineBuffer[2*i] and 00160 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 00161 ---------------------------------------------------------------------*/ 00162 00163 /* pS1 initialized to pState */ 00164 pS1 = pState; 00165 00166 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 00167 pS2 = pState + (S->N - 1u); 00168 00169 /* pbuff initialized to input buffer */ 00170 pbuff = pInlineBuffer; 00171 00172 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 00173 i = (uint32_t) S->Nby2 >> 2u; 00174 00175 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00176 ** a second loop below computes the remaining 1 to 3 samples. */ 00177 do 00178 { 00179 /* Re-ordering of even and odd elements */ 00180 pS2 -= 4u; 00181 /* pState[i] = pInlineBuffer[2*i] */ 00182 in1 = *pbuff++; 00183 *pS1++ = in1; 00184 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00185 in2 = *pbuff++; 00186 pS2[4] = in2; 00187 in3 = *pbuff++; 00188 *pS1++ = in3; 00189 in4 = *pbuff++; 00190 pS2[3] = in4; 00191 in1 = *pbuff++; 00192 *pS1++ = in1; 00193 in2 = *pbuff++; 00194 pS2[2] = in2; 00195 in3 = *pbuff++; 00196 *pS1++ = in3; 00197 in4 = *pbuff++; 00198 pS2[1] = in4; 00199 00200 /* Decrement the loop counter */ 00201 i--; 00202 } while(i > 0u); 00203 00204 /* pbuff initialized to input buffer */ 00205 pbuff = pInlineBuffer; 00206 00207 /* pS1 initialized to pState */ 00208 pS1 = pState; 00209 00210 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00211 i = (uint32_t) S->N >> 2u; 00212 00213 /* Processing with loop unrolling 4 times as N is always multiple of 4. 00214 * Compute 4 outputs at a time */ 00215 do 00216 { 00217 /* Writing the re-ordered output back to inplace input buffer */ 00218 in1 = *pS1++; 00219 *pbuff++ = in1; 00220 in2 = *pS1++; 00221 *pbuff++ = in2; 00222 in3 = *pS1++; 00223 *pbuff++ = in3; 00224 in4 = *pS1++; 00225 *pbuff++ = in4; 00226 00227 /* Decrement the loop counter */ 00228 i--; 00229 } while(i > 0u); 00230 00231 00232 /* --------------------------------------------------------- 00233 * Step2: Calculate RFFT for N-point input 00234 * ---------------------------------------------------------- */ 00235 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00236 arm_rfft_f32(S->pRfft, pInlineBuffer, pState); 00237 00238 /*---------------------------------------------------------------------- 00239 * Step3: Multiply the FFT output with the weights. 00240 *----------------------------------------------------------------------*/ 00241 arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N); 00242 00243 /* ----------- Post-processing ---------- */ 00244 /* DCT-IV can be obtained from DCT-II by the equation, 00245 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00246 * Hence, Y4(0) = Y2(0)/2 */ 00247 /* Getting only real part from the output and Converting to DCT-IV */ 00248 00249 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 00250 i = ((uint32_t) S->N - 1u) >> 2u; 00251 00252 /* pbuff initialized to input buffer. */ 00253 pbuff = pInlineBuffer; 00254 00255 /* pS1 initialized to pState */ 00256 pS1 = pState; 00257 00258 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00259 in = *pS1++ * (float32_t) 0.5; 00260 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00261 *pbuff++ = in; 00262 00263 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00264 pS1++; 00265 00266 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00267 ** a second loop below computes the remaining 1 to 3 samples. */ 00268 do 00269 { 00270 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00271 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00272 in1 = pS1[0]; 00273 in2 = pS1[2]; 00274 00275 in1 = in1 - in; 00276 00277 in3 = pS1[4]; 00278 00279 in2 = in2 - in1; 00280 00281 in4 = pS1[6]; 00282 00283 *pbuff++ = in1; 00284 00285 in3 = in3 - in2; 00286 00287 *pbuff++ = in2; 00288 00289 in = in4 - in3; 00290 00291 *pbuff++ = in3; 00292 *pbuff++ = in; 00293 00294 pS1 += 8u; 00295 00296 /* Decrement the loop counter */ 00297 i--; 00298 } while(i > 0u); 00299 00300 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00301 ** No loop unrolling is used. */ 00302 i = ((uint32_t) S->N - 1u) % 0x4u; 00303 00304 while(i > 0u) 00305 { 00306 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00307 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00308 in = *pS1++ - in; 00309 *pbuff++ = in; 00310 /* points to the next real value */ 00311 pS1++; 00312 00313 /* Decrement the loop counter */ 00314 i--; 00315 } 00316 00317 00318 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00319 00320 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00321 i = (uint32_t) S->N >> 2u; 00322 00323 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00324 pbuff = pInlineBuffer; 00325 00326 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ 00327 do 00328 { 00329 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00330 in1 = pbuff[0]; 00331 in2 = pbuff[1]; 00332 in3 = pbuff[2]; 00333 in4 = pbuff[3]; 00334 00335 in1 = in1 * S->normalize; 00336 in2 = in2 * S->normalize; 00337 in3 = in3 * S->normalize; 00338 in4 = in4 * S->normalize; 00339 00340 pbuff[0] = in1; 00341 pbuff[1] = in2; 00342 pbuff[2] = in3; 00343 pbuff[3] = in4; 00344 00345 pbuff += 4u; 00346 00347 /* Decrement the loop counter */ 00348 i--; 00349 } while(i > 0u); 00350 00351 } 00352