00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2011 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. December 2011 00005 * $Revision: V2.0.0 00006 * 00007 * Project: Cortex-R DSP Library 00008 * Title: arm_dct4_q31.c 00009 * 00010 * Description: Processing function of DCT4 & IDCT4 Q31. 00011 * 00012 * Target Processor: Cortex-R4/R5 00013 * 00014 * Version 1.0.0 2011/03/08 00015 * Alpha release. 00016 * 00017 * Version 1.0.1 2011/09/30 00018 * Beta release. 00019 * 00020 * Version 2.0.0 2011/12/15 00021 * Final release. 00022 * 00023 * -------------------------------------------------------------------- */ 00024 #include "arm_math.h" 00025 00047 void arm_dct4_q31( 00048 const arm_dct4_instance_q31 * S, 00049 q31_t * pState, 00050 q31_t * pInlineBuffer) 00051 { 00052 uint16_t i; /* Loop counter */ 00053 q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */ 00054 q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ 00055 q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ 00056 q31_t in; /* Temporary variable */ 00057 q31_t in1, in2, in3, in4; 00058 00059 00060 /* DCT4 computation involves DCT2 (which is calculated using RFFT) 00061 * along with some pre-processing and post-processing. 00062 * Computational procedure is explained as follows: 00063 * (a) Pre-processing involves multiplying input with cos factor, 00064 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) 00065 * where, 00066 * r(n) -- output of preprocessing 00067 * u(n) -- input to preprocessing(actual Source buffer) 00068 * (b) Calculation of DCT2 using FFT is divided into three steps: 00069 * Step1: Re-ordering of even and odd elements of input. 00070 * Step2: Calculating FFT of the re-ordered input. 00071 * Step3: Taking the real part of the product of FFT output and weights. 00072 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: 00073 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00074 * where, 00075 * Y4 -- DCT4 output, Y2 -- DCT2 output 00076 * (d) Multiplying the output with the normalizing factor sqrt(2/N). 00077 */ 00078 00079 /*-------- Pre-processing ------------*/ 00080 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 00081 arm_mult_q31(pInlineBuffer, cosFact, pInlineBuffer, S->N); 00082 arm_shift_q31(pInlineBuffer, 1, pInlineBuffer, S->N); 00083 00084 /* ---------------------------------------------------------------- 00085 * Step1: Re-ordering of even and odd elements as 00086 * pState[i] = pInlineBuffer[2*i] and 00087 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 00088 ---------------------------------------------------------------------*/ 00089 00090 /* pS1 initialized to pState */ 00091 pS1 = pState; 00092 00093 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 00094 pS2 = pState + (S->N - 1u); 00095 00096 /* pbuff initialized to input buffer */ 00097 pbuff = pInlineBuffer; 00098 00099 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 00100 i = S->Nby2 >> 2u; 00101 00102 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00103 ** a second loop below computes the remaining 1 to 3 samples. */ 00104 do 00105 { 00106 /* Re-ordering of even and odd elements */ 00107 pS2 -= 4u; 00108 /* pState[i] = pInlineBuffer[2*i] */ 00109 in1 = *pbuff++; 00110 *pS1++ = in1; 00111 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 00112 in2 = *pbuff++; 00113 pS2[4] = in2; 00114 in3 = *pbuff++; 00115 *pS1++ = in3; 00116 in4 = *pbuff++; 00117 pS2[3] = in4; 00118 in1 = *pbuff++; 00119 *pS1++ = in1; 00120 in2 = *pbuff++; 00121 pS2[2] = in2; 00122 in3 = *pbuff++; 00123 *pS1++ = in3; 00124 in4 = *pbuff++; 00125 pS2[1] = in4; 00126 00127 00128 00129 /* Decrement the loop counter */ 00130 i--; 00131 } while(i > 0u); 00132 00133 /* pbuff initialized to input buffer */ 00134 pbuff = pInlineBuffer; 00135 00136 /* pS1 initialized to pState */ 00137 pS1 = pState; 00138 00139 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00140 i = S->N >> 2u; 00141 00142 /* Processing with loop unrolling 4 times as N is always multiple of 4. 00143 * Compute 4 outputs at a time */ 00144 do 00145 { 00146 /* Writing the re-ordered output back to inplace input buffer */ 00147 in1 = *pS1++; 00148 *pbuff++ = in1; 00149 in2 = *pS1++; 00150 *pbuff++ = in2; 00151 in3 = *pS1++; 00152 *pbuff++ = in3; 00153 in4 = *pS1++; 00154 *pbuff++ = in4; 00155 00156 /* Decrement the loop counter */ 00157 i--; 00158 } while(i > 0u); 00159 00160 00161 /* --------------------------------------------------------- 00162 * Step2: Calculate RFFT for N-point input 00163 * ---------------------------------------------------------- */ 00164 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 00165 arm_rfft_q31(S->pRfft, pInlineBuffer, pState); 00166 00167 /*---------------------------------------------------------------------- 00168 * Step3: Multiply the FFT output with the weights. 00169 *----------------------------------------------------------------------*/ 00170 arm_cmplx_mult_cmplx_q31(pState, weights, pState, S->N); 00171 00172 /* The output of complex multiplication is in 3.29 format. 00173 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ 00174 arm_shift_q31(pState, 2, pState, S->N * 2); 00175 00176 /* ----------- Post-processing ---------- */ 00177 /* DCT-IV can be obtained from DCT-II by the equation, 00178 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 00179 * Hence, Y4(0) = Y2(0)/2 */ 00180 /* Getting only real part from the output and Converting to DCT-IV */ 00181 00182 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 00183 i = (S->N - 1u) >> 2u; 00184 00185 /* pbuff initialized to input buffer. */ 00186 pbuff = pInlineBuffer; 00187 00188 /* pS1 initialized to pState */ 00189 pS1 = pState; 00190 00191 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 00192 in = *pS1++ >> 1u; 00193 /* input buffer acts as inplace, so output values are stored in the input itself. */ 00194 *pbuff++ = in; 00195 00196 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 00197 pS1++; 00198 00199 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 00200 ** a second loop below computes the remaining 1 to 3 samples. */ 00201 do 00202 { 00203 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00204 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00205 in1 = pS1[0]; 00206 in2 = pS1[2]; 00207 in1 = in1 - in; 00208 in3 = pS1[4]; 00209 in2 = in2 - in1; 00210 in4 = pS1[6]; 00211 *pbuff++ = in1; 00212 00213 in3 = in3 - in2; 00214 *pbuff++ = in2; 00215 in = in4 - in3; 00216 00217 *pbuff++ = in3; 00218 pS1 += 8u; 00219 *pbuff++ = in; 00220 00221 00222 /* Decrement the loop counter */ 00223 i--; 00224 } while(i > 0u); 00225 00226 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 00227 ** No loop unrolling is used. */ 00228 i = (S->N - 1u) % 0x4u; 00229 00230 while(i > 0u) 00231 { 00232 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 00233 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 00234 in = *pS1++ - in; 00235 *pbuff++ = in; 00236 /* points to the next real value */ 00237 pS1++; 00238 00239 /* Decrement the loop counter */ 00240 i--; 00241 } 00242 00243 00244 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 00245 00246 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 00247 i = S->N >> 2u; 00248 00249 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 00250 pbuff = pInlineBuffer; 00251 00252 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ 00253 do 00254 { 00255 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 00256 in1 = pbuff[0]; 00257 in2 = pbuff[1]; 00258 in3 = pbuff[2]; 00259 in4 = pbuff[3]; 00260 00261 in1 = ((q31_t) (((q63_t) in1 * S->normalize) >> 31)); 00262 in2 = ((q31_t) (((q63_t) in2 * S->normalize) >> 31)); 00263 in3 = ((q31_t) (((q63_t) in3 * S->normalize) >> 31)); 00264 in4 = ((q31_t) (((q63_t) in4 * S->normalize) >> 31)); 00265 00266 *pbuff++ = in1; 00267 *pbuff++ = in2; 00268 *pbuff++ = in3; 00269 *pbuff++ = in4; 00270 00271 /* Decrement the loop counter */ 00272 i--; 00273 } while(i > 0u); 00274 00275 } 00276