#if defined(RPI) #include #include "arm_math_rpi.h" #define __SIMD32_TYPE int32_t #define __SIMD32(addr) (*(__SIMD32_TYPE **) & (addr)) #define _SIMD32_OFFSET(addr) (*(__SIMD32_TYPE *) (addr)) #define __PKHBT(ARG1, ARG2, ARG3) ( (((int32_t)(ARG1) << 0) & (int32_t)0x0000FFFF) | \ (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000) ) static uint32_t __SMLAD( uint32_t x, uint32_t y, uint32_t sum) { return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y << 16) >> 16)) + ((((q31_t)x ) >> 16) * (((q31_t)y ) >> 16)) + ( ((q31_t)sum ) ) )); } /* * @brief C custom defined SMLADX for M3 and M0 processors */ static uint32_t __SMLADX( uint32_t x, uint32_t y, uint32_t sum) { return ((uint32_t)(((((q31_t)x << 16) >> 16) * (((q31_t)y ) >> 16)) + ((((q31_t)x ) >> 16) * (((q31_t)y << 16) >> 16)) + ( ((q31_t)sum ) ) )); } void arm_fir_interpolate_q15( const arm_fir_interpolate_instance_q15 * S, q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { q15_t *pState = S->pState; /* State pointer */ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q15_t *pStateCurnt; /* Points to the current sample of the state */ q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */ q63_t sum; /* Accumulator */ q15_t x0, c0; /* Temporary variables to hold state and coefficient values */ uint32_t i, blkCnt, tapCnt; /* Loop counters */ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */ /* S->pState buffer contains previous frame (phaseLen - 1) samples */ /* pStateCurnt points to the location where the new input data should be written */ pStateCurnt = S->pState + (phaseLen - 1u); /* Total number of intput samples */ blkCnt = blockSize; /* Loop over the blockSize. */ while(blkCnt > 0u) { /* Copy new input sample into the state buffer */ *pStateCurnt++ = *pSrc++; /* Loop over the Interpolation factor. */ i = S->L; while(i > 0u) { /* Set accumulator to zero */ sum = 0; /* Initialize state pointer */ ptr1 = pState; /* Initialize coefficient pointer */ ptr2 = pCoeffs + (i - 1u); /* Loop over the polyPhase length */ tapCnt = (uint32_t) phaseLen; while(tapCnt > 0u) { /* Read the coefficient */ c0 = *ptr2; /* Increment the coefficient pointer by interpolation factor times. */ ptr2 += S->L; /* Read the input sample */ x0 = *ptr1++; /* Perform the multiply-accumulate */ sum += ((q31_t) x0 * c0); /* Decrement the loop counter */ tapCnt--; } /* Store the result after converting to 1.15 format in the destination buffer */ *pDst++ = (q15_t) (__SSAT((sum >> 15), 16)); /* Decrement the loop counter */ i--; } /* Advance the state pointer by 1 * to process the next group of interpolation factor number samples */ pState = pState + 1; /* Decrement the loop counter */ blkCnt--; } /* Processing is complete. ** Now copy the last phaseLen - 1 samples to the start of the state buffer. ** This prepares the state buffer for the next function call. */ /* Points to the start of the state buffer */ pStateCurnt = S->pState; i = (uint32_t) phaseLen - 1u; while(i > 0u) { *pStateCurnt++ = *pState++; /* Decrement the loop counter */ i--; } } void arm_fir_fast_q15( const arm_fir_instance_q15 * S, q15_t * pSrc, q15_t * pDst, uint32_t blockSize) { q15_t *pState = S->pState; /* State pointer */ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ q15_t *pStateCurnt; /* Points to the current sample of the state */ q31_t acc0, acc1, acc2, acc3; /* Accumulators */ q15_t *pb; /* Temporary pointer for coefficient buffer */ q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */ q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */ uint32_t numTaps = S->numTaps; /* Number of taps in the filter */ uint32_t tapCnt, blkCnt; /* Loop counters */ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ /* pStateCurnt points to the location where the new input data should be written */ pStateCurnt = &(S->pState[(numTaps - 1u)]); /* Apply loop unrolling and compute 4 output values simultaneously. * The variables acc0 ... acc3 hold output values that are being computed: * * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] */ blkCnt = blockSize >> 2; /* First part of the processing with loop unrolling. Compute 4 outputs at a time. ** a second loop below computes the remaining 1 to 3 samples. */ while(blkCnt > 0u) { /* Copy four new input samples into the state buffer. ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */ *pStateCurnt++ = *pSrc++; *pStateCurnt++ = *pSrc++; *pStateCurnt++ = *pSrc++; *pStateCurnt++ = *pSrc++; /* Set all accumulators to zero */ acc0 = 0; acc1 = 0; acc2 = 0; acc3 = 0; /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */ px = pState; /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */ pb = pCoeffs; /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */ x0 = *__SIMD32(px)++; /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */ x2 = *__SIMD32(px)++; /* Loop over the number of taps. Unroll by a factor of 4. ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */ tapCnt = numTaps >> 2; while(tapCnt > 0) { /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */ c0 = *__SIMD32(pb)++; /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */ acc0 = __SMLAD(x0, c0, acc0); /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */ acc2 = __SMLAD(x2, c0, acc2); /* pack x[n-N-1] and x[n-N-2] */ #ifndef ARM_MATH_BIG_ENDIAN x1 = __PKHBT(x2, x0, 0); #else x1 = __PKHBT(x0, x2, 0); #endif /* Read state x[n-N-4], x[n-N-5] */ x0 = _SIMD32_OFFSET(px); /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */ acc1 = __SMLADX(x1, c0, acc1); /* pack x[n-N-3] and x[n-N-4] */ #ifndef ARM_MATH_BIG_ENDIAN x1 = __PKHBT(x0, x2, 0); #else x1 = __PKHBT(x2, x0, 0); #endif /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */ acc3 = __SMLADX(x1, c0, acc3); /* Read coefficients b[N-2], b[N-3] */ c0 = *__SIMD32(pb)++; /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */ acc0 = __SMLAD(x2, c0, acc0); /* Read state x[n-N-6], x[n-N-7] with offset */ x2 = _SIMD32_OFFSET(px + 2u); /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */ acc2 = __SMLAD(x0, c0, acc2); /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */ acc1 = __SMLADX(x1, c0, acc1); /* pack x[n-N-5] and x[n-N-6] */ #ifndef ARM_MATH_BIG_ENDIAN x1 = __PKHBT(x2, x0, 0); #else x1 = __PKHBT(x0, x2, 0); #endif /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */ acc3 = __SMLADX(x1, c0, acc3); /* Update state pointer for next state reading */ px += 4u; /* Decrement tap count */ tapCnt--; } /* If the filter length is not a multiple of 4, compute the remaining filter taps. ** This is always be 2 taps since the filter length is even. */ if((numTaps & 0x3u) != 0u) { /* Read last two coefficients */ c0 = *__SIMD32(pb)++; /* Perform the multiply-accumulates */ acc0 = __SMLAD(x0, c0, acc0); acc2 = __SMLAD(x2, c0, acc2); /* pack state variables */ #ifndef ARM_MATH_BIG_ENDIAN x1 = __PKHBT(x2, x0, 0); #else x1 = __PKHBT(x0, x2, 0); #endif /* Read last state variables */ x0 = *__SIMD32(px); /* Perform the multiply-accumulates */ acc1 = __SMLADX(x1, c0, acc1); /* pack state variables */ #ifndef ARM_MATH_BIG_ENDIAN x1 = __PKHBT(x0, x2, 0); #else x1 = __PKHBT(x2, x0, 0); #endif /* Perform the multiply-accumulates */ acc3 = __SMLADX(x1, c0, acc3); } /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation. ** Then store the 4 outputs in the destination buffer. */ #ifndef ARM_MATH_BIG_ENDIAN *__SIMD32(pDst)++ = __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16); *__SIMD32(pDst)++ = __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16); #else *__SIMD32(pDst)++ = __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16); *__SIMD32(pDst)++ = __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16); #endif /* #ifndef ARM_MATH_BIG_ENDIAN */ /* Advance the state pointer by 4 to process the next group of 4 samples */ pState = pState + 4u; /* Decrement the loop counter */ blkCnt--; } /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No loop unrolling is used. */ blkCnt = blockSize % 0x4u; while(blkCnt > 0u) { /* Copy two samples into state buffer */ *pStateCurnt++ = *pSrc++; /* Set the accumulator to zero */ acc0 = 0; /* Use SIMD to hold states and coefficients */ px = pState; pb = pCoeffs; tapCnt = numTaps >> 1u; do { acc0 += (q31_t) * px++ * *pb++; acc0 += (q31_t) * px++ * *pb++; tapCnt--; } while(tapCnt > 0u); /* The result is in 2.30 format. Convert to 1.15 with saturation. ** Then store the output in the destination buffer. */ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16)); /* Advance state pointer by 1 for the next sample */ pState = pState + 1u; /* Decrement the loop counter */ blkCnt--; } /* Processing is complete. ** Now copy the last numTaps - 1 samples to the satrt of the state buffer. ** This prepares the state buffer for the next function call. */ /* Points to the start of the state buffer */ pStateCurnt = S->pState; /* Calculation of count for copying integer writes */ tapCnt = (numTaps - 1u) >> 2; while(tapCnt > 0u) { *pStateCurnt++ = *pState++; *pStateCurnt++ = *pState++; *pStateCurnt++ = *pState++; *pStateCurnt++ = *pState++; tapCnt--; } /* Calculation of count for remaining q15_t data */ tapCnt = (numTaps - 1u) % 0x4u; /* copy remaining data */ while(tapCnt > 0u) { *pStateCurnt++ = *pState++; /* Decrement the loop counter */ tapCnt--; } } void arm_biquad_cascade_df1_q31( const arm_biquad_casd_df1_inst_q31 * S, q31_t * pSrc, q31_t * pDst, uint32_t blockSize) { q63_t acc; /* accumulator */ uint32_t uShift = ((uint32_t) S->postShift + 1u); uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */ q31_t *pIn = pSrc; /* input pointer initialization */ q31_t *pOut = pDst; /* output pointer initialization */ q31_t *pState = S->pState; /* pState pointer initialization */ q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */ q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */ q31_t Xn; /* temporary input */ uint32_t sample, stage = S->numStages; /* loop counters */ do { /* Reading the coefficients */ b0 = *pCoeffs++; b1 = *pCoeffs++; b2 = *pCoeffs++; a1 = *pCoeffs++; a2 = *pCoeffs++; /* Reading the state values */ Xn1 = pState[0]; Xn2 = pState[1]; Yn1 = pState[2]; Yn2 = pState[3]; /* The variables acc holds the output value that is computed: * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ sample = blockSize; while(sample > 0u) { /* Read the input */ Xn = *pIn++; /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */ /* acc = b0 * x[n] */ acc = (q63_t) b0 *Xn; /* acc += b1 * x[n-1] */ acc += (q63_t) b1 *Xn1; /* acc += b[2] * x[n-2] */ acc += (q63_t) b2 *Xn2; /* acc += a1 * y[n-1] */ acc += (q63_t) a1 *Yn1; /* acc += a2 * y[n-2] */ acc += (q63_t) a2 *Yn2; /* The result is converted to 1.31 */ acc = acc >> lShift; /* Every time after the output is computed state should be updated. */ /* The states should be updated as: */ /* Xn2 = Xn1 */ /* Xn1 = Xn */ /* Yn2 = Yn1 */ /* Yn1 = acc */ Xn2 = Xn1; Xn1 = Xn; Yn2 = Yn1; Yn1 = (q31_t) acc; /* Store the output in the destination buffer. */ *pOut++ = (q31_t) acc; /* decrement the loop counter */ sample--; } /* The first stage goes from the input buffer to the output buffer. */ /* Subsequent stages occur in-place in the output buffer */ pIn = pDst; /* Reset to destination pointer */ pOut = pDst; /* Store the updated state variables back into the pState array */ *pState++ = Xn1; *pState++ = Xn2; *pState++ = Yn1; *pState++ = Yn2; } while(--stage); } void arm_q15_to_q31( q15_t * pSrc, q31_t * pDst, uint32_t blockSize) { q15_t *pIn = pSrc; /* Src pointer */ uint32_t blkCnt; /* loop counter */ /* Loop over blockSize number of values */ blkCnt = blockSize; while(blkCnt > 0u) { /* C = (q31_t)A << 16 */ /* convert from q15 to q31 and then store the results in the destination buffer */ *pDst++ = (q31_t) * pIn++ << 16; /* Decrement the loop counter */ blkCnt--; } } #endif