mirror of
https://github.com/lubosz/overte.git
synced 2025-04-08 15:43:24 +02:00
AVX2 implementation of biquad2_4x4()
This commit is contained in:
parent
3e17556cc1
commit
6cc31e7397
2 changed files with 50 additions and 1 deletions
|
@ -457,8 +457,12 @@ static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, f
|
|||
interleave_4x4_SSE(src0, src1, src2, src3, dst, numFrames);
|
||||
}
|
||||
|
||||
void biquad2_4x4_AVX2(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames);
|
||||
|
||||
static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) {
|
||||
biquad2_4x4_SSE(src, dst, coef, state, numFrames);
|
||||
|
||||
static auto f = cpuSupportsAVX2() ? biquad2_4x4_AVX2 : biquad2_4x4_SSE;
|
||||
(*f)(src, dst, coef, state, numFrames); // dispatch
|
||||
}
|
||||
|
||||
static void crossfade_4x2(float* src, float* dst, const float* win, int numFrames) {
|
||||
|
|
|
@ -87,4 +87,49 @@ void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3
|
|||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
// process 2 cascaded biquads on 4 channels (interleaved)
|
||||
// biquads are computed in parallel, by adding one sample of delay
|
||||
void biquad2_4x4_AVX2(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) {
|
||||
|
||||
// enable flush-to-zero mode to prevent denormals
|
||||
unsigned int ftz = _MM_GET_FLUSH_ZERO_MODE();
|
||||
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
|
||||
|
||||
// restore state
|
||||
__m256 x0 = _mm256_setzero_ps();
|
||||
__m256 y0 = _mm256_loadu_ps(state[0]);
|
||||
__m256 w1 = _mm256_loadu_ps(state[1]);
|
||||
__m256 w2 = _mm256_loadu_ps(state[2]);
|
||||
|
||||
// biquad coefs
|
||||
__m256 b0 = _mm256_loadu_ps(coef[0]);
|
||||
__m256 b1 = _mm256_loadu_ps(coef[1]);
|
||||
__m256 b2 = _mm256_loadu_ps(coef[2]);
|
||||
__m256 a1 = _mm256_loadu_ps(coef[3]);
|
||||
__m256 a2 = _mm256_loadu_ps(coef[4]);
|
||||
|
||||
for (int i = 0; i < numFrames; i++) {
|
||||
|
||||
// x0 = (first biquad output << 128) | input
|
||||
x0 = _mm256_insertf128_ps(_mm256_permute2f128_ps(y0, y0, 0x01), _mm_loadu_ps(&src[4*i]), 0);
|
||||
|
||||
// transposed Direct Form II
|
||||
y0 = _mm256_fmadd_ps(x0, b0, w1);
|
||||
w1 = _mm256_fmadd_ps(x0, b1, w2);
|
||||
w2 = _mm256_mul_ps(x0, b2);
|
||||
w1 = _mm256_fnmadd_ps(y0, a1, w1);
|
||||
w2 = _mm256_fnmadd_ps(y0, a2, w2);
|
||||
|
||||
_mm_storeu_ps(&dst[4*i], _mm256_extractf128_ps(y0, 1)); // second biquad output
|
||||
}
|
||||
|
||||
// save state
|
||||
_mm256_storeu_ps(state[0], y0);
|
||||
_mm256_storeu_ps(state[1], w1);
|
||||
_mm256_storeu_ps(state[2], w2);
|
||||
|
||||
_MM_SET_FLUSH_ZERO_MODE(ftz);
|
||||
_mm256_zeroupper();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in a new issue