diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp index 7895a70595..0f87df2346 100644 --- a/libraries/audio/src/AudioHRTF.cpp +++ b/libraries/audio/src/AudioHRTF.cpp @@ -457,8 +457,12 @@ static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, f interleave_4x4_SSE(src0, src1, src2, src3, dst, numFrames); } +void biquad2_4x4_AVX2(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames); + static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) { - biquad2_4x4_SSE(src, dst, coef, state, numFrames); + + static auto f = cpuSupportsAVX2() ? biquad2_4x4_AVX2 : biquad2_4x4_SSE; + (*f)(src, dst, coef, state, numFrames); // dispatch } static void crossfade_4x2(float* src, float* dst, const float* win, int numFrames) { diff --git a/libraries/audio/src/avx2/AudioHRTF_avx2.cpp b/libraries/audio/src/avx2/AudioHRTF_avx2.cpp index 501f010236..0c3c6b5096 100644 --- a/libraries/audio/src/avx2/AudioHRTF_avx2.cpp +++ b/libraries/audio/src/avx2/AudioHRTF_avx2.cpp @@ -87,4 +87,49 @@ void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3 _mm256_zeroupper(); } +// process 2 cascaded biquads on 4 channels (interleaved) +// biquads are computed in parallel, by adding one sample of delay +void biquad2_4x4_AVX2(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) { + + // enable flush-to-zero mode to prevent denormals + unsigned int ftz = _MM_GET_FLUSH_ZERO_MODE(); + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + + // restore state + __m256 x0 = _mm256_setzero_ps(); + __m256 y0 = _mm256_loadu_ps(state[0]); + __m256 w1 = _mm256_loadu_ps(state[1]); + __m256 w2 = _mm256_loadu_ps(state[2]); + + // biquad coefs + __m256 b0 = _mm256_loadu_ps(coef[0]); + __m256 b1 = _mm256_loadu_ps(coef[1]); + __m256 b2 = _mm256_loadu_ps(coef[2]); + __m256 a1 = _mm256_loadu_ps(coef[3]); + __m256 a2 = _mm256_loadu_ps(coef[4]); + + for (int i = 0; i < numFrames; i++) { + + // x0 = (first biquad output << 128) | input + x0 = _mm256_insertf128_ps(_mm256_permute2f128_ps(y0, y0, 0x01), _mm_loadu_ps(&src[4*i]), 0); + + // transposed Direct Form II + y0 = _mm256_fmadd_ps(x0, b0, w1); + w1 = _mm256_fmadd_ps(x0, b1, w2); + w2 = _mm256_mul_ps(x0, b2); + w1 = _mm256_fnmadd_ps(y0, a1, w1); + w2 = _mm256_fnmadd_ps(y0, a2, w2); + + _mm_storeu_ps(&dst[4*i], _mm256_extractf128_ps(y0, 1)); // second biquad output + } + + // save state + _mm256_storeu_ps(state[0], y0); + _mm256_storeu_ps(state[1], w1); + _mm256_storeu_ps(state[2], w2); + + _MM_SET_FLUSH_ZERO_MODE(ftz); + _mm256_zeroupper(); +} + #endif