diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp index 23adcde0c5..aef4ef3326 100644 --- a/libraries/audio/src/AudioHRTF.cpp +++ b/libraries/audio/src/AudioHRTF.cpp @@ -276,23 +276,8 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float } } -// -// Runtime CPU dispatch -// - -#include "CPUDetect.h" - -void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); -void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); - -static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - - static auto f = cpuSupportsAVX512() ? FIR_1x4_AVX512 : (cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE); - (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch -} - // 4 channel planar to interleaved -static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) { +static void interleave_4x4_SSE(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) { assert(numFrames % 4 == 0); @@ -323,7 +308,7 @@ static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, f // process 2 cascaded biquads on 4 channels (interleaved) // biquads computed in parallel, by adding one sample of delay -static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) { +static void biquad2_4x4_SSE(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) { // enable flush-to-zero mode to prevent denormals unsigned int ftz = _MM_GET_FLUSH_ZERO_MODE(); @@ -388,7 +373,7 @@ static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3] } // crossfade 4 inputs into 2 outputs with accumulation (interleaved) -static void crossfade_4x2(float* src, float* dst, const float* win, int numFrames) { +static void crossfade_4x2_SSE(float* src, float* dst, const float* win, int numFrames) { assert(numFrames % 4 == 0); @@ -435,7 +420,7 @@ static void crossfade_4x2(float* src, float* dst, const float* win, int numFrame } // linear interpolation with gain -static void interpolate(float* dst, const float* src0, const float* src1, float frac, float gain) { +static void interpolate_SSE(float* dst, const float* src0, const float* src1, float frac, float gain) { __m128 f0 = _mm_set1_ps(gain * (1.0f - frac)); __m128 f1 = _mm_set1_ps(gain * frac); @@ -453,6 +438,37 @@ static void interpolate(float* dst, const float* src0, const float* src1, float } } +// +// Runtime CPU dispatch +// + +#include "CPUDetect.h" + +void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); +void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); + +static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { + + static auto f = cpuSupportsAVX512() ? FIR_1x4_AVX512 : (cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE); + (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch +} + +static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) { + interleave_4x4_SSE(src0, src1, src2, src3, dst, numFrames); +} + +static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) { + biquad2_4x4_SSE(src, dst, coef, state, numFrames); +} + +static void crossfade_4x2(float* src, float* dst, const float* win, int numFrames) { + crossfade_4x2_SSE(src, dst, win, numFrames); +} + +static void interpolate(float* dst, const float* src0, const float* src1, float frac, float gain) { + interpolate_SSE(dst, src0, src1, frac, gain); +} + #else // portable reference code // 1 channel input, 4 channel output