Prepare for runtime CPU dispatch of all SIMD functions

2025-04-08 15:43:24 +02:00 · 2018-08-29 09:55:54 -07:00 · 2018-08-29 09:55:54 -07:00 · bf348f43aa
commit bf348f43aa
parent 89bd24dd54
1 changed files with 35 additions and 19 deletions
--- a/libraries/audio/src/AudioHRTF.cpp
+++ b/libraries/audio/src/AudioHRTF.cpp
@ -276,23 +276,8 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float
    }
 }

-//
-// Runtime CPU dispatch
-//
-
-#include "CPUDetect.h"
-
-void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
-void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
-
-static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
-
-    static auto f = cpuSupportsAVX512() ? FIR_1x4_AVX512 : (cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE);
-    (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch
-}
-
 // 4 channel planar to interleaved
-static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) {
+static void interleave_4x4_SSE(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) {

    assert(numFrames % 4 == 0);

@ -323,7 +308,7 @@ static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, f

 // process 2 cascaded biquads on 4 channels (interleaved)
 // biquads computed in parallel, by adding one sample of delay
-static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) {
+static void biquad2_4x4_SSE(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) {

    // enable flush-to-zero mode to prevent denormals
    unsigned int ftz = _MM_GET_FLUSH_ZERO_MODE();
@ -388,7 +373,7 @@ static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3]
 }

 // crossfade 4 inputs into 2 outputs with accumulation (interleaved)
-static void crossfade_4x2(float* src, float* dst, const float* win, int numFrames) {
+static void crossfade_4x2_SSE(float* src, float* dst, const float* win, int numFrames) {

    assert(numFrames % 4 == 0);

@ -435,7 +420,7 @@ static void crossfade_4x2(float* src, float* dst, const float* win, int numFrame
 }

 // linear interpolation with gain
-static void interpolate(float* dst, const float* src0, const float* src1, float frac, float gain) {
+static void interpolate_SSE(float* dst, const float* src0, const float* src1, float frac, float gain) {

    __m128 f0 = _mm_set1_ps(gain * (1.0f - frac));
    __m128 f1 = _mm_set1_ps(gain * frac);
@ -453,6 +438,37 @@ static void interpolate(float* dst, const float* src0, const float* src1, float
    }
 }

+//
+// Runtime CPU dispatch
+//
+
+#include "CPUDetect.h"
+
+void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
+void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
+
+static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
+
+    static auto f = cpuSupportsAVX512() ? FIR_1x4_AVX512 : (cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE);
+    (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch
+}
+
+static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) {
+    interleave_4x4_SSE(src0, src1, src2, src3, dst, numFrames);
+}
+
+static void biquad2_4x4(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) {
+    biquad2_4x4_SSE(src, dst, coef, state, numFrames);
+}
+
+static void crossfade_4x2(float* src, float* dst, const float* win, int numFrames) {
+    crossfade_4x2_SSE(src, dst, win, numFrames);
+}
+
+static void interpolate(float* dst, const float* src0, const float* src1, float frac, float gain) {
+    interpolate_SSE(dst, src0, src1, frac, gain);
+}
+
 #else   // portable reference code

 // 1 channel input, 4 channel output