From dfab91e50a75c80493e3994df0792740dcc538a1 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 1 Jun 2019 19:53:18 -0700 Subject: [PATCH] Reduce the performance impact of loopback resampler (AVX2 optimizations) --- libraries/audio/src/avx2/AudioSRC_avx2.cpp | 36 +++++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/libraries/audio/src/avx2/AudioSRC_avx2.cpp b/libraries/audio/src/avx2/AudioSRC_avx2.cpp index 0e31a58ce7..e5ac08746c 100644 --- a/libraries/audio/src/avx2/AudioSRC_avx2.cpp +++ b/libraries/audio/src/avx2/AudioSRC_avx2.cpp @@ -34,15 +34,26 @@ int AudioSRC::multirateFilter1_AVX2(const float* input0, float* output0, int inp const float* c0 = &_polyphaseFilter[_numTaps * _phase]; __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); - for (int j = 0; j < _numTaps; j += 8) { + int j = 0; + for (; j < _numTaps - 15; j += 16) { // unrolled x 2 //float coef = c0[j]; - __m256 coef0 = _mm256_loadu_ps(&c0[j]); + __m256 coef0 = _mm256_loadu_ps(&c0[j + 0]); + __m256 coef1 = _mm256_loadu_ps(&c0[j + 8]); //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j + 0]), coef0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j + 8]), coef1, acc1); + } + if (j < _numTaps) { + + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); } + acc0 = _mm256_add_ps(acc0, acc1); // horizontal sum acc0 = _mm256_hadd_ps(acc0, acc0); @@ -73,19 +84,36 @@ int AudioSRC::multirateFilter1_AVX2(const float* input0, float* output0, int inp const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); __m256 frac = _mm256_broadcast_ss(&ftmp); - for (int j = 0; j < _numTaps; j += 8) { + int j = 0; + for (; j < _numTaps - 15; j += 16) { // unrolled x 2 //float coef = c0[j] + frac * (c1[j] - c0[j]); + __m256 coef0 = _mm256_loadu_ps(&c0[j + 0]); + __m256 coef1 = _mm256_loadu_ps(&c1[j + 0]); + __m256 coef2 = _mm256_loadu_ps(&c0[j + 8]); + __m256 coef3 = _mm256_loadu_ps(&c1[j + 8]); + coef1 = _mm256_sub_ps(coef1, coef0); + coef3 = _mm256_sub_ps(coef3, coef2); + coef0 = _mm256_fmadd_ps(coef1, frac, coef0); + coef2 = _mm256_fmadd_ps(coef3, frac, coef2); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j + 0]), coef0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j + 8]), coef2, acc1); + } + if (j < _numTaps) { + __m256 coef0 = _mm256_loadu_ps(&c0[j]); __m256 coef1 = _mm256_loadu_ps(&c1[j]); coef1 = _mm256_sub_ps(coef1, coef0); coef0 = _mm256_fmadd_ps(coef1, frac, coef0); - //acc += input[i + j] * coef; acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); } + acc0 = _mm256_add_ps(acc0, acc1); // horizontal sum acc0 = _mm256_hadd_ps(acc0, acc0);