AVX2 implementation of interpolate()

This commit is contained in:
Ken Cooke 2018-09-03 13:06:49 -07:00
parent 96aa0549be
commit 94e8ee99f5
2 changed files with 27 additions and 1 deletions

View file

@ -477,8 +477,12 @@ static void crossfade_4x2(float* src, float* dst, const float* win, int numFrame
(*f)(src, dst, win, numFrames); // dispatch
}
void interpolate_AVX2(const float* src0, const float* src1, float* dst, float frac, float gain);
static void interpolate(const float* src0, const float* src1, float* dst, float frac, float gain) {
interpolate_SSE(src0, src1, dst, frac, gain);
static auto f = cpuSupportsAVX2() ? interpolate_AVX2 : interpolate_SSE;
(*f)(src0, src1, dst, frac, gain); // dispatch
}
#else // portable reference code

View file

@ -226,4 +226,26 @@ void crossfade_4x2_AVX2(float* src, float* dst, const float* win, int numFrames)
_mm256_zeroupper();
}
// linear interpolation with gain
void interpolate_AVX2(const float* src0, const float* src1, float* dst, float frac, float gain) {
__m256 f0 = _mm256_set1_ps(gain * (1.0f - frac));
__m256 f1 = _mm256_set1_ps(gain * frac);
static_assert(HRTF_TAPS % 8 == 0, "HRTF_TAPS must be a multiple of 8");
for (int k = 0; k < HRTF_TAPS; k += 8) {
__m256 x0 = _mm256_loadu_ps(&src0[k]);
__m256 x1 = _mm256_loadu_ps(&src1[k]);
x0 = _mm256_mul_ps(f0, x0);
x0 = _mm256_fmadd_ps(f1, x1, x0);
_mm256_storeu_ps(&dst[k], x0);
}
_mm256_zeroupper();
}
#endif