// // AudioHRTF_avx2.cpp // libraries/audio/src // // Created by Ken Cooke on 1/17/16. // Copyright 2016 High Fidelity, Inc. // // Distributed under the Apache License, Version 2.0. // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html // #ifdef __AVX2__ #include #include #include "../AudioHRTF.h" #if defined(__GNUC__) && !defined(__clang__) // for some reason, GCC -O2 results in poorly optimized code #pragma GCC optimize("Os") #endif // 1 channel input, 4 channel output void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards float* coef1 = coef[1] + HRTF_TAPS - 1; float* coef2 = coef[2] + HRTF_TAPS - 1; float* coef3 = coef[3] + HRTF_TAPS - 1; assert(numFrames % 8 == 0); for (int i = 0; i < numFrames; i += 8) { __m256 acc0 = _mm256_setzero_ps(); __m256 acc1 = _mm256_setzero_ps(); __m256 acc2 = _mm256_setzero_ps(); __m256 acc3 = _mm256_setzero_ps(); __m256 acc4 = _mm256_setzero_ps(); __m256 acc5 = _mm256_setzero_ps(); __m256 acc6 = _mm256_setzero_ps(); __m256 acc7 = _mm256_setzero_ps(); float* ps = &src[i - HRTF_TAPS + 1]; // process forwards static_assert(HRTF_TAPS % 4 == 0, "HRTF_TAPS must be a multiple of 4"); for (int k = 0; k < HRTF_TAPS; k += 4) { __m256 x0 = _mm256_loadu_ps(&ps[k+0]); acc0 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-0]), x0, acc0); acc1 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-0]), x0, acc1); acc2 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-0]), x0, acc2); acc3 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-0]), x0, acc3); __m256 x1 = _mm256_loadu_ps(&ps[k+1]); acc4 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-1]), x1, acc4); acc5 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-1]), x1, acc5); acc6 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-1]), x1, acc6); acc7 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-1]), x1, acc7); __m256 x2 = _mm256_loadu_ps(&ps[k+2]); acc0 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-2]), x2, acc0); acc1 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-2]), x2, acc1); acc2 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-2]), x2, acc2); acc3 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-2]), x2, acc3); __m256 x3 = _mm256_loadu_ps(&ps[k+3]); acc4 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-3]), x3, acc4); acc5 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-3]), x3, acc5); acc6 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-3]), x3, acc6); acc7 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-3]), x3, acc7); } acc0 = _mm256_add_ps(acc0, acc4); acc1 = _mm256_add_ps(acc1, acc5); acc2 = _mm256_add_ps(acc2, acc6); acc3 = _mm256_add_ps(acc3, acc7); _mm256_storeu_ps(&dst0[i], acc0); _mm256_storeu_ps(&dst1[i], acc1); _mm256_storeu_ps(&dst2[i], acc2); _mm256_storeu_ps(&dst3[i], acc3); } _mm256_zeroupper(); } // 4 channel planar to interleaved void interleave_4x4_AVX2(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) { assert(numFrames % 8 == 0); for (int i = 0; i < numFrames; i += 8) { __m256 x0 = _mm256_loadu_ps(&src0[i]); __m256 x1 = _mm256_loadu_ps(&src1[i]); __m256 x2 = _mm256_loadu_ps(&src2[i]); __m256 x3 = _mm256_loadu_ps(&src3[i]); // interleave (4x4 matrix transpose) __m256 t0 = _mm256_unpacklo_ps(x0, x1); __m256 t1 = _mm256_unpackhi_ps(x0, x1); __m256 t2 = _mm256_unpacklo_ps(x2, x3); __m256 t3 = _mm256_unpackhi_ps(x2, x3); x0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1,0,1,0)); x1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3,2,3,2)); x2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1,0,1,0)); x3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3,2,3,2)); t0 = _mm256_permute2f128_ps(x0, x1, 0x20); t1 = _mm256_permute2f128_ps(x2, x3, 0x20); t2 = _mm256_permute2f128_ps(x0, x1, 0x31); t3 = _mm256_permute2f128_ps(x2, x3, 0x31); _mm256_storeu_ps(&dst[4*i+0], t0); _mm256_storeu_ps(&dst[4*i+8], t1); _mm256_storeu_ps(&dst[4*i+16], t2); _mm256_storeu_ps(&dst[4*i+24], t3); } _mm256_zeroupper(); } // process 2 cascaded biquads on 4 channels (interleaved) // biquads are computed in parallel, by adding one sample of delay void biquad2_4x4_AVX2(float* src, float* dst, float coef[5][8], float state[3][8], int numFrames) { // enable flush-to-zero mode to prevent denormals unsigned int ftz = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // restore state __m256 x0 = _mm256_setzero_ps(); __m256 y0 = _mm256_loadu_ps(state[0]); __m256 w1 = _mm256_loadu_ps(state[1]); __m256 w2 = _mm256_loadu_ps(state[2]); // biquad coefs __m256 b0 = _mm256_loadu_ps(coef[0]); __m256 b1 = _mm256_loadu_ps(coef[1]); __m256 b2 = _mm256_loadu_ps(coef[2]); __m256 a1 = _mm256_loadu_ps(coef[3]); __m256 a2 = _mm256_loadu_ps(coef[4]); for (int i = 0; i < numFrames; i++) { // x0 = (first biquad output << 128) | input x0 = _mm256_insertf128_ps(_mm256_permute2f128_ps(y0, y0, 0x01), _mm_loadu_ps(&src[4*i]), 0); // transposed Direct Form II y0 = _mm256_fmadd_ps(x0, b0, w1); w1 = _mm256_fmadd_ps(x0, b1, w2); w2 = _mm256_mul_ps(x0, b2); w1 = _mm256_fnmadd_ps(y0, a1, w1); w2 = _mm256_fnmadd_ps(y0, a2, w2); _mm_storeu_ps(&dst[4*i], _mm256_extractf128_ps(y0, 1)); // second biquad output } // save state _mm256_storeu_ps(state[0], y0); _mm256_storeu_ps(state[1], w1); _mm256_storeu_ps(state[2], w2); _MM_SET_FLUSH_ZERO_MODE(ftz); _mm256_zeroupper(); } // crossfade 4 inputs into 2 outputs with accumulation (interleaved) void crossfade_4x2_AVX2(float* src, float* dst, const float* win, int numFrames) { assert(numFrames % 8 == 0); for (int i = 0; i < numFrames; i += 8) { __m256 f0 = _mm256_loadu_ps(&win[i]); __m256 x0 = _mm256_castps128_ps256(_mm_loadu_ps(&src[4*i+0])); __m256 x1 = _mm256_castps128_ps256(_mm_loadu_ps(&src[4*i+4])); __m256 x2 = _mm256_castps128_ps256(_mm_loadu_ps(&src[4*i+8])); __m256 x3 = _mm256_castps128_ps256(_mm_loadu_ps(&src[4*i+12])); x0 = _mm256_insertf128_ps(x0, _mm_loadu_ps(&src[4*i+16]), 1); x1 = _mm256_insertf128_ps(x1, _mm_loadu_ps(&src[4*i+20]), 1); x2 = _mm256_insertf128_ps(x2, _mm_loadu_ps(&src[4*i+24]), 1); x3 = _mm256_insertf128_ps(x3, _mm_loadu_ps(&src[4*i+28]), 1); __m256 y0 = _mm256_loadu_ps(&dst[2*i+0]); __m256 y1 = _mm256_loadu_ps(&dst[2*i+8]); // deinterleave (4x4 matrix transpose) __m256 t0 = _mm256_unpacklo_ps(x0, x1); __m256 t1 = _mm256_unpackhi_ps(x0, x1); __m256 t2 = _mm256_unpacklo_ps(x2, x3); __m256 t3 = _mm256_unpackhi_ps(x2, x3); x0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1,0,1,0)); x1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3,2,3,2)); x2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1,0,1,0)); x3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3,2,3,2)); // crossfade x0 = _mm256_sub_ps(x0, x2); x1 = _mm256_sub_ps(x1, x3); x2 = _mm256_fmadd_ps(f0, x0, x2); x3 = _mm256_fmadd_ps(f0, x1, x3); // interleave t0 = _mm256_unpacklo_ps(x2, x3); t1 = _mm256_unpackhi_ps(x2, x3); x0 = _mm256_permute2f128_ps(t0, t1, 0x20); x1 = _mm256_permute2f128_ps(t0, t1, 0x31); // accumulate y0 = _mm256_add_ps(y0, x0); y1 = _mm256_add_ps(y1, x1); _mm256_storeu_ps(&dst[2*i+0], y0); _mm256_storeu_ps(&dst[2*i+8], y1); } _mm256_zeroupper(); } // linear interpolation with gain void interpolate_AVX2(const float* src0, const float* src1, float* dst, float frac, float gain) { __m256 f0 = _mm256_set1_ps(gain * (1.0f - frac)); __m256 f1 = _mm256_set1_ps(gain * frac); static_assert(HRTF_TAPS % 8 == 0, "HRTF_TAPS must be a multiple of 8"); for (int k = 0; k < HRTF_TAPS; k += 8) { __m256 x0 = _mm256_loadu_ps(&src0[k]); __m256 x1 = _mm256_loadu_ps(&src1[k]); x0 = _mm256_mul_ps(f0, x0); x0 = _mm256_fmadd_ps(f1, x1, x0); _mm256_storeu_ps(&dst[k], x0); } _mm256_zeroupper(); } #endif