From 3d684dd0e72f63dc748ce0c624405ebc1b40a1f7 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 6 Feb 2016 10:55:23 -0800 Subject: [PATCH] Refactor AVX code for separate compilation and runtime dispatch --- libraries/audio/src/AudioHRTF.cpp | 140 +++++++--------------- libraries/audio/src/avx/AudioHRTF_avx.cpp | 96 +++++++++++++++ 2 files changed, 140 insertions(+), 96 deletions(-) create mode 100644 libraries/audio/src/avx/AudioHRTF_avx.cpp diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp index a5c1f19241..a8fa7ab747 100644 --- a/libraries/audio/src/AudioHRTF.cpp +++ b/libraries/audio/src/AudioHRTF.cpp @@ -64,82 +64,7 @@ static const float crossfadeTable[HRTF_BLOCK] = { // #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) -#include // AVX - -#if 0 // AVX disabled for now.. -// 1 channel input, 4 channel output -static void FIR_1x4_AVX(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - - float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards - float* coef1 = coef[1] + HRTF_TAPS - 1; - float* coef2 = coef[2] + HRTF_TAPS - 1; - float* coef3 = coef[3] + HRTF_TAPS - 1; - - assert(numFrames % 8 == 0); - - for (int i = 0; i < numFrames; i += 8) { - - __m256 acc0 = _mm256_setzero_ps(); - __m256 acc1 = _mm256_setzero_ps(); - __m256 acc2 = _mm256_setzero_ps(); - __m256 acc3 = _mm256_setzero_ps(); - - float* ps = &src[i - HRTF_TAPS + 1]; // process forwards - - assert(HRTF_TAPS % 8 == 0); - - for (int k = 0; k < HRTF_TAPS; k += 8) { - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - } - - _mm256_storeu_ps(&dst0[i], acc0); - _mm256_storeu_ps(&dst1[i], acc1); - _mm256_storeu_ps(&dst2[i], acc2); - _mm256_storeu_ps(&dst3[i], acc3); - } - - _mm256_zeroupper(); -} -#endif +#include // 1 channel input, 4 channel output static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { @@ -193,23 +118,22 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float } // -// Runtime CPU dispatch +// Detect AVX/AVX2 support // -#if 0 -//#if defined(_MSC_VER) +#if defined(_MSC_VER) #include -// detect AVX support -static bool cpuSupportsAVX() { +static bool cpuSupportsAVX() { int info[4]; - int mask = (1<<27) | (1<<28); // OSXSAVE and AVX + int mask = (1 << 27) | (1 << 28); // OSXSAVE and AVX __cpuidex(info, 0x1, 0); bool result = false; if ((info[2] & mask) == mask) { + if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) { result = true; } @@ -217,33 +141,57 @@ static bool cpuSupportsAVX() { return result; } -typedef void (*t_FIR_1x4)(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); +static bool cpuSupportsAVX2() { + int info[4]; + int mask = (1 << 5); // AVX2 -// dispatch stub -static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - static t_FIR_1x4 f = cpuSupportsAVX() ? FIR_1x4_AVX : FIR_1x4_SSE; // init on first call - (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch + bool result = false; + if (cpuSupportsAVX()) { + + __cpuidex(info, 0x7, 0); + + if ((info[1] & mask) == mask) { + result = true; + } + } + return result; } -//#elif defined(__GNU__) +#elif defined(__GNU__) || defined(__clang__) -typedef void (*t_FIR_1x4)(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); +static bool cpuSupportsAVX() { + return __builtin_cpu_supports("avx"); +} -// dispatch stub -static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - static t_FIR_1x4 f = __builtin_cpu_supports("avx") ? FIR_1x4_AVX : FIR_1x4_SSE; // init on first call - (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch +static bool cpuSupportsAVX2() { + return __builtin_cpu_supports("avx2"); } #else -// always use SSE version -static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - FIR_1x4_SSE(src, dst0, dst1, dst2, dst3, coef, numFrames); +static bool cpuSupportsAVX() { + return false; +} + +static bool cpuSupportsAVX2() { + return false; } #endif +// +// Runtime CPU dispatch +// + +typedef void FIR_1x4_t(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); +FIR_1x4_t FIR_1x4_AVX; // separate compilation with VEX-encoding enabled + +static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { + + static FIR_1x4_t* f = cpuSupportsAVX() ? FIR_1x4_AVX : FIR_1x4_SSE; // init on first call + (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch +} + // 4 channel planar to interleaved static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) { diff --git a/libraries/audio/src/avx/AudioHRTF_avx.cpp b/libraries/audio/src/avx/AudioHRTF_avx.cpp new file mode 100644 index 0000000000..b103bf015c --- /dev/null +++ b/libraries/audio/src/avx/AudioHRTF_avx.cpp @@ -0,0 +1,96 @@ +// +// AudioHRTF_avx.cpp +// libraries/audio/src/avx +// +// Created by Ken Cooke on 1/17/16. +// Copyright 2016 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) + +#include +#include + +#include "../AudioHRTF.h" + +#ifndef __AVX__ +#error Must be compiled with /arch:AVX or -mavx. +#endif + +// 1 channel input, 4 channel output +void FIR_1x4_AVX(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { + + float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards + float* coef1 = coef[1] + HRTF_TAPS - 1; + float* coef2 = coef[2] + HRTF_TAPS - 1; + float* coef3 = coef[3] + HRTF_TAPS - 1; + + assert(numFrames % 8 == 0); + + for (int i = 0; i < numFrames; i += 8) { + + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + __m256 acc2 = _mm256_setzero_ps(); + __m256 acc3 = _mm256_setzero_ps(); + + float* ps = &src[i - HRTF_TAPS + 1]; // process forwards + + assert(HRTF_TAPS % 8 == 0); + + for (int k = 0; k < HRTF_TAPS; k += 8) { + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-0]), _mm256_loadu_ps(&ps[k+0]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-0]), _mm256_loadu_ps(&ps[k+0]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-0]), _mm256_loadu_ps(&ps[k+0]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-0]), _mm256_loadu_ps(&ps[k+0]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-1]), _mm256_loadu_ps(&ps[k+1]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-1]), _mm256_loadu_ps(&ps[k+1]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-1]), _mm256_loadu_ps(&ps[k+1]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-1]), _mm256_loadu_ps(&ps[k+1]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-2]), _mm256_loadu_ps(&ps[k+2]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-2]), _mm256_loadu_ps(&ps[k+2]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-2]), _mm256_loadu_ps(&ps[k+2]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-2]), _mm256_loadu_ps(&ps[k+2]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-3]), _mm256_loadu_ps(&ps[k+3]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-3]), _mm256_loadu_ps(&ps[k+3]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-3]), _mm256_loadu_ps(&ps[k+3]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-3]), _mm256_loadu_ps(&ps[k+3]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-4]), _mm256_loadu_ps(&ps[k+4]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-4]), _mm256_loadu_ps(&ps[k+4]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-4]), _mm256_loadu_ps(&ps[k+4]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-4]), _mm256_loadu_ps(&ps[k+4]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-5]), _mm256_loadu_ps(&ps[k+5]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-5]), _mm256_loadu_ps(&ps[k+5]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-5]), _mm256_loadu_ps(&ps[k+5]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-5]), _mm256_loadu_ps(&ps[k+5]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-6]), _mm256_loadu_ps(&ps[k+6]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-6]), _mm256_loadu_ps(&ps[k+6]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-6]), _mm256_loadu_ps(&ps[k+6]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-6]), _mm256_loadu_ps(&ps[k+6]))); + + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-7]), _mm256_loadu_ps(&ps[k+7]))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-7]), _mm256_loadu_ps(&ps[k+7]))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-7]), _mm256_loadu_ps(&ps[k+7]))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-7]), _mm256_loadu_ps(&ps[k+7]))); + } + + _mm256_storeu_ps(&dst0[i], acc0); + _mm256_storeu_ps(&dst1[i], acc1); + _mm256_storeu_ps(&dst2[i], acc2); + _mm256_storeu_ps(&dst3[i], acc3); + } + + _mm256_zeroupper(); +} + +#endif