Merge pull request #10780 from kencooke/audio-hrtf-avx512

HRTF optimizations
This commit is contained in:
Brad Hefta-Gaub 2017-06-22 13:34:05 -07:00 committed by GitHub
commit 197006e3d3
7 changed files with 190 additions and 113 deletions

View file

@ -35,6 +35,23 @@ macro(SETUP_HIFI_LIBRARY)
endif()
endforeach()
# add compiler flags to AVX512 source files, if supported by compiler
include(CheckCXXCompilerFlag)
file(GLOB_RECURSE AVX512_SRCS "src/avx512/*.cpp" "src/avx512/*.c")
foreach(SRC ${AVX512_SRCS})
if (WIN32)
check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORTS_AVX512)
if (COMPILER_SUPPORTS_AVX512)
set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS /arch:AVX512)
endif()
elseif (APPLE OR UNIX)
check_cxx_compiler_flag("-mavx512f" COMPILER_SUPPORTS_AVX512)
if (COMPILER_SUPPORTS_AVX512)
set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS -mavx512f)
endif()
endif()
endforeach()
setup_memory_debugger()
# create a library and set the property so it can be referenced later

View file

@ -239,10 +239,11 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float
#include "CPUDetect.h"
void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
static auto f = cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE;
static auto f = cpuSupportsAVX512() ? FIR_1x4_AVX512 : (cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE);
(*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch
}

View file

@ -9,15 +9,11 @@
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
#ifdef __AVX2__
#include <stdint.h>
#include <assert.h>
#include <immintrin.h> // AVX2
#ifndef __AVX2__
#error Must be compiled with /arch:AVX2 or -mavx2 -mfma.
#endif
#include <immintrin.h>
#define _mm256_permute4x64_ps(ymm, imm) _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(ymm), imm));

View file

@ -9,17 +9,13 @@
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
#ifdef __AVX2__
#include <assert.h>
#include <immintrin.h> // AVX2
#include <immintrin.h>
#include "../AudioHRTF.h"
#ifndef __AVX2__
#error Must be compiled with /arch:AVX2 or -mavx2 -mfma.
#endif
#if defined(__GNUC__) && !defined(__clang__)
// for some reason, GCC -O2 results in poorly optimized code
#pragma GCC optimize("Os")

View file

@ -9,17 +9,13 @@
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
#ifdef __AVX2__
#include <assert.h>
#include <immintrin.h>
#include "../AudioSRC.h"
#ifndef __AVX2__
#error Must be compiled with /arch:AVX2 or -mavx2 -mfma.
#endif
// high/low part of int64_t
#define LO32(a) ((uint32_t)(a))
#define HI32(a) ((int32_t)((a) >> 32))

View file

@ -0,0 +1,101 @@
//
// AudioHRTF_avx512.cpp
// libraries/audio/src
//
// Created by Ken Cooke on 6/20/17.
// Copyright 2017 High Fidelity, Inc.
//
// Distributed under the Apache License, Version 2.0.
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#if defined(__AVX512F__)
#include <assert.h>
#include <immintrin.h>
#include "../AudioHRTF.h"
#if defined(__GNUC__) && !defined(__clang__)
// for some reason, GCC -O2 results in poorly optimized code
#pragma GCC optimize("Os")
#endif
// 1 channel input, 4 channel output
void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards
float* coef1 = coef[1] + HRTF_TAPS - 1;
float* coef2 = coef[2] + HRTF_TAPS - 1;
float* coef3 = coef[3] + HRTF_TAPS - 1;
assert(numFrames % 16 == 0);
for (int i = 0; i < numFrames; i += 16) {
__m512 acc0 = _mm512_setzero_ps();
__m512 acc1 = _mm512_setzero_ps();
__m512 acc2 = _mm512_setzero_ps();
__m512 acc3 = _mm512_setzero_ps();
__m512 acc4 = _mm512_setzero_ps();
__m512 acc5 = _mm512_setzero_ps();
__m512 acc6 = _mm512_setzero_ps();
__m512 acc7 = _mm512_setzero_ps();
float* ps = &src[i - HRTF_TAPS + 1]; // process forwards
assert(HRTF_TAPS % 4 == 0);
for (int k = 0; k < HRTF_TAPS; k += 4) {
__m512 x0 = _mm512_loadu_ps(&ps[k+0]);
acc0 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-0]), x0, acc0); // vfmadd231ps acc0, x0, dword ptr [coef]{1to16}
acc1 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-0]), x0, acc1);
acc2 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-0]), x0, acc2);
acc3 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-0]), x0, acc3);
__m512 x1 = _mm512_loadu_ps(&ps[k+1]);
acc4 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-1]), x1, acc4);
acc5 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-1]), x1, acc5);
acc6 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-1]), x1, acc6);
acc7 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-1]), x1, acc7);
__m512 x2 = _mm512_loadu_ps(&ps[k+2]);
acc0 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-2]), x2, acc0);
acc1 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-2]), x2, acc1);
acc2 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-2]), x2, acc2);
acc3 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-2]), x2, acc3);
__m512 x3 = _mm512_loadu_ps(&ps[k+3]);
acc4 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-3]), x3, acc4);
acc5 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-3]), x3, acc5);
acc6 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-3]), x3, acc6);
acc7 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-3]), x3, acc7);
}
acc0 = _mm512_add_ps(acc0, acc4);
acc1 = _mm512_add_ps(acc1, acc5);
acc2 = _mm512_add_ps(acc2, acc6);
acc3 = _mm512_add_ps(acc3, acc7);
_mm512_storeu_ps(&dst0[i], acc0);
_mm512_storeu_ps(&dst1[i], acc1);
_mm512_storeu_ps(&dst2[i], acc2);
_mm512_storeu_ps(&dst3[i], acc3);
}
_mm256_zeroupper();
}
// FIXME: this fallback can be removed, once we require VS2017
#elif defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
#include "../AudioHRTF.h"
void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
FIR_1x4_AVX2(src, dst0, dst1, dst2, dst3, coef, numFrames);
}
#endif

View file

@ -2,8 +2,8 @@
// CPUDetect.h
// libraries/shared/src
//
// Created by Ken Cooke on 6/6/16.
// Copyright 2016 High Fidelity, Inc.
// Created by Ken Cooke on 6/16/17.
// Copyright 2017 High Fidelity, Inc.
//
// Distributed under the Apache License, Version 2.0.
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
@ -13,28 +13,68 @@
#define hifi_CPUDetect_h
//
// Lightweight functions to detect SSE/AVX/AVX2 support
// Lightweight functions to detect SSE/AVX/AVX2/AVX512 support
//
#define MASK_SSE3 (1 << 0) // SSE3
#define MASK_SSSE3 (1 << 9) // SSSE3
#define MASK_SSE41 (1 << 19) // SSE4.1
#define MASK_SSE42 ((1 << 20) | (1 << 23)) // SSE4.2 and POPCNT
#define MASK_OSXSAVE (1 << 27) // OSXSAVE
#define MASK_AVX ((1 << 27) | (1 << 28)) // OSXSAVE and AVX
#define MASK_AVX2 (1 << 5) // AVX2
#define MASK_AVX512 ((1 << 16) | (1 << 17) | (1 << 28) | (1 << 30) | (1 << 31)) // AVX512 F,DQ,CD,BW,VL (SKX)
#define MASK_XCR0_YMM ((1 << 1) | (1 << 2)) // XMM,YMM
#define MASK_XCR0_ZMM ((1 << 1) | (1 << 2) | (7 << 5)) // XMM,YMM,ZMM
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
#define ARCH_X86
#endif
#define MASK_SSE3 (1 << 0) // SSE3
#define MASK_SSSE3 (1 << 9) // SSSE3
#define MASK_SSE41 (1 << 19) // SSE4.1
#define MASK_SSE42 ((1 << 20) | (1 << 23)) // SSE4.2 and POPCNT
#define MASK_AVX ((1 << 27) | (1 << 28)) // OSXSAVE and AVX
#define MASK_AVX2 (1 << 5) // AVX2
#if defined(ARCH_X86) && defined(_MSC_VER)
#include <intrin.h>
// use MSVC intrinsics
#define cpuidex(info, eax, ecx) __cpuidex(info, eax, ecx)
#define xgetbv(ecx) _xgetbv(ecx)
#elif defined(ARCH_X86) && defined(__GNUC__)
#include <cpuid.h>
// use GCC intrinics/asm
static inline void cpuidex(int info[4], int eax, int ecx) {
__cpuid_count(eax, ecx, info[0], info[1], info[2], info[3]);
}
static inline unsigned long long xgetbv(unsigned int ecx){
unsigned int eax, edx;
__asm__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
return ((unsigned long long)edx << 32) | eax;
}
#else
static inline void cpuidex(int info[4], int eax, int ecx) {
info[0] = 0;
info[1] = 0;
info[2] = 0;
info[3] = 0;
}
static inline unsigned long long xgetbv(unsigned int ecx){
return 0ULL;
}
#endif
static inline bool cpuSupportsSSE3() {
int info[4];
__cpuidex(info, 0x1, 0);
cpuidex(info, 0x1, 0);
return ((info[2] & MASK_SSE3) == MASK_SSE3);
}
@ -42,7 +82,7 @@ static inline bool cpuSupportsSSE3() {
static inline bool cpuSupportsSSSE3() {
int info[4];
__cpuidex(info, 0x1, 0);
cpuidex(info, 0x1, 0);
return ((info[2] & MASK_SSSE3) == MASK_SSSE3);
}
@ -50,7 +90,7 @@ static inline bool cpuSupportsSSSE3() {
static inline bool cpuSupportsSSE41() {
int info[4];
__cpuidex(info, 0x1, 0);
cpuidex(info, 0x1, 0);
return ((info[2] & MASK_SSE41) == MASK_SSE41);
}
@ -58,7 +98,7 @@ static inline bool cpuSupportsSSE41() {
static inline bool cpuSupportsSSE42() {
int info[4];
__cpuidex(info, 0x1, 0);
cpuidex(info, 0x1, 0);
return ((info[2] & MASK_SSE42) == MASK_SSE42);
}
@ -66,13 +106,13 @@ static inline bool cpuSupportsSSE42() {
static inline bool cpuSupportsAVX() {
int info[4];
__cpuidex(info, 0x1, 0);
cpuidex(info, 0x1, 0);
bool result = false;
if ((info[2] & MASK_AVX) == MASK_AVX) {
// verify OS support for YMM state
if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
if ((xgetbv(0) & MASK_XCR0_YMM) == MASK_XCR0_YMM) {
result = true;
}
}
@ -85,7 +125,7 @@ static inline bool cpuSupportsAVX2() {
bool result = false;
if (cpuSupportsAVX()) {
__cpuidex(info, 0x7, 0);
cpuidex(info, 0x7, 0);
if ((info[1] & MASK_AVX2) == MASK_AVX2) {
result = true;
@ -94,62 +134,20 @@ static inline bool cpuSupportsAVX2() {
return result;
}
#elif defined(ARCH_X86) && defined(__GNUC__)
static inline bool cpuSupportsAVX512() {
int info[4];
#include <cpuid.h>
static inline bool cpuSupportsSSE3() {
unsigned int eax, ebx, ecx, edx;
return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSE3) == MASK_SSE3);
}
static inline bool cpuSupportsSSSE3() {
unsigned int eax, ebx, ecx, edx;
return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSSE3) == MASK_SSSE3);
}
static inline bool cpuSupportsSSE41() {
unsigned int eax, ebx, ecx, edx;
return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSE41) == MASK_SSE41);
}
static inline bool cpuSupportsSSE42() {
unsigned int eax, ebx, ecx, edx;
return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSE42) == MASK_SSE42);
}
static inline bool cpuSupportsAVX() {
unsigned int eax, ebx, ecx, edx;
cpuidex(info, 0x1, 0);
bool result = false;
if (__get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_AVX) == MASK_AVX)) {
if ((info[2] & MASK_OSXSAVE) == MASK_OSXSAVE) {
// verify OS support for YMM state
__asm__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
if ((eax & 0x6) == 0x6) {
result = true;
}
}
return result;
}
// verify OS support for ZMM state
if ((xgetbv(0) & MASK_XCR0_ZMM) == MASK_XCR0_ZMM) {
static inline bool cpuSupportsAVX2() {
unsigned int eax, ebx, ecx, edx;
cpuidex(info, 0x7, 0);
bool result = false;
if (cpuSupportsAVX()) {
// Work around a bug where __get_cpuid(0x7) returns wrong values on older GCC
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77756
if (__get_cpuid(0x0, &eax, &ebx, &ecx, &edx) && (eax >= 0x7)) {
__cpuid_count(0x7, 0x0, eax, ebx, ecx, edx);
if ((ebx & MASK_AVX2) == MASK_AVX2) {
if ((info[1] & MASK_AVX512) == MASK_AVX512) {
result = true;
}
}
@ -157,32 +155,4 @@ static inline bool cpuSupportsAVX2() {
return result;
}
#else
static inline bool cpuSupportsSSE3() {
return false;
}
static inline bool cpuSupportsSSSE3() {
return false;
}
static inline bool cpuSupportsSSE41() {
return false;
}
static inline bool cpuSupportsSSE42() {
return false;
}
static inline bool cpuSupportsAVX() {
return false;
}
static inline bool cpuSupportsAVX2() {
return false;
}
#endif
#endif // hifi_CPUDetect_h