diff --git a/cmake/macros/SetupHifiLibrary.cmake b/cmake/macros/SetupHifiLibrary.cmake index e4a286cf3f..d0fc58af0c 100644 --- a/cmake/macros/SetupHifiLibrary.cmake +++ b/cmake/macros/SetupHifiLibrary.cmake @@ -35,6 +35,23 @@ macro(SETUP_HIFI_LIBRARY) endif() endforeach() + # add compiler flags to AVX512 source files, if supported by compiler + include(CheckCXXCompilerFlag) + file(GLOB_RECURSE AVX512_SRCS "src/avx512/*.cpp" "src/avx512/*.c") + foreach(SRC ${AVX512_SRCS}) + if (WIN32) + check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORTS_AVX512) + if (COMPILER_SUPPORTS_AVX512) + set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS /arch:AVX512) + endif() + elseif (APPLE OR UNIX) + check_cxx_compiler_flag("-mavx512f" COMPILER_SUPPORTS_AVX512) + if (COMPILER_SUPPORTS_AVX512) + set_source_files_properties(${SRC} PROPERTIES COMPILE_FLAGS -mavx512f) + endif() + endif() + endforeach() + setup_memory_debugger() # create a library and set the property so it can be referenced later diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp index 2a191b5821..1d5b074db7 100644 --- a/libraries/audio/src/AudioHRTF.cpp +++ b/libraries/audio/src/AudioHRTF.cpp @@ -239,10 +239,11 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float #include "CPUDetect.h" void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); +void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - static auto f = cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE; + static auto f = cpuSupportsAVX512() ? FIR_1x4_AVX512 : (cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE); (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch } diff --git a/libraries/audio/src/avx2/AudioFOA_avx2.cpp b/libraries/audio/src/avx2/AudioFOA_avx2.cpp index de5dfcd0b5..880f40b185 100644 --- a/libraries/audio/src/avx2/AudioFOA_avx2.cpp +++ b/libraries/audio/src/avx2/AudioFOA_avx2.cpp @@ -9,15 +9,11 @@ // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html // -#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) +#ifdef __AVX2__ #include #include -#include // AVX2 - -#ifndef __AVX2__ -#error Must be compiled with /arch:AVX2 or -mavx2 -mfma. -#endif +#include #define _mm256_permute4x64_ps(ymm, imm) _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(ymm), imm)); diff --git a/libraries/audio/src/avx2/AudioHRTF_avx2.cpp b/libraries/audio/src/avx2/AudioHRTF_avx2.cpp index 452ceb7f4c..e89128b173 100644 --- a/libraries/audio/src/avx2/AudioHRTF_avx2.cpp +++ b/libraries/audio/src/avx2/AudioHRTF_avx2.cpp @@ -9,17 +9,13 @@ // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html // -#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) +#ifdef __AVX2__ #include -#include // AVX2 +#include #include "../AudioHRTF.h" -#ifndef __AVX2__ -#error Must be compiled with /arch:AVX2 or -mavx2 -mfma. -#endif - #if defined(__GNUC__) && !defined(__clang__) // for some reason, GCC -O2 results in poorly optimized code #pragma GCC optimize("Os") diff --git a/libraries/audio/src/avx2/AudioSRC_avx2.cpp b/libraries/audio/src/avx2/AudioSRC_avx2.cpp index 693bad7fc6..0e31a58ce7 100644 --- a/libraries/audio/src/avx2/AudioSRC_avx2.cpp +++ b/libraries/audio/src/avx2/AudioSRC_avx2.cpp @@ -9,17 +9,13 @@ // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html // -#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) +#ifdef __AVX2__ #include #include #include "../AudioSRC.h" -#ifndef __AVX2__ -#error Must be compiled with /arch:AVX2 or -mavx2 -mfma. -#endif - // high/low part of int64_t #define LO32(a) ((uint32_t)(a)) #define HI32(a) ((int32_t)((a) >> 32)) diff --git a/libraries/audio/src/avx512/AudioHRTF_avx512.cpp b/libraries/audio/src/avx512/AudioHRTF_avx512.cpp new file mode 100644 index 0000000000..682f5f2f77 --- /dev/null +++ b/libraries/audio/src/avx512/AudioHRTF_avx512.cpp @@ -0,0 +1,101 @@ +// +// AudioHRTF_avx512.cpp +// libraries/audio/src +// +// Created by Ken Cooke on 6/20/17. +// Copyright 2017 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#if defined(__AVX512F__) + +#include +#include + +#include "../AudioHRTF.h" + +#if defined(__GNUC__) && !defined(__clang__) +// for some reason, GCC -O2 results in poorly optimized code +#pragma GCC optimize("Os") +#endif + +// 1 channel input, 4 channel output +void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { + + float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards + float* coef1 = coef[1] + HRTF_TAPS - 1; + float* coef2 = coef[2] + HRTF_TAPS - 1; + float* coef3 = coef[3] + HRTF_TAPS - 1; + + assert(numFrames % 16 == 0); + + for (int i = 0; i < numFrames; i += 16) { + + __m512 acc0 = _mm512_setzero_ps(); + __m512 acc1 = _mm512_setzero_ps(); + __m512 acc2 = _mm512_setzero_ps(); + __m512 acc3 = _mm512_setzero_ps(); + __m512 acc4 = _mm512_setzero_ps(); + __m512 acc5 = _mm512_setzero_ps(); + __m512 acc6 = _mm512_setzero_ps(); + __m512 acc7 = _mm512_setzero_ps(); + + float* ps = &src[i - HRTF_TAPS + 1]; // process forwards + + assert(HRTF_TAPS % 4 == 0); + + for (int k = 0; k < HRTF_TAPS; k += 4) { + + __m512 x0 = _mm512_loadu_ps(&ps[k+0]); + acc0 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-0]), x0, acc0); // vfmadd231ps acc0, x0, dword ptr [coef]{1to16} + acc1 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-0]), x0, acc1); + acc2 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-0]), x0, acc2); + acc3 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-0]), x0, acc3); + + __m512 x1 = _mm512_loadu_ps(&ps[k+1]); + acc4 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-1]), x1, acc4); + acc5 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-1]), x1, acc5); + acc6 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-1]), x1, acc6); + acc7 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-1]), x1, acc7); + + __m512 x2 = _mm512_loadu_ps(&ps[k+2]); + acc0 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-2]), x2, acc0); + acc1 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-2]), x2, acc1); + acc2 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-2]), x2, acc2); + acc3 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-2]), x2, acc3); + + __m512 x3 = _mm512_loadu_ps(&ps[k+3]); + acc4 = _mm512_fmadd_ps(_mm512_set1_ps(coef0[-k-3]), x3, acc4); + acc5 = _mm512_fmadd_ps(_mm512_set1_ps(coef1[-k-3]), x3, acc5); + acc6 = _mm512_fmadd_ps(_mm512_set1_ps(coef2[-k-3]), x3, acc6); + acc7 = _mm512_fmadd_ps(_mm512_set1_ps(coef3[-k-3]), x3, acc7); + } + + acc0 = _mm512_add_ps(acc0, acc4); + acc1 = _mm512_add_ps(acc1, acc5); + acc2 = _mm512_add_ps(acc2, acc6); + acc3 = _mm512_add_ps(acc3, acc7); + + _mm512_storeu_ps(&dst0[i], acc0); + _mm512_storeu_ps(&dst1[i], acc1); + _mm512_storeu_ps(&dst2[i], acc2); + _mm512_storeu_ps(&dst3[i], acc3); + } + + _mm256_zeroupper(); +} + +// FIXME: this fallback can be removed, once we require VS2017 +#elif defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) + +#include "../AudioHRTF.h" + +void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); + +void FIR_1x4_AVX512(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { + FIR_1x4_AVX2(src, dst0, dst1, dst2, dst3, coef, numFrames); +} + +#endif diff --git a/libraries/shared/src/CPUDetect.h b/libraries/shared/src/CPUDetect.h index ea6d23d8d6..a2320dcdc1 100644 --- a/libraries/shared/src/CPUDetect.h +++ b/libraries/shared/src/CPUDetect.h @@ -2,8 +2,8 @@ // CPUDetect.h // libraries/shared/src // -// Created by Ken Cooke on 6/6/16. -// Copyright 2016 High Fidelity, Inc. +// Created by Ken Cooke on 6/16/17. +// Copyright 2017 High Fidelity, Inc. // // Distributed under the Apache License, Version 2.0. // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html @@ -13,28 +13,68 @@ #define hifi_CPUDetect_h // -// Lightweight functions to detect SSE/AVX/AVX2 support +// Lightweight functions to detect SSE/AVX/AVX2/AVX512 support // +#define MASK_SSE3 (1 << 0) // SSE3 +#define MASK_SSSE3 (1 << 9) // SSSE3 +#define MASK_SSE41 (1 << 19) // SSE4.1 +#define MASK_SSE42 ((1 << 20) | (1 << 23)) // SSE4.2 and POPCNT +#define MASK_OSXSAVE (1 << 27) // OSXSAVE +#define MASK_AVX ((1 << 27) | (1 << 28)) // OSXSAVE and AVX +#define MASK_AVX2 (1 << 5) // AVX2 + +#define MASK_AVX512 ((1 << 16) | (1 << 17) | (1 << 28) | (1 << 30) | (1 << 31)) // AVX512 F,DQ,CD,BW,VL (SKX) + +#define MASK_XCR0_YMM ((1 << 1) | (1 << 2)) // XMM,YMM +#define MASK_XCR0_ZMM ((1 << 1) | (1 << 2) | (7 << 5)) // XMM,YMM,ZMM + #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) #define ARCH_X86 #endif -#define MASK_SSE3 (1 << 0) // SSE3 -#define MASK_SSSE3 (1 << 9) // SSSE3 -#define MASK_SSE41 (1 << 19) // SSE4.1 -#define MASK_SSE42 ((1 << 20) | (1 << 23)) // SSE4.2 and POPCNT -#define MASK_AVX ((1 << 27) | (1 << 28)) // OSXSAVE and AVX -#define MASK_AVX2 (1 << 5) // AVX2 - #if defined(ARCH_X86) && defined(_MSC_VER) #include +// use MSVC intrinsics +#define cpuidex(info, eax, ecx) __cpuidex(info, eax, ecx) +#define xgetbv(ecx) _xgetbv(ecx) + +#elif defined(ARCH_X86) && defined(__GNUC__) + +#include + +// use GCC intrinics/asm +static inline void cpuidex(int info[4], int eax, int ecx) { + __cpuid_count(eax, ecx, info[0], info[1], info[2], info[3]); +} + +static inline unsigned long long xgetbv(unsigned int ecx){ + unsigned int eax, edx; + __asm__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx)); + return ((unsigned long long)edx << 32) | eax; +} + +#else + +static inline void cpuidex(int info[4], int eax, int ecx) { + info[0] = 0; + info[1] = 0; + info[2] = 0; + info[3] = 0; +} + +static inline unsigned long long xgetbv(unsigned int ecx){ + return 0ULL; +} + +#endif + static inline bool cpuSupportsSSE3() { int info[4]; - __cpuidex(info, 0x1, 0); + cpuidex(info, 0x1, 0); return ((info[2] & MASK_SSE3) == MASK_SSE3); } @@ -42,7 +82,7 @@ static inline bool cpuSupportsSSE3() { static inline bool cpuSupportsSSSE3() { int info[4]; - __cpuidex(info, 0x1, 0); + cpuidex(info, 0x1, 0); return ((info[2] & MASK_SSSE3) == MASK_SSSE3); } @@ -50,7 +90,7 @@ static inline bool cpuSupportsSSSE3() { static inline bool cpuSupportsSSE41() { int info[4]; - __cpuidex(info, 0x1, 0); + cpuidex(info, 0x1, 0); return ((info[2] & MASK_SSE41) == MASK_SSE41); } @@ -58,7 +98,7 @@ static inline bool cpuSupportsSSE41() { static inline bool cpuSupportsSSE42() { int info[4]; - __cpuidex(info, 0x1, 0); + cpuidex(info, 0x1, 0); return ((info[2] & MASK_SSE42) == MASK_SSE42); } @@ -66,13 +106,13 @@ static inline bool cpuSupportsSSE42() { static inline bool cpuSupportsAVX() { int info[4]; - __cpuidex(info, 0x1, 0); + cpuidex(info, 0x1, 0); bool result = false; if ((info[2] & MASK_AVX) == MASK_AVX) { // verify OS support for YMM state - if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) { + if ((xgetbv(0) & MASK_XCR0_YMM) == MASK_XCR0_YMM) { result = true; } } @@ -85,7 +125,7 @@ static inline bool cpuSupportsAVX2() { bool result = false; if (cpuSupportsAVX()) { - __cpuidex(info, 0x7, 0); + cpuidex(info, 0x7, 0); if ((info[1] & MASK_AVX2) == MASK_AVX2) { result = true; @@ -94,62 +134,20 @@ static inline bool cpuSupportsAVX2() { return result; } -#elif defined(ARCH_X86) && defined(__GNUC__) +static inline bool cpuSupportsAVX512() { + int info[4]; -#include - -static inline bool cpuSupportsSSE3() { - unsigned int eax, ebx, ecx, edx; - - return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSE3) == MASK_SSE3); -} - -static inline bool cpuSupportsSSSE3() { - unsigned int eax, ebx, ecx, edx; - - return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSSE3) == MASK_SSSE3); -} - -static inline bool cpuSupportsSSE41() { - unsigned int eax, ebx, ecx, edx; - - return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSE41) == MASK_SSE41); -} - -static inline bool cpuSupportsSSE42() { - unsigned int eax, ebx, ecx, edx; - - return __get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_SSE42) == MASK_SSE42); -} - -static inline bool cpuSupportsAVX() { - unsigned int eax, ebx, ecx, edx; + cpuidex(info, 0x1, 0); bool result = false; - if (__get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & MASK_AVX) == MASK_AVX)) { + if ((info[2] & MASK_OSXSAVE) == MASK_OSXSAVE) { - // verify OS support for YMM state - __asm__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0)); - if ((eax & 0x6) == 0x6) { - result = true; - } - } - return result; -} + // verify OS support for ZMM state + if ((xgetbv(0) & MASK_XCR0_ZMM) == MASK_XCR0_ZMM) { -static inline bool cpuSupportsAVX2() { - unsigned int eax, ebx, ecx, edx; + cpuidex(info, 0x7, 0); - bool result = false; - if (cpuSupportsAVX()) { - - // Work around a bug where __get_cpuid(0x7) returns wrong values on older GCC - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77756 - if (__get_cpuid(0x0, &eax, &ebx, &ecx, &edx) && (eax >= 0x7)) { - - __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); - - if ((ebx & MASK_AVX2) == MASK_AVX2) { + if ((info[1] & MASK_AVX512) == MASK_AVX512) { result = true; } } @@ -157,32 +155,4 @@ static inline bool cpuSupportsAVX2() { return result; } -#else - -static inline bool cpuSupportsSSE3() { - return false; -} - -static inline bool cpuSupportsSSSE3() { - return false; -} - -static inline bool cpuSupportsSSE41() { - return false; -} - -static inline bool cpuSupportsSSE42() { - return false; -} - -static inline bool cpuSupportsAVX() { - return false; -} - -static inline bool cpuSupportsAVX2() { - return false; -} - -#endif - #endif // hifi_CPUDetect_h