diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp index 5984187203..84e3622498 100644 --- a/libraries/audio/src/AudioHRTF.cpp +++ b/libraries/audio/src/AudioHRTF.cpp @@ -16,6 +16,14 @@ #include "AudioHRTF.h" #include "AudioHRTFData.h" +#if defined(_MSC_VER) +#define ALIGN32 __declspec(align(32)) +#elif defined(__GNUC__) +#define ALIGN32 __attribute__((aligned(32))) +#else +#define ALIGN32 +#endif + #ifndef MAX #define MAX(a,b) (((a) > (b)) ? (a) : (b)) #endif @@ -30,7 +38,7 @@ // Transients in the time-varying Thiran allpass filter are eliminated by the initial delay. // Valimaki, Laakso. "Elimination of Transients in Time-Varying Allpass Fractional Delay Filters" // -static const float crossfadeTable[HRTF_BLOCK] = { +ALIGN32 static const float crossfadeTable[HRTF_BLOCK] = { 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.9999545513f, 0.9998182135f, 0.9995910114f, 0.9992729863f, 0.9988641959f, 0.9983647147f, 0.9977746334f, 0.9970940592f, 0.9963231160f, 0.9954619438f, 0.9945106993f, 0.9934695553f, 0.9923387012f, 0.9911183425f, 0.9898087010f, 0.9884100149f, @@ -192,25 +200,29 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float for (int k = 0; k < HRTF_TAPS; k += 4) { - acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-0]), _mm_loadu_ps(&ps[k+0]))); - acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-0]), _mm_loadu_ps(&ps[k+0]))); - acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-0]), _mm_loadu_ps(&ps[k+0]))); - acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-0]), _mm_loadu_ps(&ps[k+0]))); + __m128 x0 = _mm_loadu_ps(&ps[k+0]); + acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-0]), x0)); + acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-0]), x0)); + acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-0]), x0)); + acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-0]), x0)); - acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-1]), _mm_loadu_ps(&ps[k+1]))); - acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-1]), _mm_loadu_ps(&ps[k+1]))); - acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-1]), _mm_loadu_ps(&ps[k+1]))); - acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-1]), _mm_loadu_ps(&ps[k+1]))); + __m128 x1 = _mm_loadu_ps(&ps[k+1]); + acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-1]), x1)); + acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-1]), x1)); + acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-1]), x1)); + acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-1]), x1)); - acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-2]), _mm_loadu_ps(&ps[k+2]))); - acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-2]), _mm_loadu_ps(&ps[k+2]))); - acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-2]), _mm_loadu_ps(&ps[k+2]))); - acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-2]), _mm_loadu_ps(&ps[k+2]))); + __m128 x2 = _mm_loadu_ps(&ps[k+2]); + acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-2]), x2)); + acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-2]), x2)); + acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-2]), x2)); + acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-2]), x2)); - acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-3]), _mm_loadu_ps(&ps[k+3]))); - acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-3]), _mm_loadu_ps(&ps[k+3]))); - acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-3]), _mm_loadu_ps(&ps[k+3]))); - acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-3]), _mm_loadu_ps(&ps[k+3]))); + __m128 x3 = _mm_loadu_ps(&ps[k+3]); + acc0 = _mm_add_ps(acc0, _mm_mul_ps(_mm_load1_ps(&coef0[-k-3]), x3)); + acc1 = _mm_add_ps(acc1, _mm_mul_ps(_mm_load1_ps(&coef1[-k-3]), x3)); + acc2 = _mm_add_ps(acc2, _mm_mul_ps(_mm_load1_ps(&coef2[-k-3]), x3)); + acc3 = _mm_add_ps(acc3, _mm_mul_ps(_mm_load1_ps(&coef3[-k-3]), x3)); } _mm_storeu_ps(&dst0[i], acc0); @@ -226,11 +238,11 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float #include "CPUDetect.h" -void FIR_1x4_AVX(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); +void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames); static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - static auto f = cpuSupportsAVX() ? FIR_1x4_AVX : FIR_1x4_SSE; + static auto f = cpuSupportsAVX2() ? FIR_1x4_AVX2 : FIR_1x4_SSE; (*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch } @@ -842,12 +854,12 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth, assert(index < HRTF_TABLES); assert(numFrames == HRTF_BLOCK); - float in[HRTF_TAPS + HRTF_BLOCK]; // mono - float firCoef[4][HRTF_TAPS]; // 4-channel - float firBuffer[4][HRTF_DELAY + HRTF_BLOCK]; // 4-channel - float bqCoef[5][8]; // 4-channel (interleaved) - float bqBuffer[4 * HRTF_BLOCK]; // 4-channel (interleaved) - int delay[4]; // 4-channel (interleaved) + ALIGN32 float in[HRTF_TAPS + HRTF_BLOCK]; // mono + ALIGN32 float firCoef[4][HRTF_TAPS]; // 4-channel + ALIGN32 float firBuffer[4][HRTF_DELAY + HRTF_BLOCK]; // 4-channel + ALIGN32 float bqCoef[5][8]; // 4-channel (interleaved) + ALIGN32 float bqBuffer[4 * HRTF_BLOCK]; // 4-channel (interleaved) + int delay[4]; // 4-channel (interleaved) // to avoid polluting the cache, old filters are recomputed instead of stored setFilters(firCoef, bqCoef, delay, index, _azimuthState, _distanceState, _gainState, L0); diff --git a/libraries/audio/src/AudioHRTFData.h b/libraries/audio/src/AudioHRTFData.h index e317dee7c7..b2d3ebf847 100644 --- a/libraries/audio/src/AudioHRTFData.h +++ b/libraries/audio/src/AudioHRTFData.h @@ -30,6 +30,14 @@ // 6) Truncate filter length to 2.5ms using rectangular window with 8-tap Hanning taper // +#if defined(_MSC_VER) +#define ALIGN32 __declspec(align(32)) +#elif defined(__GNUC__) +#define ALIGN32 __attribute__((aligned(32))) +#else +#define ALIGN32 +#endif + static const float itd_1002_table[HRTF_AZIMUTHS] = { -0.07851f, 0.85414f, 1.77170f, 2.71137f, 3.71065f, 4.74907f, 5.79892f, 6.82396f, 7.82837f, 8.80796f, 9.75426f, 10.68332f, 11.59979f, 12.48520f, 13.36135f, 14.19234f, @@ -42,7 +50,7 @@ static const float itd_1002_table[HRTF_AZIMUTHS] = { -8.39670f, -7.23606f, -6.09663f, -5.05593f, -4.06186f, -3.07465f, -2.06122f, -1.05417f, }; -static const float ir_1002_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1002_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 8.341559e-01f, 1.886116e-02f, 2.677664e-01f, -7.037183e-02f, -4.147236e-02f, -2.761588e-01f, 2.310035e-01f, -1.643133e-01f, @@ -1497,7 +1505,7 @@ static const float itd_1003_table[HRTF_AZIMUTHS] = { -6.64380f, -5.73462f, -4.83364f, -3.97025f, -3.08925f, -2.16621f, -1.19364f, -0.20709f, }; -static const float ir_1003_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1003_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 9.266240e-01f, 1.260510e-01f, 5.051008e-02f, -3.536678e-01f, 2.462246e-02f, 4.465557e-02f, 6.813228e-02f, -6.063477e-02f, @@ -2952,7 +2960,7 @@ static const float itd_1004_table[HRTF_AZIMUTHS] = { -7.55720f, -6.55578f, -5.59246f, -4.69657f, -3.80733f, -2.88567f, -1.90337f, -0.89923f, }; -static const float ir_1004_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1004_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.326633e-01f, 4.279429e-01f, -5.910516e-02f, -2.480760e-01f, -9.903029e-02f, 9.215562e-02f, -2.893536e-02f, 5.464364e-02f, @@ -4407,7 +4415,7 @@ static const float itd_1005_table[HRTF_AZIMUTHS] = { -6.80079f, -6.03878f, -5.25100f, -4.34973f, -3.39268f, -2.41226f, -1.45444f, -0.50375f, }; -static const float ir_1005_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1005_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 8.515557e-01f, 1.208618e-01f, 3.238278e-01f, -3.605847e-01f, -3.354420e-02f, -1.829174e-01f, 2.309960e-01f, -1.744711e-01f, @@ -5862,7 +5870,7 @@ static const float itd_1007_table[HRTF_AZIMUTHS] = { -7.68135f, -6.69801f, -5.72186f, -4.72708f, -3.74413f, -2.77373f, -1.79032f, -0.81823f, }; -static const float ir_1007_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1007_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 6.544936e-01f, 2.820574e-01f, 1.850652e-01f, -2.597811e-01f, -5.585250e-02f, -7.975905e-02f, 8.143960e-02f, -5.044548e-02f, @@ -7317,7 +7325,7 @@ static const float itd_1012_table[HRTF_AZIMUTHS] = { -7.32159f, -6.30684f, -5.31969f, -4.40260f, -3.50567f, -2.60925f, -1.70893f, -0.80401f, }; -static const float ir_1012_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1012_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 8.505165e-01f, 9.074762e-02f, 3.296598e-01f, -5.213905e-01f, 1.348379e-01f, -1.828924e-01f, 1.400077e-01f, -4.071996e-02f, @@ -8772,7 +8780,7 @@ static const float itd_1014_table[HRTF_AZIMUTHS] = { -7.51312f, -6.52705f, -5.56262f, -4.72113f, -3.90664f, -3.07768f, -2.22719f, -1.37514f, }; -static const float ir_1014_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1014_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 6.542071e-01f, 4.575563e-01f, 1.118072e-02f, -1.823464e-01f, -2.222339e-01f, 1.371357e-01f, 7.027919e-03f, -5.534852e-02f, @@ -10227,7 +10235,7 @@ static const float itd_1017_table[HRTF_AZIMUTHS] = { -7.46925f, -6.49073f, -5.52501f, -4.62178f, -3.74041f, -2.86207f, -1.97362f, -1.07512f, }; -static const float ir_1017_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1017_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.470867e-01f, 2.686078e-01f, 2.097923e-01f, -2.935018e-01f, -8.687224e-02f, -4.547367e-02f, 6.920631e-03f, 3.752071e-02f, @@ -11682,7 +11690,7 @@ static const float itd_1020_table[HRTF_AZIMUTHS] = { -8.28071f, -7.36311f, -6.43732f, -5.49298f, -4.53728f, -3.57601f, -2.59830f, -1.63297f, }; -static const float ir_1020_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1020_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 6.953847e-01f, 3.081256e-01f, 2.474324e-01f, -3.025226e-01f, -1.119181e-01f, -4.966299e-02f, 5.727889e-02f, 6.715016e-03f, @@ -13137,7 +13145,7 @@ static const float itd_1021_table[HRTF_AZIMUTHS] = { -8.12772f, -7.17689f, -6.23068f, -5.27554f, -4.32391f, -3.38489f, -2.46445f, -1.54407f, }; -static const float ir_1021_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1021_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.807186e-01f, 3.835520e-01f, 1.208801e-01f, -4.044311e-01f, -5.188029e-02f, -7.750225e-02f, 1.739668e-01f, -6.599168e-02f, @@ -14592,7 +14600,7 @@ static const float itd_1022_table[HRTF_AZIMUTHS] = { -7.19675f, -6.30334f, -5.39609f, -4.47018f, -3.53964f, -2.62393f, -1.75389f, -0.90222f, }; -static const float ir_1022_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1022_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.053226e-01f, 2.645844e-01f, 2.462055e-01f, -2.145682e-01f, -1.333283e-01f, -1.751403e-01f, 2.721890e-01f, -1.743790e-01f, @@ -16047,7 +16055,7 @@ static const float itd_1026_table[HRTF_AZIMUTHS] = { -7.45209f, -6.46598f, -5.49746f, -4.54220f, -3.60610f, -2.68084f, -1.74087f, -0.80841f, }; -static const float ir_1026_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1026_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.150396e-01f, 3.144234e-01f, 9.132840e-02f, -2.128668e-01f, -1.899010e-01f, 1.362356e-01f, -4.105226e-02f, 4.896281e-02f, @@ -17502,7 +17510,7 @@ static const float itd_1028_table[HRTF_AZIMUTHS] = { -7.80099f, -6.89255f, -5.95721f, -5.04107f, -4.11968f, -3.20233f, -2.33316f, -1.46289f, }; -static const float ir_1028_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1028_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 9.491360e-01f, 2.952796e-01f, -1.585342e-01f, -3.497386e-01f, 1.204260e-01f, -4.886012e-02f, 5.238760e-02f, -8.209077e-03f, @@ -18957,7 +18965,7 @@ static const float itd_1038_table[HRTF_AZIMUTHS] = { -6.69661f, -5.65906f, -4.62851f, -3.63493f, -2.66802f, -1.71997f, -0.76853f, 0.18497f, }; -static const float ir_1038_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1038_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 9.325991e-01f, 1.817283e-01f, 5.397613e-02f, -4.121773e-01f, -7.921759e-03f, -4.009945e-02f, 1.499187e-01f, -1.838252e-02f, @@ -20412,7 +20420,7 @@ static const float itd_1041_table[HRTF_AZIMUTHS] = { -7.03257f, -6.07458f, -5.13664f, -4.24453f, -3.37177f, -2.49083f, -1.55807f, -0.62014f, }; -static const float ir_1041_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1041_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.012368e-01f, 2.006662e-01f, 3.173636e-01f, -2.865733e-01f, 1.345042e-01f, -5.030394e-01f, 3.717757e-01f, -1.138039e-01f, @@ -21867,7 +21875,7 @@ static const float itd_1042_table[HRTF_AZIMUTHS] = { -7.79822f, -6.84403f, -5.88862f, -4.94525f, -3.99704f, -3.03547f, -2.06207f, -1.07916f, }; -static const float ir_1042_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1042_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 9.114429e-01f, 2.201994e-03f, 3.703525e-01f, -4.825957e-01f, 1.210277e-01f, -2.471091e-01f, 1.766662e-01f, -5.840113e-03f, @@ -23322,7 +23330,7 @@ static const float itd_1043_table[HRTF_AZIMUTHS] = { -6.81973f, -5.86664f, -4.92096f, -3.99232f, -3.07973f, -2.16321f, -1.20142f, -0.22538f, }; -static const float ir_1043_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1043_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.339447e-01f, 1.339343e-01f, 4.031645e-01f, -4.891909e-01f, 8.751389e-02f, -2.110783e-01f, 2.573841e-01f, -1.050324e-01f, @@ -24777,7 +24785,7 @@ static const float itd_1044_table[HRTF_AZIMUTHS] = { -7.31965f, -6.37963f, -5.45379f, -4.54748f, -3.59370f, -2.59525f, -1.67705f, -0.73882f, }; -static const float ir_1044_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1044_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.028871e-01f, 2.381998e-01f, 4.686725e-01f, -5.412304e-01f, 1.262568e-01f, -3.198619e-01f, 1.963468e-01f, -4.016186e-02f, @@ -26232,7 +26240,7 @@ static const float itd_1047_table[HRTF_AZIMUTHS] = { -9.01225f, -7.93667f, -6.85884f, -5.78919f, -4.72064f, -3.66640f, -2.66295f, -1.65780f, }; -static const float ir_1047_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1047_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 7.788578e-01f, 1.598904e-01f, 2.366520e-01f, -3.524184e-01f, -8.784474e-03f, -5.144472e-02f, 8.679429e-02f, -1.634258e-02f, @@ -27687,7 +27695,7 @@ static const float itd_1048_table[HRTF_AZIMUTHS] = { -7.15985f, -6.30472f, -5.41513f, -4.54994f, -3.62385f, -2.66142f, -1.79111f, -0.94033f, }; -static const float ir_1048_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1048_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 8.865287e-01f, 2.972076e-01f, -1.305391e-01f, -1.213860e-01f, -1.948535e-01f, 1.458427e-01f, -8.912857e-02f, 9.493978e-02f, @@ -29142,7 +29150,7 @@ static const float itd_1050_table[HRTF_AZIMUTHS] = { -6.52690f, -5.58085f, -4.64474f, -3.71658f, -2.80444f, -1.92096f, -1.07543f, -0.23450f, }; -static const float ir_1050_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1050_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 9.005889e-01f, -6.452200e-02f, 3.675525e-01f, -4.309962e-01f, 7.086621e-02f, -9.161573e-02f, -4.290351e-02f, 9.057393e-02f, @@ -30597,7 +30605,7 @@ static const float itd_1052_table[HRTF_AZIMUTHS] = { -6.50194f, -5.61262f, -4.72534f, -3.84869f, -2.97504f, -2.10269f, -1.23783f, -0.36766f, }; -static const float ir_1052_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1052_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 6.650009e-01f, 3.507944e-01f, -3.274164e-02f, -1.830690e-01f, -7.720853e-02f, 1.030789e-01f, 3.877069e-02f, -5.674440e-02f, @@ -32052,7 +32060,7 @@ static const float itd_1054_table[HRTF_AZIMUTHS] = { -7.35642f, -6.36606f, -5.37262f, -4.40394f, -3.44967f, -2.51333f, -1.59834f, -0.68300f, }; -static const float ir_1054_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1054_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 8.629450e-01f, 1.677356e-01f, 1.467365e-01f, -3.248726e-01f, -5.105235e-02f, -5.031096e-02f, 1.796471e-01f, -1.298094e-01f, @@ -33507,7 +33515,7 @@ static const float itd_1056_table[HRTF_AZIMUTHS] = { -6.99437f, -5.82430f, -4.73408f, -3.76713f, -2.88870f, -2.05251f, -1.18172f, -0.32736f, }; -static const float ir_1056_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1056_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 8.031418e-01f, 2.411323e-01f, 1.417951e-01f, -2.476192e-01f, -1.076012e-01f, 1.009190e-01f, 7.761394e-02f, -1.250722e-01f, @@ -34962,7 +34970,7 @@ static const float itd_1058_table[HRTF_AZIMUTHS] = { -7.78555f, -6.81447f, -5.85685f, -4.89466f, -3.93902f, -2.98660f, -2.01925f, -1.05758f, }; -static const float ir_1058_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { +ALIGN32 static const float ir_1058_table[HRTF_AZIMUTHS][2][HRTF_TAPS] = { // azimuth = 0 {{ 9.307292e-01f, 5.592706e-02f, 2.567367e-01f, -4.525413e-01f, 1.378666e-01f, -2.503950e-01f, 1.983286e-01f, 5.925522e-03f, diff --git a/libraries/audio/src/avx/AudioHRTF_avx.cpp b/libraries/audio/src/avx/AudioHRTF_avx.cpp deleted file mode 100644 index b103bf015c..0000000000 --- a/libraries/audio/src/avx/AudioHRTF_avx.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// -// AudioHRTF_avx.cpp -// libraries/audio/src/avx -// -// Created by Ken Cooke on 1/17/16. -// Copyright 2016 High Fidelity, Inc. -// -// Distributed under the Apache License, Version 2.0. -// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html -// - -#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) - -#include -#include - -#include "../AudioHRTF.h" - -#ifndef __AVX__ -#error Must be compiled with /arch:AVX or -mavx. -#endif - -// 1 channel input, 4 channel output -void FIR_1x4_AVX(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { - - float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards - float* coef1 = coef[1] + HRTF_TAPS - 1; - float* coef2 = coef[2] + HRTF_TAPS - 1; - float* coef3 = coef[3] + HRTF_TAPS - 1; - - assert(numFrames % 8 == 0); - - for (int i = 0; i < numFrames; i += 8) { - - __m256 acc0 = _mm256_setzero_ps(); - __m256 acc1 = _mm256_setzero_ps(); - __m256 acc2 = _mm256_setzero_ps(); - __m256 acc3 = _mm256_setzero_ps(); - - float* ps = &src[i - HRTF_TAPS + 1]; // process forwards - - assert(HRTF_TAPS % 8 == 0); - - for (int k = 0; k < HRTF_TAPS; k += 8) { - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-0]), _mm256_loadu_ps(&ps[k+0]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-1]), _mm256_loadu_ps(&ps[k+1]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-2]), _mm256_loadu_ps(&ps[k+2]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-3]), _mm256_loadu_ps(&ps[k+3]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-4]), _mm256_loadu_ps(&ps[k+4]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-5]), _mm256_loadu_ps(&ps[k+5]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-6]), _mm256_loadu_ps(&ps[k+6]))); - - acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-7]), _mm256_loadu_ps(&ps[k+7]))); - } - - _mm256_storeu_ps(&dst0[i], acc0); - _mm256_storeu_ps(&dst1[i], acc1); - _mm256_storeu_ps(&dst2[i], acc2); - _mm256_storeu_ps(&dst3[i], acc3); - } - - _mm256_zeroupper(); -} - -#endif diff --git a/libraries/audio/src/avx2/AudioHRTF_avx2.cpp b/libraries/audio/src/avx2/AudioHRTF_avx2.cpp new file mode 100644 index 0000000000..452ceb7f4c --- /dev/null +++ b/libraries/audio/src/avx2/AudioHRTF_avx2.cpp @@ -0,0 +1,94 @@ +// +// AudioHRTF_avx2.cpp +// libraries/audio/src +// +// Created by Ken Cooke on 1/17/16. +// Copyright 2016 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) + +#include +#include // AVX2 + +#include "../AudioHRTF.h" + +#ifndef __AVX2__ +#error Must be compiled with /arch:AVX2 or -mavx2 -mfma. +#endif + +#if defined(__GNUC__) && !defined(__clang__) +// for some reason, GCC -O2 results in poorly optimized code +#pragma GCC optimize("Os") +#endif + +// 1 channel input, 4 channel output +void FIR_1x4_AVX2(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) { + + float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards + float* coef1 = coef[1] + HRTF_TAPS - 1; + float* coef2 = coef[2] + HRTF_TAPS - 1; + float* coef3 = coef[3] + HRTF_TAPS - 1; + + assert(numFrames % 8 == 0); + + for (int i = 0; i < numFrames; i += 8) { + + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + __m256 acc2 = _mm256_setzero_ps(); + __m256 acc3 = _mm256_setzero_ps(); + __m256 acc4 = _mm256_setzero_ps(); + __m256 acc5 = _mm256_setzero_ps(); + __m256 acc6 = _mm256_setzero_ps(); + __m256 acc7 = _mm256_setzero_ps(); + + float* ps = &src[i - HRTF_TAPS + 1]; // process forwards + + assert(HRTF_TAPS % 4 == 0); + + for (int k = 0; k < HRTF_TAPS; k += 4) { + + __m256 x0 = _mm256_loadu_ps(&ps[k+0]); + acc0 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-0]), x0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-0]), x0, acc1); + acc2 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-0]), x0, acc2); + acc3 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-0]), x0, acc3); + + __m256 x1 = _mm256_loadu_ps(&ps[k+1]); + acc4 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-1]), x1, acc4); + acc5 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-1]), x1, acc5); + acc6 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-1]), x1, acc6); + acc7 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-1]), x1, acc7); + + __m256 x2 = _mm256_loadu_ps(&ps[k+2]); + acc0 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-2]), x2, acc0); + acc1 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-2]), x2, acc1); + acc2 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-2]), x2, acc2); + acc3 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-2]), x2, acc3); + + __m256 x3 = _mm256_loadu_ps(&ps[k+3]); + acc4 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef0[-k-3]), x3, acc4); + acc5 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef1[-k-3]), x3, acc5); + acc6 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef2[-k-3]), x3, acc6); + acc7 = _mm256_fmadd_ps(_mm256_broadcast_ss(&coef3[-k-3]), x3, acc7); + } + + acc0 = _mm256_add_ps(acc0, acc4); + acc1 = _mm256_add_ps(acc1, acc5); + acc2 = _mm256_add_ps(acc2, acc6); + acc3 = _mm256_add_ps(acc3, acc7); + + _mm256_storeu_ps(&dst0[i], acc0); + _mm256_storeu_ps(&dst1[i], acc1); + _mm256_storeu_ps(&dst2[i], acc2); + _mm256_storeu_ps(&dst3[i], acc3); + } + + _mm256_zeroupper(); +} + +#endif