AVX2 code must be in /avx for CMAKE to compile it properly

2025-04-23 23:33:48 +02:00 · 2016-06-06 11:57:17 -07:00 · 2016-06-06 11:57:17 -07:00 · 4f35d3df02
commit 4f35d3df02
parent 34f46b860b
1 changed files with 201 additions and 0 deletions
--- a/libraries/audio/src/avx/AudioSRC_avx.cpp
+++ b/libraries/audio/src/avx/AudioSRC_avx.cpp
@ -0,0 +1,201 @@
+//
+//  AudioSRC_avx.cpp
+//  libraries/audio/src
+//
+//  Created by Ken Cooke on 6/5/16.
+//  Copyright 2016 High Fidelity, Inc.
+//
+//  Distributed under the Apache License, Version 2.0.
+//  See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
+//
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "../AudioSRC.h"
+
+#ifndef __AVX__
+#error Must be compiled with /arch:AVX or -mavx.
+#endif
+
+// high/low part of int64_t
+#define LO32(a)   ((uint32_t)(a))
+#define HI32(a)   ((int32_t)((a) >> 32))
+
+int AudioSRC::multirateFilter1_AVX2(const float* input0, float* output0, int inputFrames) {
+    int outputFrames = 0;
+
+    assert(_numTaps % 8 == 0);  // SIMD8
+
+    if (_step == 0) {   // rational
+
+        int32_t i = HI32(_offset);
+
+        while (i < inputFrames) {
+
+            const float* c0 = &_polyphaseFilter[_numTaps * _phase];
+
+            __m256 acc0 = _mm256_setzero_ps();
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                //float coef = c0[j];
+                __m256 coef0 = _mm256_loadu_ps(&c0[j]);
+
+                //acc += input[i + j] * coef;
+                acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
+            }
+
+            // horizontal sum
+            acc0 = _mm256_hadd_ps(acc0, acc0);
+            __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
+            t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0));
+
+            _mm_store_ss(&output0[outputFrames], t0);
+            outputFrames += 1;
+
+            i += _stepTable[_phase];
+            if (++_phase == _upFactor) {
+                _phase = 0;
+            }
+        }
+        _offset = (int64_t)(i - inputFrames) << 32;
+
+    } else {    // irrational
+
+        while (HI32(_offset) < inputFrames) {
+
+            int32_t i = HI32(_offset);
+            uint32_t f = LO32(_offset);
+
+            uint32_t phase = f >> SRC_FRACBITS;
+            float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT;
+
+            const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)];
+            const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)];
+
+            __m256 acc0 = _mm256_setzero_ps();
+            __m256 frac = _mm256_broadcast_ss(&ftmp);
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                //float coef = c0[j] + frac * (c1[j] - c0[j]);
+                __m256 coef0 = _mm256_loadu_ps(&c0[j]);
+                __m256 coef1 = _mm256_loadu_ps(&c1[j]);
+                coef1 = _mm256_sub_ps(coef1, coef0);
+                coef0 = _mm256_fmadd_ps(coef1, frac, coef0);
+
+                //acc += input[i + j] * coef;
+                acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
+            }
+
+            // horizontal sum
+            acc0 = _mm256_hadd_ps(acc0, acc0);
+            __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
+            t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0));
+
+            _mm_store_ss(&output0[outputFrames], t0);
+            outputFrames += 1;
+
+            _offset += _step;
+        }
+        _offset -= (int64_t)inputFrames << 32;
+    }
+    _mm256_zeroupper();
+
+    return outputFrames;
+}
+
+int AudioSRC::multirateFilter2_AVX2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) {
+    int outputFrames = 0;
+
+    assert(_numTaps % 8 == 0);  // SIMD8
+
+    if (_step == 0) {   // rational
+
+        int32_t i = HI32(_offset);
+
+        while (i < inputFrames) {
+
+            const float* c0 = &_polyphaseFilter[_numTaps * _phase];
+
+            __m256 acc0 = _mm256_setzero_ps();
+            __m256 acc1 = _mm256_setzero_ps();
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                //float coef = c0[j];
+                __m256 coef0 = _mm256_loadu_ps(&c0[j]);
+
+                //acc += input[i + j] * coef;
+                acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
+                acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1);
+            }
+
+            // horizontal sum
+            acc0 = _mm256_hadd_ps(acc0, acc1);
+            __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
+            t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0));
+
+            _mm_store_ss(&output0[outputFrames], t0);
+            _mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2)));
+            outputFrames += 1;
+
+            i += _stepTable[_phase];
+            if (++_phase == _upFactor) {
+                _phase = 0;
+            }
+        }
+        _offset = (int64_t)(i - inputFrames) << 32;
+
+    } else {    // irrational
+
+        while (HI32(_offset) < inputFrames) {
+
+            int32_t i = HI32(_offset);
+            uint32_t f = LO32(_offset);
+
+            uint32_t phase = f >> SRC_FRACBITS;
+            float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT;
+
+            const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)];
+            const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)];
+
+            __m256 acc0 = _mm256_setzero_ps();
+            __m256 acc1 = _mm256_setzero_ps();
+            __m256 frac = _mm256_broadcast_ss(&ftmp);
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                //float coef = c0[j] + frac * (c1[j] - c0[j]);
+                __m256 coef0 = _mm256_loadu_ps(&c0[j]);
+                __m256 coef1 = _mm256_loadu_ps(&c1[j]);
+                coef1 = _mm256_sub_ps(coef1, coef0);
+                coef0 = _mm256_fmadd_ps(coef1, frac, coef0);
+
+                //acc += input[i + j] * coef;
+                acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
+                acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1);
+            }
+
+            // horizontal sum
+            acc0 = _mm256_hadd_ps(acc0, acc1);
+            __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
+            t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0));
+
+            _mm_store_ss(&output0[outputFrames], t0);
+            _mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2)));
+            outputFrames += 1;
+
+            _offset += _step;
+        }
+        _offset -= (int64_t)inputFrames << 32;
+    }
+    _mm256_zeroupper();
+
+    return outputFrames;
+}
+
+#endif