diff --git a/libraries/audio/src/avx/AudioSRC_avx.cpp b/libraries/audio/src/avx/AudioSRC_avx.cpp new file mode 100644 index 0000000000..dcc56b94ad --- /dev/null +++ b/libraries/audio/src/avx/AudioSRC_avx.cpp @@ -0,0 +1,201 @@ +// +// AudioSRC_avx.cpp +// libraries/audio/src +// +// Created by Ken Cooke on 6/5/16. +// Copyright 2016 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) + +#include +#include + +#include "../AudioSRC.h" + +#ifndef __AVX__ +#error Must be compiled with /arch:AVX or -mavx. +#endif + +// high/low part of int64_t +#define LO32(a) ((uint32_t)(a)) +#define HI32(a) ((int32_t)((a) >> 32)) + +int AudioSRC::multirateFilter1_AVX2(const float* input0, float* output0, int inputFrames) { + int outputFrames = 0; + + assert(_numTaps % 8 == 0); // SIMD8 + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + __m256 acc0 = _mm256_setzero_ps(); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j]; + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); + } + + // horizontal sum + acc0 = _mm256_hadd_ps(acc0, acc0); + __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1)); + t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0)); + + _mm_store_ss(&output0[outputFrames], t0); + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + __m256 acc0 = _mm256_setzero_ps(); + __m256 frac = _mm256_broadcast_ss(&ftmp); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j] + frac * (c1[j] - c0[j]); + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + __m256 coef1 = _mm256_loadu_ps(&c1[j]); + coef1 = _mm256_sub_ps(coef1, coef0); + coef0 = _mm256_fmadd_ps(coef1, frac, coef0); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); + } + + // horizontal sum + acc0 = _mm256_hadd_ps(acc0, acc0); + __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1)); + t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0)); + + _mm_store_ss(&output0[outputFrames], t0); + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + _mm256_zeroupper(); + + return outputFrames; +} + +int AudioSRC::multirateFilter2_AVX2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) { + int outputFrames = 0; + + assert(_numTaps % 8 == 0); // SIMD8 + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j]; + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1); + } + + // horizontal sum + acc0 = _mm256_hadd_ps(acc0, acc1); + __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1)); + t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0)); + + _mm_store_ss(&output0[outputFrames], t0); + _mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2))); + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + __m256 frac = _mm256_broadcast_ss(&ftmp); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j] + frac * (c1[j] - c0[j]); + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + __m256 coef1 = _mm256_loadu_ps(&c1[j]); + coef1 = _mm256_sub_ps(coef1, coef0); + coef0 = _mm256_fmadd_ps(coef1, frac, coef0); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1); + } + + // horizontal sum + acc0 = _mm256_hadd_ps(acc0, acc1); + __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1)); + t0 = _mm_add_ps(t0, _mm_movehdup_ps(t0)); + + _mm_store_ss(&output0[outputFrames], t0); + _mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2))); + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + _mm256_zeroupper(); + + return outputFrames; +} + +#endif