From b4994a3d89a8283dcaae6a5612fc0508b512c695 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Wed, 14 Dec 2016 15:18:41 -0800 Subject: [PATCH] Ambisonic resampler, optimized using AVX2. int16_t and float versions. Removed unused SSE2 version. Removed unused getExactInput(). Refactored to remove nested #ifdefs. --- libraries/audio/src/AudioSRC.cpp | 1321 ++++++++++++-------- libraries/audio/src/AudioSRC.h | 14 +- libraries/audio/src/Sound.cpp | 47 +- libraries/audio/src/avx2/AudioSRC_avx2.cpp | 105 ++ 4 files changed, 933 insertions(+), 554 deletions(-) diff --git a/libraries/audio/src/AudioSRC.cpp b/libraries/audio/src/AudioSRC.cpp index 22e51200d1..3cd7a53b6b 100644 --- a/libraries/audio/src/AudioSRC.cpp +++ b/libraries/audio/src/AudioSRC.cpp @@ -10,12 +10,19 @@ // #include +#include #include -#include #include "AudioSRC.h" #include "AudioSRCData.h" +#ifndef MAX +#define MAX(a,b) (((a) > (b)) ? (a) : (b)) +#endif +#ifndef MIN +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) +#endif + // high/low part of int64_t #define LO32(a) ((uint32_t)(a)) #define HI32(a) ((int32_t)((a) >> 32)) @@ -194,182 +201,224 @@ int AudioSRC::createIrrationalFilter(int upFactor, int downFactor, float gain) { } // -// on x86 architecture, assume that SSE2 is present +// scalar reference versions // +int AudioSRC::multirateFilter1_ref(const float* input0, float* output0, int inputFrames) { + int outputFrames = 0; + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + float acc0 = 0.0f; + + for (int j = 0; j < _numTaps; j++) { + + float coef = c0[j]; + + acc0 += input0[i + j] * coef; + } + + output0[outputFrames] = acc0; + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + float acc0 = 0.0f; + + for (int j = 0; j < _numTaps; j++) { + + float coef = c0[j] + frac * (c1[j] - c0[j]); + + acc0 += input0[i + j] * coef; + } + + output0[outputFrames] = acc0; + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + + return outputFrames; +} + +int AudioSRC::multirateFilter2_ref(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) { + int outputFrames = 0; + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + float acc0 = 0.0f; + float acc1 = 0.0f; + + for (int j = 0; j < _numTaps; j++) { + + float coef = c0[j]; + + acc0 += input0[i + j] * coef; + acc1 += input1[i + j] * coef; + } + + output0[outputFrames] = acc0; + output1[outputFrames] = acc1; + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + float acc0 = 0.0f; + float acc1 = 0.0f; + + for (int j = 0; j < _numTaps; j++) { + + float coef = c0[j] + frac * (c1[j] - c0[j]); + + acc0 += input0[i + j] * coef; + acc1 += input1[i + j] * coef; + } + + output0[outputFrames] = acc0; + output1[outputFrames] = acc1; + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + + return outputFrames; +} + +int AudioSRC::multirateFilter4_ref(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames) { + int outputFrames = 0; + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + float acc0 = 0.0f; + float acc1 = 0.0f; + float acc2 = 0.0f; + float acc3 = 0.0f; + + for (int j = 0; j < _numTaps; j++) { + + float coef = c0[j]; + + acc0 += input0[i + j] * coef; + acc1 += input1[i + j] * coef; + acc2 += input2[i + j] * coef; + acc3 += input3[i + j] * coef; + } + + output0[outputFrames] = acc0; + output1[outputFrames] = acc1; + output2[outputFrames] = acc2; + output3[outputFrames] = acc3; + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + float acc0 = 0.0f; + float acc1 = 0.0f; + float acc2 = 0.0f; + float acc3 = 0.0f; + + for (int j = 0; j < _numTaps; j++) { + + float coef = c0[j] + frac * (c1[j] - c0[j]); + + acc0 += input0[i + j] * coef; + acc1 += input1[i + j] * coef; + acc2 += input2[i + j] * coef; + acc3 += input3[i + j] * coef; + } + + output0[outputFrames] = acc0; + output1[outputFrames] = acc1; + output2[outputFrames] = acc2; + output3[outputFrames] = acc3; + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + + return outputFrames; +} + #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) -#include - -int AudioSRC::multirateFilter1_SSE(const float* input0, float* output0, int inputFrames) { - int outputFrames = 0; - - assert(_numTaps % 4 == 0); // SIMD4 - - if (_step == 0) { // rational - - int32_t i = HI32(_offset); - - while (i < inputFrames) { - - const float* c0 = &_polyphaseFilter[_numTaps * _phase]; - - __m128 acc0 = _mm_setzero_ps(); - - for (int j = 0; j < _numTaps; j += 4) { - - //float coef = c0[j]; - __m128 coef0 = _mm_loadu_ps(&c0[j]); - - //acc += input[i + j] * coef; - acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0); - } - - // horizontal sum - acc0 = _mm_add_ps(acc0, _mm_movehl_ps(acc0, acc0)); - acc0 = _mm_add_ss(acc0, _mm_shuffle_ps(acc0, acc0, _MM_SHUFFLE(0,0,0,1))); - - _mm_store_ss(&output0[outputFrames], acc0); - outputFrames += 1; - - i += _stepTable[_phase]; - if (++_phase == _upFactor) { - _phase = 0; - } - } - _offset = (int64_t)(i - inputFrames) << 32; - - } else { // irrational - - while (HI32(_offset) < inputFrames) { - - int32_t i = HI32(_offset); - uint32_t f = LO32(_offset); - - uint32_t phase = f >> SRC_FRACBITS; - __m128 frac = _mm_set1_ps((f & SRC_FRACMASK) * QFRAC_TO_FLOAT); - - const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; - const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; - - __m128 acc0 = _mm_setzero_ps(); - - for (int j = 0; j < _numTaps; j += 4) { - - //float coef = c0[j] + frac * (c1[j] - c0[j]); - __m128 coef0 = _mm_loadu_ps(&c0[j]); - __m128 coef1 = _mm_loadu_ps(&c1[j]); - coef1 = _mm_sub_ps(coef1, coef0); - coef0 = _mm_add_ps(_mm_mul_ps(coef1, frac), coef0); - - //acc += input[i + j] * coef; - acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0); - } - - // horizontal sum - acc0 = _mm_add_ps(acc0, _mm_movehl_ps(acc0, acc0)); - acc0 = _mm_add_ss(acc0, _mm_shuffle_ps(acc0, acc0, _MM_SHUFFLE(0,0,0,1))); - - _mm_store_ss(&output0[outputFrames], acc0); - outputFrames += 1; - - _offset += _step; - } - _offset -= (int64_t)inputFrames << 32; - } - - return outputFrames; -} - -int AudioSRC::multirateFilter2_SSE(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) { - int outputFrames = 0; - - assert(_numTaps % 4 == 0); // SIMD4 - - if (_step == 0) { // rational - - int32_t i = HI32(_offset); - - while (i < inputFrames) { - - const float* c0 = &_polyphaseFilter[_numTaps * _phase]; - - __m128 acc0 = _mm_setzero_ps(); - __m128 acc1 = _mm_setzero_ps(); - - for (int j = 0; j < _numTaps; j += 4) { - - //float coef = c0[j]; - __m128 coef0 = _mm_loadu_ps(&c0[j]); - - //acc += input[i + j] * coef; - acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0); - acc1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input1[i + j]), coef0), acc1); - } - - // horizontal sum - acc0 = _mm_add_ps(acc0, _mm_movehl_ps(acc0, acc0)); - acc1 = _mm_add_ps(acc1, _mm_movehl_ps(acc1, acc1)); - acc0 = _mm_add_ss(acc0, _mm_shuffle_ps(acc0, acc0, _MM_SHUFFLE(0,0,0,1))); - acc1 = _mm_add_ss(acc1, _mm_shuffle_ps(acc1, acc1, _MM_SHUFFLE(0,0,0,1))); - - _mm_store_ss(&output0[outputFrames], acc0); - _mm_store_ss(&output1[outputFrames], acc1); - outputFrames += 1; - - i += _stepTable[_phase]; - if (++_phase == _upFactor) { - _phase = 0; - } - } - _offset = (int64_t)(i - inputFrames) << 32; - - } else { // irrational - - while (HI32(_offset) < inputFrames) { - - int32_t i = HI32(_offset); - uint32_t f = LO32(_offset); - - uint32_t phase = f >> SRC_FRACBITS; - __m128 frac = _mm_set1_ps((f & SRC_FRACMASK) * QFRAC_TO_FLOAT); - - const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; - const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; - - __m128 acc0 = _mm_setzero_ps(); - __m128 acc1 = _mm_setzero_ps(); - - for (int j = 0; j < _numTaps; j += 4) { - - //float coef = c0[j] + frac * (c1[j] - c0[j]); - __m128 coef0 = _mm_loadu_ps(&c0[j]); - __m128 coef1 = _mm_loadu_ps(&c1[j]); - coef1 = _mm_sub_ps(coef1, coef0); - coef0 = _mm_add_ps(_mm_mul_ps(coef1, frac), coef0); - - //acc += input[i + j] * coef; - acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0); - acc1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input1[i + j]), coef0), acc1); - } - - // horizontal sum - acc0 = _mm_add_ps(acc0, _mm_movehl_ps(acc0, acc0)); - acc1 = _mm_add_ps(acc1, _mm_movehl_ps(acc1, acc1)); - acc0 = _mm_add_ss(acc0, _mm_shuffle_ps(acc0, acc0, _MM_SHUFFLE(0,0,0,1))); - acc1 = _mm_add_ss(acc1, _mm_shuffle_ps(acc1, acc1, _MM_SHUFFLE(0,0,0,1))); - - _mm_store_ss(&output0[outputFrames], acc0); - _mm_store_ss(&output1[outputFrames], acc1); - outputFrames += 1; - - _offset += _step; - } - _offset -= (int64_t)inputFrames << 32; - } - - return outputFrames; -} - // // Runtime CPU dispatch // @@ -377,223 +426,22 @@ int AudioSRC::multirateFilter2_SSE(const float* input0, const float* input1, flo #include "CPUDetect.h" int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) { - - static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter1_AVX2 : &AudioSRC::multirateFilter1_SSE; + static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter1_AVX2 : &AudioSRC::multirateFilter1_ref; return (this->*f)(input0, output0, inputFrames); // dispatch } int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) { - - static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter2_AVX2 : &AudioSRC::multirateFilter2_SSE; + static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter2_AVX2 : &AudioSRC::multirateFilter2_ref; return (this->*f)(input0, input1, output0, output1, inputFrames); // dispatch } -// convert int16_t to float, deinterleave stereo -void AudioSRC::convertInput(const int16_t* input, float** outputs, int numFrames) { - __m128 scale = _mm_set1_ps(1/32768.0f); - - if (_numChannels == 1) { - - int i = 0; - for (; i < numFrames - 3; i += 4) { - __m128i a0 = _mm_loadl_epi64((__m128i*)&input[i]); - - // sign-extend - a0 = _mm_srai_epi32(_mm_unpacklo_epi16(a0, a0), 16); - - __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); - - _mm_storeu_ps(&outputs[0][i], f0); - } - for (; i < numFrames; i++) { - __m128i a0 = _mm_insert_epi16(_mm_setzero_si128(), input[i], 0); - - // sign-extend - a0 = _mm_srai_epi32(_mm_unpacklo_epi16(a0, a0), 16); - - __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); - - _mm_store_ss(&outputs[0][i], f0); - } - - } else if (_numChannels == 2) { - - int i = 0; - for (; i < numFrames - 3; i += 4) { - __m128i a0 = _mm_loadu_si128((__m128i*)&input[2*i]); - __m128i a1 = a0; - - // deinterleave and sign-extend - a0 = _mm_madd_epi16(a0, _mm_set1_epi32(0x00000001)); - a1 = _mm_madd_epi16(a1, _mm_set1_epi32(0x00010000)); - - __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); - __m128 f1 = _mm_mul_ps(_mm_cvtepi32_ps(a1), scale); - - _mm_storeu_ps(&outputs[0][i], f0); - _mm_storeu_ps(&outputs[1][i], f1); - } - for (; i < numFrames; i++) { - __m128i a0 = _mm_cvtsi32_si128(*(int32_t*)&input[2*i]); - __m128i a1 = a0; - - // deinterleave and sign-extend - a0 = _mm_madd_epi16(a0, _mm_set1_epi32(0x00000001)); - a1 = _mm_madd_epi16(a1, _mm_set1_epi32(0x00010000)); - - __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); - __m128 f1 = _mm_mul_ps(_mm_cvtepi32_ps(a1), scale); - - _mm_store_ss(&outputs[0][i], f0); - _mm_store_ss(&outputs[1][i], f1); - } - } +int AudioSRC::multirateFilter4(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames) { + static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter4_AVX2 : &AudioSRC::multirateFilter4_ref; + return (this->*f)(input0, input1, input2, input3, output0, output1, output2, output3, inputFrames); // dispatch } -// fast TPDF dither in [-1.0f, 1.0f] -static inline __m128 dither4() { - static __m128i rz; - - // update the 8 different maximum-length LCGs - rz = _mm_mullo_epi16(rz, _mm_set_epi16(25173, -25511, -5975, -23279, 19445, -27591, 30185, -3495)); - rz = _mm_add_epi16(rz, _mm_set_epi16(13849, -32767, 105, -19675, -7701, -32679, -13225, 28013)); - - // promote to 32-bit - __m128i r0 = _mm_unpacklo_epi16(rz, _mm_setzero_si128()); - __m128i r1 = _mm_unpackhi_epi16(rz, _mm_setzero_si128()); - - // return (r0 - r1) * (1/65536.0f); - __m128 d0 = _mm_cvtepi32_ps(_mm_sub_epi32(r0, r1)); - return _mm_mul_ps(d0, _mm_set1_ps(1/65536.0f)); -} - -// convert float to int16_t with dither, interleave stereo -void AudioSRC::convertOutput(float** inputs, int16_t* output, int numFrames) { - __m128 scale = _mm_set1_ps(32768.0f); - - if (_numChannels == 1) { - - int i = 0; - for (; i < numFrames - 3; i += 4) { - __m128 f0 = _mm_mul_ps(_mm_loadu_ps(&inputs[0][i]), scale); - - f0 = _mm_add_ps(f0, dither4()); - - // round and saturate - __m128i a0 = _mm_cvtps_epi32(f0); - a0 = _mm_packs_epi32(a0, a0); - - _mm_storel_epi64((__m128i*)&output[i], a0); - } - for (; i < numFrames; i++) { - __m128 f0 = _mm_mul_ps(_mm_load_ss(&inputs[0][i]), scale); - - f0 = _mm_add_ps(f0, dither4()); - - // round and saturate - __m128i a0 = _mm_cvtps_epi32(f0); - a0 = _mm_packs_epi32(a0, a0); - - output[i] = (int16_t)_mm_extract_epi16(a0, 0); - } - - } else if (_numChannels == 2) { - - int i = 0; - for (; i < numFrames - 3; i += 4) { - __m128 f0 = _mm_mul_ps(_mm_loadu_ps(&inputs[0][i]), scale); - __m128 f1 = _mm_mul_ps(_mm_loadu_ps(&inputs[1][i]), scale); - - __m128 d0 = dither4(); - f0 = _mm_add_ps(f0, d0); - f1 = _mm_add_ps(f1, d0); - - // round and saturate - __m128i a0 = _mm_cvtps_epi32(f0); - __m128i a1 = _mm_cvtps_epi32(f1); - a0 = _mm_packs_epi32(a0, a0); - a1 = _mm_packs_epi32(a1, a1); - - // interleave - a0 = _mm_unpacklo_epi16(a0, a1); - _mm_storeu_si128((__m128i*)&output[2*i], a0); - } - for (; i < numFrames; i++) { - __m128 f0 = _mm_mul_ps(_mm_load_ss(&inputs[0][i]), scale); - __m128 f1 = _mm_mul_ps(_mm_load_ss(&inputs[1][i]), scale); - - __m128 d0 = dither4(); - f0 = _mm_add_ps(f0, d0); - f1 = _mm_add_ps(f1, d0); - - // round and saturate - __m128i a0 = _mm_cvtps_epi32(f0); - __m128i a1 = _mm_cvtps_epi32(f1); - a0 = _mm_packs_epi32(a0, a0); - a1 = _mm_packs_epi32(a1, a1); - - // interleave - a0 = _mm_unpacklo_epi16(a0, a1); - *(int32_t*)&output[2*i] = _mm_cvtsi128_si32(a0); - } - } -} - -// deinterleave stereo -void AudioSRC::convertInput(const float* input, float** outputs, int numFrames) { - - if (_numChannels == 1) { - - memcpy(outputs[0], input, numFrames * sizeof(float)); - - } else if (_numChannels == 2) { - - int i = 0; - for (; i < numFrames - 3; i += 4) { - __m128 f0 = _mm_loadu_ps(&input[2*i + 0]); - __m128 f1 = _mm_loadu_ps(&input[2*i + 4]); - - // deinterleave - _mm_storeu_ps(&outputs[0][i], _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(2,0,2,0))); - _mm_storeu_ps(&outputs[1][i], _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(3,1,3,1))); - } - for (; i < numFrames; i++) { - // deinterleave - outputs[0][i] = input[2*i + 0]; - outputs[1][i] = input[2*i + 1]; - } - } -} - -// interleave stereo -void AudioSRC::convertOutput(float** inputs, float* output, int numFrames) { - - if (_numChannels == 1) { - - memcpy(output, inputs[0], numFrames * sizeof(float)); - - } else if (_numChannels == 2) { - - int i = 0; - for (; i < numFrames - 3; i += 4) { - __m128 f0 = _mm_loadu_ps(&inputs[0][i]); - __m128 f1 = _mm_loadu_ps(&inputs[1][i]); - - // interleave - _mm_storeu_ps(&output[2*i + 0], _mm_unpacklo_ps(f0, f1)); - _mm_storeu_ps(&output[2*i + 4], _mm_unpackhi_ps(f0, f1)); - } - for (; i < numFrames; i++) { - // interleave - output[2*i + 0] = inputs[0][i]; - output[2*i + 1] = inputs[1][i]; - } - } -} - -#else - -#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#elif defined(__ARM_NEON__) || defined(__ARM_NEON) #include @@ -714,6 +562,7 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* //acc += input[i + j] * coef; acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0); + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1); acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1); } @@ -766,6 +615,7 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* //acc += input[i + j] * coef; acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0); + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1); acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1); } @@ -787,11 +637,12 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* return outputFrames; } -#else - -int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) { +int AudioSRC::multirateFilter4(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames) { int outputFrames = 0; + assert(_numTaps % 8 == 0); // SIMD8 + if (_step == 0) { // rational int32_t i = HI32(_offset); @@ -800,16 +651,41 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra const float* c0 = &_polyphaseFilter[_numTaps * _phase]; - float acc0 = 0.0f; + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + float32x4_t acc2 = vdupq_n_f32(0); + float32x4_t acc3 = vdupq_n_f32(0); - for (int j = 0; j < _numTaps; j++) { + for (int j = 0; j < _numTaps; j += 8) { - float coef = c0[j]; + //float coef = c0[j]; + float32x4_t coef0 = vld1q_f32(&c0[j + 0]); // aligned + float32x4_t coef1 = vld1q_f32(&c0[j + 4]); // aligned - acc0 += input0[i + j] * coef; + //acc += input[i + j] * coef; + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0); + acc2 = vmlaq_f32(acc2, vld1q_f32(&input2[i + j + 0]), coef0); + acc3 = vmlaq_f32(acc3, vld1q_f32(&input3[i + j + 0]), coef0); + + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1); + acc2 = vmlaq_f32(acc2, vld1q_f32(&input2[i + j + 4]), coef1); + acc3 = vmlaq_f32(acc3, vld1q_f32(&input3[i + j + 4]), coef1); } - output0[outputFrames] = acc0; + // horizontal sum + float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + float32x2_t t1 = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); + float32x2_t t2 = vadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)); + float32x2_t t3 = vadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)); + t0 = vpadd_f32(t0, t1); + t2 = vpadd_f32(t2, t3); + + vst1_lane_f32(&output0[outputFrames], t0, 0); + vst1_lane_f32(&output1[outputFrames], t0, 1); + vst1_lane_f32(&output2[outputFrames], t2, 0); + vst1_lane_f32(&output3[outputFrames], t2, 1); outputFrames += 1; i += _stepTable[_phase]; @@ -827,21 +703,53 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra uint32_t f = LO32(_offset); uint32_t phase = f >> SRC_FRACBITS; - float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + float32x4_t frac = vdupq_n_f32((f & SRC_FRACMASK) * QFRAC_TO_FLOAT); const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; - float acc0 = 0.0f; + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + float32x4_t acc2 = vdupq_n_f32(0); + float32x4_t acc3 = vdupq_n_f32(0); - for (int j = 0; j < _numTaps; j++) { + for (int j = 0; j < _numTaps; j += 8) { - float coef = c0[j] + frac * (c1[j] - c0[j]); + float32x4_t coef0 = vld1q_f32(&c0[j + 0]); // aligned + float32x4_t coef1 = vld1q_f32(&c0[j + 4]); // aligned + float32x4_t coef2 = vld1q_f32(&c1[j + 0]); // aligned + float32x4_t coef3 = vld1q_f32(&c1[j + 4]); // aligned - acc0 += input0[i + j] * coef; + //float coef = c0[j] + frac * (c1[j] - c0[j]); + coef2 = vsubq_f32(coef2, coef0); + coef3 = vsubq_f32(coef3, coef1); + coef0 = vmlaq_f32(coef0, coef2, frac); + coef1 = vmlaq_f32(coef1, coef3, frac); + + //acc += input[i + j] * coef; + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0); + acc2 = vmlaq_f32(acc2, vld1q_f32(&input2[i + j + 0]), coef0); + acc3 = vmlaq_f32(acc3, vld1q_f32(&input3[i + j + 0]), coef0); + + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1); + acc2 = vmlaq_f32(acc2, vld1q_f32(&input2[i + j + 4]), coef1); + acc3 = vmlaq_f32(acc3, vld1q_f32(&input3[i + j + 4]), coef1); } - output0[outputFrames] = acc0; + // horizontal sum + float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + float32x2_t t1 = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); + float32x2_t t2 = vadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)); + float32x2_t t3 = vadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)); + t0 = vpadd_f32(t0, t1); + t2 = vpadd_f32(t2, t3); + + vst1_lane_f32(&output0[outputFrames], t0, 0); + vst1_lane_f32(&output1[outputFrames], t0, 1); + vst1_lane_f32(&output2[outputFrames], t2, 0); + vst1_lane_f32(&output3[outputFrames], t2, 1); outputFrames += 1; _offset += _step; @@ -852,77 +760,409 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra return outputFrames; } +#else // portable reference code + +int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) { + return multirateFilter1_ref(input0, output0, inputFrames); +} + int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) { - int outputFrames = 0; + return multirateFilter2_ref(input0, input1, output0, output1, inputFrames); +} - if (_step == 0) { // rational - - int32_t i = HI32(_offset); - - while (i < inputFrames) { - - const float* c0 = &_polyphaseFilter[_numTaps * _phase]; - - float acc0 = 0.0f; - float acc1 = 0.0f; - - for (int j = 0; j < _numTaps; j++) { - - float coef = c0[j]; - - acc0 += input0[i + j] * coef; - acc1 += input1[i + j] * coef; - } - - output0[outputFrames] = acc0; - output1[outputFrames] = acc1; - outputFrames += 1; - - i += _stepTable[_phase]; - if (++_phase == _upFactor) { - _phase = 0; - } - } - _offset = (int64_t)(i - inputFrames) << 32; - - } else { // irrational - - while (HI32(_offset) < inputFrames) { - - int32_t i = HI32(_offset); - uint32_t f = LO32(_offset); - - uint32_t phase = f >> SRC_FRACBITS; - float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; - - const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; - const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; - - float acc0 = 0.0f; - float acc1 = 0.0f; - - for (int j = 0; j < _numTaps; j++) { - - float coef = c0[j] + frac * (c1[j] - c0[j]); - - acc0 += input0[i + j] * coef; - acc1 += input1[i + j] * coef; - } - - output0[outputFrames] = acc0; - output1[outputFrames] = acc1; - outputFrames += 1; - - _offset += _step; - } - _offset -= (int64_t)inputFrames << 32; - } - - return outputFrames; +int AudioSRC::multirateFilter4(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames) { + return multirateFilter4_ref(input0, input1, input2, input3, output0, output1, output2, output3, inputFrames); } #endif +// +// on x86 architecture, assume that SSE2 is present +// +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) + +#include // SSE2 + +// convert int16_t to float, deinterleave stereo +void AudioSRC::convertInput(const int16_t* input, float** outputs, int numFrames) { + __m128 scale = _mm_set1_ps(1/32768.0f); + + if (_numChannels == 1) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128i a0 = _mm_loadl_epi64((__m128i*)&input[i]); + + // sign-extend + a0 = _mm_srai_epi32(_mm_unpacklo_epi16(a0, a0), 16); + + __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); + + _mm_storeu_ps(&outputs[0][i], f0); + } + for (; i < numFrames; i++) { + __m128i a0 = _mm_insert_epi16(_mm_setzero_si128(), input[i], 0); + + // sign-extend + a0 = _mm_srai_epi32(_mm_unpacklo_epi16(a0, a0), 16); + + __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); + + _mm_store_ss(&outputs[0][i], f0); + } + + } else if (_numChannels == 2) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128i a0 = _mm_loadu_si128((__m128i*)&input[2*i]); + __m128i a1 = a0; + + // deinterleave and sign-extend + a0 = _mm_madd_epi16(a0, _mm_set1_epi32(0x00000001)); + a1 = _mm_madd_epi16(a1, _mm_set1_epi32(0x00010000)); + + __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); + __m128 f1 = _mm_mul_ps(_mm_cvtepi32_ps(a1), scale); + + _mm_storeu_ps(&outputs[0][i], f0); + _mm_storeu_ps(&outputs[1][i], f1); + } + for (; i < numFrames; i++) { + __m128i a0 = _mm_cvtsi32_si128(*(int32_t*)&input[2*i]); + __m128i a1 = a0; + + // deinterleave and sign-extend + a0 = _mm_madd_epi16(a0, _mm_set1_epi32(0x00000001)); + a1 = _mm_madd_epi16(a1, _mm_set1_epi32(0x00010000)); + + __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); + __m128 f1 = _mm_mul_ps(_mm_cvtepi32_ps(a1), scale); + + _mm_store_ss(&outputs[0][i], f0); + _mm_store_ss(&outputs[1][i], f1); + } + } else if (_numChannels == 4) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128i a0 = _mm_loadu_si128((__m128i*)&input[4*i+0]); + __m128i a1 = a0; + __m128i a2 = _mm_loadu_si128((__m128i*)&input[4*i+8]); + __m128i a3 = a2; + + // deinterleave and sign-extend + a0 = _mm_madd_epi16(a0, _mm_set1_epi32(0x00000001)); + a1 = _mm_madd_epi16(a1, _mm_set1_epi32(0x00010000)); + a2 = _mm_madd_epi16(a2, _mm_set1_epi32(0x00000001)); + a3 = _mm_madd_epi16(a3, _mm_set1_epi32(0x00010000)); + + __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); + __m128 f1 = _mm_mul_ps(_mm_cvtepi32_ps(a1), scale); + __m128 f2 = _mm_mul_ps(_mm_cvtepi32_ps(a2), scale); + __m128 f3 = _mm_mul_ps(_mm_cvtepi32_ps(a3), scale); + + _mm_storeu_ps(&outputs[0][i], _mm_shuffle_ps(f0, f2, _MM_SHUFFLE(2,0,2,0))); + _mm_storeu_ps(&outputs[1][i], _mm_shuffle_ps(f1, f3, _MM_SHUFFLE(2,0,2,0))); + _mm_storeu_ps(&outputs[2][i], _mm_shuffle_ps(f0, f2, _MM_SHUFFLE(3,1,3,1))); + _mm_storeu_ps(&outputs[3][i], _mm_shuffle_ps(f1, f3, _MM_SHUFFLE(3,1,3,1))); + } + for (; i < numFrames; i++) { + __m128i a0 = _mm_cvtsi32_si128(*(int32_t*)&input[4*i+0]); + __m128i a1 = a0; + __m128i a2 = _mm_cvtsi32_si128(*(int32_t*)&input[4*i+2]); + __m128i a3 = a2; + + // deinterleave and sign-extend + a0 = _mm_madd_epi16(a0, _mm_set1_epi32(0x00000001)); + a1 = _mm_madd_epi16(a1, _mm_set1_epi32(0x00010000)); + a2 = _mm_madd_epi16(a2, _mm_set1_epi32(0x00000001)); + a3 = _mm_madd_epi16(a3, _mm_set1_epi32(0x00010000)); + + __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(a0), scale); + __m128 f1 = _mm_mul_ps(_mm_cvtepi32_ps(a1), scale); + __m128 f2 = _mm_mul_ps(_mm_cvtepi32_ps(a2), scale); + __m128 f3 = _mm_mul_ps(_mm_cvtepi32_ps(a3), scale); + + _mm_store_ss(&outputs[0][i], f0); + _mm_store_ss(&outputs[1][i], f1); + _mm_store_ss(&outputs[2][i], f2); + _mm_store_ss(&outputs[3][i], f3); + } + } +} + +// fast TPDF dither in [-1.0f, 1.0f] +static inline __m128 dither4() { + static __m128i rz; + + // update the 8 different maximum-length LCGs + rz = _mm_mullo_epi16(rz, _mm_set_epi16(25173, -25511, -5975, -23279, 19445, -27591, 30185, -3495)); + rz = _mm_add_epi16(rz, _mm_set_epi16(13849, -32767, 105, -19675, -7701, -32679, -13225, 28013)); + + // promote to 32-bit + __m128i r0 = _mm_unpacklo_epi16(rz, _mm_setzero_si128()); + __m128i r1 = _mm_unpackhi_epi16(rz, _mm_setzero_si128()); + + // return (r0 - r1) * (1/65536.0f); + __m128 d0 = _mm_cvtepi32_ps(_mm_sub_epi32(r0, r1)); + return _mm_mul_ps(d0, _mm_set1_ps(1/65536.0f)); +} + +// convert float to int16_t with dither, interleave stereo +void AudioSRC::convertOutput(float** inputs, int16_t* output, int numFrames) { + __m128 scale = _mm_set1_ps(32768.0f); + + if (_numChannels == 1) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_mul_ps(_mm_loadu_ps(&inputs[0][i]), scale); + + f0 = _mm_add_ps(f0, dither4()); + + // round and saturate + __m128i a0 = _mm_cvtps_epi32(f0); + a0 = _mm_packs_epi32(a0, a0); + + _mm_storel_epi64((__m128i*)&output[i], a0); + } + for (; i < numFrames; i++) { + __m128 f0 = _mm_mul_ps(_mm_load_ss(&inputs[0][i]), scale); + + f0 = _mm_add_ps(f0, dither4()); + + // round and saturate + __m128i a0 = _mm_cvtps_epi32(f0); + a0 = _mm_packs_epi32(a0, a0); + + output[i] = (int16_t)_mm_extract_epi16(a0, 0); + } + + } else if (_numChannels == 2) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_mul_ps(_mm_loadu_ps(&inputs[0][i]), scale); + __m128 f1 = _mm_mul_ps(_mm_loadu_ps(&inputs[1][i]), scale); + + __m128 d0 = dither4(); + f0 = _mm_add_ps(f0, d0); + f1 = _mm_add_ps(f1, d0); + + // round and saturate + __m128i a0 = _mm_cvtps_epi32(f0); + __m128i a1 = _mm_cvtps_epi32(f1); + a0 = _mm_packs_epi32(a0, a0); + a1 = _mm_packs_epi32(a1, a1); + + // interleave + a0 = _mm_unpacklo_epi16(a0, a1); + _mm_storeu_si128((__m128i*)&output[2*i], a0); + } + for (; i < numFrames; i++) { + __m128 f0 = _mm_mul_ps(_mm_load_ss(&inputs[0][i]), scale); + __m128 f1 = _mm_mul_ps(_mm_load_ss(&inputs[1][i]), scale); + + __m128 d0 = dither4(); + f0 = _mm_add_ps(f0, d0); + f1 = _mm_add_ps(f1, d0); + + // round and saturate + __m128i a0 = _mm_cvtps_epi32(f0); + __m128i a1 = _mm_cvtps_epi32(f1); + a0 = _mm_packs_epi32(a0, a0); + a1 = _mm_packs_epi32(a1, a1); + + // interleave + a0 = _mm_unpacklo_epi16(a0, a1); + *(int32_t*)&output[2*i] = _mm_cvtsi128_si32(a0); + } + + } else if (_numChannels == 4) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_mul_ps(_mm_loadu_ps(&inputs[0][i]), scale); + __m128 f1 = _mm_mul_ps(_mm_loadu_ps(&inputs[1][i]), scale); + __m128 f2 = _mm_mul_ps(_mm_loadu_ps(&inputs[2][i]), scale); + __m128 f3 = _mm_mul_ps(_mm_loadu_ps(&inputs[3][i]), scale); + + __m128 d0 = dither4(); + f0 = _mm_add_ps(f0, d0); + f1 = _mm_add_ps(f1, d0); + f2 = _mm_add_ps(f2, d0); + f3 = _mm_add_ps(f3, d0); + + // round and saturate + __m128i a0 = _mm_cvtps_epi32(f0); + __m128i a1 = _mm_cvtps_epi32(f1); + __m128i a2 = _mm_cvtps_epi32(f2); + __m128i a3 = _mm_cvtps_epi32(f3); + a0 = _mm_packs_epi32(a0, a2); + a1 = _mm_packs_epi32(a1, a3); + + // interleave + a2 = _mm_unpacklo_epi16(a0, a1); + a3 = _mm_unpackhi_epi16(a0, a1); + a0 = _mm_unpacklo_epi32(a2, a3); + a1 = _mm_unpackhi_epi32(a2, a3); + + _mm_storeu_si128((__m128i*)&output[4*i+0], a0); + _mm_storeu_si128((__m128i*)&output[4*i+8], a1); + } + for (; i < numFrames; i++) { + __m128 f0 = _mm_mul_ps(_mm_load_ss(&inputs[0][i]), scale); + __m128 f1 = _mm_mul_ps(_mm_load_ss(&inputs[1][i]), scale); + __m128 f2 = _mm_mul_ps(_mm_load_ss(&inputs[2][i]), scale); + __m128 f3 = _mm_mul_ps(_mm_load_ss(&inputs[3][i]), scale); + + __m128 d0 = dither4(); + f0 = _mm_add_ps(f0, d0); + f1 = _mm_add_ps(f1, d0); + f2 = _mm_add_ps(f2, d0); + f3 = _mm_add_ps(f3, d0); + + // round and saturate + __m128i a0 = _mm_cvtps_epi32(f0); + __m128i a1 = _mm_cvtps_epi32(f1); + __m128i a2 = _mm_cvtps_epi32(f2); + __m128i a3 = _mm_cvtps_epi32(f3); + a0 = _mm_packs_epi32(a0, a2); + a1 = _mm_packs_epi32(a1, a3); + + // interleave + a2 = _mm_unpacklo_epi16(a0, a1); + a3 = _mm_unpackhi_epi16(a0, a1); + a0 = _mm_unpacklo_epi32(a2, a3); + a1 = _mm_unpackhi_epi32(a2, a3); + + _mm_storel_epi64((__m128i*)&output[4*i], a0); + } + } +} + +// deinterleave stereo +void AudioSRC::convertInput(const float* input, float** outputs, int numFrames) { + + if (_numChannels == 1) { + + memcpy(outputs[0], input, numFrames * sizeof(float)); + + } else if (_numChannels == 2) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_loadu_ps(&input[2*i + 0]); + __m128 f1 = _mm_loadu_ps(&input[2*i + 4]); + + // deinterleave + _mm_storeu_ps(&outputs[0][i], _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(2,0,2,0))); + _mm_storeu_ps(&outputs[1][i], _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(3,1,3,1))); + } + for (; i < numFrames; i++) { + // deinterleave + outputs[0][i] = input[2*i + 0]; + outputs[1][i] = input[2*i + 1]; + } + + } else if (_numChannels == 4) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_loadu_ps(&input[4*i + 0]); + __m128 f1 = _mm_loadu_ps(&input[4*i + 4]); + __m128 f2 = _mm_loadu_ps(&input[4*i + 8]); + __m128 f3 = _mm_loadu_ps(&input[4*i + 12]); + + // deinterleave + __m128 t0 = _mm_unpacklo_ps(f0, f1); + __m128 t2 = _mm_unpacklo_ps(f2, f3); + __m128 t1 = _mm_unpackhi_ps(f0, f1); + __m128 t3 = _mm_unpackhi_ps(f2, f3); + + f0 = _mm_movelh_ps(t0, t2); + f1 = _mm_movehl_ps(t2, t0); + f2 = _mm_movelh_ps(t1, t3); + f3 = _mm_movehl_ps(t3, t1); + + _mm_storeu_ps(&outputs[0][i], f0); + _mm_storeu_ps(&outputs[1][i], f1); + _mm_storeu_ps(&outputs[2][i], f2); + _mm_storeu_ps(&outputs[3][i], f3); + } + for (; i < numFrames; i++) { + // deinterleave + outputs[0][i] = input[4*i + 0]; + outputs[1][i] = input[4*i + 1]; + outputs[2][i] = input[4*i + 2]; + outputs[3][i] = input[4*i + 3]; + } + } +} + +// interleave stereo +void AudioSRC::convertOutput(float** inputs, float* output, int numFrames) { + + if (_numChannels == 1) { + + memcpy(output, inputs[0], numFrames * sizeof(float)); + + } else if (_numChannels == 2) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_loadu_ps(&inputs[0][i]); + __m128 f1 = _mm_loadu_ps(&inputs[1][i]); + + // interleave + _mm_storeu_ps(&output[2*i + 0], _mm_unpacklo_ps(f0, f1)); + _mm_storeu_ps(&output[2*i + 4], _mm_unpackhi_ps(f0, f1)); + } + for (; i < numFrames; i++) { + // interleave + output[2*i + 0] = inputs[0][i]; + output[2*i + 1] = inputs[1][i]; + } + + } else if (_numChannels == 4) { + + int i = 0; + for (; i < numFrames - 3; i += 4) { + __m128 f0 = _mm_loadu_ps(&inputs[0][i]); + __m128 f1 = _mm_loadu_ps(&inputs[1][i]); + __m128 f2 = _mm_loadu_ps(&inputs[2][i]); + __m128 f3 = _mm_loadu_ps(&inputs[3][i]); + + // interleave + __m128 t0 = _mm_unpacklo_ps(f0, f1); + __m128 t2 = _mm_unpacklo_ps(f2, f3); + __m128 t1 = _mm_unpackhi_ps(f0, f1); + __m128 t3 = _mm_unpackhi_ps(f2, f3); + + f0 = _mm_movelh_ps(t0, t2); + f1 = _mm_movehl_ps(t2, t0); + f2 = _mm_movelh_ps(t1, t3); + f3 = _mm_movehl_ps(t3, t1); + + _mm_storeu_ps(&output[4*i + 0], f0); + _mm_storeu_ps(&output[4*i + 4], f1); + _mm_storeu_ps(&output[4*i + 8], f2); + _mm_storeu_ps(&output[4*i + 12], f3); + } + for (; i < numFrames; i++) { + // interleave + output[4*i + 0] = inputs[0][i]; + output[4*i + 1] = inputs[1][i]; + output[4*i + 2] = inputs[2][i]; + output[4*i + 3] = inputs[3][i]; + } + } +} + +#else // portable reference code + // convert int16_t to float, deinterleave stereo void AudioSRC::convertInput(const int16_t* input, float** outputs, int numFrames) { const float scale = 1/32768.0f; @@ -936,6 +1176,13 @@ void AudioSRC::convertInput(const int16_t* input, float** outputs, int numFrames outputs[0][i] = (float)input[2*i + 0] * scale; outputs[1][i] = (float)input[2*i + 1] * scale; } + } else if (_numChannels == 4) { + for (int i = 0; i < numFrames; i++) { + outputs[0][i] = (float)input[4*i + 0] * scale; + outputs[1][i] = (float)input[4*i + 1] * scale; + outputs[2][i] = (float)input[4*i + 2] * scale; + outputs[3][i] = (float)input[4*i + 3] * scale; + } } } @@ -961,7 +1208,7 @@ void AudioSRC::convertOutput(float** inputs, int16_t* output, int numFrames) { // round and saturate f += (f < 0.0f ? -0.5f : +0.5f); - f = std::max(std::min(f, 32767.0f), -32768.0f); + f = MAX(MIN(f, 32767.0f), -32768.0f); output[i] = (int16_t)f; } @@ -978,13 +1225,43 @@ void AudioSRC::convertOutput(float** inputs, int16_t* output, int numFrames) { // round and saturate f0 += (f0 < 0.0f ? -0.5f : +0.5f); f1 += (f1 < 0.0f ? -0.5f : +0.5f); - f0 = std::max(std::min(f0, 32767.0f), -32768.0f); - f1 = std::max(std::min(f1, 32767.0f), -32768.0f); + f0 = MAX(MIN(f0, 32767.0f), -32768.0f); + f1 = MAX(MIN(f1, 32767.0f), -32768.0f); // interleave output[2*i + 0] = (int16_t)f0; output[2*i + 1] = (int16_t)f1; } + } else if (_numChannels == 4) { + for (int i = 0; i < numFrames; i++) { + + float f0 = inputs[0][i] * scale; + float f1 = inputs[1][i] * scale; + float f2 = inputs[2][i] * scale; + float f3 = inputs[3][i] * scale; + + float d = dither(); + f0 += d; + f1 += d; + f2 += d; + f3 += d; + + // round and saturate + f0 += (f0 < 0.0f ? -0.5f : +0.5f); + f1 += (f1 < 0.0f ? -0.5f : +0.5f); + f2 += (f2 < 0.0f ? -0.5f : +0.5f); + f3 += (f3 < 0.0f ? -0.5f : +0.5f); + f0 = MAX(MIN(f0, 32767.0f), -32768.0f); + f1 = MAX(MIN(f1, 32767.0f), -32768.0f); + f2 = MAX(MIN(f2, 32767.0f), -32768.0f); + f3 = MAX(MIN(f3, 32767.0f), -32768.0f); + + // interleave + output[4*i + 0] = (int16_t)f0; + output[4*i + 1] = (int16_t)f1; + output[4*i + 2] = (int16_t)f2; + output[4*i + 3] = (int16_t)f3; + } } } @@ -1001,6 +1278,14 @@ void AudioSRC::convertInput(const float* input, float** outputs, int numFrames) outputs[0][i] = input[2*i + 0]; outputs[1][i] = input[2*i + 1]; } + } else if (_numChannels == 4) { + for (int i = 0; i < numFrames; i++) { + // deinterleave + outputs[0][i] = input[4*i + 0]; + outputs[1][i] = input[4*i + 1]; + outputs[2][i] = input[4*i + 2]; + outputs[3][i] = input[4*i + 3]; + } } } @@ -1017,6 +1302,14 @@ void AudioSRC::convertOutput(float** inputs, float* output, int numFrames) { output[2*i + 0] = inputs[0][i]; output[2*i + 1] = inputs[1][i]; } + } else if (_numChannels == 4) { + for (int i = 0; i < numFrames; i++) { + // interleave + output[4*i + 0] = inputs[0][i]; + output[4*i + 1] = inputs[1][i]; + output[4*i + 2] = inputs[2][i]; + output[4*i + 3] = inputs[3][i]; + } } } @@ -1025,8 +1318,8 @@ void AudioSRC::convertOutput(float** inputs, float* output, int numFrames) { int AudioSRC::render(float** inputs, float** outputs, int inputFrames) { int outputFrames = 0; - int nh = std::min(_numHistory, inputFrames); // number of frames from history buffer - int ni = inputFrames - nh; // number of frames from remaining input + int nh = MIN(_numHistory, inputFrames); // number of frames from history buffer + int ni = inputFrames - nh; // number of frames from remaining input if (_numChannels == 1) { @@ -1070,6 +1363,43 @@ int AudioSRC::render(float** inputs, float** outputs, int inputFrames) { memmove(_history[0], _history[0] + nh, _numHistory * sizeof(float)); memmove(_history[1], _history[1] + nh, _numHistory * sizeof(float)); } + + } else if (_numChannels == 4) { + + // refill history buffers + memcpy(_history[0] + _numHistory, inputs[0], nh * sizeof(float)); + memcpy(_history[1] + _numHistory, inputs[1], nh * sizeof(float)); + memcpy(_history[2] + _numHistory, inputs[2], nh * sizeof(float)); + memcpy(_history[3] + _numHistory, inputs[3], nh * sizeof(float)); + + // process history buffer + outputFrames += multirateFilter4(_history[0], _history[1], _history[2], _history[3], + outputs[0], + outputs[1], + outputs[2], + outputs[3], nh); + + // process remaining input + if (ni) { + outputFrames += multirateFilter4(inputs[0], inputs[1], inputs[2], inputs[3], + outputs[0] + outputFrames, + outputs[1] + outputFrames, + outputs[2] + outputFrames, + outputs[3] + outputFrames, ni); + } + + // shift history buffers + if (ni) { + memcpy(_history[0], inputs[0] + ni, _numHistory * sizeof(float)); + memcpy(_history[1], inputs[1] + ni, _numHistory * sizeof(float)); + memcpy(_history[2], inputs[2] + ni, _numHistory * sizeof(float)); + memcpy(_history[3], inputs[3] + ni, _numHistory * sizeof(float)); + } else { + memmove(_history[0], _history[0] + nh, _numHistory * sizeof(float)); + memmove(_history[1], _history[1] + nh, _numHistory * sizeof(float)); + memmove(_history[2], _history[2] + nh, _numHistory * sizeof(float)); + memmove(_history[3], _history[3] + nh, _numHistory * sizeof(float)); + } } return outputFrames; @@ -1110,35 +1440,40 @@ AudioSRC::AudioSRC(int inputSampleRate, int outputSampleRate, int numChannels) { //printf("up=%d down=%.3f taps=%d\n", _upFactor, _downFactor + (LO32(_step)<> 32); } } - -// the input frames that will produce exactly outputFrames -int AudioSRC::getExactInput(int outputFrames) { - // - // For upsampling, a correct implementation is more complicated - // because it requires early exit of the multirate filter. - // This is not currently supported. - // - if (_upFactor > _downFactor) { - return -1; - } - if (_step == 0) { - int64_t offset = ((int64_t)_phase * _downFactor) % _upFactor; - return (int)(((int64_t)outputFrames * _downFactor + offset) / _upFactor); - } else { - return (int)(((int64_t)outputFrames * _step + _offset) >> 32); - } -} diff --git a/libraries/audio/src/AudioSRC.h b/libraries/audio/src/AudioSRC.h index 5ae5318b5e..301786ee8f 100644 --- a/libraries/audio/src/AudioSRC.h +++ b/libraries/audio/src/AudioSRC.h @@ -14,7 +14,7 @@ #include -static const int SRC_MAX_CHANNELS = 2; +static const int SRC_MAX_CHANNELS = 4; // polyphase filter static const int SRC_PHASEBITS = 8; @@ -48,8 +48,6 @@ public: int getMinInput(int outputFrames); int getMaxInput(int outputFrames); - int getExactInput(int outputFrames); - private: float* _polyphaseFilter; int* _stepTable; @@ -77,12 +75,18 @@ private: int multirateFilter1(const float* input0, float* output0, int inputFrames); int multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames); + int multirateFilter4(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames); - int multirateFilter1_SSE(const float* input0, float* output0, int inputFrames); - int multirateFilter2_SSE(const float* input0, const float* input1, float* output0, float* output1, int inputFrames); + int multirateFilter1_ref(const float* input0, float* output0, int inputFrames); + int multirateFilter2_ref(const float* input0, const float* input1, float* output0, float* output1, int inputFrames); + int multirateFilter4_ref(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames); int multirateFilter1_AVX2(const float* input0, float* output0, int inputFrames); int multirateFilter2_AVX2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames); + int multirateFilter4_AVX2(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames); void convertInput(const int16_t* input, float** outputs, int numFrames); void convertOutput(float** inputs, int16_t* output, int numFrames); diff --git a/libraries/audio/src/Sound.cpp b/libraries/audio/src/Sound.cpp index f8dd4b9e27..d6607424db 100644 --- a/libraries/audio/src/Sound.cpp +++ b/libraries/audio/src/Sound.cpp @@ -97,54 +97,9 @@ void Sound::downSample(const QByteArray& rawAudioByteArray, int sampleRate) { // no resampling needed _byteArray = rawAudioByteArray; - } else if (_isAmbisonic) { - - // FIXME: add a proper Ambisonic resampler! - int numChannels = 4; - AudioSRC resampler[4] { {sampleRate, AudioConstants::SAMPLE_RATE, 1}, - {sampleRate, AudioConstants::SAMPLE_RATE, 1}, - {sampleRate, AudioConstants::SAMPLE_RATE, 1}, - {sampleRate, AudioConstants::SAMPLE_RATE, 1} }; - - // resize to max possible output - int numSourceFrames = rawAudioByteArray.size() / (numChannels * sizeof(AudioConstants::AudioSample)); - int maxDestinationFrames = resampler[0].getMaxOutput(numSourceFrames); - int maxDestinationBytes = maxDestinationFrames * numChannels * sizeof(AudioConstants::AudioSample); - _byteArray.resize(maxDestinationBytes); - - int numDestinationFrames = 0; - - // iterate over channels - int16_t* srcBuffer = new int16_t[numSourceFrames]; - int16_t* dstBuffer = new int16_t[maxDestinationFrames]; - for (int ch = 0; ch < 4; ch++) { - - int16_t* src = (int16_t*)rawAudioByteArray.data(); - int16_t* dst = (int16_t*)_byteArray.data(); - - // deinterleave samples - for (int i = 0; i < numSourceFrames; i++) { - srcBuffer[i] = src[4*i + ch]; - } - - // resample one channel - numDestinationFrames = resampler[ch].render(srcBuffer, dstBuffer, numSourceFrames); - - // reinterleave samples - for (int i = 0; i < numDestinationFrames; i++) { - dst[4*i + ch] = dstBuffer[i]; - } - } - delete[] srcBuffer; - delete[] dstBuffer; - - // truncate to actual output - int numDestinationBytes = numDestinationFrames * numChannels * sizeof(AudioConstants::AudioSample); - _byteArray.resize(numDestinationBytes); - } else { - int numChannels = _isStereo ? 2 : 1; + int numChannels = _isAmbisonic ? AudioConstants::AMBISONIC : (_isStereo ? AudioConstants::STEREO : AudioConstants::MONO); AudioSRC resampler(sampleRate, AudioConstants::SAMPLE_RATE, numChannels); // resize to max possible output diff --git a/libraries/audio/src/avx2/AudioSRC_avx2.cpp b/libraries/audio/src/avx2/AudioSRC_avx2.cpp index e634554bfb..693bad7fc6 100644 --- a/libraries/audio/src/avx2/AudioSRC_avx2.cpp +++ b/libraries/audio/src/avx2/AudioSRC_avx2.cpp @@ -198,4 +198,109 @@ int AudioSRC::multirateFilter2_AVX2(const float* input0, const float* input1, fl return outputFrames; } +int AudioSRC::multirateFilter4_AVX2(const float* input0, const float* input1, const float* input2, const float* input3, + float* output0, float* output1, float* output2, float* output3, int inputFrames) { + int outputFrames = 0; + + assert(_numTaps % 8 == 0); // SIMD8 + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + __m256 acc2 = _mm256_setzero_ps(); + __m256 acc3 = _mm256_setzero_ps(); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j]; + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1); + acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(&input2[i + j]), coef0, acc2); + acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(&input3[i + j]), coef0, acc3); + } + + // horizontal sum + acc0 = _mm256_hadd_ps(acc0, acc1); + acc2 = _mm256_hadd_ps(acc2, acc3); + acc0 = _mm256_hadd_ps(acc0, acc2); + __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1)); + + _mm_store_ss(&output0[outputFrames], t0); + _mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,1))); + _mm_store_ss(&output2[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2))); + _mm_store_ss(&output3[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,3))); + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT; + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + __m256 acc2 = _mm256_setzero_ps(); + __m256 acc3 = _mm256_setzero_ps(); + __m256 frac = _mm256_broadcast_ss(&ftmp); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j] + frac * (c1[j] - c0[j]); + __m256 coef0 = _mm256_loadu_ps(&c0[j]); + __m256 coef1 = _mm256_loadu_ps(&c1[j]); + coef1 = _mm256_sub_ps(coef1, coef0); + coef0 = _mm256_fmadd_ps(coef1, frac, coef0); + + //acc += input[i + j] * coef; + acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0); + acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1); + acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(&input2[i + j]), coef0, acc2); + acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(&input3[i + j]), coef0, acc3); + } + + // horizontal sum + acc0 = _mm256_hadd_ps(acc0, acc1); + acc2 = _mm256_hadd_ps(acc2, acc3); + acc0 = _mm256_hadd_ps(acc0, acc2); + __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1)); + + _mm_store_ss(&output0[outputFrames], t0); + _mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,1))); + _mm_store_ss(&output2[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2))); + _mm_store_ss(&output3[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,3))); + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + _mm256_zeroupper(); + + return outputFrames; +} + #endif