From b47000eab24efa08af526df503d80811fdb030ea Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Tue, 6 Sep 2016 16:53:06 -0700 Subject: [PATCH 1/3] Added ARM NEON optimized audio sample rate conversion --- libraries/audio/src/AudioSRC.cpp | 216 +++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) diff --git a/libraries/audio/src/AudioSRC.cpp b/libraries/audio/src/AudioSRC.cpp index 5dba63b349..22e51200d1 100644 --- a/libraries/audio/src/AudioSRC.cpp +++ b/libraries/audio/src/AudioSRC.cpp @@ -593,6 +593,202 @@ void AudioSRC::convertOutput(float** inputs, float* output, int numFrames) { #else +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + +#include + +int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) { + int outputFrames = 0; + + assert(_numTaps % 8 == 0); // SIMD8 + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j]; + float32x4_t coef0 = vld1q_f32(&c0[j + 0]); // aligned + float32x4_t coef1 = vld1q_f32(&c0[j + 4]); // aligned + + //acc += input[i + j] * coef; + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input0[i + j + 4]), coef1); + } + acc0 = vaddq_f32(acc0, acc1); + + // horizontal sum + float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + t0 = vpadd_f32(t0, t0); + + vst1_lane_f32(&output0[outputFrames], t0, 0); + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float32x4_t frac = vdupq_n_f32((f & SRC_FRACMASK) * QFRAC_TO_FLOAT); + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + + for (int j = 0; j < _numTaps; j += 8) { + + float32x4_t coef0 = vld1q_f32(&c0[j + 0]); // aligned + float32x4_t coef1 = vld1q_f32(&c0[j + 4]); // aligned + float32x4_t coef2 = vld1q_f32(&c1[j + 0]); // aligned + float32x4_t coef3 = vld1q_f32(&c1[j + 4]); // aligned + + //float coef = c0[j] + frac * (c1[j] - c0[j]); + coef2 = vsubq_f32(coef2, coef0); + coef3 = vsubq_f32(coef3, coef1); + coef0 = vmlaq_f32(coef0, coef2, frac); + coef1 = vmlaq_f32(coef1, coef3, frac); + + //acc += input[i + j] * coef; + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input0[i + j + 4]), coef1); + } + acc0 = vaddq_f32(acc0, acc1); + + // horizontal sum + float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + t0 = vpadd_f32(t0, t0); + + vst1_lane_f32(&output0[outputFrames], t0, 0); + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + + return outputFrames; +} + +int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) { + int outputFrames = 0; + + assert(_numTaps % 8 == 0); // SIMD8 + + if (_step == 0) { // rational + + int32_t i = HI32(_offset); + + while (i < inputFrames) { + + const float* c0 = &_polyphaseFilter[_numTaps * _phase]; + + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + + for (int j = 0; j < _numTaps; j += 8) { + + //float coef = c0[j]; + float32x4_t coef0 = vld1q_f32(&c0[j + 0]); // aligned + float32x4_t coef1 = vld1q_f32(&c0[j + 4]); // aligned + + //acc += input[i + j] * coef; + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0); + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1); + } + + // horizontal sum + float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + float32x2_t t1 = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); + t0 = vpadd_f32(t0, t1); + + vst1_lane_f32(&output0[outputFrames], t0, 0); + vst1_lane_f32(&output1[outputFrames], t0, 1); + outputFrames += 1; + + i += _stepTable[_phase]; + if (++_phase == _upFactor) { + _phase = 0; + } + } + _offset = (int64_t)(i - inputFrames) << 32; + + } else { // irrational + + while (HI32(_offset) < inputFrames) { + + int32_t i = HI32(_offset); + uint32_t f = LO32(_offset); + + uint32_t phase = f >> SRC_FRACBITS; + float32x4_t frac = vdupq_n_f32((f & SRC_FRACMASK) * QFRAC_TO_FLOAT); + + const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)]; + const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)]; + + float32x4_t acc0 = vdupq_n_f32(0); + float32x4_t acc1 = vdupq_n_f32(0); + + for (int j = 0; j < _numTaps; j += 8) { + + float32x4_t coef0 = vld1q_f32(&c0[j + 0]); // aligned + float32x4_t coef1 = vld1q_f32(&c0[j + 4]); // aligned + float32x4_t coef2 = vld1q_f32(&c1[j + 0]); // aligned + float32x4_t coef3 = vld1q_f32(&c1[j + 4]); // aligned + + //float coef = c0[j] + frac * (c1[j] - c0[j]); + coef2 = vsubq_f32(coef2, coef0); + coef3 = vsubq_f32(coef3, coef1); + coef0 = vmlaq_f32(coef0, coef2, frac); + coef1 = vmlaq_f32(coef1, coef3, frac); + + //acc += input[i + j] * coef; + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0); + acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1); + acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1); + } + + // horizontal sum + float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)); + float32x2_t t1 = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); + t0 = vpadd_f32(t0, t1); + + vst1_lane_f32(&output0[outputFrames], t0, 0); + vst1_lane_f32(&output1[outputFrames], t0, 1); + outputFrames += 1; + + _offset += _step; + } + _offset -= (int64_t)inputFrames << 32; + } + + return outputFrames; +} + +#else + int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) { int outputFrames = 0; @@ -725,6 +921,8 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* return outputFrames; } +#endif + // convert int16_t to float, deinterleave stereo void AudioSRC::convertInput(const int16_t* input, float** outputs, int numFrames) { const float scale = 1/32768.0f; @@ -1030,3 +1228,21 @@ int AudioSRC::getMaxInput(int outputFrames) { return (int)(((int64_t)outputFrames * _step) >> 32); } } + +// the input frames that will produce exactly outputFrames +int AudioSRC::getExactInput(int outputFrames) { + // + // For upsampling, a correct implementation is more complicated + // because it requires early exit of the multirate filter. + // This is not currently supported. + // + if (_upFactor > _downFactor) { + return -1; + } + if (_step == 0) { + int64_t offset = ((int64_t)_phase * _downFactor) % _upFactor; + return (int)(((int64_t)outputFrames * _downFactor + offset) / _upFactor); + } else { + return (int)(((int64_t)outputFrames * _step + _offset) >> 32); + } +} From e7ebcc0467cce53e1caacfa0489d1892c8211e99 Mon Sep 17 00:00:00 2001 From: Atlante45 Date: Tue, 6 Sep 2016 16:09:34 -0700 Subject: [PATCH 2/3] Fix toolBar.js relative path --- scripts/developer/utilities/record/recorder.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/developer/utilities/record/recorder.js b/scripts/developer/utilities/record/recorder.js index d08cdd68f3..083037461d 100644 --- a/scripts/developer/utilities/record/recorder.js +++ b/scripts/developer/utilities/record/recorder.js @@ -10,7 +10,7 @@ // HIFI_PUBLIC_BUCKET = "http://s3.amazonaws.com/hifi-public/"; -Script.include("../../libraries/toolBars.js"); +Script.include("/~/system/libraries/toolBars.js"); var recordingFile = "recording.rec"; From 284bbfdf0622659b37fbe7d0b52d5b29847bc6bc Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Tue, 6 Sep 2016 17:09:59 -0700 Subject: [PATCH 3/3] Remove extraneous code --- libraries/audio/src/AudioSRC.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/libraries/audio/src/AudioSRC.cpp b/libraries/audio/src/AudioSRC.cpp index 22e51200d1..32ec29cac4 100644 --- a/libraries/audio/src/AudioSRC.cpp +++ b/libraries/audio/src/AudioSRC.cpp @@ -1228,21 +1228,3 @@ int AudioSRC::getMaxInput(int outputFrames) { return (int)(((int64_t)outputFrames * _step) >> 32); } } - -// the input frames that will produce exactly outputFrames -int AudioSRC::getExactInput(int outputFrames) { - // - // For upsampling, a correct implementation is more complicated - // because it requires early exit of the multirate filter. - // This is not currently supported. - // - if (_upFactor > _downFactor) { - return -1; - } - if (_step == 0) { - int64_t offset = ((int64_t)_phase * _downFactor) % _upFactor; - return (int)(((int64_t)outputFrames * _downFactor + offset) / _upFactor); - } else { - return (int)(((int64_t)outputFrames * _step + _offset) >> 32); - } -}