From b47000eab24efa08af526df503d80811fdb030ea Mon Sep 17 00:00:00 2001
From: Ken Cooke <ken@highfidelity.io>
Date: Tue, 6 Sep 2016 16:53:06 -0700
Subject: [PATCH 1/3] Added ARM NEON optimized audio sample rate conversion

---
 libraries/audio/src/AudioSRC.cpp | 216 +++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)

diff --git a/libraries/audio/src/AudioSRC.cpp b/libraries/audio/src/AudioSRC.cpp
index 5dba63b349..22e51200d1 100644
--- a/libraries/audio/src/AudioSRC.cpp
+++ b/libraries/audio/src/AudioSRC.cpp
@@ -593,6 +593,202 @@ void AudioSRC::convertOutput(float** inputs, float* output, int numFrames) {
 
 #else
 
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include <arm_neon.h>
+
+int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) {
+    int outputFrames = 0;
+
+    assert(_numTaps % 8 == 0);  // SIMD8
+
+    if (_step == 0) {   // rational
+
+        int32_t i = HI32(_offset);
+
+        while (i < inputFrames) {
+
+            const float* c0 = &_polyphaseFilter[_numTaps * _phase];
+
+            float32x4_t acc0 = vdupq_n_f32(0);
+            float32x4_t acc1 = vdupq_n_f32(0);
+           
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                //float coef = c0[j];
+                float32x4_t coef0 = vld1q_f32(&c0[j + 0]);  // aligned
+                float32x4_t coef1 = vld1q_f32(&c0[j + 4]);  // aligned
+
+                //acc += input[i + j] * coef;
+                acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0);
+                acc1 = vmlaq_f32(acc1, vld1q_f32(&input0[i + j + 4]), coef1);
+            }
+            acc0 = vaddq_f32(acc0, acc1);
+
+            // horizontal sum
+            float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+            t0 = vpadd_f32(t0, t0);
+
+            vst1_lane_f32(&output0[outputFrames], t0, 0);
+            outputFrames += 1;
+
+            i += _stepTable[_phase];
+            if (++_phase == _upFactor) {
+                _phase = 0;
+            }
+        }
+        _offset = (int64_t)(i - inputFrames) << 32;
+
+    } else {    // irrational
+
+        while (HI32(_offset) < inputFrames) {
+
+            int32_t i = HI32(_offset);
+            uint32_t f = LO32(_offset);
+
+            uint32_t phase = f >> SRC_FRACBITS;
+            float32x4_t frac = vdupq_n_f32((f & SRC_FRACMASK) * QFRAC_TO_FLOAT);
+
+            const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)];
+            const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)];
+
+            float32x4_t acc0 = vdupq_n_f32(0);
+            float32x4_t acc1 = vdupq_n_f32(0);
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                float32x4_t coef0 = vld1q_f32(&c0[j + 0]);  // aligned
+                float32x4_t coef1 = vld1q_f32(&c0[j + 4]);  // aligned
+                float32x4_t coef2 = vld1q_f32(&c1[j + 0]);  // aligned
+                float32x4_t coef3 = vld1q_f32(&c1[j + 4]);  // aligned
+
+                //float coef = c0[j] + frac * (c1[j] - c0[j]);
+                coef2 = vsubq_f32(coef2, coef0);
+                coef3 = vsubq_f32(coef3, coef1);
+                coef0 = vmlaq_f32(coef0, coef2, frac);
+                coef1 = vmlaq_f32(coef1, coef3, frac);
+
+                //acc += input[i + j] * coef;
+                acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0);
+                acc1 = vmlaq_f32(acc1, vld1q_f32(&input0[i + j + 4]), coef1);
+            }
+            acc0 = vaddq_f32(acc0, acc1);
+
+            // horizontal sum
+            float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+            t0 = vpadd_f32(t0, t0);
+
+            vst1_lane_f32(&output0[outputFrames], t0, 0);
+            outputFrames += 1;
+
+            _offset += _step;
+        }
+        _offset -= (int64_t)inputFrames << 32;
+    }
+
+    return outputFrames;
+}
+
+int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) {
+    int outputFrames = 0;
+
+    assert(_numTaps % 8 == 0);  // SIMD8
+
+    if (_step == 0) {   // rational
+
+        int32_t i = HI32(_offset);
+
+        while (i < inputFrames) {
+
+            const float* c0 = &_polyphaseFilter[_numTaps * _phase];
+
+            float32x4_t acc0 = vdupq_n_f32(0);
+            float32x4_t acc1 = vdupq_n_f32(0);
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                //float coef = c0[j];
+                float32x4_t coef0 = vld1q_f32(&c0[j + 0]);  // aligned
+                float32x4_t coef1 = vld1q_f32(&c0[j + 4]);  // aligned
+
+                //acc += input[i + j] * coef;
+                acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0);
+                acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0);
+                acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1);
+                acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1);
+            }
+
+            // horizontal sum
+            float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+            float32x2_t t1 = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+            t0 = vpadd_f32(t0, t1);
+
+            vst1_lane_f32(&output0[outputFrames], t0, 0);
+            vst1_lane_f32(&output1[outputFrames], t0, 1);
+            outputFrames += 1;
+
+            i += _stepTable[_phase];
+            if (++_phase == _upFactor) {
+                _phase = 0;
+            }
+        }
+        _offset = (int64_t)(i - inputFrames) << 32;
+
+    } else {    // irrational
+
+        while (HI32(_offset) < inputFrames) {
+
+            int32_t i = HI32(_offset);
+            uint32_t f = LO32(_offset);
+
+            uint32_t phase = f >> SRC_FRACBITS;
+            float32x4_t frac = vdupq_n_f32((f & SRC_FRACMASK) * QFRAC_TO_FLOAT);
+
+            const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)];
+            const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)];
+
+            float32x4_t acc0 = vdupq_n_f32(0);
+            float32x4_t acc1 = vdupq_n_f32(0);
+
+            for (int j = 0; j < _numTaps; j += 8) {
+
+                float32x4_t coef0 = vld1q_f32(&c0[j + 0]);  // aligned
+                float32x4_t coef1 = vld1q_f32(&c0[j + 4]);  // aligned
+                float32x4_t coef2 = vld1q_f32(&c1[j + 0]);  // aligned
+                float32x4_t coef3 = vld1q_f32(&c1[j + 4]);  // aligned
+
+                //float coef = c0[j] + frac * (c1[j] - c0[j]);
+                coef2 = vsubq_f32(coef2, coef0);
+                coef3 = vsubq_f32(coef3, coef1);
+                coef0 = vmlaq_f32(coef0, coef2, frac);
+                coef1 = vmlaq_f32(coef1, coef3, frac);
+
+                //acc += input[i + j] * coef;
+                acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 0]), coef0);
+                acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 0]), coef0);
+                acc0 = vmlaq_f32(acc0, vld1q_f32(&input0[i + j + 4]), coef1);
+                acc1 = vmlaq_f32(acc1, vld1q_f32(&input1[i + j + 4]), coef1);
+            }
+
+            // horizontal sum
+            float32x2_t t0 = vadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+            float32x2_t t1 = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+            t0 = vpadd_f32(t0, t1);
+
+            vst1_lane_f32(&output0[outputFrames], t0, 0);
+            vst1_lane_f32(&output1[outputFrames], t0, 1);
+            outputFrames += 1;
+
+            _offset += _step;
+        }
+        _offset -= (int64_t)inputFrames << 32;
+    }
+
+    return outputFrames;
+}
+
+#else
+
 int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) {
     int outputFrames = 0;
 
@@ -725,6 +921,8 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
     return outputFrames;
 }
 
+#endif
+
 // convert int16_t to float, deinterleave stereo
 void AudioSRC::convertInput(const int16_t* input, float** outputs, int numFrames) {
     const float scale = 1/32768.0f;
@@ -1030,3 +1228,21 @@ int AudioSRC::getMaxInput(int outputFrames) {
         return (int)(((int64_t)outputFrames * _step) >> 32);
     }
 }
+
+// the input frames that will produce exactly outputFrames
+int AudioSRC::getExactInput(int outputFrames) {
+    //
+    // For upsampling, a correct implementation is more complicated
+    // because it requires early exit of the multirate filter.
+    // This is not currently supported.
+    //
+    if (_upFactor > _downFactor) {
+        return -1;
+    }
+    if (_step == 0) {
+        int64_t offset = ((int64_t)_phase * _downFactor) % _upFactor;
+        return (int)(((int64_t)outputFrames * _downFactor + offset) / _upFactor);
+    } else {
+        return (int)(((int64_t)outputFrames * _step + _offset) >> 32);
+    }
+}

From e7ebcc0467cce53e1caacfa0489d1892c8211e99 Mon Sep 17 00:00:00 2001
From: Atlante45 <clement.brisset@gmail.com>
Date: Tue, 6 Sep 2016 16:09:34 -0700
Subject: [PATCH 2/3] Fix toolBar.js relative path

---
 scripts/developer/utilities/record/recorder.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/developer/utilities/record/recorder.js b/scripts/developer/utilities/record/recorder.js
index d08cdd68f3..083037461d 100644
--- a/scripts/developer/utilities/record/recorder.js
+++ b/scripts/developer/utilities/record/recorder.js
@@ -10,7 +10,7 @@
 //
 
 HIFI_PUBLIC_BUCKET = "http://s3.amazonaws.com/hifi-public/";
-Script.include("../../libraries/toolBars.js");
+Script.include("/~/system/libraries/toolBars.js");
 
 var recordingFile = "recording.rec";
 

From 284bbfdf0622659b37fbe7d0b52d5b29847bc6bc Mon Sep 17 00:00:00 2001
From: Ken Cooke <ken@highfidelity.io>
Date: Tue, 6 Sep 2016 17:09:59 -0700
Subject: [PATCH 3/3] Remove extraneous code

---
 libraries/audio/src/AudioSRC.cpp | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/libraries/audio/src/AudioSRC.cpp b/libraries/audio/src/AudioSRC.cpp
index 22e51200d1..32ec29cac4 100644
--- a/libraries/audio/src/AudioSRC.cpp
+++ b/libraries/audio/src/AudioSRC.cpp
@@ -1228,21 +1228,3 @@ int AudioSRC::getMaxInput(int outputFrames) {
         return (int)(((int64_t)outputFrames * _step) >> 32);
     }
 }
-
-// the input frames that will produce exactly outputFrames
-int AudioSRC::getExactInput(int outputFrames) {
-    //
-    // For upsampling, a correct implementation is more complicated
-    // because it requires early exit of the multirate filter.
-    // This is not currently supported.
-    //
-    if (_upFactor > _downFactor) {
-        return -1;
-    }
-    if (_step == 0) {
-        int64_t offset = ((int64_t)_phase * _downFactor) % _upFactor;
-        return (int)(((int64_t)outputFrames * _downFactor + offset) / _upFactor);
-    } else {
-        return (int)(((int64_t)outputFrames * _step + _offset) >> 32);
-    }
-}