diff --git a/libraries/audio/src/AudioFOA.cpp b/libraries/audio/src/AudioFOA.cpp
index 30d29b72b7..ddcdf7f250 100644
--- a/libraries/audio/src/AudioFOA.cpp
+++ b/libraries/audio/src/AudioFOA.cpp
@@ -882,14 +882,16 @@ static void convertInput_ref(int16_t* src, float *dst[4], float gain, int numFra
 
 #endif
 
-// in-place rotation of the soundfield
-// crossfade between old and new rotation, to prevent artifacts
-static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
+// in-place rotation and scaling of the soundfield
+// crossfade between old and new matrix, to prevent artifacts
+static void rotate_4x4_ref(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
 
-    const float md[3][3] = { 
-        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] },
-        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] },
-        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] },
+    // matrix difference
+    const float md[4][4] = { 
+        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] },
+        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] },
+        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] },
+        { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] },
     };
 
     for (int i = 0; i < numFrames; i++) {
@@ -898,22 +900,27 @@ static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3
 
         // interpolate the matrix
         float m00 = m1[0][0] + frac * md[0][0];
-        float m10 = m1[1][0] + frac * md[1][0];
-        float m20 = m1[2][0] + frac * md[2][0];
 
-        float m01 = m1[0][1] + frac * md[0][1];
         float m11 = m1[1][1] + frac * md[1][1];
         float m21 = m1[2][1] + frac * md[2][1];
+        float m31 = m1[3][1] + frac * md[3][1];
 
-        float m02 = m1[0][2] + frac * md[0][2];
         float m12 = m1[1][2] + frac * md[1][2];
         float m22 = m1[2][2] + frac * md[2][2];
+        float m32 = m1[3][2] + frac * md[3][2];
+
+        float m13 = m1[1][3] + frac * md[1][3];
+        float m23 = m1[2][3] + frac * md[2][3];
+        float m33 = m1[3][3] + frac * md[3][3];
 
         // matrix multiply
-        float x = m00 * buf[1][i] + m01 * buf[2][i] + m02 * buf[3][i];
-        float y = m10 * buf[1][i] + m11 * buf[2][i] + m12 * buf[3][i];
-        float z = m20 * buf[1][i] + m21 * buf[2][i] + m22 * buf[3][i];
+        float w = m00 * buf[0][i];
 
+        float x = m11 * buf[1][i] + m12 * buf[2][i] + m13 * buf[3][i];
+        float y = m21 * buf[1][i] + m22 * buf[2][i] + m23 * buf[3][i];
+        float z = m31 * buf[1][i] + m32 * buf[2][i] + m33 * buf[3][i];
+
+        buf[0][i] = w;
         buf[1][i] = x;
         buf[2][i] = y;
         buf[3][i] = z;
@@ -932,7 +939,7 @@ void rfft512_AVX2(float buf[512]);
 void rifft512_AVX2(float buf[512]);
 void rfft512_cmadd_1X2_AVX2(const float src[512], const float coef0[512], const float coef1[512], float dst0[512], float dst1[512]);
 void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames);
-void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames);
+void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames);
 
 static void rfft512(float buf[512]) {
     static auto f = cpuSupportsAVX2() ? rfft512_AVX2 : rfft512_ref;
@@ -954,8 +961,8 @@ static void convertInput(int16_t* src, float *dst[4], float gain, int numFrames)
     (*f)(src, dst, gain, numFrames);  // dispatch
 }
 
-static void rotate_3x3(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
-    static auto f = cpuSupportsAVX2() ? rotate_3x3_AVX2 : rotate_3x3_ref;
+static void rotate_4x4(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
+    static auto f = cpuSupportsAVX2() ? rotate_4x4_AVX2 : rotate_4x4_ref;
     (*f)(buf, m0, m1, win, numFrames);  // dispatch
 }
 
@@ -965,7 +972,7 @@ static auto& rfft512 = rfft512_ref;
 static auto& rifft512 = rifft512_ref;
 static auto& rfft512_cmadd_1X2 = rfft512_cmadd_1X2_ref;
 static auto& convertInput = convertInput_ref;
-static auto& rotate_3x3 = rotate_3x3_ref;
+static auto& rotate_4x4 = rotate_4x4_ref;
 
 #endif
 
@@ -1007,8 +1014,8 @@ ALIGN32 static const float crossfadeTable[FOA_BLOCK] = {
     0.0020975362f, 0.0015413331f, 0.0010705384f, 0.0006852326f, 0.0003854819f, 0.0001713375f, 0.0000428362f, 0.0000000000f, 
 };
 
-// convert quaternion to a column-major 3x3 rotation matrix
-static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3]) {
+// convert quaternion to a column-major 4x4 rotation matrix
+static void quatToMatrix_4x4(float w, float x, float y, float z, float m[4][4]) {
 
     float xx = x * (x + x);
     float xy = x * (y + y);
@@ -1022,17 +1029,33 @@ static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3])
     float wy = w * (y + y);
     float wz = w * (z + z);
 
-    m[0][0] = 1.0f - (yy + zz);
-    m[0][1] = xy - wz;
-    m[0][2] = xz + wy;
+    m[0][0] = 1.0f;
+    m[0][1] = 0.0f;
+    m[0][2] = 0.0f;
+    m[0][3] = 0.0f;
 
-    m[1][0] = xy + wz;
-    m[1][1] = 1.0f - (xx + zz);
-    m[1][2] = yz - wx;
+    m[1][0] = 0.0f;
+    m[1][1] = 1.0f - (yy + zz);
+    m[1][2] = xy - wz;
+    m[1][3] = xz + wy;
 
-    m[2][0] = xz - wy;
-    m[2][1] = yz + wx;
-    m[2][2] = 1.0f - (xx + yy);
+    m[2][0] = 0.0f;
+    m[2][1] = xy + wz;
+    m[2][2] = 1.0f - (xx + zz);
+    m[2][3] = yz - wx;
+
+    m[3][0] = 0.0f;
+    m[3][1] = xz - wy;
+    m[3][2] = yz + wx;
+    m[3][3] = 1.0f - (xx + yy);
+}
+
+static void scaleMatrix_4x4(float scale, float m[4][4]) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m[i][j] *= scale;
+        }
+    }
 }
 
 // Ambisonic to binaural render
@@ -1047,16 +1070,19 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float
     ALIGN32 float inBuffer[4][FOA_BLOCK];       // deinterleaved input buffers
 
     float* in[4] = { inBuffer[0], inBuffer[1], inBuffer[2], inBuffer[3] };
-    float rotation[3][3];
+    float rotation[4][4];
 
     // convert input to deinterleaved float
-    convertInput(input, in, FOA_GAIN * gain, FOA_BLOCK);
+    convertInput(input, in, FOA_GAIN, FOA_BLOCK);
 
-    // convert quaternion to 3x3 rotation
-    quatToMatrix_3x3(qw, qx, qy, qz, rotation);
+    // convert quaternion to 4x4 rotation
+    quatToMatrix_4x4(qw, qx, qy, qz, rotation);
 
-    // rotate the soundfield
-    rotate_3x3(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK);
+    // apply gain as uniform scale
+    scaleMatrix_4x4(gain, rotation);
+
+    // rotate and scale the soundfield
+    rotate_4x4(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK);
 
     // rotation history update
     memcpy(_rotationState, rotation, sizeof(_rotationState));
diff --git a/libraries/audio/src/AudioFOA.h b/libraries/audio/src/AudioFOA.h
index 9eccc35bce..b8257e5190 100644
--- a/libraries/audio/src/AudioFOA.h
+++ b/libraries/audio/src/AudioFOA.h
@@ -28,12 +28,7 @@ static_assert((FOA_BLOCK + FOA_OVERLAP) == FOA_NFFT, "FFT convolution requires L
 class AudioFOA {
 
 public:
-    AudioFOA() {
-        // identity matrix
-        _rotationState[0][0] = 1.0f;
-        _rotationState[1][1] = 1.0f;
-        _rotationState[2][2] = 1.0f;
-    };
+    AudioFOA() {};
 
     //
     // input: interleaved First-Order Ambisonic source
@@ -55,8 +50,8 @@ private:
     // input history, for overlap-save
     float _fftState[4][FOA_OVERLAP] = {};
 
-    // orientation history
-    float _rotationState[3][3] = {};
+    // orientation and gain history
+    float _rotationState[4][4] = {};
 };
 
 #endif // AudioFOA_h
diff --git a/libraries/audio/src/avx2/AudioFOA_avx2.cpp b/libraries/audio/src/avx2/AudioFOA_avx2.cpp
index 70f9b0e5f6..15d37fcc3a 100644
--- a/libraries/audio/src/avx2/AudioFOA_avx2.cpp
+++ b/libraries/audio/src/avx2/AudioFOA_avx2.cpp
@@ -1289,14 +1289,16 @@ void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames) {
 
 #endif
 
-// in-place rotation of the soundfield
-// crossfade between old and new rotation, to prevent artifacts
-void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
+// in-place rotation and scaling of the soundfield
+// crossfade between old and new matrix, to prevent artifacts
+void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
 
-    const float md[3][3] = { 
-        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] },
-        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] },
-        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] },
+    // matrix difference
+    const float md[4][4] = { 
+        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] },
+        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] },
+        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] },
+        { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] },
     };
 
     assert(numFrames % 8 == 0);
@@ -1307,30 +1309,35 @@ void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3],
 
         // interpolate the matrix
         __m256 m00 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][0]), _mm256_broadcast_ss(&m1[0][0]));
-        __m256 m10 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][0]), _mm256_broadcast_ss(&m1[1][0]));
-        __m256 m20 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][0]), _mm256_broadcast_ss(&m1[2][0]));
 
-        __m256 m01 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][1]), _mm256_broadcast_ss(&m1[0][1]));
         __m256 m11 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][1]), _mm256_broadcast_ss(&m1[1][1]));
         __m256 m21 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][1]), _mm256_broadcast_ss(&m1[2][1]));
+        __m256 m31 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][1]), _mm256_broadcast_ss(&m1[3][1]));
 
-        __m256 m02 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][2]), _mm256_broadcast_ss(&m1[0][2]));
         __m256 m12 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][2]), _mm256_broadcast_ss(&m1[1][2]));
         __m256 m22 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][2]), _mm256_broadcast_ss(&m1[2][2]));
+        __m256 m32 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][2]), _mm256_broadcast_ss(&m1[3][2]));
+
+        __m256 m13 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][3]), _mm256_broadcast_ss(&m1[1][3]));
+        __m256 m23 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][3]), _mm256_broadcast_ss(&m1[2][3]));
+        __m256 m33 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][3]), _mm256_broadcast_ss(&m1[3][3]));
 
         // matrix multiply
-        __m256 x = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[1][i]));
-        __m256 y = _mm256_mul_ps(m10, _mm256_loadu_ps(&buf[1][i]));
-        __m256 z = _mm256_mul_ps(m20, _mm256_loadu_ps(&buf[1][i]));
+        __m256 w = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[0][i]));
 
-        x = _mm256_fmadd_ps(m01, _mm256_loadu_ps(&buf[2][i]), x);
-        y = _mm256_fmadd_ps(m11, _mm256_loadu_ps(&buf[2][i]), y);
-        z = _mm256_fmadd_ps(m21, _mm256_loadu_ps(&buf[2][i]), z);
+        __m256 x = _mm256_mul_ps(m11, _mm256_loadu_ps(&buf[1][i]));
+        __m256 y = _mm256_mul_ps(m21, _mm256_loadu_ps(&buf[1][i]));
+        __m256 z = _mm256_mul_ps(m31, _mm256_loadu_ps(&buf[1][i]));
 
-        x = _mm256_fmadd_ps(m02, _mm256_loadu_ps(&buf[3][i]), x);
-        y = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[3][i]), y);
-        z = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[3][i]), z);
+        x = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[2][i]), x);
+        y = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[2][i]), y);
+        z = _mm256_fmadd_ps(m32, _mm256_loadu_ps(&buf[2][i]), z);
 
+        x = _mm256_fmadd_ps(m13, _mm256_loadu_ps(&buf[3][i]), x);
+        y = _mm256_fmadd_ps(m23, _mm256_loadu_ps(&buf[3][i]), y);
+        z = _mm256_fmadd_ps(m33, _mm256_loadu_ps(&buf[3][i]), z);
+
+        _mm256_storeu_ps(&buf[0][i], w);
         _mm256_storeu_ps(&buf[1][i], x);
         _mm256_storeu_ps(&buf[2][i], y);
         _mm256_storeu_ps(&buf[3][i], z);