diff --git a/assignment-client/src/audio/AudioMixerSlave.cpp b/assignment-client/src/audio/AudioMixerSlave.cpp
index cb90df58e5..e5e9f89984 100644
--- a/assignment-client/src/audio/AudioMixerSlave.cpp
+++ b/assignment-client/src/audio/AudioMixerSlave.cpp
@@ -549,38 +549,28 @@ void AudioMixerSlave::addStream(AudioMixerClientData::MixableStream& mixableStre
     // grab the stream from the ring buffer
     AudioRingBuffer::ConstIterator streamPopOutput = streamToAdd->getLastPopOutput();
 
-    // stereo sources are not passed through HRTF
     if (streamToAdd->isStereo()) {
 
-        // apply the avatar gain adjustment
-        gain *= mixableStream.hrtf->getGainAdjustment();
+        streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_STEREO);
 
-        const float scale = 1 / 32768.0f; // int16_t to float
-
-        for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-            _mixSamples[2*i+0] += (float)streamPopOutput[2*i+0] * gain * scale;
-            _mixSamples[2*i+1] += (float)streamPopOutput[2*i+1] * gain * scale;
-        }
+        // stereo sources are not passed through HRTF
+        mixableStream.hrtf->mixStereo(_bufferSamples, _mixSamples, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
 
         ++stats.manualStereoMixes;
     } else if (isEcho) {
+
+        streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
+
         // echo sources are not passed through HRTF
-
-        const float scale = 1/32768.0f; // int16_t to float
-
-        for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-            float sample = (float)streamPopOutput[i] * gain * scale;
-            _mixSamples[2*i+0] += sample;
-            _mixSamples[2*i+1] += sample;
-        }
+        mixableStream.hrtf->mixMono(_bufferSamples, _mixSamples, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
 
         ++stats.manualEchoMixes;
     } else {
+
         streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
 
         mixableStream.hrtf->render(_bufferSamples, _mixSamples, HRTF_DATASET_INDEX, azimuth, distance, gain,
                                    AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
-
         ++stats.hrtfRenders;
     }
 }
diff --git a/libraries/audio-client/src/AudioClient.cpp b/libraries/audio-client/src/AudioClient.cpp
index c537fea646..4d3311b065 100644
--- a/libraries/audio-client/src/AudioClient.cpp
+++ b/libraries/audio-client/src/AudioClient.cpp
@@ -1397,7 +1397,6 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
                     // spatialize into mixBuffer
                     injector->getLocalFOA().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX,
                                                    qw, qx, qy, qz, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
-
                 } else if (options.stereo) {
 
                     if (options.positionSet) {
@@ -1409,11 +1408,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
                     }
 
                     // direct mix into mixBuffer
-                    for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-                        mixBuffer[2*i+0] += convertToFloat(_localScratchBuffer[2*i+0]) * gain;
-                        mixBuffer[2*i+1] += convertToFloat(_localScratchBuffer[2*i+1]) * gain;
-                    }
-
+                    injector->getLocalHRTF().mixStereo(_localScratchBuffer, mixBuffer, gain,
+                                                       AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
                 } else {    // injector is mono
 
                     if (options.positionSet) {
@@ -1431,11 +1427,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
                     } else {
 
                         // direct mix into mixBuffer
-                        for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-                            float sample = convertToFloat(_localScratchBuffer[i]) * gain;
-                            mixBuffer[2*i+0] += sample;
-                            mixBuffer[2*i+1] += sample;
-                        }
+                        injector->getLocalHRTF().mixMono(_localScratchBuffer, mixBuffer, gain,
+                                                         AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
                     }
                 }
 
diff --git a/libraries/audio/src/AudioFOA.cpp b/libraries/audio/src/AudioFOA.cpp
index 30d29b72b7..0dd61fbd02 100644
--- a/libraries/audio/src/AudioFOA.cpp
+++ b/libraries/audio/src/AudioFOA.cpp
@@ -882,14 +882,16 @@ static void convertInput_ref(int16_t* src, float *dst[4], float gain, int numFra
 
 #endif
 
-// in-place rotation of the soundfield
-// crossfade between old and new rotation, to prevent artifacts
-static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
+// in-place rotation and scaling of the soundfield
+// crossfade between old and new matrix, to prevent artifacts
+static void rotate_4x4_ref(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
 
-    const float md[3][3] = { 
-        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] },
-        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] },
-        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] },
+    // matrix difference
+    const float md[4][4] = { 
+        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] },
+        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] },
+        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] },
+        { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] },
     };
 
     for (int i = 0; i < numFrames; i++) {
@@ -898,22 +900,27 @@ static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3
 
         // interpolate the matrix
         float m00 = m1[0][0] + frac * md[0][0];
-        float m10 = m1[1][0] + frac * md[1][0];
-        float m20 = m1[2][0] + frac * md[2][0];
 
-        float m01 = m1[0][1] + frac * md[0][1];
         float m11 = m1[1][1] + frac * md[1][1];
         float m21 = m1[2][1] + frac * md[2][1];
+        float m31 = m1[3][1] + frac * md[3][1];
 
-        float m02 = m1[0][2] + frac * md[0][2];
         float m12 = m1[1][2] + frac * md[1][2];
         float m22 = m1[2][2] + frac * md[2][2];
+        float m32 = m1[3][2] + frac * md[3][2];
+
+        float m13 = m1[1][3] + frac * md[1][3];
+        float m23 = m1[2][3] + frac * md[2][3];
+        float m33 = m1[3][3] + frac * md[3][3];
 
         // matrix multiply
-        float x = m00 * buf[1][i] + m01 * buf[2][i] + m02 * buf[3][i];
-        float y = m10 * buf[1][i] + m11 * buf[2][i] + m12 * buf[3][i];
-        float z = m20 * buf[1][i] + m21 * buf[2][i] + m22 * buf[3][i];
+        float w = m00 * buf[0][i];
 
+        float x = m11 * buf[1][i] + m12 * buf[2][i] + m13 * buf[3][i];
+        float y = m21 * buf[1][i] + m22 * buf[2][i] + m23 * buf[3][i];
+        float z = m31 * buf[1][i] + m32 * buf[2][i] + m33 * buf[3][i];
+
+        buf[0][i] = w;
         buf[1][i] = x;
         buf[2][i] = y;
         buf[3][i] = z;
@@ -932,7 +939,7 @@ void rfft512_AVX2(float buf[512]);
 void rifft512_AVX2(float buf[512]);
 void rfft512_cmadd_1X2_AVX2(const float src[512], const float coef0[512], const float coef1[512], float dst0[512], float dst1[512]);
 void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames);
-void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames);
+void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames);
 
 static void rfft512(float buf[512]) {
     static auto f = cpuSupportsAVX2() ? rfft512_AVX2 : rfft512_ref;
@@ -954,8 +961,8 @@ static void convertInput(int16_t* src, float *dst[4], float gain, int numFrames)
     (*f)(src, dst, gain, numFrames);  // dispatch
 }
 
-static void rotate_3x3(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
-    static auto f = cpuSupportsAVX2() ? rotate_3x3_AVX2 : rotate_3x3_ref;
+static void rotate_4x4(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
+    static auto f = cpuSupportsAVX2() ? rotate_4x4_AVX2 : rotate_4x4_ref;
     (*f)(buf, m0, m1, win, numFrames);  // dispatch
 }
 
@@ -965,7 +972,7 @@ static auto& rfft512 = rfft512_ref;
 static auto& rifft512 = rifft512_ref;
 static auto& rfft512_cmadd_1X2 = rfft512_cmadd_1X2_ref;
 static auto& convertInput = convertInput_ref;
-static auto& rotate_3x3 = rotate_3x3_ref;
+static auto& rotate_4x4 = rotate_4x4_ref;
 
 #endif
 
@@ -1007,8 +1014,8 @@ ALIGN32 static const float crossfadeTable[FOA_BLOCK] = {
     0.0020975362f, 0.0015413331f, 0.0010705384f, 0.0006852326f, 0.0003854819f, 0.0001713375f, 0.0000428362f, 0.0000000000f, 
 };
 
-// convert quaternion to a column-major 3x3 rotation matrix
-static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3]) {
+// convert quaternion to a column-major 4x4 rotation matrix
+static void quatToMatrix_4x4(float w, float x, float y, float z, float m[4][4]) {
 
     float xx = x * (x + x);
     float xy = x * (y + y);
@@ -1022,17 +1029,33 @@ static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3])
     float wy = w * (y + y);
     float wz = w * (z + z);
 
-    m[0][0] = 1.0f - (yy + zz);
-    m[0][1] = xy - wz;
-    m[0][2] = xz + wy;
+    m[0][0] = 1.0f;
+    m[0][1] = 0.0f;
+    m[0][2] = 0.0f;
+    m[0][3] = 0.0f;
 
-    m[1][0] = xy + wz;
-    m[1][1] = 1.0f - (xx + zz);
-    m[1][2] = yz - wx;
+    m[1][0] = 0.0f;
+    m[1][1] = 1.0f - (yy + zz);
+    m[1][2] = xy - wz;
+    m[1][3] = xz + wy;
 
-    m[2][0] = xz - wy;
-    m[2][1] = yz + wx;
-    m[2][2] = 1.0f - (xx + yy);
+    m[2][0] = 0.0f;
+    m[2][1] = xy + wz;
+    m[2][2] = 1.0f - (xx + zz);
+    m[2][3] = yz - wx;
+
+    m[3][0] = 0.0f;
+    m[3][1] = xz - wy;
+    m[3][2] = yz + wx;
+    m[3][3] = 1.0f - (xx + yy);
+}
+
+static void scaleMatrix_4x4(float scale, float m[4][4]) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m[i][j] *= scale;
+        }
+    }
 }
 
 // Ambisonic to binaural render
@@ -1047,18 +1070,26 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float
     ALIGN32 float inBuffer[4][FOA_BLOCK];       // deinterleaved input buffers
 
     float* in[4] = { inBuffer[0], inBuffer[1], inBuffer[2], inBuffer[3] };
-    float rotation[3][3];
+    float rotation[4][4];
 
     // convert input to deinterleaved float
-    convertInput(input, in, FOA_GAIN * gain, FOA_BLOCK);
+    convertInput(input, in, FOA_GAIN, FOA_BLOCK);
 
-    // convert quaternion to 3x3 rotation
-    quatToMatrix_3x3(qw, qx, qy, qz, rotation);
+    // convert quaternion to 4x4 rotation
+    quatToMatrix_4x4(qw, qx, qy, qz, rotation);
 
-    // rotate the soundfield
-    rotate_3x3(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK);
+    // apply gain as uniform scale
+    scaleMatrix_4x4(gain, rotation);
 
-    // rotation history update
+    // disable interpolation from reset state
+    if (_resetState) {
+        memcpy(_rotationState, rotation, sizeof(_rotationState));
+    }
+
+    // rotate and scale the soundfield
+    rotate_4x4(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK);
+
+    // new parameters become old
     memcpy(_rotationState, rotation, sizeof(_rotationState));
 
     //
@@ -1093,4 +1124,6 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float
         output[2*i+0] += accBuffer[0][i + FOA_OVERLAP];
         output[2*i+1] += accBuffer[1][i + FOA_OVERLAP];
     }
+
+    _resetState = false;
 }
diff --git a/libraries/audio/src/AudioFOA.h b/libraries/audio/src/AudioFOA.h
index 9eccc35bce..e8cacc22ab 100644
--- a/libraries/audio/src/AudioFOA.h
+++ b/libraries/audio/src/AudioFOA.h
@@ -28,12 +28,7 @@ static_assert((FOA_BLOCK + FOA_OVERLAP) == FOA_NFFT, "FFT convolution requires L
 class AudioFOA {
 
 public:
-    AudioFOA() {
-        // identity matrix
-        _rotationState[0][0] = 1.0f;
-        _rotationState[1][1] = 1.0f;
-        _rotationState[2][2] = 1.0f;
-    };
+    AudioFOA() {};
 
     //
     // input: interleaved First-Order Ambisonic source
@@ -55,8 +50,10 @@ private:
     // input history, for overlap-save
     float _fftState[4][FOA_OVERLAP] = {};
 
-    // orientation history
-    float _rotationState[3][3] = {};
+    // orientation and gain history
+    float _rotationState[4][4] = {};
+
+    bool _resetState = true;
 };
 
 #endif // AudioFOA_h
diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp
index 9de6440d67..e5e32781b0 100644
--- a/libraries/audio/src/AudioHRTF.cpp
+++ b/libraries/audio/src/AudioHRTF.cpp
@@ -750,6 +750,43 @@ static void interpolate(const float* src0, const float* src1, float* dst, float
 
 #endif
 
+// apply gain crossfade with accumulation (interleaved)
+static void gainfade_1x2(int16_t* src, float* dst, const float* win, float gain0, float gain1, int numFrames) {
+
+    gain0 *= (1/32768.0f);  // int16_t to float
+    gain1 *= (1/32768.0f);
+
+    for (int i = 0; i < numFrames; i++) {
+
+        float frac = win[i];
+        float gain = gain1 + frac * (gain0 - gain1);
+
+        float x0 = (float)src[i] * gain;
+
+        dst[2*i+0] += x0;
+        dst[2*i+1] += x0;
+    }
+}
+
+// apply gain crossfade with accumulation (interleaved)
+static void gainfade_2x2(int16_t* src, float* dst, const float* win, float gain0, float gain1, int numFrames) {
+
+    gain0 *= (1/32768.0f);  // int16_t to float
+    gain1 *= (1/32768.0f);
+
+    for (int i = 0; i < numFrames; i++) {
+
+        float frac = win[i];
+        float gain = gain1 + frac * (gain0 - gain1);
+
+        float x0 = (float)src[2*i+0] * gain;
+        float x1 = (float)src[2*i+1] * gain;
+
+        dst[2*i+0] += x0;
+        dst[2*i+1] += x1;
+    }
+}
+
 // design a 2nd order Thiran allpass
 static void ThiranBiquad(float f, float& b0, float& b1, float& b2, float& a1, float& a2) {
 
@@ -1104,6 +1141,13 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth,
     // apply global and local gain adjustment
     gain *= _gainAdjust;
 
+    // disable interpolation from reset state
+    if (_resetState) {
+        _azimuthState = azimuth;
+        _distanceState = distance;
+        _gainState = gain;
+    }
+
     // to avoid polluting the cache, old filters are recomputed instead of stored
     setFilters(firCoef, bqCoef, delay, index, _azimuthState, _distanceState, _gainState, L0);
 
@@ -1175,3 +1219,45 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth,
 
     _resetState = false;
 }
+
+void AudioHRTF::mixMono(int16_t* input, float* output, float gain, int numFrames) {
+
+    assert(numFrames == HRTF_BLOCK);
+
+    // apply global and local gain adjustment
+    gain *= _gainAdjust;
+
+    // disable interpolation from reset state
+    if (_resetState) {
+        _gainState = gain;
+    }
+
+    // crossfade gain and accumulate
+    gainfade_1x2(input, output, crossfadeTable, _gainState, gain, HRTF_BLOCK);
+
+    // new parameters become old
+    _gainState = gain;
+
+    _resetState = false;
+}
+
+void AudioHRTF::mixStereo(int16_t* input, float* output, float gain, int numFrames) {
+
+    assert(numFrames == HRTF_BLOCK);
+
+    // apply global and local gain adjustment
+    gain *= _gainAdjust;
+
+    // disable interpolation from reset state
+    if (_resetState) {
+        _gainState = gain;
+    }
+
+    // crossfade gain and accumulate
+    gainfade_2x2(input, output, crossfadeTable, _gainState, gain, HRTF_BLOCK);
+
+    // new parameters become old
+    _gainState = gain;
+
+    _resetState = false;
+}
diff --git a/libraries/audio/src/AudioHRTF.h b/libraries/audio/src/AudioHRTF.h
index 7d23f4825a..436d6318a5 100644
--- a/libraries/audio/src/AudioHRTF.h
+++ b/libraries/audio/src/AudioHRTF.h
@@ -50,6 +50,12 @@ public:
     //
     void render(int16_t* input, float* output, int index, float azimuth, float distance, float gain, int numFrames);
 
+    //
+    // Non-spatialized direct mix (accumulates into existing output)
+    //
+    void mixMono(int16_t* input, float* output, float gain, int numFrames);
+    void mixStereo(int16_t* input, float* output, float gain, int numFrames);
+
     //
     // Fast path when input is known to be silent and state as been flushed
     //
diff --git a/libraries/audio/src/avx2/AudioFOA_avx2.cpp b/libraries/audio/src/avx2/AudioFOA_avx2.cpp
index 70f9b0e5f6..15d37fcc3a 100644
--- a/libraries/audio/src/avx2/AudioFOA_avx2.cpp
+++ b/libraries/audio/src/avx2/AudioFOA_avx2.cpp
@@ -1289,14 +1289,16 @@ void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames) {
 
 #endif
 
-// in-place rotation of the soundfield
-// crossfade between old and new rotation, to prevent artifacts
-void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
+// in-place rotation and scaling of the soundfield
+// crossfade between old and new matrix, to prevent artifacts
+void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
 
-    const float md[3][3] = { 
-        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] },
-        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] },
-        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] },
+    // matrix difference
+    const float md[4][4] = { 
+        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] },
+        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] },
+        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] },
+        { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] },
     };
 
     assert(numFrames % 8 == 0);
@@ -1307,30 +1309,35 @@ void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3],
 
         // interpolate the matrix
         __m256 m00 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][0]), _mm256_broadcast_ss(&m1[0][0]));
-        __m256 m10 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][0]), _mm256_broadcast_ss(&m1[1][0]));
-        __m256 m20 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][0]), _mm256_broadcast_ss(&m1[2][0]));
 
-        __m256 m01 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][1]), _mm256_broadcast_ss(&m1[0][1]));
         __m256 m11 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][1]), _mm256_broadcast_ss(&m1[1][1]));
         __m256 m21 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][1]), _mm256_broadcast_ss(&m1[2][1]));
+        __m256 m31 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][1]), _mm256_broadcast_ss(&m1[3][1]));
 
-        __m256 m02 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][2]), _mm256_broadcast_ss(&m1[0][2]));
         __m256 m12 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][2]), _mm256_broadcast_ss(&m1[1][2]));
         __m256 m22 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][2]), _mm256_broadcast_ss(&m1[2][2]));
+        __m256 m32 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][2]), _mm256_broadcast_ss(&m1[3][2]));
+
+        __m256 m13 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][3]), _mm256_broadcast_ss(&m1[1][3]));
+        __m256 m23 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][3]), _mm256_broadcast_ss(&m1[2][3]));
+        __m256 m33 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][3]), _mm256_broadcast_ss(&m1[3][3]));
 
         // matrix multiply
-        __m256 x = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[1][i]));
-        __m256 y = _mm256_mul_ps(m10, _mm256_loadu_ps(&buf[1][i]));
-        __m256 z = _mm256_mul_ps(m20, _mm256_loadu_ps(&buf[1][i]));
+        __m256 w = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[0][i]));
 
-        x = _mm256_fmadd_ps(m01, _mm256_loadu_ps(&buf[2][i]), x);
-        y = _mm256_fmadd_ps(m11, _mm256_loadu_ps(&buf[2][i]), y);
-        z = _mm256_fmadd_ps(m21, _mm256_loadu_ps(&buf[2][i]), z);
+        __m256 x = _mm256_mul_ps(m11, _mm256_loadu_ps(&buf[1][i]));
+        __m256 y = _mm256_mul_ps(m21, _mm256_loadu_ps(&buf[1][i]));
+        __m256 z = _mm256_mul_ps(m31, _mm256_loadu_ps(&buf[1][i]));
 
-        x = _mm256_fmadd_ps(m02, _mm256_loadu_ps(&buf[3][i]), x);
-        y = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[3][i]), y);
-        z = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[3][i]), z);
+        x = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[2][i]), x);
+        y = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[2][i]), y);
+        z = _mm256_fmadd_ps(m32, _mm256_loadu_ps(&buf[2][i]), z);
 
+        x = _mm256_fmadd_ps(m13, _mm256_loadu_ps(&buf[3][i]), x);
+        y = _mm256_fmadd_ps(m23, _mm256_loadu_ps(&buf[3][i]), y);
+        z = _mm256_fmadd_ps(m33, _mm256_loadu_ps(&buf[3][i]), z);
+
+        _mm256_storeu_ps(&buf[0][i], w);
         _mm256_storeu_ps(&buf[1][i], x);
         _mm256_storeu_ps(&buf[2][i], y);
         _mm256_storeu_ps(&buf[3][i], z);