Merge pull request #15307 from kencooke/audio-gain-interpolation

Case 22019: Audio clicks/pops when gain is rapidly changing
2025-04-07 10:02:24 +02:00 · 2019-04-10 12:26:59 -07:00 · 2019-04-10 12:26:59 -07:00 · 6f4f7335dc
commit 6f4f7335dc
parent feb2757990 85368e6836
7 changed files with 205 additions and 93 deletions
--- a/assignment-client/src/audio/AudioMixerSlave.cpp
+++ b/assignment-client/src/audio/AudioMixerSlave.cpp
@ -549,38 +549,28 @@ void AudioMixerSlave::addStream(AudioMixerClientData::MixableStream& mixableStre
    // grab the stream from the ring buffer
    AudioRingBuffer::ConstIterator streamPopOutput = streamToAdd->getLastPopOutput();

-    // stereo sources are not passed through HRTF
    if (streamToAdd->isStereo()) {

-        // apply the avatar gain adjustment
-        gain *= mixableStream.hrtf->getGainAdjustment();
+        streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_STEREO);

-        const float scale = 1 / 32768.0f; // int16_t to float
-
-        for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-            _mixSamples[2*i+0] += (float)streamPopOutput[2*i+0] * gain * scale;
-            _mixSamples[2*i+1] += (float)streamPopOutput[2*i+1] * gain * scale;
-        }
+        // stereo sources are not passed through HRTF
+        mixableStream.hrtf->mixStereo(_bufferSamples, _mixSamples, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);

        ++stats.manualStereoMixes;
    } else if (isEcho) {
+
+        streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
+
        // echo sources are not passed through HRTF
-
-        const float scale = 1/32768.0f; // int16_t to float
-
-        for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-            float sample = (float)streamPopOutput[i] * gain * scale;
-            _mixSamples[2*i+0] += sample;
-            _mixSamples[2*i+1] += sample;
-        }
+        mixableStream.hrtf->mixMono(_bufferSamples, _mixSamples, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);

        ++stats.manualEchoMixes;
    } else {
+
        streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);

        mixableStream.hrtf->render(_bufferSamples, _mixSamples, HRTF_DATASET_INDEX, azimuth, distance, gain,
                                   AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
-
        ++stats.hrtfRenders;
    }
 }
--- a/libraries/audio-client/src/AudioClient.cpp
+++ b/libraries/audio-client/src/AudioClient.cpp
@ -1397,7 +1397,6 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
                    // spatialize into mixBuffer
                    injector->getLocalFOA().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX,
                                                   qw, qx, qy, qz, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
-
                } else if (options.stereo) {

                    if (options.positionSet) {
@ -1409,11 +1408,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
                    }

                    // direct mix into mixBuffer
-                    for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-                        mixBuffer[2*i+0] += convertToFloat(_localScratchBuffer[2*i+0]) * gain;
-                        mixBuffer[2*i+1] += convertToFloat(_localScratchBuffer[2*i+1]) * gain;
-                    }
-
+                    injector->getLocalHRTF().mixStereo(_localScratchBuffer, mixBuffer, gain,
+                                                       AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
                } else {    // injector is mono

                    if (options.positionSet) {
@ -1431,11 +1427,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
                    } else {

                        // direct mix into mixBuffer
-                        for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) {
-                            float sample = convertToFloat(_localScratchBuffer[i]) * gain;
-                            mixBuffer[2*i+0] += sample;
-                            mixBuffer[2*i+1] += sample;
-                        }
+                        injector->getLocalHRTF().mixMono(_localScratchBuffer, mixBuffer, gain,
+                                                         AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
                    }
                }

--- a/libraries/audio/src/AudioFOA.cpp
+++ b/libraries/audio/src/AudioFOA.cpp
@ -882,14 +882,16 @@ static void convertInput_ref(int16_t* src, float *dst[4], float gain, int numFra

 #endif

-// in-place rotation of the soundfield
-// crossfade between old and new rotation, to prevent artifacts
-static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
+// in-place rotation and scaling of the soundfield
+// crossfade between old and new matrix, to prevent artifacts
+static void rotate_4x4_ref(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {

-    const float md[3][3] = { 
-        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] },
-        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] },
-        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] },
+    // matrix difference
+    const float md[4][4] = { 
+        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] },
+        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] },
+        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] },
+        { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] },
    };

    for (int i = 0; i < numFrames; i++) {
@ -898,22 +900,27 @@ static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3

        // interpolate the matrix
        float m00 = m1[0][0] + frac * md[0][0];
-        float m10 = m1[1][0] + frac * md[1][0];
-        float m20 = m1[2][0] + frac * md[2][0];

-        float m01 = m1[0][1] + frac * md[0][1];
        float m11 = m1[1][1] + frac * md[1][1];
        float m21 = m1[2][1] + frac * md[2][1];
+        float m31 = m1[3][1] + frac * md[3][1];

-        float m02 = m1[0][2] + frac * md[0][2];
        float m12 = m1[1][2] + frac * md[1][2];
        float m22 = m1[2][2] + frac * md[2][2];
+        float m32 = m1[3][2] + frac * md[3][2];
+
+        float m13 = m1[1][3] + frac * md[1][3];
+        float m23 = m1[2][3] + frac * md[2][3];
+        float m33 = m1[3][3] + frac * md[3][3];

        // matrix multiply
-        float x = m00 * buf[1][i] + m01 * buf[2][i] + m02 * buf[3][i];
-        float y = m10 * buf[1][i] + m11 * buf[2][i] + m12 * buf[3][i];
-        float z = m20 * buf[1][i] + m21 * buf[2][i] + m22 * buf[3][i];
+        float w = m00 * buf[0][i];

+        float x = m11 * buf[1][i] + m12 * buf[2][i] + m13 * buf[3][i];
+        float y = m21 * buf[1][i] + m22 * buf[2][i] + m23 * buf[3][i];
+        float z = m31 * buf[1][i] + m32 * buf[2][i] + m33 * buf[3][i];
+
+        buf[0][i] = w;
        buf[1][i] = x;
        buf[2][i] = y;
        buf[3][i] = z;
@ -932,7 +939,7 @@ void rfft512_AVX2(float buf[512]);
 void rifft512_AVX2(float buf[512]);
 void rfft512_cmadd_1X2_AVX2(const float src[512], const float coef0[512], const float coef1[512], float dst0[512], float dst1[512]);
 void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames);
-void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames);
+void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames);

 static void rfft512(float buf[512]) {
    static auto f = cpuSupportsAVX2() ? rfft512_AVX2 : rfft512_ref;
@ -954,8 +961,8 @@ static void convertInput(int16_t* src, float *dst[4], float gain, int numFrames)
    (*f)(src, dst, gain, numFrames);  // dispatch
 }

-static void rotate_3x3(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
-    static auto f = cpuSupportsAVX2() ? rotate_3x3_AVX2 : rotate_3x3_ref;
+static void rotate_4x4(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {
+    static auto f = cpuSupportsAVX2() ? rotate_4x4_AVX2 : rotate_4x4_ref;
    (*f)(buf, m0, m1, win, numFrames);  // dispatch
 }

@ -965,7 +972,7 @@ static auto& rfft512 = rfft512_ref;
 static auto& rifft512 = rifft512_ref;
 static auto& rfft512_cmadd_1X2 = rfft512_cmadd_1X2_ref;
 static auto& convertInput = convertInput_ref;
-static auto& rotate_3x3 = rotate_3x3_ref;
+static auto& rotate_4x4 = rotate_4x4_ref;

 #endif

@ -1007,8 +1014,8 @@ ALIGN32 static const float crossfadeTable[FOA_BLOCK] = {
    0.0020975362f, 0.0015413331f, 0.0010705384f, 0.0006852326f, 0.0003854819f, 0.0001713375f, 0.0000428362f, 0.0000000000f, 
 };

-// convert quaternion to a column-major 3x3 rotation matrix
-static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3]) {
+// convert quaternion to a column-major 4x4 rotation matrix
+static void quatToMatrix_4x4(float w, float x, float y, float z, float m[4][4]) {

    float xx = x * (x + x);
    float xy = x * (y + y);
@ -1022,17 +1029,33 @@ static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3])
    float wy = w * (y + y);
    float wz = w * (z + z);

-    m[0][0] = 1.0f - (yy + zz);
-    m[0][1] = xy - wz;
-    m[0][2] = xz + wy;
+    m[0][0] = 1.0f;
+    m[0][1] = 0.0f;
+    m[0][2] = 0.0f;
+    m[0][3] = 0.0f;

-    m[1][0] = xy + wz;
-    m[1][1] = 1.0f - (xx + zz);
-    m[1][2] = yz - wx;
+    m[1][0] = 0.0f;
+    m[1][1] = 1.0f - (yy + zz);
+    m[1][2] = xy - wz;
+    m[1][3] = xz + wy;

-    m[2][0] = xz - wy;
-    m[2][1] = yz + wx;
-    m[2][2] = 1.0f - (xx + yy);
+    m[2][0] = 0.0f;
+    m[2][1] = xy + wz;
+    m[2][2] = 1.0f - (xx + zz);
+    m[2][3] = yz - wx;
+
+    m[3][0] = 0.0f;
+    m[3][1] = xz - wy;
+    m[3][2] = yz + wx;
+    m[3][3] = 1.0f - (xx + yy);
+}
+
+static void scaleMatrix_4x4(float scale, float m[4][4]) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m[i][j] *= scale;
+        }
+    }
 }

 // Ambisonic to binaural render
@ -1047,18 +1070,26 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float
    ALIGN32 float inBuffer[4][FOA_BLOCK];       // deinterleaved input buffers

    float* in[4] = { inBuffer[0], inBuffer[1], inBuffer[2], inBuffer[3] };
-    float rotation[3][3];
+    float rotation[4][4];

    // convert input to deinterleaved float
-    convertInput(input, in, FOA_GAIN * gain, FOA_BLOCK);
+    convertInput(input, in, FOA_GAIN, FOA_BLOCK);

-    // convert quaternion to 3x3 rotation
-    quatToMatrix_3x3(qw, qx, qy, qz, rotation);
+    // convert quaternion to 4x4 rotation
+    quatToMatrix_4x4(qw, qx, qy, qz, rotation);

-    // rotate the soundfield
-    rotate_3x3(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK);
+    // apply gain as uniform scale
+    scaleMatrix_4x4(gain, rotation);

-    // rotation history update
+    // disable interpolation from reset state
+    if (_resetState) {
+        memcpy(_rotationState, rotation, sizeof(_rotationState));
+    }
+
+    // rotate and scale the soundfield
+    rotate_4x4(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK);
+
+    // new parameters become old
    memcpy(_rotationState, rotation, sizeof(_rotationState));

    //
@ -1093,4 +1124,6 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float
        output[2*i+0] += accBuffer[0][i + FOA_OVERLAP];
        output[2*i+1] += accBuffer[1][i + FOA_OVERLAP];
    }
+
+    _resetState = false;
 }
--- a/libraries/audio/src/AudioFOA.h
+++ b/libraries/audio/src/AudioFOA.h
@ -28,12 +28,7 @@ static_assert((FOA_BLOCK + FOA_OVERLAP) == FOA_NFFT, "FFT convolution requires L
 class AudioFOA {

 public:
-    AudioFOA() {
-        // identity matrix
-        _rotationState[0][0] = 1.0f;
-        _rotationState[1][1] = 1.0f;
-        _rotationState[2][2] = 1.0f;
-    };
+    AudioFOA() {};

    //
    // input: interleaved First-Order Ambisonic source
@ -55,8 +50,10 @@ private:
    // input history, for overlap-save
    float _fftState[4][FOA_OVERLAP] = {};

-    // orientation history
-    float _rotationState[3][3] = {};
+    // orientation and gain history
+    float _rotationState[4][4] = {};
+
+    bool _resetState = true;
 };

 #endif // AudioFOA_h
--- a/libraries/audio/src/AudioHRTF.cpp
+++ b/libraries/audio/src/AudioHRTF.cpp
@ -750,6 +750,43 @@ static void interpolate(const float* src0, const float* src1, float* dst, float

 #endif

+// apply gain crossfade with accumulation (interleaved)
+static void gainfade_1x2(int16_t* src, float* dst, const float* win, float gain0, float gain1, int numFrames) {
+
+    gain0 *= (1/32768.0f);  // int16_t to float
+    gain1 *= (1/32768.0f);
+
+    for (int i = 0; i < numFrames; i++) {
+
+        float frac = win[i];
+        float gain = gain1 + frac * (gain0 - gain1);
+
+        float x0 = (float)src[i] * gain;
+
+        dst[2*i+0] += x0;
+        dst[2*i+1] += x0;
+    }
+}
+
+// apply gain crossfade with accumulation (interleaved)
+static void gainfade_2x2(int16_t* src, float* dst, const float* win, float gain0, float gain1, int numFrames) {
+
+    gain0 *= (1/32768.0f);  // int16_t to float
+    gain1 *= (1/32768.0f);
+
+    for (int i = 0; i < numFrames; i++) {
+
+        float frac = win[i];
+        float gain = gain1 + frac * (gain0 - gain1);
+
+        float x0 = (float)src[2*i+0] * gain;
+        float x1 = (float)src[2*i+1] * gain;
+
+        dst[2*i+0] += x0;
+        dst[2*i+1] += x1;
+    }
+}
+
 // design a 2nd order Thiran allpass
 static void ThiranBiquad(float f, float& b0, float& b1, float& b2, float& a1, float& a2) {

@ -1104,6 +1141,13 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth,
    // apply global and local gain adjustment
    gain *= _gainAdjust;

+    // disable interpolation from reset state
+    if (_resetState) {
+        _azimuthState = azimuth;
+        _distanceState = distance;
+        _gainState = gain;
+    }
+
    // to avoid polluting the cache, old filters are recomputed instead of stored
    setFilters(firCoef, bqCoef, delay, index, _azimuthState, _distanceState, _gainState, L0);

@ -1175,3 +1219,45 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth,

    _resetState = false;
 }
+
+void AudioHRTF::mixMono(int16_t* input, float* output, float gain, int numFrames) {
+
+    assert(numFrames == HRTF_BLOCK);
+
+    // apply global and local gain adjustment
+    gain *= _gainAdjust;
+
+    // disable interpolation from reset state
+    if (_resetState) {
+        _gainState = gain;
+    }
+
+    // crossfade gain and accumulate
+    gainfade_1x2(input, output, crossfadeTable, _gainState, gain, HRTF_BLOCK);
+
+    // new parameters become old
+    _gainState = gain;
+
+    _resetState = false;
+}
+
+void AudioHRTF::mixStereo(int16_t* input, float* output, float gain, int numFrames) {
+
+    assert(numFrames == HRTF_BLOCK);
+
+    // apply global and local gain adjustment
+    gain *= _gainAdjust;
+
+    // disable interpolation from reset state
+    if (_resetState) {
+        _gainState = gain;
+    }
+
+    // crossfade gain and accumulate
+    gainfade_2x2(input, output, crossfadeTable, _gainState, gain, HRTF_BLOCK);
+
+    // new parameters become old
+    _gainState = gain;
+
+    _resetState = false;
+}
--- a/libraries/audio/src/AudioHRTF.h
+++ b/libraries/audio/src/AudioHRTF.h
@ -50,6 +50,12 @@ public:
    //
    void render(int16_t* input, float* output, int index, float azimuth, float distance, float gain, int numFrames);

+    //
+    // Non-spatialized direct mix (accumulates into existing output)
+    //
+    void mixMono(int16_t* input, float* output, float gain, int numFrames);
+    void mixStereo(int16_t* input, float* output, float gain, int numFrames);
+
    //
    // Fast path when input is known to be silent and state as been flushed
    //
--- a/libraries/audio/src/avx2/AudioFOA_avx2.cpp
+++ b/libraries/audio/src/avx2/AudioFOA_avx2.cpp
@ -1289,14 +1289,16 @@ void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames) {

 #endif

-// in-place rotation of the soundfield
-// crossfade between old and new rotation, to prevent artifacts
-void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) {
+// in-place rotation and scaling of the soundfield
+// crossfade between old and new matrix, to prevent artifacts
+void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) {

-    const float md[3][3] = { 
-        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] },
-        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] },
-        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] },
+    // matrix difference
+    const float md[4][4] = { 
+        { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] },
+        { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] },
+        { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] },
+        { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] },
    };

    assert(numFrames % 8 == 0);
@ -1307,30 +1309,35 @@ void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3],

        // interpolate the matrix
        __m256 m00 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][0]), _mm256_broadcast_ss(&m1[0][0]));
-        __m256 m10 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][0]), _mm256_broadcast_ss(&m1[1][0]));
-        __m256 m20 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][0]), _mm256_broadcast_ss(&m1[2][0]));

-        __m256 m01 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][1]), _mm256_broadcast_ss(&m1[0][1]));
        __m256 m11 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][1]), _mm256_broadcast_ss(&m1[1][1]));
        __m256 m21 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][1]), _mm256_broadcast_ss(&m1[2][1]));
+        __m256 m31 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][1]), _mm256_broadcast_ss(&m1[3][1]));

-        __m256 m02 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][2]), _mm256_broadcast_ss(&m1[0][2]));
        __m256 m12 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][2]), _mm256_broadcast_ss(&m1[1][2]));
        __m256 m22 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][2]), _mm256_broadcast_ss(&m1[2][2]));
+        __m256 m32 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][2]), _mm256_broadcast_ss(&m1[3][2]));
+
+        __m256 m13 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][3]), _mm256_broadcast_ss(&m1[1][3]));
+        __m256 m23 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][3]), _mm256_broadcast_ss(&m1[2][3]));
+        __m256 m33 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][3]), _mm256_broadcast_ss(&m1[3][3]));

        // matrix multiply
-        __m256 x = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[1][i]));
-        __m256 y = _mm256_mul_ps(m10, _mm256_loadu_ps(&buf[1][i]));
-        __m256 z = _mm256_mul_ps(m20, _mm256_loadu_ps(&buf[1][i]));
+        __m256 w = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[0][i]));

-        x = _mm256_fmadd_ps(m01, _mm256_loadu_ps(&buf[2][i]), x);
-        y = _mm256_fmadd_ps(m11, _mm256_loadu_ps(&buf[2][i]), y);
-        z = _mm256_fmadd_ps(m21, _mm256_loadu_ps(&buf[2][i]), z);
+        __m256 x = _mm256_mul_ps(m11, _mm256_loadu_ps(&buf[1][i]));
+        __m256 y = _mm256_mul_ps(m21, _mm256_loadu_ps(&buf[1][i]));
+        __m256 z = _mm256_mul_ps(m31, _mm256_loadu_ps(&buf[1][i]));

-        x = _mm256_fmadd_ps(m02, _mm256_loadu_ps(&buf[3][i]), x);
-        y = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[3][i]), y);
-        z = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[3][i]), z);
+        x = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[2][i]), x);
+        y = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[2][i]), y);
+        z = _mm256_fmadd_ps(m32, _mm256_loadu_ps(&buf[2][i]), z);

+        x = _mm256_fmadd_ps(m13, _mm256_loadu_ps(&buf[3][i]), x);
+        y = _mm256_fmadd_ps(m23, _mm256_loadu_ps(&buf[3][i]), y);
+        z = _mm256_fmadd_ps(m33, _mm256_loadu_ps(&buf[3][i]), z);
+
+        _mm256_storeu_ps(&buf[0][i], w);
        _mm256_storeu_ps(&buf[1][i], x);
        _mm256_storeu_ps(&buf[2][i], y);
        _mm256_storeu_ps(&buf[3][i], z);