diff --git a/assignment-client/src/audio/AudioMixerSlave.cpp b/assignment-client/src/audio/AudioMixerSlave.cpp index cb90df58e5..e5e9f89984 100644 --- a/assignment-client/src/audio/AudioMixerSlave.cpp +++ b/assignment-client/src/audio/AudioMixerSlave.cpp @@ -549,38 +549,28 @@ void AudioMixerSlave::addStream(AudioMixerClientData::MixableStream& mixableStre // grab the stream from the ring buffer AudioRingBuffer::ConstIterator streamPopOutput = streamToAdd->getLastPopOutput(); - // stereo sources are not passed through HRTF if (streamToAdd->isStereo()) { - // apply the avatar gain adjustment - gain *= mixableStream.hrtf->getGainAdjustment(); + streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_STEREO); - const float scale = 1 / 32768.0f; // int16_t to float - - for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) { - _mixSamples[2*i+0] += (float)streamPopOutput[2*i+0] * gain * scale; - _mixSamples[2*i+1] += (float)streamPopOutput[2*i+1] * gain * scale; - } + // stereo sources are not passed through HRTF + mixableStream.hrtf->mixStereo(_bufferSamples, _mixSamples, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); ++stats.manualStereoMixes; } else if (isEcho) { + + streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); + // echo sources are not passed through HRTF - - const float scale = 1/32768.0f; // int16_t to float - - for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) { - float sample = (float)streamPopOutput[i] * gain * scale; - _mixSamples[2*i+0] += sample; - _mixSamples[2*i+1] += sample; - } + mixableStream.hrtf->mixMono(_bufferSamples, _mixSamples, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); ++stats.manualEchoMixes; } else { + streamPopOutput.readSamples(_bufferSamples, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); mixableStream.hrtf->render(_bufferSamples, _mixSamples, HRTF_DATASET_INDEX, azimuth, distance, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); - ++stats.hrtfRenders; } } diff --git a/libraries/audio-client/src/AudioClient.cpp b/libraries/audio-client/src/AudioClient.cpp index c537fea646..4d3311b065 100644 --- a/libraries/audio-client/src/AudioClient.cpp +++ b/libraries/audio-client/src/AudioClient.cpp @@ -1397,7 +1397,6 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) { // spatialize into mixBuffer injector->getLocalFOA().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX, qw, qx, qy, qz, gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); - } else if (options.stereo) { if (options.positionSet) { @@ -1409,11 +1408,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) { } // direct mix into mixBuffer - for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) { - mixBuffer[2*i+0] += convertToFloat(_localScratchBuffer[2*i+0]) * gain; - mixBuffer[2*i+1] += convertToFloat(_localScratchBuffer[2*i+1]) * gain; - } - + injector->getLocalHRTF().mixStereo(_localScratchBuffer, mixBuffer, gain, + AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); } else { // injector is mono if (options.positionSet) { @@ -1431,11 +1427,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) { } else { // direct mix into mixBuffer - for (int i = 0; i < AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; i++) { - float sample = convertToFloat(_localScratchBuffer[i]) * gain; - mixBuffer[2*i+0] += sample; - mixBuffer[2*i+1] += sample; - } + injector->getLocalHRTF().mixMono(_localScratchBuffer, mixBuffer, gain, + AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL); } } diff --git a/libraries/audio/src/AudioFOA.cpp b/libraries/audio/src/AudioFOA.cpp index 30d29b72b7..0dd61fbd02 100644 --- a/libraries/audio/src/AudioFOA.cpp +++ b/libraries/audio/src/AudioFOA.cpp @@ -882,14 +882,16 @@ static void convertInput_ref(int16_t* src, float *dst[4], float gain, int numFra #endif -// in-place rotation of the soundfield -// crossfade between old and new rotation, to prevent artifacts -static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) { +// in-place rotation and scaling of the soundfield +// crossfade between old and new matrix, to prevent artifacts +static void rotate_4x4_ref(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) { - const float md[3][3] = { - { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] }, - { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] }, - { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] }, + // matrix difference + const float md[4][4] = { + { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] }, + { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] }, + { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] }, + { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] }, }; for (int i = 0; i < numFrames; i++) { @@ -898,22 +900,27 @@ static void rotate_3x3_ref(float* buf[4], const float m0[3][3], const float m1[3 // interpolate the matrix float m00 = m1[0][0] + frac * md[0][0]; - float m10 = m1[1][0] + frac * md[1][0]; - float m20 = m1[2][0] + frac * md[2][0]; - float m01 = m1[0][1] + frac * md[0][1]; float m11 = m1[1][1] + frac * md[1][1]; float m21 = m1[2][1] + frac * md[2][1]; + float m31 = m1[3][1] + frac * md[3][1]; - float m02 = m1[0][2] + frac * md[0][2]; float m12 = m1[1][2] + frac * md[1][2]; float m22 = m1[2][2] + frac * md[2][2]; + float m32 = m1[3][2] + frac * md[3][2]; + + float m13 = m1[1][3] + frac * md[1][3]; + float m23 = m1[2][3] + frac * md[2][3]; + float m33 = m1[3][3] + frac * md[3][3]; // matrix multiply - float x = m00 * buf[1][i] + m01 * buf[2][i] + m02 * buf[3][i]; - float y = m10 * buf[1][i] + m11 * buf[2][i] + m12 * buf[3][i]; - float z = m20 * buf[1][i] + m21 * buf[2][i] + m22 * buf[3][i]; + float w = m00 * buf[0][i]; + float x = m11 * buf[1][i] + m12 * buf[2][i] + m13 * buf[3][i]; + float y = m21 * buf[1][i] + m22 * buf[2][i] + m23 * buf[3][i]; + float z = m31 * buf[1][i] + m32 * buf[2][i] + m33 * buf[3][i]; + + buf[0][i] = w; buf[1][i] = x; buf[2][i] = y; buf[3][i] = z; @@ -932,7 +939,7 @@ void rfft512_AVX2(float buf[512]); void rifft512_AVX2(float buf[512]); void rfft512_cmadd_1X2_AVX2(const float src[512], const float coef0[512], const float coef1[512], float dst0[512], float dst1[512]); void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames); -void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames); +void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames); static void rfft512(float buf[512]) { static auto f = cpuSupportsAVX2() ? rfft512_AVX2 : rfft512_ref; @@ -954,8 +961,8 @@ static void convertInput(int16_t* src, float *dst[4], float gain, int numFrames) (*f)(src, dst, gain, numFrames); // dispatch } -static void rotate_3x3(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) { - static auto f = cpuSupportsAVX2() ? rotate_3x3_AVX2 : rotate_3x3_ref; +static void rotate_4x4(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) { + static auto f = cpuSupportsAVX2() ? rotate_4x4_AVX2 : rotate_4x4_ref; (*f)(buf, m0, m1, win, numFrames); // dispatch } @@ -965,7 +972,7 @@ static auto& rfft512 = rfft512_ref; static auto& rifft512 = rifft512_ref; static auto& rfft512_cmadd_1X2 = rfft512_cmadd_1X2_ref; static auto& convertInput = convertInput_ref; -static auto& rotate_3x3 = rotate_3x3_ref; +static auto& rotate_4x4 = rotate_4x4_ref; #endif @@ -1007,8 +1014,8 @@ ALIGN32 static const float crossfadeTable[FOA_BLOCK] = { 0.0020975362f, 0.0015413331f, 0.0010705384f, 0.0006852326f, 0.0003854819f, 0.0001713375f, 0.0000428362f, 0.0000000000f, }; -// convert quaternion to a column-major 3x3 rotation matrix -static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3]) { +// convert quaternion to a column-major 4x4 rotation matrix +static void quatToMatrix_4x4(float w, float x, float y, float z, float m[4][4]) { float xx = x * (x + x); float xy = x * (y + y); @@ -1022,17 +1029,33 @@ static void quatToMatrix_3x3(float w, float x, float y, float z, float m[3][3]) float wy = w * (y + y); float wz = w * (z + z); - m[0][0] = 1.0f - (yy + zz); - m[0][1] = xy - wz; - m[0][2] = xz + wy; + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; - m[1][0] = xy + wz; - m[1][1] = 1.0f - (xx + zz); - m[1][2] = yz - wx; + m[1][0] = 0.0f; + m[1][1] = 1.0f - (yy + zz); + m[1][2] = xy - wz; + m[1][3] = xz + wy; - m[2][0] = xz - wy; - m[2][1] = yz + wx; - m[2][2] = 1.0f - (xx + yy); + m[2][0] = 0.0f; + m[2][1] = xy + wz; + m[2][2] = 1.0f - (xx + zz); + m[2][3] = yz - wx; + + m[3][0] = 0.0f; + m[3][1] = xz - wy; + m[3][2] = yz + wx; + m[3][3] = 1.0f - (xx + yy); +} + +static void scaleMatrix_4x4(float scale, float m[4][4]) { + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + m[i][j] *= scale; + } + } } // Ambisonic to binaural render @@ -1047,18 +1070,26 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float ALIGN32 float inBuffer[4][FOA_BLOCK]; // deinterleaved input buffers float* in[4] = { inBuffer[0], inBuffer[1], inBuffer[2], inBuffer[3] }; - float rotation[3][3]; + float rotation[4][4]; // convert input to deinterleaved float - convertInput(input, in, FOA_GAIN * gain, FOA_BLOCK); + convertInput(input, in, FOA_GAIN, FOA_BLOCK); - // convert quaternion to 3x3 rotation - quatToMatrix_3x3(qw, qx, qy, qz, rotation); + // convert quaternion to 4x4 rotation + quatToMatrix_4x4(qw, qx, qy, qz, rotation); - // rotate the soundfield - rotate_3x3(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK); + // apply gain as uniform scale + scaleMatrix_4x4(gain, rotation); - // rotation history update + // disable interpolation from reset state + if (_resetState) { + memcpy(_rotationState, rotation, sizeof(_rotationState)); + } + + // rotate and scale the soundfield + rotate_4x4(in, _rotationState, rotation, crossfadeTable, FOA_BLOCK); + + // new parameters become old memcpy(_rotationState, rotation, sizeof(_rotationState)); // @@ -1093,4 +1124,6 @@ void AudioFOA::render(int16_t* input, float* output, int index, float qw, float output[2*i+0] += accBuffer[0][i + FOA_OVERLAP]; output[2*i+1] += accBuffer[1][i + FOA_OVERLAP]; } + + _resetState = false; } diff --git a/libraries/audio/src/AudioFOA.h b/libraries/audio/src/AudioFOA.h index 9eccc35bce..e8cacc22ab 100644 --- a/libraries/audio/src/AudioFOA.h +++ b/libraries/audio/src/AudioFOA.h @@ -28,12 +28,7 @@ static_assert((FOA_BLOCK + FOA_OVERLAP) == FOA_NFFT, "FFT convolution requires L class AudioFOA { public: - AudioFOA() { - // identity matrix - _rotationState[0][0] = 1.0f; - _rotationState[1][1] = 1.0f; - _rotationState[2][2] = 1.0f; - }; + AudioFOA() {}; // // input: interleaved First-Order Ambisonic source @@ -55,8 +50,10 @@ private: // input history, for overlap-save float _fftState[4][FOA_OVERLAP] = {}; - // orientation history - float _rotationState[3][3] = {}; + // orientation and gain history + float _rotationState[4][4] = {}; + + bool _resetState = true; }; #endif // AudioFOA_h diff --git a/libraries/audio/src/AudioHRTF.cpp b/libraries/audio/src/AudioHRTF.cpp index 9de6440d67..e5e32781b0 100644 --- a/libraries/audio/src/AudioHRTF.cpp +++ b/libraries/audio/src/AudioHRTF.cpp @@ -750,6 +750,43 @@ static void interpolate(const float* src0, const float* src1, float* dst, float #endif +// apply gain crossfade with accumulation (interleaved) +static void gainfade_1x2(int16_t* src, float* dst, const float* win, float gain0, float gain1, int numFrames) { + + gain0 *= (1/32768.0f); // int16_t to float + gain1 *= (1/32768.0f); + + for (int i = 0; i < numFrames; i++) { + + float frac = win[i]; + float gain = gain1 + frac * (gain0 - gain1); + + float x0 = (float)src[i] * gain; + + dst[2*i+0] += x0; + dst[2*i+1] += x0; + } +} + +// apply gain crossfade with accumulation (interleaved) +static void gainfade_2x2(int16_t* src, float* dst, const float* win, float gain0, float gain1, int numFrames) { + + gain0 *= (1/32768.0f); // int16_t to float + gain1 *= (1/32768.0f); + + for (int i = 0; i < numFrames; i++) { + + float frac = win[i]; + float gain = gain1 + frac * (gain0 - gain1); + + float x0 = (float)src[2*i+0] * gain; + float x1 = (float)src[2*i+1] * gain; + + dst[2*i+0] += x0; + dst[2*i+1] += x1; + } +} + // design a 2nd order Thiran allpass static void ThiranBiquad(float f, float& b0, float& b1, float& b2, float& a1, float& a2) { @@ -1104,6 +1141,13 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth, // apply global and local gain adjustment gain *= _gainAdjust; + // disable interpolation from reset state + if (_resetState) { + _azimuthState = azimuth; + _distanceState = distance; + _gainState = gain; + } + // to avoid polluting the cache, old filters are recomputed instead of stored setFilters(firCoef, bqCoef, delay, index, _azimuthState, _distanceState, _gainState, L0); @@ -1175,3 +1219,45 @@ void AudioHRTF::render(int16_t* input, float* output, int index, float azimuth, _resetState = false; } + +void AudioHRTF::mixMono(int16_t* input, float* output, float gain, int numFrames) { + + assert(numFrames == HRTF_BLOCK); + + // apply global and local gain adjustment + gain *= _gainAdjust; + + // disable interpolation from reset state + if (_resetState) { + _gainState = gain; + } + + // crossfade gain and accumulate + gainfade_1x2(input, output, crossfadeTable, _gainState, gain, HRTF_BLOCK); + + // new parameters become old + _gainState = gain; + + _resetState = false; +} + +void AudioHRTF::mixStereo(int16_t* input, float* output, float gain, int numFrames) { + + assert(numFrames == HRTF_BLOCK); + + // apply global and local gain adjustment + gain *= _gainAdjust; + + // disable interpolation from reset state + if (_resetState) { + _gainState = gain; + } + + // crossfade gain and accumulate + gainfade_2x2(input, output, crossfadeTable, _gainState, gain, HRTF_BLOCK); + + // new parameters become old + _gainState = gain; + + _resetState = false; +} diff --git a/libraries/audio/src/AudioHRTF.h b/libraries/audio/src/AudioHRTF.h index 7d23f4825a..436d6318a5 100644 --- a/libraries/audio/src/AudioHRTF.h +++ b/libraries/audio/src/AudioHRTF.h @@ -50,6 +50,12 @@ public: // void render(int16_t* input, float* output, int index, float azimuth, float distance, float gain, int numFrames); + // + // Non-spatialized direct mix (accumulates into existing output) + // + void mixMono(int16_t* input, float* output, float gain, int numFrames); + void mixStereo(int16_t* input, float* output, float gain, int numFrames); + // // Fast path when input is known to be silent and state as been flushed // diff --git a/libraries/audio/src/avx2/AudioFOA_avx2.cpp b/libraries/audio/src/avx2/AudioFOA_avx2.cpp index 70f9b0e5f6..15d37fcc3a 100644 --- a/libraries/audio/src/avx2/AudioFOA_avx2.cpp +++ b/libraries/audio/src/avx2/AudioFOA_avx2.cpp @@ -1289,14 +1289,16 @@ void convertInput_AVX2(int16_t* src, float *dst[4], float gain, int numFrames) { #endif -// in-place rotation of the soundfield -// crossfade between old and new rotation, to prevent artifacts -void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], const float* win, int numFrames) { +// in-place rotation and scaling of the soundfield +// crossfade between old and new matrix, to prevent artifacts +void rotate_4x4_AVX2(float* buf[4], const float m0[4][4], const float m1[4][4], const float* win, int numFrames) { - const float md[3][3] = { - { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2] }, - { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2] }, - { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2] }, + // matrix difference + const float md[4][4] = { + { m0[0][0] - m1[0][0], m0[0][1] - m1[0][1], m0[0][2] - m1[0][2], m0[0][3] - m1[0][3] }, + { m0[1][0] - m1[1][0], m0[1][1] - m1[1][1], m0[1][2] - m1[1][2], m0[1][3] - m1[1][3] }, + { m0[2][0] - m1[2][0], m0[2][1] - m1[2][1], m0[2][2] - m1[2][2], m0[2][3] - m1[2][3] }, + { m0[3][0] - m1[3][0], m0[3][1] - m1[3][1], m0[3][2] - m1[3][2], m0[3][3] - m1[3][3] }, }; assert(numFrames % 8 == 0); @@ -1307,30 +1309,35 @@ void rotate_3x3_AVX2(float* buf[4], const float m0[3][3], const float m1[3][3], // interpolate the matrix __m256 m00 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][0]), _mm256_broadcast_ss(&m1[0][0])); - __m256 m10 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][0]), _mm256_broadcast_ss(&m1[1][0])); - __m256 m20 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][0]), _mm256_broadcast_ss(&m1[2][0])); - __m256 m01 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][1]), _mm256_broadcast_ss(&m1[0][1])); __m256 m11 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][1]), _mm256_broadcast_ss(&m1[1][1])); __m256 m21 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][1]), _mm256_broadcast_ss(&m1[2][1])); + __m256 m31 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][1]), _mm256_broadcast_ss(&m1[3][1])); - __m256 m02 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[0][2]), _mm256_broadcast_ss(&m1[0][2])); __m256 m12 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][2]), _mm256_broadcast_ss(&m1[1][2])); __m256 m22 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][2]), _mm256_broadcast_ss(&m1[2][2])); + __m256 m32 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][2]), _mm256_broadcast_ss(&m1[3][2])); + + __m256 m13 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[1][3]), _mm256_broadcast_ss(&m1[1][3])); + __m256 m23 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[2][3]), _mm256_broadcast_ss(&m1[2][3])); + __m256 m33 = _mm256_fmadd_ps(frac, _mm256_broadcast_ss(&md[3][3]), _mm256_broadcast_ss(&m1[3][3])); // matrix multiply - __m256 x = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[1][i])); - __m256 y = _mm256_mul_ps(m10, _mm256_loadu_ps(&buf[1][i])); - __m256 z = _mm256_mul_ps(m20, _mm256_loadu_ps(&buf[1][i])); + __m256 w = _mm256_mul_ps(m00, _mm256_loadu_ps(&buf[0][i])); - x = _mm256_fmadd_ps(m01, _mm256_loadu_ps(&buf[2][i]), x); - y = _mm256_fmadd_ps(m11, _mm256_loadu_ps(&buf[2][i]), y); - z = _mm256_fmadd_ps(m21, _mm256_loadu_ps(&buf[2][i]), z); + __m256 x = _mm256_mul_ps(m11, _mm256_loadu_ps(&buf[1][i])); + __m256 y = _mm256_mul_ps(m21, _mm256_loadu_ps(&buf[1][i])); + __m256 z = _mm256_mul_ps(m31, _mm256_loadu_ps(&buf[1][i])); - x = _mm256_fmadd_ps(m02, _mm256_loadu_ps(&buf[3][i]), x); - y = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[3][i]), y); - z = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[3][i]), z); + x = _mm256_fmadd_ps(m12, _mm256_loadu_ps(&buf[2][i]), x); + y = _mm256_fmadd_ps(m22, _mm256_loadu_ps(&buf[2][i]), y); + z = _mm256_fmadd_ps(m32, _mm256_loadu_ps(&buf[2][i]), z); + x = _mm256_fmadd_ps(m13, _mm256_loadu_ps(&buf[3][i]), x); + y = _mm256_fmadd_ps(m23, _mm256_loadu_ps(&buf[3][i]), y); + z = _mm256_fmadd_ps(m33, _mm256_loadu_ps(&buf[3][i]), z); + + _mm256_storeu_ps(&buf[0][i], w); _mm256_storeu_ps(&buf[1][i], x); _mm256_storeu_ps(&buf[2][i], y); _mm256_storeu_ps(&buf[3][i], z);