mirror of
https://github.com/overte-org/overte.git
synced 2025-08-09 22:51:20 +02:00
Refactor AVX code for separate compilation and runtime dispatch
This commit is contained in:
parent
08e299f724
commit
3d684dd0e7
2 changed files with 140 additions and 96 deletions
|
@ -64,82 +64,7 @@ static const float crossfadeTable[HRTF_BLOCK] = {
|
||||||
//
|
//
|
||||||
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
|
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
|
||||||
|
|
||||||
#include <immintrin.h> // AVX
|
#include <emmintrin.h>
|
||||||
|
|
||||||
#if 0 // AVX disabled for now..
|
|
||||||
// 1 channel input, 4 channel output
|
|
||||||
static void FIR_1x4_AVX(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
|
||||||
|
|
||||||
float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards
|
|
||||||
float* coef1 = coef[1] + HRTF_TAPS - 1;
|
|
||||||
float* coef2 = coef[2] + HRTF_TAPS - 1;
|
|
||||||
float* coef3 = coef[3] + HRTF_TAPS - 1;
|
|
||||||
|
|
||||||
assert(numFrames % 8 == 0);
|
|
||||||
|
|
||||||
for (int i = 0; i < numFrames; i += 8) {
|
|
||||||
|
|
||||||
__m256 acc0 = _mm256_setzero_ps();
|
|
||||||
__m256 acc1 = _mm256_setzero_ps();
|
|
||||||
__m256 acc2 = _mm256_setzero_ps();
|
|
||||||
__m256 acc3 = _mm256_setzero_ps();
|
|
||||||
|
|
||||||
float* ps = &src[i - HRTF_TAPS + 1]; // process forwards
|
|
||||||
|
|
||||||
assert(HRTF_TAPS % 8 == 0);
|
|
||||||
|
|
||||||
for (int k = 0; k < HRTF_TAPS; k += 8) {
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
|
||||||
|
|
||||||
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
|
||||||
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
|
||||||
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
|
||||||
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm256_storeu_ps(&dst0[i], acc0);
|
|
||||||
_mm256_storeu_ps(&dst1[i], acc1);
|
|
||||||
_mm256_storeu_ps(&dst2[i], acc2);
|
|
||||||
_mm256_storeu_ps(&dst3[i], acc3);
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm256_zeroupper();
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// 1 channel input, 4 channel output
|
// 1 channel input, 4 channel output
|
||||||
static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
||||||
|
@ -193,23 +118,22 @@ static void FIR_1x4_SSE(float* src, float* dst0, float* dst1, float* dst2, float
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Runtime CPU dispatch
|
// Detect AVX/AVX2 support
|
||||||
//
|
//
|
||||||
|
|
||||||
#if 0
|
#if defined(_MSC_VER)
|
||||||
//#if defined(_MSC_VER)
|
|
||||||
|
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
|
|
||||||
// detect AVX support
|
static bool cpuSupportsAVX() {
|
||||||
static bool cpuSupportsAVX() {
|
|
||||||
int info[4];
|
int info[4];
|
||||||
int mask = (1<<27) | (1<<28); // OSXSAVE and AVX
|
int mask = (1 << 27) | (1 << 28); // OSXSAVE and AVX
|
||||||
|
|
||||||
__cpuidex(info, 0x1, 0);
|
__cpuidex(info, 0x1, 0);
|
||||||
|
|
||||||
bool result = false;
|
bool result = false;
|
||||||
if ((info[2] & mask) == mask) {
|
if ((info[2] & mask) == mask) {
|
||||||
|
|
||||||
if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
|
if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
@ -217,33 +141,57 @@ static bool cpuSupportsAVX() {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef void (*t_FIR_1x4)(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
|
static bool cpuSupportsAVX2() {
|
||||||
|
int info[4];
|
||||||
|
int mask = (1 << 5); // AVX2
|
||||||
|
|
||||||
// dispatch stub
|
bool result = false;
|
||||||
static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
if (cpuSupportsAVX()) {
|
||||||
static t_FIR_1x4 f = cpuSupportsAVX() ? FIR_1x4_AVX : FIR_1x4_SSE; // init on first call
|
|
||||||
(*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch
|
__cpuidex(info, 0x7, 0);
|
||||||
|
|
||||||
|
if ((info[1] & mask) == mask) {
|
||||||
|
result = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
//#elif defined(__GNU__)
|
#elif defined(__GNU__) || defined(__clang__)
|
||||||
|
|
||||||
typedef void (*t_FIR_1x4)(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
|
static bool cpuSupportsAVX() {
|
||||||
|
return __builtin_cpu_supports("avx");
|
||||||
|
}
|
||||||
|
|
||||||
// dispatch stub
|
static bool cpuSupportsAVX2() {
|
||||||
static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
return __builtin_cpu_supports("avx2");
|
||||||
static t_FIR_1x4 f = __builtin_cpu_supports("avx") ? FIR_1x4_AVX : FIR_1x4_SSE; // init on first call
|
|
||||||
(*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
// always use SSE version
|
static bool cpuSupportsAVX() {
|
||||||
static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
return false;
|
||||||
FIR_1x4_SSE(src, dst0, dst1, dst2, dst3, coef, numFrames);
|
}
|
||||||
|
|
||||||
|
static bool cpuSupportsAVX2() {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// Runtime CPU dispatch
|
||||||
|
//
|
||||||
|
|
||||||
|
typedef void FIR_1x4_t(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames);
|
||||||
|
FIR_1x4_t FIR_1x4_AVX; // separate compilation with VEX-encoding enabled
|
||||||
|
|
||||||
|
static void FIR_1x4(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
||||||
|
|
||||||
|
static FIR_1x4_t* f = cpuSupportsAVX() ? FIR_1x4_AVX : FIR_1x4_SSE; // init on first call
|
||||||
|
(*f)(src, dst0, dst1, dst2, dst3, coef, numFrames); // dispatch
|
||||||
|
}
|
||||||
|
|
||||||
// 4 channel planar to interleaved
|
// 4 channel planar to interleaved
|
||||||
static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) {
|
static void interleave_4x4(float* src0, float* src1, float* src2, float* src3, float* dst, int numFrames) {
|
||||||
|
|
||||||
|
|
96
libraries/audio/src/avx/AudioHRTF_avx.cpp
Normal file
96
libraries/audio/src/avx/AudioHRTF_avx.cpp
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
//
|
||||||
|
// AudioHRTF_avx.cpp
|
||||||
|
// libraries/audio/src/avx
|
||||||
|
//
|
||||||
|
// Created by Ken Cooke on 1/17/16.
|
||||||
|
// Copyright 2016 High Fidelity, Inc.
|
||||||
|
//
|
||||||
|
// Distributed under the Apache License, Version 2.0.
|
||||||
|
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
|
||||||
|
//
|
||||||
|
|
||||||
|
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#include "../AudioHRTF.h"
|
||||||
|
|
||||||
|
#ifndef __AVX__
|
||||||
|
#error Must be compiled with /arch:AVX or -mavx.
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// 1 channel input, 4 channel output
|
||||||
|
void FIR_1x4_AVX(float* src, float* dst0, float* dst1, float* dst2, float* dst3, float coef[4][HRTF_TAPS], int numFrames) {
|
||||||
|
|
||||||
|
float* coef0 = coef[0] + HRTF_TAPS - 1; // process backwards
|
||||||
|
float* coef1 = coef[1] + HRTF_TAPS - 1;
|
||||||
|
float* coef2 = coef[2] + HRTF_TAPS - 1;
|
||||||
|
float* coef3 = coef[3] + HRTF_TAPS - 1;
|
||||||
|
|
||||||
|
assert(numFrames % 8 == 0);
|
||||||
|
|
||||||
|
for (int i = 0; i < numFrames; i += 8) {
|
||||||
|
|
||||||
|
__m256 acc0 = _mm256_setzero_ps();
|
||||||
|
__m256 acc1 = _mm256_setzero_ps();
|
||||||
|
__m256 acc2 = _mm256_setzero_ps();
|
||||||
|
__m256 acc3 = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
float* ps = &src[i - HRTF_TAPS + 1]; // process forwards
|
||||||
|
|
||||||
|
assert(HRTF_TAPS % 8 == 0);
|
||||||
|
|
||||||
|
for (int k = 0; k < HRTF_TAPS; k += 8) {
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-0]), _mm256_loadu_ps(&ps[k+0])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-1]), _mm256_loadu_ps(&ps[k+1])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-2]), _mm256_loadu_ps(&ps[k+2])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-3]), _mm256_loadu_ps(&ps[k+3])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-4]), _mm256_loadu_ps(&ps[k+4])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-5]), _mm256_loadu_ps(&ps[k+5])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-6]), _mm256_loadu_ps(&ps[k+6])));
|
||||||
|
|
||||||
|
acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_broadcast_ss(&coef0[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
||||||
|
acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_broadcast_ss(&coef1[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
||||||
|
acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_broadcast_ss(&coef2[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
||||||
|
acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_broadcast_ss(&coef3[-k-7]), _mm256_loadu_ps(&ps[k+7])));
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm256_storeu_ps(&dst0[i], acc0);
|
||||||
|
_mm256_storeu_ps(&dst1[i], acc1);
|
||||||
|
_mm256_storeu_ps(&dst2[i], acc2);
|
||||||
|
_mm256_storeu_ps(&dst3[i], acc3);
|
||||||
|
}
|
||||||
|
|
||||||
|
_mm256_zeroupper();
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in a new issue