AVX2 optimized audio resampler

This commit is contained in:
Ken Cooke 2016-06-06 11:54:18 -07:00
parent f2bedc546f
commit 34f46b860b
2 changed files with 163 additions and 60 deletions

View file

@ -544,22 +544,9 @@ static const float prototypeFilter[PROTOTYPE_TAPS * PROTOTYPE_PHASES] = {
8.99882100e-06f, 7.61267073e-06f, 6.57702907e-06f, 5.59829210e-06f, 4.27698546e-06f, 1.03248674e-05f,
};
//
// polyphase filter
//
static const int SRC_PHASEBITS = 8;
static const int SRC_PHASES = (1 << SRC_PHASEBITS);
static const int SRC_FRACBITS = 32 - SRC_PHASEBITS;
static const uint32_t SRC_FRACMASK = (1 << SRC_FRACBITS) - 1;
static const float QFRAC_TO_FLOAT = 1.0f / (1 << SRC_FRACBITS);
static const float Q32_TO_FLOAT = 1.0f / (1ULL << 32);
// blocking size in frames, chosen so block processing fits in L1 cache
static const int SRC_BLOCK = 1024;
#define lo32(a) ((uint32_t)(a))
#define hi32(a) ((int32_t)((a) >> 32))
// high/low part of int64_t
#define LO32(a) ((uint32_t)(a))
#define HI32(a) ((int32_t)((a) >> 32))
//
// Portable aligned malloc/free
@ -610,8 +597,8 @@ static void cubicInterpolation(const float* input, float* output, int inputSize,
// Lagrange interpolation using Farrow structure
for (int j = 0; j < outputSize; j++) {
int32_t i = hi32(offset);
uint32_t f = lo32(offset);
int32_t i = HI32(offset);
uint32_t f = LO32(offset);
// values outside the window are zero
float x0 = (i - 1 < 0) ? 0.0f : input[i - 1];
@ -649,7 +636,7 @@ int AudioSRC::createRationalFilter(int upFactor, int downFactor, float gain) {
numTaps = (numCoefs + upFactor - 1) / upFactor;
gain *= (float)oldCoefs / numCoefs;
}
numTaps = (numTaps + 3) & ~3; // SIMD4
numTaps = (numTaps + 7) & ~7; // SIMD8
// interpolate the coefficients of the prototype filter
float* tempFilter = new float[numTaps * numPhases];
@ -658,7 +645,7 @@ int AudioSRC::createRationalFilter(int upFactor, int downFactor, float gain) {
cubicInterpolation(prototypeFilter, tempFilter, prototypeCoefs, numCoefs, gain);
// create the polyphase filter
_polyphaseFilter = (float*)aligned_malloc(numTaps * numPhases * sizeof(float), 16); // SIMD4
_polyphaseFilter = (float*)aligned_malloc(numTaps * numPhases * sizeof(float), 32); // SIMD8
// rearrange into polyphase form, ordered by use
for (int i = 0; i < numPhases; i++) {
@ -699,7 +686,7 @@ int AudioSRC::createIrrationalFilter(int upFactor, int downFactor, float gain) {
numTaps = (numCoefs + upFactor - 1) / upFactor;
gain *= (float)oldCoefs / numCoefs;
}
numTaps = (numTaps + 3) & ~3; // SIMD4
numTaps = (numTaps + 7) & ~7; // SIMD8
// interpolate the coefficients of the prototype filter
float* tempFilter = new float[numTaps * numPhases];
@ -708,7 +695,7 @@ int AudioSRC::createIrrationalFilter(int upFactor, int downFactor, float gain) {
cubicInterpolation(prototypeFilter, tempFilter, prototypeCoefs, numCoefs, gain);
// create the polyphase filter, with extra phase at the end to simplify coef interpolation
_polyphaseFilter = (float*)aligned_malloc(numTaps * (numPhases + 1) * sizeof(float), 16); // SIMD4
_polyphaseFilter = (float*)aligned_malloc(numTaps * (numPhases + 1) * sizeof(float), 32); // SIMD8
// rearrange into polyphase form, ordered by fractional delay
for (int phase = 0; phase < numPhases; phase++) {
@ -741,14 +728,14 @@ int AudioSRC::createIrrationalFilter(int upFactor, int downFactor, float gain) {
#include <emmintrin.h>
int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) {
int AudioSRC::multirateFilter1_SSE(const float* input0, float* output0, int inputFrames) {
int outputFrames = 0;
assert((_numTaps & 0x3) == 0); // SIMD4
assert(_numTaps % 4 == 0); // SIMD4
if (_step == 0) { // rational
int32_t i = hi32(_offset);
int32_t i = HI32(_offset);
while (i < inputFrames) {
@ -761,7 +748,7 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra
//float coef = c0[j];
__m128 coef0 = _mm_loadu_ps(&c0[j]);
//acc0 += input0[i + j] * coef;
//acc += input[i + j] * coef;
acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0);
}
@ -781,10 +768,10 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra
} else { // irrational
while (hi32(_offset) < inputFrames) {
while (HI32(_offset) < inputFrames) {
int32_t i = hi32(_offset);
uint32_t f = lo32(_offset);
int32_t i = HI32(_offset);
uint32_t f = LO32(_offset);
uint32_t phase = f >> SRC_FRACBITS;
__m128 frac = _mm_set1_ps((f & SRC_FRACMASK) * QFRAC_TO_FLOAT);
@ -802,7 +789,7 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra
coef1 = _mm_sub_ps(coef1, coef0);
coef0 = _mm_add_ps(_mm_mul_ps(coef1, frac), coef0);
//acc0 += input0[i + j] * coef;
//acc += input[i + j] * coef;
acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0);
}
@ -821,14 +808,14 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra
return outputFrames;
}
int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) {
int AudioSRC::multirateFilter2_SSE(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) {
int outputFrames = 0;
assert((_numTaps & 0x3) == 0); // SIMD4
assert(_numTaps % 4 == 0); // SIMD4
if (_step == 0) { // rational
int32_t i = hi32(_offset);
int32_t i = HI32(_offset);
while (i < inputFrames) {
@ -842,7 +829,7 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
//float coef = c0[j];
__m128 coef0 = _mm_loadu_ps(&c0[j]);
//acc0 += input0[i + j] * coef;
//acc += input[i + j] * coef;
acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0);
acc1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input1[i + j]), coef0), acc1);
}
@ -866,10 +853,10 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
} else { // irrational
while (hi32(_offset) < inputFrames) {
while (HI32(_offset) < inputFrames) {
int32_t i = hi32(_offset);
uint32_t f = lo32(_offset);
int32_t i = HI32(_offset);
uint32_t f = LO32(_offset);
uint32_t phase = f >> SRC_FRACBITS;
__m128 frac = _mm_set1_ps((f & SRC_FRACMASK) * QFRAC_TO_FLOAT);
@ -888,7 +875,7 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
coef1 = _mm_sub_ps(coef1, coef0);
coef0 = _mm_add_ps(_mm_mul_ps(coef1, frac), coef0);
//acc0 += input0[i + j] * coef;
//acc += input[i + j] * coef;
acc0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input0[i + j]), coef0), acc0);
acc1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(&input1[i + j]), coef0), acc1);
}
@ -911,6 +898,107 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
return outputFrames;
}
//
// Detect AVX/AVX2 support
//
#if defined(_MSC_VER)
#include <intrin.h>
static bool cpuSupportsAVX() {
int info[4];
int mask = (1 << 27) | (1 << 28); // OSXSAVE and AVX
__cpuidex(info, 0x1, 0);
bool result = false;
if ((info[2] & mask) == mask) {
if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
result = true;
}
}
return result;
}
static bool cpuSupportsAVX2() {
int info[4];
int mask = (1 << 5); // AVX2
bool result = false;
if (cpuSupportsAVX()) {
__cpuidex(info, 0x7, 0);
if ((info[1] & mask) == mask) {
result = true;
}
}
return result;
}
#elif defined(__GNUC__)
#include <cpuid.h>
static bool cpuSupportsAVX() {
unsigned int eax, ebx, ecx, edx;
unsigned int mask = (1 << 27) | (1 << 28); // OSXSAVE and AVX
bool result = false;
if (__get_cpuid(0x1, &eax, &ebx, &ecx, &edx) && ((ecx & mask) == mask)) {
__asm__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
if ((eax & 0x6) == 0x6) {
result = true;
}
}
return result;
}
static bool cpuSupportsAVX2() {
unsigned int eax, ebx, ecx, edx;
unsigned mask = (1 << 5); // AVX2
bool result = false;
if (cpuSupportsAVX()) {
if (__get_cpuid(0x7, &eax, &ebx, &ecx, &edx) && ((ebx & mask) == mask)) {
result = true;
}
}
return result;
}
#else
static bool cpuSupportsAVX() {
return false;
}
static bool cpuSupportsAVX2() {
return false;
}
#endif
//
// Runtime CPU dispatch
//
int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFrames) {
static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter1_AVX2 : &AudioSRC::multirateFilter1_SSE;
return (this->*f)(input0, output0, inputFrames); // dispatch
}
int AudioSRC::multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames) {
static auto f = cpuSupportsAVX2() ? &AudioSRC::multirateFilter2_AVX2 : &AudioSRC::multirateFilter2_SSE;
return (this->*f)(input0, input1, output0, output1, inputFrames); // dispatch
}
// convert int16_t to float, deinterleave stereo
void AudioSRC::convertInputFromInt16(const int16_t* input, float** outputs, int numFrames) {
__m128 scale = _mm_set1_ps(1/32768.0f);
@ -1069,7 +1157,7 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra
if (_step == 0) { // rational
int32_t i = hi32(_offset);
int32_t i = HI32(_offset);
while (i < inputFrames) {
@ -1096,10 +1184,10 @@ int AudioSRC::multirateFilter1(const float* input0, float* output0, int inputFra
} else { // irrational
while (hi32(_offset) < inputFrames) {
while (HI32(_offset) < inputFrames) {
int32_t i = hi32(_offset);
uint32_t f = lo32(_offset);
int32_t i = HI32(_offset);
uint32_t f = LO32(_offset);
uint32_t phase = f >> SRC_FRACBITS;
float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT;
@ -1132,7 +1220,7 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
if (_step == 0) { // rational
int32_t i = hi32(_offset);
int32_t i = HI32(_offset);
while (i < inputFrames) {
@ -1162,10 +1250,10 @@ int AudioSRC::multirateFilter2(const float* input0, const float* input1, float*
} else { // irrational
while (hi32(_offset) < inputFrames) {
while (HI32(_offset) < inputFrames) {
int32_t i = hi32(_offset);
uint32_t f = lo32(_offset);
int32_t i = HI32(_offset);
uint32_t f = LO32(_offset);
uint32_t phase = f >> SRC_FRACBITS;
float frac = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT;
@ -1320,7 +1408,7 @@ AudioSRC::AudioSRC(int inputSampleRate, int outputSampleRate, int numChannels) {
assert(inputSampleRate > 0);
assert(outputSampleRate > 0);
assert(numChannels > 0);
assert(numChannels <= MAX_CHANNELS);
assert(numChannels <= SRC_MAX_CHANNELS);
_inputSampleRate = inputSampleRate;
_outputSampleRate = outputSampleRate;
@ -1349,7 +1437,7 @@ AudioSRC::AudioSRC(int inputSampleRate, int outputSampleRate, int numChannels) {
_numTaps = createIrrationalFilter(_upFactor, _downFactor, 1.0f);
}
//printf("up=%d down=%.3f taps=%d\n", _upFactor, _downFactor + (lo32(_step)<<SRC_PHASEBITS) * Q32_TO_FLOAT, _numTaps);
//printf("up=%d down=%.3f taps=%d\n", _upFactor, _downFactor + (LO32(_step)<<SRC_PHASEBITS) * Q32_TO_FLOAT, _numTaps);
// filter history buffers
_numHistory = _numTaps - 1;
@ -1357,10 +1445,10 @@ AudioSRC::AudioSRC(int inputSampleRate, int outputSampleRate, int numChannels) {
_history[1] = new float[2 * _numHistory];
// format conversion buffers
_inputs[0] = (float*)aligned_malloc(SRC_BLOCK * sizeof(float), 16); // SIMD4
_inputs[1] = (float*)aligned_malloc(SRC_BLOCK * sizeof(float), 16);
_outputs[0] = (float*)aligned_malloc(SRC_BLOCK * sizeof(float), 16);
_outputs[1] = (float*)aligned_malloc(SRC_BLOCK * sizeof(float), 16);
_inputs[0] = (float*)aligned_malloc(4*SRC_BLOCK * sizeof(float), 16); // SIMD4
_inputs[1] = _inputs[0] + 1*SRC_BLOCK;
_outputs[0] = _inputs[0] + 2*SRC_BLOCK;
_outputs[1] = _inputs[0] + 3*SRC_BLOCK;
// input blocking size, such that input and output are both guaranteed not to exceed SRC_BLOCK frames
_inputBlock = std::min(SRC_BLOCK, getMaxInput(SRC_BLOCK));
@ -1380,9 +1468,6 @@ AudioSRC::~AudioSRC() {
delete[] _history[1];
aligned_free(_inputs[0]);
aligned_free(_inputs[1]);
aligned_free(_outputs[0]);
aligned_free(_outputs[1]);
}
//

View file

@ -14,11 +14,23 @@
#include <stdint.h>
static const int SRC_MAX_CHANNELS = 2;
// polyphase filter
static const int SRC_PHASEBITS = 8;
static const int SRC_PHASES = (1 << SRC_PHASEBITS);
static const int SRC_FRACBITS = 32 - SRC_PHASEBITS;
static const uint32_t SRC_FRACMASK = (1 << SRC_FRACBITS) - 1;
static const float QFRAC_TO_FLOAT = 1.0f / (1 << SRC_FRACBITS);
static const float Q32_TO_FLOAT = 1.0f / (1ULL << 32);
// blocking size in frames, chosen so block processing fits in L1 cache
static const int SRC_BLOCK = 256;
class AudioSRC {
public:
static const int MAX_CHANNELS = 2;
AudioSRC(int inputSampleRate, int outputSampleRate, int numChannels);
~AudioSRC();
@ -33,9 +45,9 @@ private:
float* _polyphaseFilter;
int* _stepTable;
float* _history[MAX_CHANNELS];
float* _inputs[MAX_CHANNELS];
float* _outputs[MAX_CHANNELS];
float* _history[SRC_MAX_CHANNELS];
float* _inputs[SRC_MAX_CHANNELS];
float* _outputs[SRC_MAX_CHANNELS];
int _inputSampleRate;
int _outputSampleRate;
@ -57,6 +69,12 @@ private:
int multirateFilter1(const float* input0, float* output0, int inputFrames);
int multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
int multirateFilter1_SSE(const float* input0, float* output0, int inputFrames);
int multirateFilter2_SSE(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
int multirateFilter1_AVX2(const float* input0, float* output0, int inputFrames);
int multirateFilter2_AVX2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
void convertInputFromInt16(const int16_t* input, float** outputs, int numFrames);
void convertOutputToInt16(float** inputs, int16_t* output, int numFrames);