Ambisonic resampler, optimized using AVX2.

int16_t and float versions.
Removed unused SSE2 version.
Removed unused getExactInput().
Refactored to remove nested #ifdefs.
This commit is contained in:
Ken Cooke 2016-12-14 15:18:41 -08:00
parent 1aec8c5a65
commit b4994a3d89
4 changed files with 933 additions and 554 deletions

File diff suppressed because it is too large Load diff

View file

@ -14,7 +14,7 @@
#include <stdint.h>
static const int SRC_MAX_CHANNELS = 2;
static const int SRC_MAX_CHANNELS = 4;
// polyphase filter
static const int SRC_PHASEBITS = 8;
@ -48,8 +48,6 @@ public:
int getMinInput(int outputFrames);
int getMaxInput(int outputFrames);
int getExactInput(int outputFrames);
private:
float* _polyphaseFilter;
int* _stepTable;
@ -77,12 +75,18 @@ private:
int multirateFilter1(const float* input0, float* output0, int inputFrames);
int multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
int multirateFilter4(const float* input0, const float* input1, const float* input2, const float* input3,
float* output0, float* output1, float* output2, float* output3, int inputFrames);
int multirateFilter1_SSE(const float* input0, float* output0, int inputFrames);
int multirateFilter2_SSE(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
int multirateFilter1_ref(const float* input0, float* output0, int inputFrames);
int multirateFilter2_ref(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
int multirateFilter4_ref(const float* input0, const float* input1, const float* input2, const float* input3,
float* output0, float* output1, float* output2, float* output3, int inputFrames);
int multirateFilter1_AVX2(const float* input0, float* output0, int inputFrames);
int multirateFilter2_AVX2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
int multirateFilter4_AVX2(const float* input0, const float* input1, const float* input2, const float* input3,
float* output0, float* output1, float* output2, float* output3, int inputFrames);
void convertInput(const int16_t* input, float** outputs, int numFrames);
void convertOutput(float** inputs, int16_t* output, int numFrames);

View file

@ -97,54 +97,9 @@ void Sound::downSample(const QByteArray& rawAudioByteArray, int sampleRate) {
// no resampling needed
_byteArray = rawAudioByteArray;
} else if (_isAmbisonic) {
// FIXME: add a proper Ambisonic resampler!
int numChannels = 4;
AudioSRC resampler[4] { {sampleRate, AudioConstants::SAMPLE_RATE, 1},
{sampleRate, AudioConstants::SAMPLE_RATE, 1},
{sampleRate, AudioConstants::SAMPLE_RATE, 1},
{sampleRate, AudioConstants::SAMPLE_RATE, 1} };
// resize to max possible output
int numSourceFrames = rawAudioByteArray.size() / (numChannels * sizeof(AudioConstants::AudioSample));
int maxDestinationFrames = resampler[0].getMaxOutput(numSourceFrames);
int maxDestinationBytes = maxDestinationFrames * numChannels * sizeof(AudioConstants::AudioSample);
_byteArray.resize(maxDestinationBytes);
int numDestinationFrames = 0;
// iterate over channels
int16_t* srcBuffer = new int16_t[numSourceFrames];
int16_t* dstBuffer = new int16_t[maxDestinationFrames];
for (int ch = 0; ch < 4; ch++) {
int16_t* src = (int16_t*)rawAudioByteArray.data();
int16_t* dst = (int16_t*)_byteArray.data();
// deinterleave samples
for (int i = 0; i < numSourceFrames; i++) {
srcBuffer[i] = src[4*i + ch];
}
// resample one channel
numDestinationFrames = resampler[ch].render(srcBuffer, dstBuffer, numSourceFrames);
// reinterleave samples
for (int i = 0; i < numDestinationFrames; i++) {
dst[4*i + ch] = dstBuffer[i];
}
}
delete[] srcBuffer;
delete[] dstBuffer;
// truncate to actual output
int numDestinationBytes = numDestinationFrames * numChannels * sizeof(AudioConstants::AudioSample);
_byteArray.resize(numDestinationBytes);
} else {
int numChannels = _isStereo ? 2 : 1;
int numChannels = _isAmbisonic ? AudioConstants::AMBISONIC : (_isStereo ? AudioConstants::STEREO : AudioConstants::MONO);
AudioSRC resampler(sampleRate, AudioConstants::SAMPLE_RATE, numChannels);
// resize to max possible output

View file

@ -198,4 +198,109 @@ int AudioSRC::multirateFilter2_AVX2(const float* input0, const float* input1, fl
return outputFrames;
}
int AudioSRC::multirateFilter4_AVX2(const float* input0, const float* input1, const float* input2, const float* input3,
float* output0, float* output1, float* output2, float* output3, int inputFrames) {
int outputFrames = 0;
assert(_numTaps % 8 == 0); // SIMD8
if (_step == 0) { // rational
int32_t i = HI32(_offset);
while (i < inputFrames) {
const float* c0 = &_polyphaseFilter[_numTaps * _phase];
__m256 acc0 = _mm256_setzero_ps();
__m256 acc1 = _mm256_setzero_ps();
__m256 acc2 = _mm256_setzero_ps();
__m256 acc3 = _mm256_setzero_ps();
for (int j = 0; j < _numTaps; j += 8) {
//float coef = c0[j];
__m256 coef0 = _mm256_loadu_ps(&c0[j]);
//acc += input[i + j] * coef;
acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1);
acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(&input2[i + j]), coef0, acc2);
acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(&input3[i + j]), coef0, acc3);
}
// horizontal sum
acc0 = _mm256_hadd_ps(acc0, acc1);
acc2 = _mm256_hadd_ps(acc2, acc3);
acc0 = _mm256_hadd_ps(acc0, acc2);
__m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
_mm_store_ss(&output0[outputFrames], t0);
_mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,1)));
_mm_store_ss(&output2[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2)));
_mm_store_ss(&output3[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,3)));
outputFrames += 1;
i += _stepTable[_phase];
if (++_phase == _upFactor) {
_phase = 0;
}
}
_offset = (int64_t)(i - inputFrames) << 32;
} else { // irrational
while (HI32(_offset) < inputFrames) {
int32_t i = HI32(_offset);
uint32_t f = LO32(_offset);
uint32_t phase = f >> SRC_FRACBITS;
float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT;
const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)];
const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)];
__m256 acc0 = _mm256_setzero_ps();
__m256 acc1 = _mm256_setzero_ps();
__m256 acc2 = _mm256_setzero_ps();
__m256 acc3 = _mm256_setzero_ps();
__m256 frac = _mm256_broadcast_ss(&ftmp);
for (int j = 0; j < _numTaps; j += 8) {
//float coef = c0[j] + frac * (c1[j] - c0[j]);
__m256 coef0 = _mm256_loadu_ps(&c0[j]);
__m256 coef1 = _mm256_loadu_ps(&c1[j]);
coef1 = _mm256_sub_ps(coef1, coef0);
coef0 = _mm256_fmadd_ps(coef1, frac, coef0);
//acc += input[i + j] * coef;
acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1);
acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(&input2[i + j]), coef0, acc2);
acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(&input3[i + j]), coef0, acc3);
}
// horizontal sum
acc0 = _mm256_hadd_ps(acc0, acc1);
acc2 = _mm256_hadd_ps(acc2, acc3);
acc0 = _mm256_hadd_ps(acc0, acc2);
__m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
_mm_store_ss(&output0[outputFrames], t0);
_mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,1)));
_mm_store_ss(&output2[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2)));
_mm_store_ss(&output3[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,3)));
outputFrames += 1;
_offset += _step;
}
_offset -= (int64_t)inputFrames << 32;
}
_mm256_zeroupper();
return outputFrames;
}
#endif