mirror of
https://github.com/JulianGro/overte.git
synced 2025-04-25 17:35:08 +02:00
Ambisonic resampler, optimized using AVX2.
int16_t and float versions. Removed unused SSE2 version. Removed unused getExactInput(). Refactored to remove nested #ifdefs.
This commit is contained in:
parent
1aec8c5a65
commit
b4994a3d89
4 changed files with 933 additions and 554 deletions
File diff suppressed because it is too large
Load diff
|
@ -14,7 +14,7 @@
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
static const int SRC_MAX_CHANNELS = 2;
|
||||
static const int SRC_MAX_CHANNELS = 4;
|
||||
|
||||
// polyphase filter
|
||||
static const int SRC_PHASEBITS = 8;
|
||||
|
@ -48,8 +48,6 @@ public:
|
|||
int getMinInput(int outputFrames);
|
||||
int getMaxInput(int outputFrames);
|
||||
|
||||
int getExactInput(int outputFrames);
|
||||
|
||||
private:
|
||||
float* _polyphaseFilter;
|
||||
int* _stepTable;
|
||||
|
@ -77,12 +75,18 @@ private:
|
|||
|
||||
int multirateFilter1(const float* input0, float* output0, int inputFrames);
|
||||
int multirateFilter2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
|
||||
int multirateFilter4(const float* input0, const float* input1, const float* input2, const float* input3,
|
||||
float* output0, float* output1, float* output2, float* output3, int inputFrames);
|
||||
|
||||
int multirateFilter1_SSE(const float* input0, float* output0, int inputFrames);
|
||||
int multirateFilter2_SSE(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
|
||||
int multirateFilter1_ref(const float* input0, float* output0, int inputFrames);
|
||||
int multirateFilter2_ref(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
|
||||
int multirateFilter4_ref(const float* input0, const float* input1, const float* input2, const float* input3,
|
||||
float* output0, float* output1, float* output2, float* output3, int inputFrames);
|
||||
|
||||
int multirateFilter1_AVX2(const float* input0, float* output0, int inputFrames);
|
||||
int multirateFilter2_AVX2(const float* input0, const float* input1, float* output0, float* output1, int inputFrames);
|
||||
int multirateFilter4_AVX2(const float* input0, const float* input1, const float* input2, const float* input3,
|
||||
float* output0, float* output1, float* output2, float* output3, int inputFrames);
|
||||
|
||||
void convertInput(const int16_t* input, float** outputs, int numFrames);
|
||||
void convertOutput(float** inputs, int16_t* output, int numFrames);
|
||||
|
|
|
@ -97,54 +97,9 @@ void Sound::downSample(const QByteArray& rawAudioByteArray, int sampleRate) {
|
|||
// no resampling needed
|
||||
_byteArray = rawAudioByteArray;
|
||||
|
||||
} else if (_isAmbisonic) {
|
||||
|
||||
// FIXME: add a proper Ambisonic resampler!
|
||||
int numChannels = 4;
|
||||
AudioSRC resampler[4] { {sampleRate, AudioConstants::SAMPLE_RATE, 1},
|
||||
{sampleRate, AudioConstants::SAMPLE_RATE, 1},
|
||||
{sampleRate, AudioConstants::SAMPLE_RATE, 1},
|
||||
{sampleRate, AudioConstants::SAMPLE_RATE, 1} };
|
||||
|
||||
// resize to max possible output
|
||||
int numSourceFrames = rawAudioByteArray.size() / (numChannels * sizeof(AudioConstants::AudioSample));
|
||||
int maxDestinationFrames = resampler[0].getMaxOutput(numSourceFrames);
|
||||
int maxDestinationBytes = maxDestinationFrames * numChannels * sizeof(AudioConstants::AudioSample);
|
||||
_byteArray.resize(maxDestinationBytes);
|
||||
|
||||
int numDestinationFrames = 0;
|
||||
|
||||
// iterate over channels
|
||||
int16_t* srcBuffer = new int16_t[numSourceFrames];
|
||||
int16_t* dstBuffer = new int16_t[maxDestinationFrames];
|
||||
for (int ch = 0; ch < 4; ch++) {
|
||||
|
||||
int16_t* src = (int16_t*)rawAudioByteArray.data();
|
||||
int16_t* dst = (int16_t*)_byteArray.data();
|
||||
|
||||
// deinterleave samples
|
||||
for (int i = 0; i < numSourceFrames; i++) {
|
||||
srcBuffer[i] = src[4*i + ch];
|
||||
}
|
||||
|
||||
// resample one channel
|
||||
numDestinationFrames = resampler[ch].render(srcBuffer, dstBuffer, numSourceFrames);
|
||||
|
||||
// reinterleave samples
|
||||
for (int i = 0; i < numDestinationFrames; i++) {
|
||||
dst[4*i + ch] = dstBuffer[i];
|
||||
}
|
||||
}
|
||||
delete[] srcBuffer;
|
||||
delete[] dstBuffer;
|
||||
|
||||
// truncate to actual output
|
||||
int numDestinationBytes = numDestinationFrames * numChannels * sizeof(AudioConstants::AudioSample);
|
||||
_byteArray.resize(numDestinationBytes);
|
||||
|
||||
} else {
|
||||
|
||||
int numChannels = _isStereo ? 2 : 1;
|
||||
int numChannels = _isAmbisonic ? AudioConstants::AMBISONIC : (_isStereo ? AudioConstants::STEREO : AudioConstants::MONO);
|
||||
AudioSRC resampler(sampleRate, AudioConstants::SAMPLE_RATE, numChannels);
|
||||
|
||||
// resize to max possible output
|
||||
|
|
|
@ -198,4 +198,109 @@ int AudioSRC::multirateFilter2_AVX2(const float* input0, const float* input1, fl
|
|||
return outputFrames;
|
||||
}
|
||||
|
||||
int AudioSRC::multirateFilter4_AVX2(const float* input0, const float* input1, const float* input2, const float* input3,
|
||||
float* output0, float* output1, float* output2, float* output3, int inputFrames) {
|
||||
int outputFrames = 0;
|
||||
|
||||
assert(_numTaps % 8 == 0); // SIMD8
|
||||
|
||||
if (_step == 0) { // rational
|
||||
|
||||
int32_t i = HI32(_offset);
|
||||
|
||||
while (i < inputFrames) {
|
||||
|
||||
const float* c0 = &_polyphaseFilter[_numTaps * _phase];
|
||||
|
||||
__m256 acc0 = _mm256_setzero_ps();
|
||||
__m256 acc1 = _mm256_setzero_ps();
|
||||
__m256 acc2 = _mm256_setzero_ps();
|
||||
__m256 acc3 = _mm256_setzero_ps();
|
||||
|
||||
for (int j = 0; j < _numTaps; j += 8) {
|
||||
|
||||
//float coef = c0[j];
|
||||
__m256 coef0 = _mm256_loadu_ps(&c0[j]);
|
||||
|
||||
//acc += input[i + j] * coef;
|
||||
acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
|
||||
acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1);
|
||||
acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(&input2[i + j]), coef0, acc2);
|
||||
acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(&input3[i + j]), coef0, acc3);
|
||||
}
|
||||
|
||||
// horizontal sum
|
||||
acc0 = _mm256_hadd_ps(acc0, acc1);
|
||||
acc2 = _mm256_hadd_ps(acc2, acc3);
|
||||
acc0 = _mm256_hadd_ps(acc0, acc2);
|
||||
__m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
|
||||
|
||||
_mm_store_ss(&output0[outputFrames], t0);
|
||||
_mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,1)));
|
||||
_mm_store_ss(&output2[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2)));
|
||||
_mm_store_ss(&output3[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,3)));
|
||||
outputFrames += 1;
|
||||
|
||||
i += _stepTable[_phase];
|
||||
if (++_phase == _upFactor) {
|
||||
_phase = 0;
|
||||
}
|
||||
}
|
||||
_offset = (int64_t)(i - inputFrames) << 32;
|
||||
|
||||
} else { // irrational
|
||||
|
||||
while (HI32(_offset) < inputFrames) {
|
||||
|
||||
int32_t i = HI32(_offset);
|
||||
uint32_t f = LO32(_offset);
|
||||
|
||||
uint32_t phase = f >> SRC_FRACBITS;
|
||||
float ftmp = (f & SRC_FRACMASK) * QFRAC_TO_FLOAT;
|
||||
|
||||
const float* c0 = &_polyphaseFilter[_numTaps * (phase + 0)];
|
||||
const float* c1 = &_polyphaseFilter[_numTaps * (phase + 1)];
|
||||
|
||||
__m256 acc0 = _mm256_setzero_ps();
|
||||
__m256 acc1 = _mm256_setzero_ps();
|
||||
__m256 acc2 = _mm256_setzero_ps();
|
||||
__m256 acc3 = _mm256_setzero_ps();
|
||||
__m256 frac = _mm256_broadcast_ss(&ftmp);
|
||||
|
||||
for (int j = 0; j < _numTaps; j += 8) {
|
||||
|
||||
//float coef = c0[j] + frac * (c1[j] - c0[j]);
|
||||
__m256 coef0 = _mm256_loadu_ps(&c0[j]);
|
||||
__m256 coef1 = _mm256_loadu_ps(&c1[j]);
|
||||
coef1 = _mm256_sub_ps(coef1, coef0);
|
||||
coef0 = _mm256_fmadd_ps(coef1, frac, coef0);
|
||||
|
||||
//acc += input[i + j] * coef;
|
||||
acc0 = _mm256_fmadd_ps(_mm256_loadu_ps(&input0[i + j]), coef0, acc0);
|
||||
acc1 = _mm256_fmadd_ps(_mm256_loadu_ps(&input1[i + j]), coef0, acc1);
|
||||
acc2 = _mm256_fmadd_ps(_mm256_loadu_ps(&input2[i + j]), coef0, acc2);
|
||||
acc3 = _mm256_fmadd_ps(_mm256_loadu_ps(&input3[i + j]), coef0, acc3);
|
||||
}
|
||||
|
||||
// horizontal sum
|
||||
acc0 = _mm256_hadd_ps(acc0, acc1);
|
||||
acc2 = _mm256_hadd_ps(acc2, acc3);
|
||||
acc0 = _mm256_hadd_ps(acc0, acc2);
|
||||
__m128 t0 = _mm_add_ps(_mm256_castps256_ps128(acc0), _mm256_extractf128_ps(acc0, 1));
|
||||
|
||||
_mm_store_ss(&output0[outputFrames], t0);
|
||||
_mm_store_ss(&output1[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,1)));
|
||||
_mm_store_ss(&output2[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,2)));
|
||||
_mm_store_ss(&output3[outputFrames], _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0,0,0,3)));
|
||||
outputFrames += 1;
|
||||
|
||||
_offset += _step;
|
||||
}
|
||||
_offset -= (int64_t)inputFrames << 32;
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
return outputFrames;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in a new issue