mirror of
https://github.com/overte-org/overte.git
synced 2025-04-20 18:44:01 +02:00
Merge pull request #3055 from wangyix/master
Audio jitter buffer resizes depending on max inter-frame time gaps of incoming audio packets
This commit is contained in:
commit
18cd794542
10 changed files with 188 additions and 48 deletions
|
@ -54,9 +54,6 @@
|
|||
|
||||
#include "AudioMixer.h"
|
||||
|
||||
const short JITTER_BUFFER_MSECS = 12;
|
||||
const short JITTER_BUFFER_SAMPLES = JITTER_BUFFER_MSECS * (SAMPLE_RATE / 1000.0);
|
||||
|
||||
const float LOUDNESS_TO_DISTANCE_RATIO = 0.00001f;
|
||||
|
||||
const QString AUDIO_MIXER_LOGGING_TARGET_NAME = "audio-mixer";
|
||||
|
@ -487,8 +484,7 @@ void AudioMixer::run() {
|
|||
|
||||
foreach (const SharedNodePointer& node, nodeList->getNodeHash()) {
|
||||
if (node->getLinkedData()) {
|
||||
((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(JITTER_BUFFER_SAMPLES,
|
||||
_sourceUnattenuatedZone,
|
||||
((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(_sourceUnattenuatedZone,
|
||||
_listenerUnattenuatedZone);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -98,10 +98,9 @@ int AudioMixerClientData::parseData(const QByteArray& packet) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
|
||||
AABox* checkSourceZone, AABox* listenerZone) {
|
||||
void AudioMixerClientData::checkBuffersBeforeFrameSend(AABox* checkSourceZone, AABox* listenerZone) {
|
||||
for (int i = 0; i < _ringBuffers.size(); i++) {
|
||||
if (_ringBuffers[i]->shouldBeAddedToMix(jitterBufferLengthSamples)) {
|
||||
if (_ringBuffers[i]->shouldBeAddedToMix()) {
|
||||
// this is a ring buffer that is ready to go
|
||||
// set its flag so we know to push its buffer when all is said and done
|
||||
_ringBuffers[i]->setWillBeAddedToMix(true);
|
||||
|
@ -120,20 +119,22 @@ void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSam
|
|||
}
|
||||
|
||||
void AudioMixerClientData::pushBuffersAfterFrameSend() {
|
||||
for (int i = 0; i < _ringBuffers.size(); i++) {
|
||||
|
||||
QList<PositionalAudioRingBuffer*>::iterator i = _ringBuffers.begin();
|
||||
while (i != _ringBuffers.end()) {
|
||||
// this was a used buffer, push the output pointer forwards
|
||||
PositionalAudioRingBuffer* audioBuffer = _ringBuffers[i];
|
||||
PositionalAudioRingBuffer* audioBuffer = *i;
|
||||
|
||||
if (audioBuffer->willBeAddedToMix()) {
|
||||
audioBuffer->shiftReadPosition(audioBuffer->isStereo()
|
||||
? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL);
|
||||
|
||||
audioBuffer->shiftReadPosition(audioBuffer->getSamplesPerFrame());
|
||||
audioBuffer->setWillBeAddedToMix(false);
|
||||
} else if (audioBuffer->getType() == PositionalAudioRingBuffer::Injector
|
||||
&& audioBuffer->hasStarted() && audioBuffer->isStarved()) {
|
||||
// this is an empty audio buffer that has starved, safe to delete
|
||||
delete audioBuffer;
|
||||
_ringBuffers.erase(_ringBuffers.begin() + i);
|
||||
i = _ringBuffers.erase(i);
|
||||
continue;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,8 +27,7 @@ public:
|
|||
AvatarAudioRingBuffer* getAvatarAudioRingBuffer() const;
|
||||
|
||||
int parseData(const QByteArray& packet);
|
||||
void checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
|
||||
AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
|
||||
void checkBuffersBeforeFrameSend(AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
|
||||
void pushBuffersAfterFrameSend();
|
||||
private:
|
||||
QList<PositionalAudioRingBuffer*> _ringBuffers;
|
||||
|
|
|
@ -19,6 +19,9 @@ AvatarAudioRingBuffer::AvatarAudioRingBuffer(bool isStereo) :
|
|||
}
|
||||
|
||||
int AvatarAudioRingBuffer::parseData(const QByteArray& packet) {
|
||||
_interframeTimeGapStats.frameReceived();
|
||||
updateDesiredJitterBufferFrames();
|
||||
|
||||
_shouldLoopbackForNode = (packetTypeForPacket(packet) == PacketTypeMicrophoneAudioWithEcho);
|
||||
return PositionalAudioRingBuffer::parseData(packet);
|
||||
}
|
||||
|
|
|
@ -461,8 +461,8 @@ void Audio::handleAudioInput() {
|
|||
int16_t* inputAudioSamples = new int16_t[inputSamplesRequired];
|
||||
_inputRingBuffer.readSamples(inputAudioSamples, inputSamplesRequired);
|
||||
|
||||
int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
|
||||
int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
|
||||
const int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
|
||||
const int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
|
||||
|
||||
// zero out the monoAudioSamples array and the locally injected audio
|
||||
memset(networkAudioSamples, 0, numNetworkBytes);
|
||||
|
@ -634,12 +634,10 @@ void Audio::handleAudioInput() {
|
|||
packetType = PacketTypeSilentAudioFrame;
|
||||
|
||||
// we need to indicate how many silent samples this is to the audio mixer
|
||||
audioDataPacket[0] = _isStereoInput
|
||||
? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO
|
||||
: NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
|
||||
networkAudioSamples[0] = numNetworkSamples;
|
||||
numAudioBytes = sizeof(int16_t);
|
||||
} else {
|
||||
numAudioBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
|
||||
numAudioBytes = numNetworkBytes;
|
||||
|
||||
if (Menu::getInstance()->isOptionChecked(MenuOption::EchoServerAudio)) {
|
||||
packetType = PacketTypeMicrophoneAudioWithEcho;
|
||||
|
|
|
@ -124,17 +124,15 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
|
|||
|
||||
std::less<int16_t*> less;
|
||||
std::less_equal<int16_t*> lessEqual;
|
||||
|
||||
if (_hasStarted
|
||||
&& (less(_endOfLastWrite, _nextOutput)
|
||||
&& lessEqual(_nextOutput, shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy)))) {
|
||||
|
||||
if (_hasStarted && samplesToCopy > _sampleCapacity - samplesAvailable()) {
|
||||
// this read will cross the next output, so call us starved and reset the buffer
|
||||
qDebug() << "Filled the ring buffer. Resetting.";
|
||||
_endOfLastWrite = _buffer;
|
||||
_nextOutput = _buffer;
|
||||
_isStarved = true;
|
||||
}
|
||||
|
||||
|
||||
if (_endOfLastWrite + samplesToCopy <= _buffer + _sampleCapacity) {
|
||||
memcpy(_endOfLastWrite, data, samplesToCopy * sizeof(int16_t));
|
||||
} else {
|
||||
|
@ -144,7 +142,7 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
|
|||
}
|
||||
|
||||
_endOfLastWrite = shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy);
|
||||
|
||||
|
||||
return samplesToCopy * sizeof(int16_t);
|
||||
}
|
||||
|
||||
|
|
|
@ -31,6 +31,9 @@ InjectedAudioRingBuffer::InjectedAudioRingBuffer(const QUuid& streamIdentifier)
|
|||
const uchar MAX_INJECTOR_VOLUME = 255;
|
||||
|
||||
int InjectedAudioRingBuffer::parseData(const QByteArray& packet) {
|
||||
_interframeTimeGapStats.frameReceived();
|
||||
updateDesiredJitterBufferFrames();
|
||||
|
||||
// setup a data stream to read from this packet
|
||||
QDataStream packetStream(packet);
|
||||
packetStream.skipRawData(numBytesForPacketHeader(packet));
|
||||
|
|
|
@ -19,6 +19,71 @@
|
|||
#include <UUID.h>
|
||||
|
||||
#include "PositionalAudioRingBuffer.h"
|
||||
#include "SharedUtil.h"
|
||||
|
||||
InterframeTimeGapStats::InterframeTimeGapStats()
|
||||
: _lastFrameReceivedTime(0),
|
||||
_numSamplesInCurrentInterval(0),
|
||||
_currentIntervalMaxGap(0),
|
||||
_newestIntervalMaxGapAt(0),
|
||||
_windowMaxGap(0),
|
||||
_newWindowMaxGapAvailable(false)
|
||||
{
|
||||
memset(_intervalMaxGaps, 0, TIME_GAP_NUM_INTERVALS_IN_WINDOW * sizeof(quint64));
|
||||
}
|
||||
|
||||
void InterframeTimeGapStats::frameReceived() {
|
||||
quint64 now = usecTimestampNow();
|
||||
|
||||
// make sure this isn't the first time frameReceived() is called so can actually calculate a gap.
|
||||
if (_lastFrameReceivedTime != 0) {
|
||||
quint64 gap = now - _lastFrameReceivedTime;
|
||||
|
||||
// update the current interval max
|
||||
if (gap > _currentIntervalMaxGap) {
|
||||
_currentIntervalMaxGap = gap;
|
||||
|
||||
// keep the window max gap at least as large as the current interval max
|
||||
// this allows the window max gap to respond immediately to a sudden spike in gap times
|
||||
// also, this prevents the window max gap from staying at 0 until the first interval of samples filled up
|
||||
if (_currentIntervalMaxGap > _windowMaxGap) {
|
||||
_windowMaxGap = _currentIntervalMaxGap;
|
||||
_newWindowMaxGapAvailable = true;
|
||||
}
|
||||
}
|
||||
_numSamplesInCurrentInterval++;
|
||||
|
||||
// if the current interval of samples is now full, record it in our interval maxes
|
||||
if (_numSamplesInCurrentInterval == TIME_GAP_NUM_SAMPLES_IN_INTERVAL) {
|
||||
|
||||
// find location to insert this interval's max (increment index cyclically)
|
||||
_newestIntervalMaxGapAt = _newestIntervalMaxGapAt == TIME_GAP_NUM_INTERVALS_IN_WINDOW - 1 ? 0 : _newestIntervalMaxGapAt + 1;
|
||||
|
||||
// record the current interval's max gap as the newest
|
||||
_intervalMaxGaps[_newestIntervalMaxGapAt] = _currentIntervalMaxGap;
|
||||
|
||||
// update the window max gap, which is the max out of all the past intervals' max gaps
|
||||
_windowMaxGap = 0;
|
||||
for (int i = 0; i < TIME_GAP_NUM_INTERVALS_IN_WINDOW; i++) {
|
||||
if (_intervalMaxGaps[i] > _windowMaxGap) {
|
||||
_windowMaxGap = _intervalMaxGaps[i];
|
||||
}
|
||||
}
|
||||
_newWindowMaxGapAvailable = true;
|
||||
|
||||
// reset the current interval
|
||||
_numSamplesInCurrentInterval = 0;
|
||||
_currentIntervalMaxGap = 0;
|
||||
}
|
||||
}
|
||||
_lastFrameReceivedTime = now;
|
||||
}
|
||||
|
||||
quint64 InterframeTimeGapStats::getWindowMaxGap() {
|
||||
_newWindowMaxGapAvailable = false;
|
||||
return _windowMaxGap;
|
||||
}
|
||||
|
||||
|
||||
PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::Type type, bool isStereo) :
|
||||
AudioRingBuffer(isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL),
|
||||
|
@ -29,9 +94,10 @@ PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::
|
|||
_shouldLoopbackForNode(false),
|
||||
_shouldOutputStarveDebug(true),
|
||||
_isStereo(isStereo),
|
||||
_listenerUnattenuatedZone(NULL)
|
||||
_listenerUnattenuatedZone(NULL),
|
||||
_desiredJitterBufferFrames(1),
|
||||
_currentJitterBufferFrames(0)
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
|
||||
|
@ -54,13 +120,31 @@ int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
|
|||
readBytes += sizeof(int16_t);
|
||||
|
||||
if (numSilentSamples > 0) {
|
||||
addSilentFrame(numSilentSamples);
|
||||
if (_currentJitterBufferFrames > _desiredJitterBufferFrames) {
|
||||
// our current jitter buffer size exceeds its desired value, so ignore some silent
|
||||
// frames to get that size as close to desired as possible
|
||||
int samplesPerFrame = getSamplesPerFrame();
|
||||
int numSilentFrames = numSilentSamples / samplesPerFrame;
|
||||
int numFramesToDropDesired = _currentJitterBufferFrames - _desiredJitterBufferFrames;
|
||||
|
||||
if (numSilentFrames > numFramesToDropDesired) {
|
||||
// we have more than enough frames to drop to get the jitter buffer to its desired length
|
||||
int numSilentFramesToAdd = numSilentFrames - numFramesToDropDesired;
|
||||
addSilentFrame(numSilentFramesToAdd * samplesPerFrame);
|
||||
_currentJitterBufferFrames = _desiredJitterBufferFrames;
|
||||
|
||||
} else {
|
||||
// we need to drop all frames to get the jitter buffer close as possible to its desired length
|
||||
_currentJitterBufferFrames -= numSilentFrames;
|
||||
}
|
||||
} else {
|
||||
addSilentFrame(numSilentSamples);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// there is audio data to read
|
||||
readBytes += writeData(packet.data() + readBytes, packet.size() - readBytes);
|
||||
}
|
||||
|
||||
return readBytes;
|
||||
}
|
||||
|
||||
|
@ -106,29 +190,54 @@ void PositionalAudioRingBuffer::updateNextOutputTrailingLoudness() {
|
|||
}
|
||||
}
|
||||
|
||||
bool PositionalAudioRingBuffer::shouldBeAddedToMix(int numJitterBufferSamples) {
|
||||
if (!isNotStarvedOrHasMinimumSamples(NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL + numJitterBufferSamples)) {
|
||||
bool PositionalAudioRingBuffer::shouldBeAddedToMix() {
|
||||
int samplesPerFrame = getSamplesPerFrame();
|
||||
int desiredJitterBufferSamples = _desiredJitterBufferFrames * samplesPerFrame;
|
||||
|
||||
if (!isNotStarvedOrHasMinimumSamples(samplesPerFrame + desiredJitterBufferSamples)) {
|
||||
// if the buffer was starved, allow it to accrue at least the desired number of
|
||||
// jitter buffer frames before we start taking frames from it for mixing
|
||||
|
||||
if (_shouldOutputStarveDebug) {
|
||||
_shouldOutputStarveDebug = false;
|
||||
}
|
||||
|
||||
return false;
|
||||
} else if (samplesAvailable() < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) {
|
||||
|
||||
return false;
|
||||
} else if (samplesAvailable() < samplesPerFrame) {
|
||||
// if the buffer doesn't have a full frame of samples to take for mixing, it is starved
|
||||
_isStarved = true;
|
||||
|
||||
// set to 0 to indicate the jitter buffer is starved
|
||||
_currentJitterBufferFrames = 0;
|
||||
|
||||
// reset our _shouldOutputStarveDebug to true so the next is printed
|
||||
_shouldOutputStarveDebug = true;
|
||||
|
||||
|
||||
return false;
|
||||
} else {
|
||||
// good buffer, add this to the mix
|
||||
}
|
||||
|
||||
// good buffer, add this to the mix
|
||||
if (_isStarved) {
|
||||
// if this buffer has just finished replenishing after being starved, the number of frames in it now
|
||||
// minus one (since a frame will be read immediately after this) is the length of the jitter buffer
|
||||
_currentJitterBufferFrames = samplesAvailable() / samplesPerFrame - 1;
|
||||
_isStarved = false;
|
||||
|
||||
// since we've read data from ring buffer at least once - we've started
|
||||
_hasStarted = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
// since we've read data from ring buffer at least once - we've started
|
||||
_hasStarted = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void PositionalAudioRingBuffer::updateDesiredJitterBufferFrames() {
|
||||
|
||||
const float USECS_PER_FRAME = NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL * USECS_PER_SECOND / (float)SAMPLE_RATE;
|
||||
|
||||
if (_interframeTimeGapStats.hasNewWindowMaxGapAvailable()) {
|
||||
_desiredJitterBufferFrames = ceilf((float)_interframeTimeGapStats.getWindowMaxGap() / USECS_PER_FRAME);
|
||||
if (_desiredJitterBufferFrames < 1) {
|
||||
_desiredJitterBufferFrames = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,31 @@
|
|||
|
||||
#include "AudioRingBuffer.h"
|
||||
|
||||
// this means that every 500 samples, the max for the past 10*500 samples will be calculated
|
||||
const int TIME_GAP_NUM_SAMPLES_IN_INTERVAL = 500;
|
||||
const int TIME_GAP_NUM_INTERVALS_IN_WINDOW = 10;
|
||||
|
||||
// class used to track time between incoming frames for the purpose of varying the jitter buffer length
|
||||
class InterframeTimeGapStats {
|
||||
public:
|
||||
InterframeTimeGapStats();
|
||||
|
||||
void frameReceived();
|
||||
bool hasNewWindowMaxGapAvailable() const { return _newWindowMaxGapAvailable; }
|
||||
quint64 peekWindowMaxGap() const { return _windowMaxGap; }
|
||||
quint64 getWindowMaxGap();
|
||||
|
||||
private:
|
||||
quint64 _lastFrameReceivedTime;
|
||||
|
||||
int _numSamplesInCurrentInterval;
|
||||
quint64 _currentIntervalMaxGap;
|
||||
quint64 _intervalMaxGaps[TIME_GAP_NUM_INTERVALS_IN_WINDOW];
|
||||
int _newestIntervalMaxGapAt;
|
||||
quint64 _windowMaxGap;
|
||||
bool _newWindowMaxGapAvailable;
|
||||
};
|
||||
|
||||
class PositionalAudioRingBuffer : public AudioRingBuffer {
|
||||
public:
|
||||
enum Type {
|
||||
|
@ -34,7 +59,7 @@ public:
|
|||
void updateNextOutputTrailingLoudness();
|
||||
float getNextOutputTrailingLoudness() const { return _nextOutputTrailingLoudness; }
|
||||
|
||||
bool shouldBeAddedToMix(int numJitterBufferSamples);
|
||||
bool shouldBeAddedToMix();
|
||||
|
||||
bool willBeAddedToMix() const { return _willBeAddedToMix; }
|
||||
void setWillBeAddedToMix(bool willBeAddedToMix) { _willBeAddedToMix = willBeAddedToMix; }
|
||||
|
@ -50,10 +75,14 @@ public:
|
|||
AABox* getListenerUnattenuatedZone() const { return _listenerUnattenuatedZone; }
|
||||
void setListenerUnattenuatedZone(AABox* listenerUnattenuatedZone) { _listenerUnattenuatedZone = listenerUnattenuatedZone; }
|
||||
|
||||
int getSamplesPerFrame() const { return _isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL; }
|
||||
|
||||
protected:
|
||||
// disallow copying of PositionalAudioRingBuffer objects
|
||||
PositionalAudioRingBuffer(const PositionalAudioRingBuffer&);
|
||||
PositionalAudioRingBuffer& operator= (const PositionalAudioRingBuffer&);
|
||||
|
||||
void updateDesiredJitterBufferFrames();
|
||||
|
||||
PositionalAudioRingBuffer::Type _type;
|
||||
glm::vec3 _position;
|
||||
|
@ -65,6 +94,10 @@ protected:
|
|||
|
||||
float _nextOutputTrailingLoudness;
|
||||
AABox* _listenerUnattenuatedZone;
|
||||
|
||||
InterframeTimeGapStats _interframeTimeGapStats;
|
||||
int _desiredJitterBufferFrames;
|
||||
int _currentJitterBufferFrames;
|
||||
};
|
||||
|
||||
#endif // hifi_PositionalAudioRingBuffer_h
|
||||
|
|
|
@ -354,7 +354,7 @@ void OctreeEditPacketSender::processNackPacket(const QByteArray& packet) {
|
|||
// read number of sequence numbers
|
||||
uint16_t numSequenceNumbers = (*(uint16_t*)dataAt);
|
||||
dataAt += sizeof(uint16_t);
|
||||
|
||||
|
||||
// read sequence numbers and queue packets for resend
|
||||
for (int i = 0; i < numSequenceNumbers; i++) {
|
||||
unsigned short int sequenceNumber = (*(unsigned short int*)dataAt);
|
||||
|
|
Loading…
Reference in a new issue