Merge pull request #3055 from wangyix/master

Audio jitter buffer resizes depending on max inter-frame time gaps of incoming audio packets
This commit is contained in:
Brad Hefta-Gaub 2014-06-23 12:45:04 -07:00
commit 18cd794542
10 changed files with 188 additions and 48 deletions

View file

@ -54,9 +54,6 @@
#include "AudioMixer.h" #include "AudioMixer.h"
const short JITTER_BUFFER_MSECS = 12;
const short JITTER_BUFFER_SAMPLES = JITTER_BUFFER_MSECS * (SAMPLE_RATE / 1000.0);
const float LOUDNESS_TO_DISTANCE_RATIO = 0.00001f; const float LOUDNESS_TO_DISTANCE_RATIO = 0.00001f;
const QString AUDIO_MIXER_LOGGING_TARGET_NAME = "audio-mixer"; const QString AUDIO_MIXER_LOGGING_TARGET_NAME = "audio-mixer";
@ -487,8 +484,7 @@ void AudioMixer::run() {
foreach (const SharedNodePointer& node, nodeList->getNodeHash()) { foreach (const SharedNodePointer& node, nodeList->getNodeHash()) {
if (node->getLinkedData()) { if (node->getLinkedData()) {
((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(JITTER_BUFFER_SAMPLES, ((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(_sourceUnattenuatedZone,
_sourceUnattenuatedZone,
_listenerUnattenuatedZone); _listenerUnattenuatedZone);
} }
} }

View file

@ -98,10 +98,9 @@ int AudioMixerClientData::parseData(const QByteArray& packet) {
return 0; return 0;
} }
void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSamples, void AudioMixerClientData::checkBuffersBeforeFrameSend(AABox* checkSourceZone, AABox* listenerZone) {
AABox* checkSourceZone, AABox* listenerZone) {
for (int i = 0; i < _ringBuffers.size(); i++) { for (int i = 0; i < _ringBuffers.size(); i++) {
if (_ringBuffers[i]->shouldBeAddedToMix(jitterBufferLengthSamples)) { if (_ringBuffers[i]->shouldBeAddedToMix()) {
// this is a ring buffer that is ready to go // this is a ring buffer that is ready to go
// set its flag so we know to push its buffer when all is said and done // set its flag so we know to push its buffer when all is said and done
_ringBuffers[i]->setWillBeAddedToMix(true); _ringBuffers[i]->setWillBeAddedToMix(true);
@ -120,20 +119,22 @@ void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSam
} }
void AudioMixerClientData::pushBuffersAfterFrameSend() { void AudioMixerClientData::pushBuffersAfterFrameSend() {
for (int i = 0; i < _ringBuffers.size(); i++) {
QList<PositionalAudioRingBuffer*>::iterator i = _ringBuffers.begin();
while (i != _ringBuffers.end()) {
// this was a used buffer, push the output pointer forwards // this was a used buffer, push the output pointer forwards
PositionalAudioRingBuffer* audioBuffer = _ringBuffers[i]; PositionalAudioRingBuffer* audioBuffer = *i;
if (audioBuffer->willBeAddedToMix()) { if (audioBuffer->willBeAddedToMix()) {
audioBuffer->shiftReadPosition(audioBuffer->isStereo() audioBuffer->shiftReadPosition(audioBuffer->getSamplesPerFrame());
? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL);
audioBuffer->setWillBeAddedToMix(false); audioBuffer->setWillBeAddedToMix(false);
} else if (audioBuffer->getType() == PositionalAudioRingBuffer::Injector } else if (audioBuffer->getType() == PositionalAudioRingBuffer::Injector
&& audioBuffer->hasStarted() && audioBuffer->isStarved()) { && audioBuffer->hasStarted() && audioBuffer->isStarved()) {
// this is an empty audio buffer that has starved, safe to delete // this is an empty audio buffer that has starved, safe to delete
delete audioBuffer; delete audioBuffer;
_ringBuffers.erase(_ringBuffers.begin() + i); i = _ringBuffers.erase(i);
continue;
} }
i++;
} }
} }

View file

@ -27,8 +27,7 @@ public:
AvatarAudioRingBuffer* getAvatarAudioRingBuffer() const; AvatarAudioRingBuffer* getAvatarAudioRingBuffer() const;
int parseData(const QByteArray& packet); int parseData(const QByteArray& packet);
void checkBuffersBeforeFrameSend(int jitterBufferLengthSamples, void checkBuffersBeforeFrameSend(AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
void pushBuffersAfterFrameSend(); void pushBuffersAfterFrameSend();
private: private:
QList<PositionalAudioRingBuffer*> _ringBuffers; QList<PositionalAudioRingBuffer*> _ringBuffers;

View file

@ -19,6 +19,9 @@ AvatarAudioRingBuffer::AvatarAudioRingBuffer(bool isStereo) :
} }
int AvatarAudioRingBuffer::parseData(const QByteArray& packet) { int AvatarAudioRingBuffer::parseData(const QByteArray& packet) {
_interframeTimeGapStats.frameReceived();
updateDesiredJitterBufferFrames();
_shouldLoopbackForNode = (packetTypeForPacket(packet) == PacketTypeMicrophoneAudioWithEcho); _shouldLoopbackForNode = (packetTypeForPacket(packet) == PacketTypeMicrophoneAudioWithEcho);
return PositionalAudioRingBuffer::parseData(packet); return PositionalAudioRingBuffer::parseData(packet);
} }

View file

@ -461,8 +461,8 @@ void Audio::handleAudioInput() {
int16_t* inputAudioSamples = new int16_t[inputSamplesRequired]; int16_t* inputAudioSamples = new int16_t[inputSamplesRequired];
_inputRingBuffer.readSamples(inputAudioSamples, inputSamplesRequired); _inputRingBuffer.readSamples(inputAudioSamples, inputSamplesRequired);
int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL; const int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL; const int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
// zero out the monoAudioSamples array and the locally injected audio // zero out the monoAudioSamples array and the locally injected audio
memset(networkAudioSamples, 0, numNetworkBytes); memset(networkAudioSamples, 0, numNetworkBytes);
@ -634,12 +634,10 @@ void Audio::handleAudioInput() {
packetType = PacketTypeSilentAudioFrame; packetType = PacketTypeSilentAudioFrame;
// we need to indicate how many silent samples this is to the audio mixer // we need to indicate how many silent samples this is to the audio mixer
audioDataPacket[0] = _isStereoInput networkAudioSamples[0] = numNetworkSamples;
? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO
: NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
numAudioBytes = sizeof(int16_t); numAudioBytes = sizeof(int16_t);
} else { } else {
numAudioBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL; numAudioBytes = numNetworkBytes;
if (Menu::getInstance()->isOptionChecked(MenuOption::EchoServerAudio)) { if (Menu::getInstance()->isOptionChecked(MenuOption::EchoServerAudio)) {
packetType = PacketTypeMicrophoneAudioWithEcho; packetType = PacketTypeMicrophoneAudioWithEcho;

View file

@ -124,17 +124,15 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
std::less<int16_t*> less; std::less<int16_t*> less;
std::less_equal<int16_t*> lessEqual; std::less_equal<int16_t*> lessEqual;
if (_hasStarted if (_hasStarted && samplesToCopy > _sampleCapacity - samplesAvailable()) {
&& (less(_endOfLastWrite, _nextOutput)
&& lessEqual(_nextOutput, shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy)))) {
// this read will cross the next output, so call us starved and reset the buffer // this read will cross the next output, so call us starved and reset the buffer
qDebug() << "Filled the ring buffer. Resetting."; qDebug() << "Filled the ring buffer. Resetting.";
_endOfLastWrite = _buffer; _endOfLastWrite = _buffer;
_nextOutput = _buffer; _nextOutput = _buffer;
_isStarved = true; _isStarved = true;
} }
if (_endOfLastWrite + samplesToCopy <= _buffer + _sampleCapacity) { if (_endOfLastWrite + samplesToCopy <= _buffer + _sampleCapacity) {
memcpy(_endOfLastWrite, data, samplesToCopy * sizeof(int16_t)); memcpy(_endOfLastWrite, data, samplesToCopy * sizeof(int16_t));
} else { } else {
@ -144,7 +142,7 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
} }
_endOfLastWrite = shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy); _endOfLastWrite = shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy);
return samplesToCopy * sizeof(int16_t); return samplesToCopy * sizeof(int16_t);
} }

View file

@ -31,6 +31,9 @@ InjectedAudioRingBuffer::InjectedAudioRingBuffer(const QUuid& streamIdentifier)
const uchar MAX_INJECTOR_VOLUME = 255; const uchar MAX_INJECTOR_VOLUME = 255;
int InjectedAudioRingBuffer::parseData(const QByteArray& packet) { int InjectedAudioRingBuffer::parseData(const QByteArray& packet) {
_interframeTimeGapStats.frameReceived();
updateDesiredJitterBufferFrames();
// setup a data stream to read from this packet // setup a data stream to read from this packet
QDataStream packetStream(packet); QDataStream packetStream(packet);
packetStream.skipRawData(numBytesForPacketHeader(packet)); packetStream.skipRawData(numBytesForPacketHeader(packet));

View file

@ -19,6 +19,71 @@
#include <UUID.h> #include <UUID.h>
#include "PositionalAudioRingBuffer.h" #include "PositionalAudioRingBuffer.h"
#include "SharedUtil.h"
InterframeTimeGapStats::InterframeTimeGapStats()
: _lastFrameReceivedTime(0),
_numSamplesInCurrentInterval(0),
_currentIntervalMaxGap(0),
_newestIntervalMaxGapAt(0),
_windowMaxGap(0),
_newWindowMaxGapAvailable(false)
{
memset(_intervalMaxGaps, 0, TIME_GAP_NUM_INTERVALS_IN_WINDOW * sizeof(quint64));
}
void InterframeTimeGapStats::frameReceived() {
quint64 now = usecTimestampNow();
// make sure this isn't the first time frameReceived() is called so can actually calculate a gap.
if (_lastFrameReceivedTime != 0) {
quint64 gap = now - _lastFrameReceivedTime;
// update the current interval max
if (gap > _currentIntervalMaxGap) {
_currentIntervalMaxGap = gap;
// keep the window max gap at least as large as the current interval max
// this allows the window max gap to respond immediately to a sudden spike in gap times
// also, this prevents the window max gap from staying at 0 until the first interval of samples filled up
if (_currentIntervalMaxGap > _windowMaxGap) {
_windowMaxGap = _currentIntervalMaxGap;
_newWindowMaxGapAvailable = true;
}
}
_numSamplesInCurrentInterval++;
// if the current interval of samples is now full, record it in our interval maxes
if (_numSamplesInCurrentInterval == TIME_GAP_NUM_SAMPLES_IN_INTERVAL) {
// find location to insert this interval's max (increment index cyclically)
_newestIntervalMaxGapAt = _newestIntervalMaxGapAt == TIME_GAP_NUM_INTERVALS_IN_WINDOW - 1 ? 0 : _newestIntervalMaxGapAt + 1;
// record the current interval's max gap as the newest
_intervalMaxGaps[_newestIntervalMaxGapAt] = _currentIntervalMaxGap;
// update the window max gap, which is the max out of all the past intervals' max gaps
_windowMaxGap = 0;
for (int i = 0; i < TIME_GAP_NUM_INTERVALS_IN_WINDOW; i++) {
if (_intervalMaxGaps[i] > _windowMaxGap) {
_windowMaxGap = _intervalMaxGaps[i];
}
}
_newWindowMaxGapAvailable = true;
// reset the current interval
_numSamplesInCurrentInterval = 0;
_currentIntervalMaxGap = 0;
}
}
_lastFrameReceivedTime = now;
}
quint64 InterframeTimeGapStats::getWindowMaxGap() {
_newWindowMaxGapAvailable = false;
return _windowMaxGap;
}
PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::Type type, bool isStereo) : PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::Type type, bool isStereo) :
AudioRingBuffer(isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL), AudioRingBuffer(isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL),
@ -29,9 +94,10 @@ PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::
_shouldLoopbackForNode(false), _shouldLoopbackForNode(false),
_shouldOutputStarveDebug(true), _shouldOutputStarveDebug(true),
_isStereo(isStereo), _isStereo(isStereo),
_listenerUnattenuatedZone(NULL) _listenerUnattenuatedZone(NULL),
_desiredJitterBufferFrames(1),
_currentJitterBufferFrames(0)
{ {
} }
int PositionalAudioRingBuffer::parseData(const QByteArray& packet) { int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
@ -54,13 +120,31 @@ int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
readBytes += sizeof(int16_t); readBytes += sizeof(int16_t);
if (numSilentSamples > 0) { if (numSilentSamples > 0) {
addSilentFrame(numSilentSamples); if (_currentJitterBufferFrames > _desiredJitterBufferFrames) {
// our current jitter buffer size exceeds its desired value, so ignore some silent
// frames to get that size as close to desired as possible
int samplesPerFrame = getSamplesPerFrame();
int numSilentFrames = numSilentSamples / samplesPerFrame;
int numFramesToDropDesired = _currentJitterBufferFrames - _desiredJitterBufferFrames;
if (numSilentFrames > numFramesToDropDesired) {
// we have more than enough frames to drop to get the jitter buffer to its desired length
int numSilentFramesToAdd = numSilentFrames - numFramesToDropDesired;
addSilentFrame(numSilentFramesToAdd * samplesPerFrame);
_currentJitterBufferFrames = _desiredJitterBufferFrames;
} else {
// we need to drop all frames to get the jitter buffer close as possible to its desired length
_currentJitterBufferFrames -= numSilentFrames;
}
} else {
addSilentFrame(numSilentSamples);
}
} }
} else { } else {
// there is audio data to read // there is audio data to read
readBytes += writeData(packet.data() + readBytes, packet.size() - readBytes); readBytes += writeData(packet.data() + readBytes, packet.size() - readBytes);
} }
return readBytes; return readBytes;
} }
@ -106,29 +190,54 @@ void PositionalAudioRingBuffer::updateNextOutputTrailingLoudness() {
} }
} }
bool PositionalAudioRingBuffer::shouldBeAddedToMix(int numJitterBufferSamples) { bool PositionalAudioRingBuffer::shouldBeAddedToMix() {
if (!isNotStarvedOrHasMinimumSamples(NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL + numJitterBufferSamples)) { int samplesPerFrame = getSamplesPerFrame();
int desiredJitterBufferSamples = _desiredJitterBufferFrames * samplesPerFrame;
if (!isNotStarvedOrHasMinimumSamples(samplesPerFrame + desiredJitterBufferSamples)) {
// if the buffer was starved, allow it to accrue at least the desired number of
// jitter buffer frames before we start taking frames from it for mixing
if (_shouldOutputStarveDebug) { if (_shouldOutputStarveDebug) {
_shouldOutputStarveDebug = false; _shouldOutputStarveDebug = false;
} }
return false; return false;
} else if (samplesAvailable() < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) { } else if (samplesAvailable() < samplesPerFrame) {
// if the buffer doesn't have a full frame of samples to take for mixing, it is starved
_isStarved = true; _isStarved = true;
// set to 0 to indicate the jitter buffer is starved
_currentJitterBufferFrames = 0;
// reset our _shouldOutputStarveDebug to true so the next is printed // reset our _shouldOutputStarveDebug to true so the next is printed
_shouldOutputStarveDebug = true; _shouldOutputStarveDebug = true;
return false; return false;
} else { }
// good buffer, add this to the mix
// good buffer, add this to the mix
if (_isStarved) {
// if this buffer has just finished replenishing after being starved, the number of frames in it now
// minus one (since a frame will be read immediately after this) is the length of the jitter buffer
_currentJitterBufferFrames = samplesAvailable() / samplesPerFrame - 1;
_isStarved = false; _isStarved = false;
// since we've read data from ring buffer at least once - we've started
_hasStarted = true;
return true;
} }
return false; // since we've read data from ring buffer at least once - we've started
_hasStarted = true;
return true;
}
void PositionalAudioRingBuffer::updateDesiredJitterBufferFrames() {
const float USECS_PER_FRAME = NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL * USECS_PER_SECOND / (float)SAMPLE_RATE;
if (_interframeTimeGapStats.hasNewWindowMaxGapAvailable()) {
_desiredJitterBufferFrames = ceilf((float)_interframeTimeGapStats.getWindowMaxGap() / USECS_PER_FRAME);
if (_desiredJitterBufferFrames < 1) {
_desiredJitterBufferFrames = 1;
}
}
} }

View file

@ -18,6 +18,31 @@
#include "AudioRingBuffer.h" #include "AudioRingBuffer.h"
// this means that every 500 samples, the max for the past 10*500 samples will be calculated
const int TIME_GAP_NUM_SAMPLES_IN_INTERVAL = 500;
const int TIME_GAP_NUM_INTERVALS_IN_WINDOW = 10;
// class used to track time between incoming frames for the purpose of varying the jitter buffer length
class InterframeTimeGapStats {
public:
InterframeTimeGapStats();
void frameReceived();
bool hasNewWindowMaxGapAvailable() const { return _newWindowMaxGapAvailable; }
quint64 peekWindowMaxGap() const { return _windowMaxGap; }
quint64 getWindowMaxGap();
private:
quint64 _lastFrameReceivedTime;
int _numSamplesInCurrentInterval;
quint64 _currentIntervalMaxGap;
quint64 _intervalMaxGaps[TIME_GAP_NUM_INTERVALS_IN_WINDOW];
int _newestIntervalMaxGapAt;
quint64 _windowMaxGap;
bool _newWindowMaxGapAvailable;
};
class PositionalAudioRingBuffer : public AudioRingBuffer { class PositionalAudioRingBuffer : public AudioRingBuffer {
public: public:
enum Type { enum Type {
@ -34,7 +59,7 @@ public:
void updateNextOutputTrailingLoudness(); void updateNextOutputTrailingLoudness();
float getNextOutputTrailingLoudness() const { return _nextOutputTrailingLoudness; } float getNextOutputTrailingLoudness() const { return _nextOutputTrailingLoudness; }
bool shouldBeAddedToMix(int numJitterBufferSamples); bool shouldBeAddedToMix();
bool willBeAddedToMix() const { return _willBeAddedToMix; } bool willBeAddedToMix() const { return _willBeAddedToMix; }
void setWillBeAddedToMix(bool willBeAddedToMix) { _willBeAddedToMix = willBeAddedToMix; } void setWillBeAddedToMix(bool willBeAddedToMix) { _willBeAddedToMix = willBeAddedToMix; }
@ -50,10 +75,14 @@ public:
AABox* getListenerUnattenuatedZone() const { return _listenerUnattenuatedZone; } AABox* getListenerUnattenuatedZone() const { return _listenerUnattenuatedZone; }
void setListenerUnattenuatedZone(AABox* listenerUnattenuatedZone) { _listenerUnattenuatedZone = listenerUnattenuatedZone; } void setListenerUnattenuatedZone(AABox* listenerUnattenuatedZone) { _listenerUnattenuatedZone = listenerUnattenuatedZone; }
int getSamplesPerFrame() const { return _isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL; }
protected: protected:
// disallow copying of PositionalAudioRingBuffer objects // disallow copying of PositionalAudioRingBuffer objects
PositionalAudioRingBuffer(const PositionalAudioRingBuffer&); PositionalAudioRingBuffer(const PositionalAudioRingBuffer&);
PositionalAudioRingBuffer& operator= (const PositionalAudioRingBuffer&); PositionalAudioRingBuffer& operator= (const PositionalAudioRingBuffer&);
void updateDesiredJitterBufferFrames();
PositionalAudioRingBuffer::Type _type; PositionalAudioRingBuffer::Type _type;
glm::vec3 _position; glm::vec3 _position;
@ -65,6 +94,10 @@ protected:
float _nextOutputTrailingLoudness; float _nextOutputTrailingLoudness;
AABox* _listenerUnattenuatedZone; AABox* _listenerUnattenuatedZone;
InterframeTimeGapStats _interframeTimeGapStats;
int _desiredJitterBufferFrames;
int _currentJitterBufferFrames;
}; };
#endif // hifi_PositionalAudioRingBuffer_h #endif // hifi_PositionalAudioRingBuffer_h

View file

@ -354,7 +354,7 @@ void OctreeEditPacketSender::processNackPacket(const QByteArray& packet) {
// read number of sequence numbers // read number of sequence numbers
uint16_t numSequenceNumbers = (*(uint16_t*)dataAt); uint16_t numSequenceNumbers = (*(uint16_t*)dataAt);
dataAt += sizeof(uint16_t); dataAt += sizeof(uint16_t);
// read sequence numbers and queue packets for resend // read sequence numbers and queue packets for resend
for (int i = 0; i < numSequenceNumbers; i++) { for (int i = 0; i < numSequenceNumbers; i++) {
unsigned short int sequenceNumber = (*(unsigned short int*)dataAt); unsigned short int sequenceNumber = (*(unsigned short int*)dataAt);