Merge pull request #3055 from wangyix/master

Audio jitter buffer resizes depending on max inter-frame time gaps of incoming audio packets
This commit is contained in:
Brad Hefta-Gaub 2014-06-23 12:45:04 -07:00
commit 18cd794542
10 changed files with 188 additions and 48 deletions

View file

@ -54,9 +54,6 @@
#include "AudioMixer.h"
const short JITTER_BUFFER_MSECS = 12;
const short JITTER_BUFFER_SAMPLES = JITTER_BUFFER_MSECS * (SAMPLE_RATE / 1000.0);
const float LOUDNESS_TO_DISTANCE_RATIO = 0.00001f;
const QString AUDIO_MIXER_LOGGING_TARGET_NAME = "audio-mixer";
@ -487,8 +484,7 @@ void AudioMixer::run() {
foreach (const SharedNodePointer& node, nodeList->getNodeHash()) {
if (node->getLinkedData()) {
((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(JITTER_BUFFER_SAMPLES,
_sourceUnattenuatedZone,
((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(_sourceUnattenuatedZone,
_listenerUnattenuatedZone);
}
}

View file

@ -98,10 +98,9 @@ int AudioMixerClientData::parseData(const QByteArray& packet) {
return 0;
}
void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
AABox* checkSourceZone, AABox* listenerZone) {
void AudioMixerClientData::checkBuffersBeforeFrameSend(AABox* checkSourceZone, AABox* listenerZone) {
for (int i = 0; i < _ringBuffers.size(); i++) {
if (_ringBuffers[i]->shouldBeAddedToMix(jitterBufferLengthSamples)) {
if (_ringBuffers[i]->shouldBeAddedToMix()) {
// this is a ring buffer that is ready to go
// set its flag so we know to push its buffer when all is said and done
_ringBuffers[i]->setWillBeAddedToMix(true);
@ -120,20 +119,22 @@ void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSam
}
void AudioMixerClientData::pushBuffersAfterFrameSend() {
for (int i = 0; i < _ringBuffers.size(); i++) {
QList<PositionalAudioRingBuffer*>::iterator i = _ringBuffers.begin();
while (i != _ringBuffers.end()) {
// this was a used buffer, push the output pointer forwards
PositionalAudioRingBuffer* audioBuffer = _ringBuffers[i];
PositionalAudioRingBuffer* audioBuffer = *i;
if (audioBuffer->willBeAddedToMix()) {
audioBuffer->shiftReadPosition(audioBuffer->isStereo()
? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL);
audioBuffer->shiftReadPosition(audioBuffer->getSamplesPerFrame());
audioBuffer->setWillBeAddedToMix(false);
} else if (audioBuffer->getType() == PositionalAudioRingBuffer::Injector
&& audioBuffer->hasStarted() && audioBuffer->isStarved()) {
// this is an empty audio buffer that has starved, safe to delete
delete audioBuffer;
_ringBuffers.erase(_ringBuffers.begin() + i);
i = _ringBuffers.erase(i);
continue;
}
i++;
}
}

View file

@ -27,8 +27,7 @@ public:
AvatarAudioRingBuffer* getAvatarAudioRingBuffer() const;
int parseData(const QByteArray& packet);
void checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
void checkBuffersBeforeFrameSend(AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
void pushBuffersAfterFrameSend();
private:
QList<PositionalAudioRingBuffer*> _ringBuffers;

View file

@ -19,6 +19,9 @@ AvatarAudioRingBuffer::AvatarAudioRingBuffer(bool isStereo) :
}
int AvatarAudioRingBuffer::parseData(const QByteArray& packet) {
_interframeTimeGapStats.frameReceived();
updateDesiredJitterBufferFrames();
_shouldLoopbackForNode = (packetTypeForPacket(packet) == PacketTypeMicrophoneAudioWithEcho);
return PositionalAudioRingBuffer::parseData(packet);
}

View file

@ -461,8 +461,8 @@ void Audio::handleAudioInput() {
int16_t* inputAudioSamples = new int16_t[inputSamplesRequired];
_inputRingBuffer.readSamples(inputAudioSamples, inputSamplesRequired);
int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
const int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
const int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
// zero out the monoAudioSamples array and the locally injected audio
memset(networkAudioSamples, 0, numNetworkBytes);
@ -634,12 +634,10 @@ void Audio::handleAudioInput() {
packetType = PacketTypeSilentAudioFrame;
// we need to indicate how many silent samples this is to the audio mixer
audioDataPacket[0] = _isStereoInput
? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO
: NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
networkAudioSamples[0] = numNetworkSamples;
numAudioBytes = sizeof(int16_t);
} else {
numAudioBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
numAudioBytes = numNetworkBytes;
if (Menu::getInstance()->isOptionChecked(MenuOption::EchoServerAudio)) {
packetType = PacketTypeMicrophoneAudioWithEcho;

View file

@ -124,17 +124,15 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
std::less<int16_t*> less;
std::less_equal<int16_t*> lessEqual;
if (_hasStarted
&& (less(_endOfLastWrite, _nextOutput)
&& lessEqual(_nextOutput, shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy)))) {
if (_hasStarted && samplesToCopy > _sampleCapacity - samplesAvailable()) {
// this read will cross the next output, so call us starved and reset the buffer
qDebug() << "Filled the ring buffer. Resetting.";
_endOfLastWrite = _buffer;
_nextOutput = _buffer;
_isStarved = true;
}
if (_endOfLastWrite + samplesToCopy <= _buffer + _sampleCapacity) {
memcpy(_endOfLastWrite, data, samplesToCopy * sizeof(int16_t));
} else {
@ -144,7 +142,7 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
}
_endOfLastWrite = shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy);
return samplesToCopy * sizeof(int16_t);
}

View file

@ -31,6 +31,9 @@ InjectedAudioRingBuffer::InjectedAudioRingBuffer(const QUuid& streamIdentifier)
const uchar MAX_INJECTOR_VOLUME = 255;
int InjectedAudioRingBuffer::parseData(const QByteArray& packet) {
_interframeTimeGapStats.frameReceived();
updateDesiredJitterBufferFrames();
// setup a data stream to read from this packet
QDataStream packetStream(packet);
packetStream.skipRawData(numBytesForPacketHeader(packet));

View file

@ -19,6 +19,71 @@
#include <UUID.h>
#include "PositionalAudioRingBuffer.h"
#include "SharedUtil.h"
InterframeTimeGapStats::InterframeTimeGapStats()
: _lastFrameReceivedTime(0),
_numSamplesInCurrentInterval(0),
_currentIntervalMaxGap(0),
_newestIntervalMaxGapAt(0),
_windowMaxGap(0),
_newWindowMaxGapAvailable(false)
{
memset(_intervalMaxGaps, 0, TIME_GAP_NUM_INTERVALS_IN_WINDOW * sizeof(quint64));
}
void InterframeTimeGapStats::frameReceived() {
quint64 now = usecTimestampNow();
// make sure this isn't the first time frameReceived() is called so can actually calculate a gap.
if (_lastFrameReceivedTime != 0) {
quint64 gap = now - _lastFrameReceivedTime;
// update the current interval max
if (gap > _currentIntervalMaxGap) {
_currentIntervalMaxGap = gap;
// keep the window max gap at least as large as the current interval max
// this allows the window max gap to respond immediately to a sudden spike in gap times
// also, this prevents the window max gap from staying at 0 until the first interval of samples filled up
if (_currentIntervalMaxGap > _windowMaxGap) {
_windowMaxGap = _currentIntervalMaxGap;
_newWindowMaxGapAvailable = true;
}
}
_numSamplesInCurrentInterval++;
// if the current interval of samples is now full, record it in our interval maxes
if (_numSamplesInCurrentInterval == TIME_GAP_NUM_SAMPLES_IN_INTERVAL) {
// find location to insert this interval's max (increment index cyclically)
_newestIntervalMaxGapAt = _newestIntervalMaxGapAt == TIME_GAP_NUM_INTERVALS_IN_WINDOW - 1 ? 0 : _newestIntervalMaxGapAt + 1;
// record the current interval's max gap as the newest
_intervalMaxGaps[_newestIntervalMaxGapAt] = _currentIntervalMaxGap;
// update the window max gap, which is the max out of all the past intervals' max gaps
_windowMaxGap = 0;
for (int i = 0; i < TIME_GAP_NUM_INTERVALS_IN_WINDOW; i++) {
if (_intervalMaxGaps[i] > _windowMaxGap) {
_windowMaxGap = _intervalMaxGaps[i];
}
}
_newWindowMaxGapAvailable = true;
// reset the current interval
_numSamplesInCurrentInterval = 0;
_currentIntervalMaxGap = 0;
}
}
_lastFrameReceivedTime = now;
}
quint64 InterframeTimeGapStats::getWindowMaxGap() {
_newWindowMaxGapAvailable = false;
return _windowMaxGap;
}
PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::Type type, bool isStereo) :
AudioRingBuffer(isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL),
@ -29,9 +94,10 @@ PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::
_shouldLoopbackForNode(false),
_shouldOutputStarveDebug(true),
_isStereo(isStereo),
_listenerUnattenuatedZone(NULL)
_listenerUnattenuatedZone(NULL),
_desiredJitterBufferFrames(1),
_currentJitterBufferFrames(0)
{
}
int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
@ -54,13 +120,31 @@ int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
readBytes += sizeof(int16_t);
if (numSilentSamples > 0) {
addSilentFrame(numSilentSamples);
if (_currentJitterBufferFrames > _desiredJitterBufferFrames) {
// our current jitter buffer size exceeds its desired value, so ignore some silent
// frames to get that size as close to desired as possible
int samplesPerFrame = getSamplesPerFrame();
int numSilentFrames = numSilentSamples / samplesPerFrame;
int numFramesToDropDesired = _currentJitterBufferFrames - _desiredJitterBufferFrames;
if (numSilentFrames > numFramesToDropDesired) {
// we have more than enough frames to drop to get the jitter buffer to its desired length
int numSilentFramesToAdd = numSilentFrames - numFramesToDropDesired;
addSilentFrame(numSilentFramesToAdd * samplesPerFrame);
_currentJitterBufferFrames = _desiredJitterBufferFrames;
} else {
// we need to drop all frames to get the jitter buffer close as possible to its desired length
_currentJitterBufferFrames -= numSilentFrames;
}
} else {
addSilentFrame(numSilentSamples);
}
}
} else {
// there is audio data to read
readBytes += writeData(packet.data() + readBytes, packet.size() - readBytes);
}
return readBytes;
}
@ -106,29 +190,54 @@ void PositionalAudioRingBuffer::updateNextOutputTrailingLoudness() {
}
}
bool PositionalAudioRingBuffer::shouldBeAddedToMix(int numJitterBufferSamples) {
if (!isNotStarvedOrHasMinimumSamples(NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL + numJitterBufferSamples)) {
bool PositionalAudioRingBuffer::shouldBeAddedToMix() {
int samplesPerFrame = getSamplesPerFrame();
int desiredJitterBufferSamples = _desiredJitterBufferFrames * samplesPerFrame;
if (!isNotStarvedOrHasMinimumSamples(samplesPerFrame + desiredJitterBufferSamples)) {
// if the buffer was starved, allow it to accrue at least the desired number of
// jitter buffer frames before we start taking frames from it for mixing
if (_shouldOutputStarveDebug) {
_shouldOutputStarveDebug = false;
}
return false;
} else if (samplesAvailable() < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) {
return false;
} else if (samplesAvailable() < samplesPerFrame) {
// if the buffer doesn't have a full frame of samples to take for mixing, it is starved
_isStarved = true;
// set to 0 to indicate the jitter buffer is starved
_currentJitterBufferFrames = 0;
// reset our _shouldOutputStarveDebug to true so the next is printed
_shouldOutputStarveDebug = true;
return false;
} else {
// good buffer, add this to the mix
}
// good buffer, add this to the mix
if (_isStarved) {
// if this buffer has just finished replenishing after being starved, the number of frames in it now
// minus one (since a frame will be read immediately after this) is the length of the jitter buffer
_currentJitterBufferFrames = samplesAvailable() / samplesPerFrame - 1;
_isStarved = false;
// since we've read data from ring buffer at least once - we've started
_hasStarted = true;
return true;
}
return false;
// since we've read data from ring buffer at least once - we've started
_hasStarted = true;
return true;
}
void PositionalAudioRingBuffer::updateDesiredJitterBufferFrames() {
const float USECS_PER_FRAME = NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL * USECS_PER_SECOND / (float)SAMPLE_RATE;
if (_interframeTimeGapStats.hasNewWindowMaxGapAvailable()) {
_desiredJitterBufferFrames = ceilf((float)_interframeTimeGapStats.getWindowMaxGap() / USECS_PER_FRAME);
if (_desiredJitterBufferFrames < 1) {
_desiredJitterBufferFrames = 1;
}
}
}

View file

@ -18,6 +18,31 @@
#include "AudioRingBuffer.h"
// this means that every 500 samples, the max for the past 10*500 samples will be calculated
const int TIME_GAP_NUM_SAMPLES_IN_INTERVAL = 500;
const int TIME_GAP_NUM_INTERVALS_IN_WINDOW = 10;
// class used to track time between incoming frames for the purpose of varying the jitter buffer length
class InterframeTimeGapStats {
public:
InterframeTimeGapStats();
void frameReceived();
bool hasNewWindowMaxGapAvailable() const { return _newWindowMaxGapAvailable; }
quint64 peekWindowMaxGap() const { return _windowMaxGap; }
quint64 getWindowMaxGap();
private:
quint64 _lastFrameReceivedTime;
int _numSamplesInCurrentInterval;
quint64 _currentIntervalMaxGap;
quint64 _intervalMaxGaps[TIME_GAP_NUM_INTERVALS_IN_WINDOW];
int _newestIntervalMaxGapAt;
quint64 _windowMaxGap;
bool _newWindowMaxGapAvailable;
};
class PositionalAudioRingBuffer : public AudioRingBuffer {
public:
enum Type {
@ -34,7 +59,7 @@ public:
void updateNextOutputTrailingLoudness();
float getNextOutputTrailingLoudness() const { return _nextOutputTrailingLoudness; }
bool shouldBeAddedToMix(int numJitterBufferSamples);
bool shouldBeAddedToMix();
bool willBeAddedToMix() const { return _willBeAddedToMix; }
void setWillBeAddedToMix(bool willBeAddedToMix) { _willBeAddedToMix = willBeAddedToMix; }
@ -50,10 +75,14 @@ public:
AABox* getListenerUnattenuatedZone() const { return _listenerUnattenuatedZone; }
void setListenerUnattenuatedZone(AABox* listenerUnattenuatedZone) { _listenerUnattenuatedZone = listenerUnattenuatedZone; }
int getSamplesPerFrame() const { return _isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL; }
protected:
// disallow copying of PositionalAudioRingBuffer objects
PositionalAudioRingBuffer(const PositionalAudioRingBuffer&);
PositionalAudioRingBuffer& operator= (const PositionalAudioRingBuffer&);
void updateDesiredJitterBufferFrames();
PositionalAudioRingBuffer::Type _type;
glm::vec3 _position;
@ -65,6 +94,10 @@ protected:
float _nextOutputTrailingLoudness;
AABox* _listenerUnattenuatedZone;
InterframeTimeGapStats _interframeTimeGapStats;
int _desiredJitterBufferFrames;
int _currentJitterBufferFrames;
};
#endif // hifi_PositionalAudioRingBuffer_h

View file

@ -354,7 +354,7 @@ void OctreeEditPacketSender::processNackPacket(const QByteArray& packet) {
// read number of sequence numbers
uint16_t numSequenceNumbers = (*(uint16_t*)dataAt);
dataAt += sizeof(uint16_t);
// read sequence numbers and queue packets for resend
for (int i = 0; i < numSequenceNumbers; i++) {
unsigned short int sequenceNumber = (*(unsigned short int*)dataAt);