Merge pull request #3055 from wangyix/master

Audio jitter buffer resizes depending on max inter-frame time gaps of incoming audio packets
2025-04-20 18:44:01 +02:00 · 2014-06-23 12:45:04 -07:00 · 2014-06-23 12:45:04 -07:00 · 18cd794542
commit 18cd794542
parent 29a8307825 fbdca59d37
10 changed files with 188 additions and 48 deletions
--- a/assignment-client/src/audio/AudioMixer.cpp
+++ b/assignment-client/src/audio/AudioMixer.cpp
@ -54,9 +54,6 @@

 #include "AudioMixer.h"

-const short JITTER_BUFFER_MSECS = 12;
-const short JITTER_BUFFER_SAMPLES = JITTER_BUFFER_MSECS * (SAMPLE_RATE / 1000.0);
-
 const float LOUDNESS_TO_DISTANCE_RATIO = 0.00001f;

 const QString AUDIO_MIXER_LOGGING_TARGET_NAME = "audio-mixer";
@ -487,8 +484,7 @@ void AudioMixer::run() {
        
        foreach (const SharedNodePointer& node, nodeList->getNodeHash()) {
            if (node->getLinkedData()) {
-                ((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(JITTER_BUFFER_SAMPLES,
-                                                                                             _sourceUnattenuatedZone,
+                ((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(_sourceUnattenuatedZone,
                                                                                             _listenerUnattenuatedZone);
            }
        }
--- a/assignment-client/src/audio/AudioMixerClientData.cpp
+++ b/assignment-client/src/audio/AudioMixerClientData.cpp
@ -98,10 +98,9 @@ int AudioMixerClientData::parseData(const QByteArray& packet) {
    return 0;
 }

-void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
-                                                       AABox* checkSourceZone, AABox* listenerZone) {
+void AudioMixerClientData::checkBuffersBeforeFrameSend(AABox* checkSourceZone, AABox* listenerZone) {
    for (int i = 0; i < _ringBuffers.size(); i++) {
-        if (_ringBuffers[i]->shouldBeAddedToMix(jitterBufferLengthSamples)) {
+        if (_ringBuffers[i]->shouldBeAddedToMix()) {
            // this is a ring buffer that is ready to go
            // set its flag so we know to push its buffer when all is said and done
            _ringBuffers[i]->setWillBeAddedToMix(true);
@ -120,20 +119,22 @@ void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSam
 }

 void AudioMixerClientData::pushBuffersAfterFrameSend() {
-    for (int i = 0; i < _ringBuffers.size(); i++) {
+
+    QList<PositionalAudioRingBuffer*>::iterator i = _ringBuffers.begin();
+    while (i != _ringBuffers.end()) {
        // this was a used buffer, push the output pointer forwards
-        PositionalAudioRingBuffer* audioBuffer = _ringBuffers[i];
+        PositionalAudioRingBuffer* audioBuffer = *i;

        if (audioBuffer->willBeAddedToMix()) {
-            audioBuffer->shiftReadPosition(audioBuffer->isStereo()
-                                           ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL);
-
+            audioBuffer->shiftReadPosition(audioBuffer->getSamplesPerFrame());
            audioBuffer->setWillBeAddedToMix(false);
        } else if (audioBuffer->getType() == PositionalAudioRingBuffer::Injector
                   && audioBuffer->hasStarted() && audioBuffer->isStarved()) {
            // this is an empty audio buffer that has starved, safe to delete
            delete audioBuffer;
-            _ringBuffers.erase(_ringBuffers.begin() + i);
+            i = _ringBuffers.erase(i);
+            continue;
        }
+        i++;
    }
 }
--- a/assignment-client/src/audio/AudioMixerClientData.h
+++ b/assignment-client/src/audio/AudioMixerClientData.h
@ -27,8 +27,7 @@ public:
    AvatarAudioRingBuffer* getAvatarAudioRingBuffer() const;
    
    int parseData(const QByteArray& packet);
-    void checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
-                                     AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
+    void checkBuffersBeforeFrameSend(AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
    void pushBuffersAfterFrameSend();
 private:
    QList<PositionalAudioRingBuffer*> _ringBuffers;
--- a/assignment-client/src/audio/AvatarAudioRingBuffer.cpp
+++ b/assignment-client/src/audio/AvatarAudioRingBuffer.cpp
@ -19,6 +19,9 @@ AvatarAudioRingBuffer::AvatarAudioRingBuffer(bool isStereo) :
 }

 int AvatarAudioRingBuffer::parseData(const QByteArray& packet) {
+    _interframeTimeGapStats.frameReceived();
+    updateDesiredJitterBufferFrames();
+
    _shouldLoopbackForNode = (packetTypeForPacket(packet) == PacketTypeMicrophoneAudioWithEcho);
    return PositionalAudioRingBuffer::parseData(packet);
 }
--- a/interface/src/Audio.cpp
+++ b/interface/src/Audio.cpp
@ -461,8 +461,8 @@ void Audio::handleAudioInput() {
        int16_t* inputAudioSamples = new int16_t[inputSamplesRequired];
        _inputRingBuffer.readSamples(inputAudioSamples, inputSamplesRequired);
        
-        int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
-        int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
+        const int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
+        const int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;

        // zero out the monoAudioSamples array and the locally injected audio
        memset(networkAudioSamples, 0, numNetworkBytes);
@ -634,12 +634,10 @@ void Audio::handleAudioInput() {
                packetType = PacketTypeSilentAudioFrame;
                
                // we need to indicate how many silent samples this is to the audio mixer
-                audioDataPacket[0] = _isStereoInput
-                    ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO
-                    : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
+                networkAudioSamples[0] = numNetworkSamples;
                numAudioBytes = sizeof(int16_t);
            } else {
-                numAudioBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
+                numAudioBytes = numNetworkBytes;
                
                if (Menu::getInstance()->isOptionChecked(MenuOption::EchoServerAudio)) {
                    packetType = PacketTypeMicrophoneAudioWithEcho;
--- a/libraries/audio/src/AudioRingBuffer.cpp
+++ b/libraries/audio/src/AudioRingBuffer.cpp
@ -124,17 +124,15 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {

    std::less<int16_t*> less;
    std::less_equal<int16_t*> lessEqual;
-
-    if (_hasStarted
-        && (less(_endOfLastWrite, _nextOutput)
-            && lessEqual(_nextOutput, shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy)))) {
+    
+    if (_hasStarted && samplesToCopy > _sampleCapacity - samplesAvailable()) {
        // this read will cross the next output, so call us starved and reset the buffer
        qDebug() << "Filled the ring buffer. Resetting.";
        _endOfLastWrite = _buffer;
        _nextOutput = _buffer;
        _isStarved = true;
    }
-
+    
    if (_endOfLastWrite + samplesToCopy <= _buffer + _sampleCapacity) {
        memcpy(_endOfLastWrite, data, samplesToCopy * sizeof(int16_t));
    } else {
@ -144,7 +142,7 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
    }

    _endOfLastWrite = shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy);
-
+    
    return samplesToCopy * sizeof(int16_t);
 }

--- a/libraries/audio/src/InjectedAudioRingBuffer.cpp
+++ b/libraries/audio/src/InjectedAudioRingBuffer.cpp
@ -31,6 +31,9 @@ InjectedAudioRingBuffer::InjectedAudioRingBuffer(const QUuid& streamIdentifier)
 const uchar MAX_INJECTOR_VOLUME = 255;

 int InjectedAudioRingBuffer::parseData(const QByteArray& packet) {
+    _interframeTimeGapStats.frameReceived();
+    updateDesiredJitterBufferFrames();
+
    // setup a data stream to read from this packet
    QDataStream packetStream(packet);
    packetStream.skipRawData(numBytesForPacketHeader(packet));
--- a/libraries/audio/src/PositionalAudioRingBuffer.cpp
+++ b/libraries/audio/src/PositionalAudioRingBuffer.cpp
@ -19,6 +19,71 @@
 #include <UUID.h>

 #include "PositionalAudioRingBuffer.h"
+#include "SharedUtil.h"
+
+InterframeTimeGapStats::InterframeTimeGapStats()
+    : _lastFrameReceivedTime(0),
+    _numSamplesInCurrentInterval(0),
+    _currentIntervalMaxGap(0),
+    _newestIntervalMaxGapAt(0),
+    _windowMaxGap(0),
+    _newWindowMaxGapAvailable(false)
+{
+    memset(_intervalMaxGaps, 0, TIME_GAP_NUM_INTERVALS_IN_WINDOW * sizeof(quint64));
+}
+
+void InterframeTimeGapStats::frameReceived() {
+    quint64 now = usecTimestampNow();
+
+    // make sure this isn't the first time frameReceived() is called so can actually calculate a gap.
+    if (_lastFrameReceivedTime != 0) {
+        quint64 gap = now - _lastFrameReceivedTime;
+
+        // update the current interval max
+        if (gap > _currentIntervalMaxGap) {
+            _currentIntervalMaxGap = gap;
+
+            // keep the window max gap at least as large as the current interval max
+            // this allows the window max gap to respond immediately to a sudden spike in gap times
+            // also, this prevents the window max gap from staying at 0 until the first interval of samples filled up
+            if (_currentIntervalMaxGap > _windowMaxGap) {
+                _windowMaxGap = _currentIntervalMaxGap;
+                _newWindowMaxGapAvailable = true;
+            }
+        }
+        _numSamplesInCurrentInterval++;
+
+        // if the current interval of samples is now full, record it in our interval maxes
+        if (_numSamplesInCurrentInterval == TIME_GAP_NUM_SAMPLES_IN_INTERVAL) {
+
+            // find location to insert this interval's max (increment index cyclically)
+            _newestIntervalMaxGapAt = _newestIntervalMaxGapAt == TIME_GAP_NUM_INTERVALS_IN_WINDOW - 1 ? 0 : _newestIntervalMaxGapAt + 1;
+
+            // record the current interval's max gap as the newest
+            _intervalMaxGaps[_newestIntervalMaxGapAt] = _currentIntervalMaxGap;
+
+            // update the window max gap, which is the max out of all the past intervals' max gaps
+            _windowMaxGap = 0;
+            for (int i = 0; i < TIME_GAP_NUM_INTERVALS_IN_WINDOW; i++) {
+                if (_intervalMaxGaps[i] > _windowMaxGap) {
+                    _windowMaxGap = _intervalMaxGaps[i];
+                }
+            }
+            _newWindowMaxGapAvailable = true;
+
+            // reset the current interval
+            _numSamplesInCurrentInterval = 0;
+            _currentIntervalMaxGap = 0;
+        }
+    }
+    _lastFrameReceivedTime = now;
+}
+
+quint64 InterframeTimeGapStats::getWindowMaxGap() {
+    _newWindowMaxGapAvailable = false;
+    return _windowMaxGap;
+}
+

 PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::Type type, bool isStereo) :
    AudioRingBuffer(isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL),
@ -29,9 +94,10 @@ PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::
    _shouldLoopbackForNode(false),
    _shouldOutputStarveDebug(true),
    _isStereo(isStereo),
-    _listenerUnattenuatedZone(NULL)
+    _listenerUnattenuatedZone(NULL),
+    _desiredJitterBufferFrames(1),
+    _currentJitterBufferFrames(0)
 {
-
 }

 int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
@ -54,13 +120,31 @@ int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
        readBytes += sizeof(int16_t);
        
        if (numSilentSamples > 0) {
-            addSilentFrame(numSilentSamples);
+            if (_currentJitterBufferFrames > _desiredJitterBufferFrames) {
+                // our current jitter buffer size exceeds its desired value, so ignore some silent
+                // frames to get that size as close to desired as possible
+                int samplesPerFrame = getSamplesPerFrame();
+                int numSilentFrames = numSilentSamples / samplesPerFrame;
+                int numFramesToDropDesired = _currentJitterBufferFrames - _desiredJitterBufferFrames;
+
+                if (numSilentFrames > numFramesToDropDesired) {
+                    // we have more than enough frames to drop to get the jitter buffer to its desired length
+                    int numSilentFramesToAdd = numSilentFrames - numFramesToDropDesired;
+                    addSilentFrame(numSilentFramesToAdd * samplesPerFrame);
+                    _currentJitterBufferFrames = _desiredJitterBufferFrames;
+
+                } else {
+                    // we need to drop all frames to get the jitter buffer close as possible to its desired length
+                    _currentJitterBufferFrames -= numSilentFrames;
+                }
+            } else {
+                addSilentFrame(numSilentSamples);
+            }
        }
    } else {
        // there is audio data to read
        readBytes += writeData(packet.data() + readBytes, packet.size() - readBytes);
    }
-    
    return readBytes;
 }

@ -106,29 +190,54 @@ void PositionalAudioRingBuffer::updateNextOutputTrailingLoudness() {
    }
 }

-bool PositionalAudioRingBuffer::shouldBeAddedToMix(int numJitterBufferSamples) {
-    if (!isNotStarvedOrHasMinimumSamples(NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL + numJitterBufferSamples)) {
+bool PositionalAudioRingBuffer::shouldBeAddedToMix() {
+    int samplesPerFrame = getSamplesPerFrame();
+    int desiredJitterBufferSamples = _desiredJitterBufferFrames * samplesPerFrame;
+    
+    if (!isNotStarvedOrHasMinimumSamples(samplesPerFrame + desiredJitterBufferSamples)) {
+        // if the buffer was starved, allow it to accrue at least the desired number of
+        // jitter buffer frames before we start taking frames from it for mixing
+
        if (_shouldOutputStarveDebug) {
            _shouldOutputStarveDebug = false;
        }
-        
-        return false;
-    } else if (samplesAvailable() < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) {
+
+        return  false;
+    } else if (samplesAvailable() < samplesPerFrame) { 
+        // if the buffer doesn't have a full frame of samples to take for mixing, it is starved
        _isStarved = true;
        
+        // set to 0 to indicate the jitter buffer is starved
+        _currentJitterBufferFrames = 0;
+        
        // reset our _shouldOutputStarveDebug to true so the next is printed
        _shouldOutputStarveDebug = true;
-        
+
        return false;
-    } else {
-        // good buffer, add this to the mix
+    }
+    
+    // good buffer, add this to the mix
+    if (_isStarved) {
+        // if this buffer has just finished replenishing after being starved, the number of frames in it now
+        // minus one (since a frame will be read immediately after this) is the length of the jitter buffer
+        _currentJitterBufferFrames = samplesAvailable() / samplesPerFrame - 1;
        _isStarved = false;
-
-        // since we've read data from ring buffer at least once - we've started
-        _hasStarted = true;
-
-        return true;
    }

-    return false;
+    // since we've read data from ring buffer at least once - we've started
+    _hasStarted = true;
+
+    return true;
+}
+
+void PositionalAudioRingBuffer::updateDesiredJitterBufferFrames() {
+
+    const float USECS_PER_FRAME = NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL * USECS_PER_SECOND / (float)SAMPLE_RATE;
+
+    if (_interframeTimeGapStats.hasNewWindowMaxGapAvailable()) {
+        _desiredJitterBufferFrames = ceilf((float)_interframeTimeGapStats.getWindowMaxGap() / USECS_PER_FRAME);
+        if (_desiredJitterBufferFrames < 1) {
+            _desiredJitterBufferFrames = 1;
+        }
+    }
 }
--- a/libraries/audio/src/PositionalAudioRingBuffer.h
+++ b/libraries/audio/src/PositionalAudioRingBuffer.h
@ -18,6 +18,31 @@

 #include "AudioRingBuffer.h"

+// this means that every 500 samples, the max for the past 10*500 samples will be calculated
+const int TIME_GAP_NUM_SAMPLES_IN_INTERVAL = 500;
+const int TIME_GAP_NUM_INTERVALS_IN_WINDOW = 10;
+
+// class used to track time between incoming frames for the purpose of varying the jitter buffer length
+class InterframeTimeGapStats {
+public:
+    InterframeTimeGapStats();
+
+    void frameReceived();
+    bool hasNewWindowMaxGapAvailable() const { return _newWindowMaxGapAvailable; }
+    quint64 peekWindowMaxGap() const { return _windowMaxGap; }
+    quint64 getWindowMaxGap();
+
+private:
+    quint64 _lastFrameReceivedTime;
+
+    int _numSamplesInCurrentInterval;
+    quint64 _currentIntervalMaxGap;
+    quint64 _intervalMaxGaps[TIME_GAP_NUM_INTERVALS_IN_WINDOW];
+    int _newestIntervalMaxGapAt;
+    quint64 _windowMaxGap;
+    bool _newWindowMaxGapAvailable;
+};
+
 class PositionalAudioRingBuffer : public AudioRingBuffer {
 public:
    enum Type {
@ -34,7 +59,7 @@ public:
    void updateNextOutputTrailingLoudness();
    float getNextOutputTrailingLoudness() const { return _nextOutputTrailingLoudness; }
    
-    bool shouldBeAddedToMix(int numJitterBufferSamples);
+    bool shouldBeAddedToMix();
    
    bool willBeAddedToMix() const { return _willBeAddedToMix; }
    void setWillBeAddedToMix(bool willBeAddedToMix) { _willBeAddedToMix = willBeAddedToMix; }
@ -50,10 +75,14 @@ public:
    AABox* getListenerUnattenuatedZone() const { return _listenerUnattenuatedZone; }
    void setListenerUnattenuatedZone(AABox* listenerUnattenuatedZone) { _listenerUnattenuatedZone = listenerUnattenuatedZone; }
    
+    int getSamplesPerFrame() const { return _isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL; }
+
 protected:
    // disallow copying of PositionalAudioRingBuffer objects
    PositionalAudioRingBuffer(const PositionalAudioRingBuffer&);
    PositionalAudioRingBuffer& operator= (const PositionalAudioRingBuffer&);
+
+    void updateDesiredJitterBufferFrames();
    
    PositionalAudioRingBuffer::Type _type;
    glm::vec3 _position;
@ -65,6 +94,10 @@ protected:
    
    float _nextOutputTrailingLoudness;
    AABox* _listenerUnattenuatedZone;
+
+    InterframeTimeGapStats _interframeTimeGapStats;
+    int _desiredJitterBufferFrames;
+    int _currentJitterBufferFrames;
 };

 #endif // hifi_PositionalAudioRingBuffer_h
--- a/libraries/octree/src/OctreeEditPacketSender.cpp
+++ b/libraries/octree/src/OctreeEditPacketSender.cpp
@ -354,7 +354,7 @@ void OctreeEditPacketSender::processNackPacket(const QByteArray& packet) {
    // read number of sequence numbers
    uint16_t numSequenceNumbers = (*(uint16_t*)dataAt);
    dataAt += sizeof(uint16_t);
-
+    
    // read sequence numbers and queue packets for resend
    for (int i = 0; i < numSequenceNumbers; i++) {
        unsigned short int sequenceNumber = (*(unsigned short int*)dataAt);