Merge pull request #3055 from wangyix/master

Audio jitter buffer resizes depending on max inter-frame time gaps of incoming audio packets
2025-08-08 21:17:08 +02:00 · 2014-06-23 12:45:04 -07:00 · 2014-06-23 12:45:04 -07:00 · 18cd794542
commit 18cd794542
parent 29a8307825 fbdca59d37
10 changed files with 188 additions and 48 deletions
--- a/assignment-client/src/audio/AudioMixer.cpp
+++ b/assignment-client/src/audio/AudioMixer.cpp
@ -54,9 +54,6 @@
 #include "AudioMixer.h"
 const short JITTER_BUFFER_MSECS = 12;
 const short JITTER_BUFFER_SAMPLES = JITTER_BUFFER_MSECS * (SAMPLE_RATE / 1000.0);
 const float LOUDNESS_TO_DISTANCE_RATIO = 0.00001f;
 const QString AUDIO_MIXER_LOGGING_TARGET_NAME = "audio-mixer";
@ -487,8 +484,7 @@ void AudioMixer::run() {
        foreach (const SharedNodePointer& node, nodeList->getNodeHash()) {
            if (node->getLinkedData()) {
-                ((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(JITTER_BUFFER_SAMPLES,
+                ((AudioMixerClientData*) node->getLinkedData())->checkBuffersBeforeFrameSend(_sourceUnattenuatedZone,
                                                                                             _sourceUnattenuatedZone,
                                                                                             _listenerUnattenuatedZone);
            }
        }
--- a/assignment-client/src/audio/AudioMixerClientData.cpp
+++ b/assignment-client/src/audio/AudioMixerClientData.cpp
@ -98,10 +98,9 @@ int AudioMixerClientData::parseData(const QByteArray& packet) {
    return 0;
 }
-void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
+void AudioMixerClientData::checkBuffersBeforeFrameSend(AABox* checkSourceZone, AABox* listenerZone) {
                                                       AABox* checkSourceZone, AABox* listenerZone) {
    for (int i = 0; i < _ringBuffers.size(); i++) {
-        if (_ringBuffers[i]->shouldBeAddedToMix(jitterBufferLengthSamples)) {
+        if (_ringBuffers[i]->shouldBeAddedToMix()) {
            // this is a ring buffer that is ready to go
            // set its flag so we know to push its buffer when all is said and done
            _ringBuffers[i]->setWillBeAddedToMix(true);
@ -120,20 +119,22 @@ void AudioMixerClientData::checkBuffersBeforeFrameSend(int jitterBufferLengthSam
 }
 void AudioMixerClientData::pushBuffersAfterFrameSend() {
-    for (int i = 0; i < _ringBuffers.size(); i++) {
+
    QList<PositionalAudioRingBuffer*>::iterator i = _ringBuffers.begin();
    while (i != _ringBuffers.end()) {
        // this was a used buffer, push the output pointer forwards
-        PositionalAudioRingBuffer* audioBuffer = _ringBuffers[i];
+        PositionalAudioRingBuffer* audioBuffer = *i;
        if (audioBuffer->willBeAddedToMix()) {
-            audioBuffer->shiftReadPosition(audioBuffer->isStereo()
+            audioBuffer->shiftReadPosition(audioBuffer->getSamplesPerFrame());
                                           ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL);
            audioBuffer->setWillBeAddedToMix(false);
        } else if (audioBuffer->getType() == PositionalAudioRingBuffer::Injector
                   && audioBuffer->hasStarted() && audioBuffer->isStarved()) {
            // this is an empty audio buffer that has starved, safe to delete
            delete audioBuffer;
-            _ringBuffers.erase(_ringBuffers.begin() + i);
+            i = _ringBuffers.erase(i);
            continue;
        }
        i++;
    }
 }
--- a/assignment-client/src/audio/AudioMixerClientData.h
+++ b/assignment-client/src/audio/AudioMixerClientData.h
@ -27,8 +27,7 @@ public:
    AvatarAudioRingBuffer* getAvatarAudioRingBuffer() const;
    int parseData(const QByteArray& packet);
-    void checkBuffersBeforeFrameSend(int jitterBufferLengthSamples,
+    void checkBuffersBeforeFrameSend(AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
                                     AABox* checkSourceZone = NULL, AABox* listenerZone = NULL);
    void pushBuffersAfterFrameSend();
 private:
    QList<PositionalAudioRingBuffer*> _ringBuffers;
--- a/assignment-client/src/audio/AvatarAudioRingBuffer.cpp
+++ b/assignment-client/src/audio/AvatarAudioRingBuffer.cpp
@ -19,6 +19,9 @@ AvatarAudioRingBuffer::AvatarAudioRingBuffer(bool isStereo) :
 }
 int AvatarAudioRingBuffer::parseData(const QByteArray& packet) {
    _interframeTimeGapStats.frameReceived();
    updateDesiredJitterBufferFrames();
    _shouldLoopbackForNode = (packetTypeForPacket(packet) == PacketTypeMicrophoneAudioWithEcho);
    return PositionalAudioRingBuffer::parseData(packet);
 }
--- a/interface/src/Audio.cpp
+++ b/interface/src/Audio.cpp
@ -461,8 +461,8 @@ void Audio::handleAudioInput() {
        int16_t* inputAudioSamples = new int16_t[inputSamplesRequired];
        _inputRingBuffer.readSamples(inputAudioSamples, inputSamplesRequired);
-        int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
+        const int numNetworkBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
-        int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
+        const int numNetworkSamples = _isStereoInput ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
        // zero out the monoAudioSamples array and the locally injected audio
        memset(networkAudioSamples, 0, numNetworkBytes);
@ -634,12 +634,10 @@ void Audio::handleAudioInput() {
                packetType = PacketTypeSilentAudioFrame;
                // we need to indicate how many silent samples this is to the audio mixer
-                audioDataPacket[0] = _isStereoInput
+                networkAudioSamples[0] = numNetworkSamples;
                    ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO
                    : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL;
                numAudioBytes = sizeof(int16_t);
            } else {
-                numAudioBytes = _isStereoInput ? NETWORK_BUFFER_LENGTH_BYTES_STEREO : NETWORK_BUFFER_LENGTH_BYTES_PER_CHANNEL;
+                numAudioBytes = numNetworkBytes;
                if (Menu::getInstance()->isOptionChecked(MenuOption::EchoServerAudio)) {
                    packetType = PacketTypeMicrophoneAudioWithEcho;
--- a/libraries/audio/src/AudioRingBuffer.cpp
+++ b/libraries/audio/src/AudioRingBuffer.cpp
@ -124,17 +124,15 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
    std::less<int16_t*> less;
    std::less_equal<int16_t*> lessEqual;
-
+    
-    if (_hasStarted
+    if (_hasStarted && samplesToCopy > _sampleCapacity - samplesAvailable()) {
        && (less(_endOfLastWrite, _nextOutput)
            && lessEqual(_nextOutput, shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy)))) {
        // this read will cross the next output, so call us starved and reset the buffer
        qDebug() << "Filled the ring buffer. Resetting.";
        _endOfLastWrite = _buffer;
        _nextOutput = _buffer;
        _isStarved = true;
    }
-
+    
    if (_endOfLastWrite + samplesToCopy <= _buffer + _sampleCapacity) {
        memcpy(_endOfLastWrite, data, samplesToCopy * sizeof(int16_t));
    } else {
@ -144,7 +142,7 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
    }
    _endOfLastWrite = shiftedPositionAccomodatingWrap(_endOfLastWrite, samplesToCopy);
-
+    
    return samplesToCopy * sizeof(int16_t);
 }
--- a/libraries/audio/src/InjectedAudioRingBuffer.cpp
+++ b/libraries/audio/src/InjectedAudioRingBuffer.cpp
@ -31,6 +31,9 @@ InjectedAudioRingBuffer::InjectedAudioRingBuffer(const QUuid& streamIdentifier)
 const uchar MAX_INJECTOR_VOLUME = 255;
 int InjectedAudioRingBuffer::parseData(const QByteArray& packet) {
    _interframeTimeGapStats.frameReceived();
    updateDesiredJitterBufferFrames();
    // setup a data stream to read from this packet
    QDataStream packetStream(packet);
    packetStream.skipRawData(numBytesForPacketHeader(packet));
--- a/libraries/audio/src/PositionalAudioRingBuffer.cpp
+++ b/libraries/audio/src/PositionalAudioRingBuffer.cpp
@ -19,6 +19,71 @@
 #include <UUID.h>
 #include "PositionalAudioRingBuffer.h"
 #include "SharedUtil.h"
 InterframeTimeGapStats::InterframeTimeGapStats()
    : _lastFrameReceivedTime(0),
    _numSamplesInCurrentInterval(0),
    _currentIntervalMaxGap(0),
    _newestIntervalMaxGapAt(0),
    _windowMaxGap(0),
    _newWindowMaxGapAvailable(false)
 {
    memset(_intervalMaxGaps, 0, TIME_GAP_NUM_INTERVALS_IN_WINDOW * sizeof(quint64));
 }
 void InterframeTimeGapStats::frameReceived() {
    quint64 now = usecTimestampNow();
    // make sure this isn't the first time frameReceived() is called so can actually calculate a gap.
    if (_lastFrameReceivedTime != 0) {
        quint64 gap = now - _lastFrameReceivedTime;
        // update the current interval max
        if (gap > _currentIntervalMaxGap) {
            _currentIntervalMaxGap = gap;
            // keep the window max gap at least as large as the current interval max
            // this allows the window max gap to respond immediately to a sudden spike in gap times
            // also, this prevents the window max gap from staying at 0 until the first interval of samples filled up
            if (_currentIntervalMaxGap > _windowMaxGap) {
                _windowMaxGap = _currentIntervalMaxGap;
                _newWindowMaxGapAvailable = true;
            }
        }
        _numSamplesInCurrentInterval++;
        // if the current interval of samples is now full, record it in our interval maxes
        if (_numSamplesInCurrentInterval == TIME_GAP_NUM_SAMPLES_IN_INTERVAL) {
            // find location to insert this interval's max (increment index cyclically)
            _newestIntervalMaxGapAt = _newestIntervalMaxGapAt == TIME_GAP_NUM_INTERVALS_IN_WINDOW - 1 ? 0 : _newestIntervalMaxGapAt + 1;
            // record the current interval's max gap as the newest
            _intervalMaxGaps[_newestIntervalMaxGapAt] = _currentIntervalMaxGap;
            // update the window max gap, which is the max out of all the past intervals' max gaps
            _windowMaxGap = 0;
            for (int i = 0; i < TIME_GAP_NUM_INTERVALS_IN_WINDOW; i++) {
                if (_intervalMaxGaps[i] > _windowMaxGap) {
                    _windowMaxGap = _intervalMaxGaps[i];
                }
            }
            _newWindowMaxGapAvailable = true;
            // reset the current interval
            _numSamplesInCurrentInterval = 0;
            _currentIntervalMaxGap = 0;
        }
    }
    _lastFrameReceivedTime = now;
 }
 quint64 InterframeTimeGapStats::getWindowMaxGap() {
    _newWindowMaxGapAvailable = false;
    return _windowMaxGap;
 }
 PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::Type type, bool isStereo) :
    AudioRingBuffer(isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL),
@ -29,9 +94,10 @@ PositionalAudioRingBuffer::PositionalAudioRingBuffer(PositionalAudioRingBuffer::
    _shouldLoopbackForNode(false),
    _shouldOutputStarveDebug(true),
    _isStereo(isStereo),
-    _listenerUnattenuatedZone(NULL)
+    _listenerUnattenuatedZone(NULL),
    _desiredJitterBufferFrames(1),
    _currentJitterBufferFrames(0)
 {
 }
 int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
@ -54,13 +120,31 @@ int PositionalAudioRingBuffer::parseData(const QByteArray& packet) {
        readBytes += sizeof(int16_t);
        if (numSilentSamples > 0) {
-            addSilentFrame(numSilentSamples);
+            if (_currentJitterBufferFrames > _desiredJitterBufferFrames) {
                // our current jitter buffer size exceeds its desired value, so ignore some silent
                // frames to get that size as close to desired as possible
                int samplesPerFrame = getSamplesPerFrame();
                int numSilentFrames = numSilentSamples / samplesPerFrame;
                int numFramesToDropDesired = _currentJitterBufferFrames - _desiredJitterBufferFrames;
                if (numSilentFrames > numFramesToDropDesired) {
                    // we have more than enough frames to drop to get the jitter buffer to its desired length
                    int numSilentFramesToAdd = numSilentFrames - numFramesToDropDesired;
                    addSilentFrame(numSilentFramesToAdd * samplesPerFrame);
                    _currentJitterBufferFrames = _desiredJitterBufferFrames;
                } else {
                    // we need to drop all frames to get the jitter buffer close as possible to its desired length
                    _currentJitterBufferFrames -= numSilentFrames;
                }
            } else {
                addSilentFrame(numSilentSamples);
            }
        }
    } else {
        // there is audio data to read
        readBytes += writeData(packet.data() + readBytes, packet.size() - readBytes);
    }
    return readBytes;
 }
@ -106,29 +190,54 @@ void PositionalAudioRingBuffer::updateNextOutputTrailingLoudness() {
    }
 }
-bool PositionalAudioRingBuffer::shouldBeAddedToMix(int numJitterBufferSamples) {
+bool PositionalAudioRingBuffer::shouldBeAddedToMix() {
-    if (!isNotStarvedOrHasMinimumSamples(NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL + numJitterBufferSamples)) {
+    int samplesPerFrame = getSamplesPerFrame();
    int desiredJitterBufferSamples = _desiredJitterBufferFrames * samplesPerFrame;
    if (!isNotStarvedOrHasMinimumSamples(samplesPerFrame + desiredJitterBufferSamples)) {
        // if the buffer was starved, allow it to accrue at least the desired number of
        // jitter buffer frames before we start taking frames from it for mixing
        if (_shouldOutputStarveDebug) {
            _shouldOutputStarveDebug = false;
        }
-        
+
-        return false;
+        return  false;
-    } else if (samplesAvailable() < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) {
+    } else if (samplesAvailable() < samplesPerFrame) { 
        // if the buffer doesn't have a full frame of samples to take for mixing, it is starved
        _isStarved = true;
        // set to 0 to indicate the jitter buffer is starved
        _currentJitterBufferFrames = 0;
        // reset our _shouldOutputStarveDebug to true so the next is printed
        _shouldOutputStarveDebug = true;
-        
+
        return false;
-    } else {
+    }
-        // good buffer, add this to the mix
+    
    // good buffer, add this to the mix
    if (_isStarved) {
        // if this buffer has just finished replenishing after being starved, the number of frames in it now
        // minus one (since a frame will be read immediately after this) is the length of the jitter buffer
        _currentJitterBufferFrames = samplesAvailable() / samplesPerFrame - 1;
        _isStarved = false;
        // since we've read data from ring buffer at least once - we've started
        _hasStarted = true;
        return true;
    }
-    return false;
+    // since we've read data from ring buffer at least once - we've started
    _hasStarted = true;
    return true;
 }
 void PositionalAudioRingBuffer::updateDesiredJitterBufferFrames() {
    const float USECS_PER_FRAME = NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL * USECS_PER_SECOND / (float)SAMPLE_RATE;
    if (_interframeTimeGapStats.hasNewWindowMaxGapAvailable()) {
        _desiredJitterBufferFrames = ceilf((float)_interframeTimeGapStats.getWindowMaxGap() / USECS_PER_FRAME);
        if (_desiredJitterBufferFrames < 1) {
            _desiredJitterBufferFrames = 1;
        }
    }
 }
--- a/libraries/audio/src/PositionalAudioRingBuffer.h
+++ b/libraries/audio/src/PositionalAudioRingBuffer.h
@ -18,6 +18,31 @@
 #include "AudioRingBuffer.h"
 // this means that every 500 samples, the max for the past 10*500 samples will be calculated
 const int TIME_GAP_NUM_SAMPLES_IN_INTERVAL = 500;
 const int TIME_GAP_NUM_INTERVALS_IN_WINDOW = 10;
 // class used to track time between incoming frames for the purpose of varying the jitter buffer length
 class InterframeTimeGapStats {
 public:
    InterframeTimeGapStats();
    void frameReceived();
    bool hasNewWindowMaxGapAvailable() const { return _newWindowMaxGapAvailable; }
    quint64 peekWindowMaxGap() const { return _windowMaxGap; }
    quint64 getWindowMaxGap();
 private:
    quint64 _lastFrameReceivedTime;
    int _numSamplesInCurrentInterval;
    quint64 _currentIntervalMaxGap;
    quint64 _intervalMaxGaps[TIME_GAP_NUM_INTERVALS_IN_WINDOW];
    int _newestIntervalMaxGapAt;
    quint64 _windowMaxGap;
    bool _newWindowMaxGapAvailable;
 };
 class PositionalAudioRingBuffer : public AudioRingBuffer {
 public:
    enum Type {
@ -34,7 +59,7 @@ public:
    void updateNextOutputTrailingLoudness();
    float getNextOutputTrailingLoudness() const { return _nextOutputTrailingLoudness; }
-    bool shouldBeAddedToMix(int numJitterBufferSamples);
+    bool shouldBeAddedToMix();
    bool willBeAddedToMix() const { return _willBeAddedToMix; }
    void setWillBeAddedToMix(bool willBeAddedToMix) { _willBeAddedToMix = willBeAddedToMix; }
@ -50,10 +75,14 @@ public:
    AABox* getListenerUnattenuatedZone() const { return _listenerUnattenuatedZone; }
    void setListenerUnattenuatedZone(AABox* listenerUnattenuatedZone) { _listenerUnattenuatedZone = listenerUnattenuatedZone; }
    int getSamplesPerFrame() const { return _isStereo ? NETWORK_BUFFER_LENGTH_SAMPLES_STEREO : NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL; }
 protected:
    // disallow copying of PositionalAudioRingBuffer objects
    PositionalAudioRingBuffer(const PositionalAudioRingBuffer&);
    PositionalAudioRingBuffer& operator= (const PositionalAudioRingBuffer&);
    void updateDesiredJitterBufferFrames();
    PositionalAudioRingBuffer::Type _type;
    glm::vec3 _position;
@ -65,6 +94,10 @@ protected:
    float _nextOutputTrailingLoudness;
    AABox* _listenerUnattenuatedZone;
    InterframeTimeGapStats _interframeTimeGapStats;
    int _desiredJitterBufferFrames;
    int _currentJitterBufferFrames;
 };
 #endif // hifi_PositionalAudioRingBuffer_h
--- a/libraries/octree/src/OctreeEditPacketSender.cpp
+++ b/libraries/octree/src/OctreeEditPacketSender.cpp
@ -354,7 +354,7 @@ void OctreeEditPacketSender::processNackPacket(const QByteArray& packet) {
    // read number of sequence numbers
    uint16_t numSequenceNumbers = (*(uint16_t*)dataAt);
    dataAt += sizeof(uint16_t);
-
+    
    // read sequence numbers and queue packets for resend
    for (int i = 0; i < numSequenceNumbers; i++) {
        unsigned short int sequenceNumber = (*(unsigned short int*)dataAt);