Merge pull request #9842 from ZappoMan/tellCodecAboutSilence

improved noise gate
2025-04-19 15:43:50 +02:00 · 2017-03-09 11:04:22 -08:00 · 2017-03-09 11:04:22 -08:00 · 45efb235e3
commit 45efb235e3
parent 007ed9fe81 bdab24d9bf
4 changed files with 42 additions and 9 deletions
--- a/libraries/audio-client/src/AudioClient.cpp
+++ b/libraries/audio-client/src/AudioClient.cpp
@ -1052,7 +1052,12 @@ void AudioClient::handleAudioInput() {
        auto packetType = _shouldEchoToServer ?
            PacketType::MicrophoneAudioWithEcho : PacketType::MicrophoneAudioNoEcho;

-        if (_lastInputLoudness == 0) {
+        // if the _inputGate closed in this last frame, then we don't actually want
+        // to send a silent packet, instead, we want to go ahead and encode and send
+        // the output from the input gate (eventually, this could be crossfaded)
+        // and allow the codec to properly encode down to silent/zero. If we still
+        // have _lastInputLoudness of 0 in our NEXT frame, we will send a silent packet
+        if (_lastInputLoudness == 0 && !_inputGate.closedInLastFrame()) {
            packetType = PacketType::SilentAudioFrame;
        }
        Transform audioTransform;
--- a/libraries/audio-client/src/AudioNoiseGate.cpp
+++ b/libraries/audio-client/src/AudioNoiseGate.cpp
@ -58,7 +58,6 @@ void AudioNoiseGate::removeDCOffset(int16_t* samples, int numSamples) {
    }
 }

-
 void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) {
    //
    //  Impose Noise Gate
@ -77,8 +76,7 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) {
    //  NOISE_GATE_FRAMES_TO_AVERAGE:  How many audio frames should we average together to compute noise floor.
    //                      More means better rejection but also can reject continuous things like singing.
    // NUMBER_OF_NOISE_SAMPLE_FRAMES:  How often should we re-evaluate the noise floor?
-    
-    
+
    float loudness = 0;
    int thisSample = 0;
    int samplesOverNoiseGate = 0;
@ -142,11 +140,13 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) {
        _sampleCounter = 0;
        
    }
+
    if (samplesOverNoiseGate > NOISE_GATE_WIDTH) {
        _isOpen = true;
        _framesToClose = NOISE_GATE_CLOSE_FRAME_DELAY;
    } else {
        if (--_framesToClose == 0) {
+            _closedInLastFrame = !_isOpen;
            _isOpen = false;
        }
    }
--- a/libraries/audio-client/src/AudioNoiseGate.h
+++ b/libraries/audio-client/src/AudioNoiseGate.h
@ -24,6 +24,7 @@ public:
    void removeDCOffset(int16_t* samples, int numSamples);
    
    bool clippedInLastFrame() const { return _didClipInLastFrame; }
+    bool closedInLastFrame() const { return _closedInLastFrame; }
    float getMeasuredFloor() const { return _measuredFloor; }
    float getLastLoudness() const { return _lastLoudness; }
    
@ -40,6 +41,7 @@ private:
    float _sampleFrames[NUMBER_OF_NOISE_SAMPLE_FRAMES];
    int _sampleCounter;
    bool _isOpen;
+    bool _closedInLastFrame { false };
    int _framesToClose;
 };

--- a/libraries/audio/src/InboundAudioStream.cpp
+++ b/libraries/audio/src/InboundAudioStream.cpp
@ -136,9 +136,10 @@ int InboundAudioStream::parseData(ReceivedMessage& message) {
            break;
        }
        case SequenceNumberStats::Early: {
-            // Packet is early; write droppable silent samples for each of the skipped packets.
-            // NOTE: we assume that each dropped packet contains the same number of samples
-            // as the packet we just received.
+            // Packet is early. Treat the packets as if all the packets between the last
+            // OnTime packet and this packet were lost. If we're using a codec this will 
+            // also result in allowing the codec to interpolate lost data. Then
+            // fall through to the "on time" logic to actually handle this packet
            int packetsDropped = arrivalInfo._seqDiffFromExpected;
            lostAudioData(packetsDropped);

@ -147,7 +148,8 @@ int InboundAudioStream::parseData(ReceivedMessage& message) {
        case SequenceNumberStats::OnTime: {
            // Packet is on time; parse its data to the ringbuffer
            if (message.getType() == PacketType::SilentAudioFrame) {
-                // FIXME - Some codecs need to know about these silent frames... and can produce better output
+                // If we recieved a SilentAudioFrame from our sender, we might want to drop
+                // some of the samples in order to catch up to our desired jitter buffer size.
                writeDroppableSilentFrames(networkFrames);
            } else {
                // note: PCM and no codec are identical
@ -158,7 +160,12 @@ int InboundAudioStream::parseData(ReceivedMessage& message) {
                    parseAudioData(message.getType(), afterProperties);
                } else {
                    qDebug(audio) << "Codec mismatch: expected" << _selectedCodecName << "got" << codecInPacket << "writing silence";
-                    writeDroppableSilentFrames(networkFrames);
+
+                    // Since the data in the stream is using a codec that we aren't prepared for,
+                    // we need to let the codec know that we don't have data for it, this will
+                    // allow the codec to interpolate missing data and produce a fade to silence.
+                    lostAudioData(1);
+
                    // inform others of the mismatch
                    auto sendingNode = DependencyManager::get<NodeList>()->nodeWithUUID(message.getSourceID());
                    emit mismatchedAudioCodec(sendingNode, _selectedCodecName, codecInPacket);
@ -240,6 +247,25 @@ int InboundAudioStream::parseAudioData(PacketType type, const QByteArray& packet

 int InboundAudioStream::writeDroppableSilentFrames(int silentFrames) {

+    // We can't guarentee that all clients have faded the stream down
+    // to silence and encoded that silence before sending us a 
+    // SilentAudioFrame. If the encoder has truncated the stream it will
+    // leave the decoder holding some unknown loud state. To handle this 
+    // case we will call the decoder's lostFrame() method, which indicates
+    // that it should interpolate from its last known state down toward 
+    // silence.
+    if (_decoder) {
+        // FIXME - We could potentially use the output from the codec, in which 
+        // case we might get a cleaner fade toward silence. NOTE: The below logic 
+        // attempts to catch up in the event that the jitter buffers have grown. 
+        // The better long term fix is to use the output from the decode, detect
+        // when it actually reaches silence, and then delete the silent portions
+        // of the jitter buffers. Or petentially do a cross fade from the decode
+        // output to silence.
+        QByteArray decodedBuffer;
+        _decoder->lostFrame(decodedBuffer);
+    }
+
    // calculate how many silent frames we should drop.
    int silentSamples = silentFrames * _numChannels;
    int samplesPerFrame = _ringBuffer.getNumFrameSamples();