From 1bdad89cbe2bd95b1af910ee6d4c10d232c86012 Mon Sep 17 00:00:00 2001 From: Brad Hefta-Gaub Date: Wed, 8 Mar 2017 17:27:46 -0800 Subject: [PATCH 1/5] properly handle silent packet transitions --- libraries/audio-client/src/AudioClient.cpp | 7 +++- libraries/audio-client/src/AudioNoiseGate.cpp | 7 +++- libraries/audio-client/src/AudioNoiseGate.h | 5 +++ libraries/audio/src/InboundAudioStream.cpp | 34 ++++++++++++++++--- 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/libraries/audio-client/src/AudioClient.cpp b/libraries/audio-client/src/AudioClient.cpp index bd141cfb12..9cd87d2e70 100644 --- a/libraries/audio-client/src/AudioClient.cpp +++ b/libraries/audio-client/src/AudioClient.cpp @@ -1052,7 +1052,12 @@ void AudioClient::handleAudioInput() { auto packetType = _shouldEchoToServer ? PacketType::MicrophoneAudioWithEcho : PacketType::MicrophoneAudioNoEcho; - if (_lastInputLoudness == 0) { + // if the _inputGate closed in this last frame, then we don't actually want + // to send a silent packet, instead, we want to go ahead and encode and send + // the output from the input gate (eventually, this could be crossfaded) + // and allow the codec to properly encode down to silent/zero. If we still + // have _lastInputLoudness of 0 in our NEXT frame, we will send a silent packet + if (_lastInputLoudness == 0 && !_inputGate.closedInLastFrame()) { packetType = PacketType::SilentAudioFrame; } Transform audioTransform; diff --git a/libraries/audio-client/src/AudioNoiseGate.cpp b/libraries/audio-client/src/AudioNoiseGate.cpp index 8766a20cdf..9458f47d8c 100644 --- a/libraries/audio-client/src/AudioNoiseGate.cpp +++ b/libraries/audio-client/src/AudioNoiseGate.cpp @@ -58,6 +58,7 @@ void AudioNoiseGate::removeDCOffset(int16_t* samples, int numSamples) { } } +#include void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { // @@ -77,7 +78,8 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { // NOISE_GATE_FRAMES_TO_AVERAGE: How many audio frames should we average together to compute noise floor. // More means better rejection but also can reject continuous things like singing. // NUMBER_OF_NOISE_SAMPLE_FRAMES: How often should we re-evaluate the noise floor? - + + _closedInLastFrame = false; float loudness = 0; int thisSample = 0; @@ -147,6 +149,9 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { _framesToClose = NOISE_GATE_CLOSE_FRAME_DELAY; } else { if (--_framesToClose == 0) { + if (_isOpen) { + _closedInLastFrame = true; + } _isOpen = false; } } diff --git a/libraries/audio-client/src/AudioNoiseGate.h b/libraries/audio-client/src/AudioNoiseGate.h index 8cb1155938..18b5a77056 100644 --- a/libraries/audio-client/src/AudioNoiseGate.h +++ b/libraries/audio-client/src/AudioNoiseGate.h @@ -24,6 +24,10 @@ public: void removeDCOffset(int16_t* samples, int numSamples); bool clippedInLastFrame() const { return _didClipInLastFrame; } + + bool closedInLastFrame() const { return _closedInLastFrame; } + + bool isOpen() const { return _isOpen; } float getMeasuredFloor() const { return _measuredFloor; } float getLastLoudness() const { return _lastLoudness; } @@ -40,6 +44,7 @@ private: float _sampleFrames[NUMBER_OF_NOISE_SAMPLE_FRAMES]; int _sampleCounter; bool _isOpen; + bool _closedInLastFrame { false }; int _framesToClose; }; diff --git a/libraries/audio/src/InboundAudioStream.cpp b/libraries/audio/src/InboundAudioStream.cpp index 57c344adaf..12181cb8e2 100644 --- a/libraries/audio/src/InboundAudioStream.cpp +++ b/libraries/audio/src/InboundAudioStream.cpp @@ -136,9 +136,10 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { break; } case SequenceNumberStats::Early: { - // Packet is early; write droppable silent samples for each of the skipped packets. - // NOTE: we assume that each dropped packet contains the same number of samples - // as the packet we just received. + // Packet is early treat the packets as if all the packets between the last + // OnTime packet and this packet was lost. If we're using a codec this will + // also result in allowing the codec to flush its internal state. Then + // fall through to the "on time" logic to actually handle this packet int packetsDropped = arrivalInfo._seqDiffFromExpected; lostAudioData(packetsDropped); @@ -147,7 +148,10 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { case SequenceNumberStats::OnTime: { // Packet is on time; parse its data to the ringbuffer if (message.getType() == PacketType::SilentAudioFrame) { - // FIXME - Some codecs need to know about these silent frames... and can produce better output + // If we recieved a SilentAudioFrame from our sender, we might want to drop + // some of the samples in order to catch up to our desired jitter buffer size. + // NOTE: If we're using a codec we will be calling the codec's lostFrame() + // method to allow the codec to flush its internal state. writeDroppableSilentFrames(networkFrames); } else { // note: PCM and no codec are identical @@ -158,7 +162,12 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { parseAudioData(message.getType(), afterProperties); } else { qDebug(audio) << "Codec mismatch: expected" << _selectedCodecName << "got" << codecInPacket << "writing silence"; - writeDroppableSilentFrames(networkFrames); + + // Since the data in the stream is using a codec that we're not prepared for, + // we need to let the codec know that we don't have data for it, this will + // flush any internal codec state and produce fade to silence. + lostAudioData(1); + // inform others of the mismatch auto sendingNode = DependencyManager::get()->nodeWithUUID(message.getSourceID()); emit mismatchedAudioCodec(sendingNode, _selectedCodecName, codecInPacket); @@ -240,6 +249,21 @@ int InboundAudioStream::parseAudioData(PacketType type, const QByteArray& packet int InboundAudioStream::writeDroppableSilentFrames(int silentFrames) { + // if we have a decoder, we still want to tell the decoder about our + // lost frame. this will flush the internal state of the decoder + // we can safely ignore the output of the codec in this case, because + // we've enforced that on the sending side, the encoder ran at least + // one frame of truly silent audio before we sent the "droppable" silent + // frame. Technically we could leave this out, if we know for certain + // that the sender has really sent us an encoded packet of zeros, but + // since we can trust all encoders to always encode at least one silent + // frame (open source, someone code modify it), we will go ahead and + // tell our decoder about the lost frame. + if (_decoder) { + QByteArray decodedBuffer; + _decoder->lostFrame(decodedBuffer); + } + // calculate how many silent frames we should drop. int silentSamples = silentFrames * _numChannels; int samplesPerFrame = _ringBuffer.getNumFrameSamples(); From 60daaba52511e79eac826685df77893c50c1eb2d Mon Sep 17 00:00:00 2001 From: Brad Hefta-Gaub Date: Wed, 8 Mar 2017 17:30:44 -0800 Subject: [PATCH 2/5] small cleanup --- libraries/audio-client/src/AudioNoiseGate.cpp | 2 -- libraries/audio-client/src/AudioNoiseGate.h | 3 --- 2 files changed, 5 deletions(-) diff --git a/libraries/audio-client/src/AudioNoiseGate.cpp b/libraries/audio-client/src/AudioNoiseGate.cpp index 9458f47d8c..ac45f98f0b 100644 --- a/libraries/audio-client/src/AudioNoiseGate.cpp +++ b/libraries/audio-client/src/AudioNoiseGate.cpp @@ -58,8 +58,6 @@ void AudioNoiseGate::removeDCOffset(int16_t* samples, int numSamples) { } } -#include - void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { // // Impose Noise Gate diff --git a/libraries/audio-client/src/AudioNoiseGate.h b/libraries/audio-client/src/AudioNoiseGate.h index 18b5a77056..774a4157bb 100644 --- a/libraries/audio-client/src/AudioNoiseGate.h +++ b/libraries/audio-client/src/AudioNoiseGate.h @@ -24,10 +24,7 @@ public: void removeDCOffset(int16_t* samples, int numSamples); bool clippedInLastFrame() const { return _didClipInLastFrame; } - bool closedInLastFrame() const { return _closedInLastFrame; } - - bool isOpen() const { return _isOpen; } float getMeasuredFloor() const { return _measuredFloor; } float getLastLoudness() const { return _lastLoudness; } From 2d595c1f879ccc75709b164567b7e4c4c00c7010 Mon Sep 17 00:00:00 2001 From: Brad Hefta-Gaub Date: Wed, 8 Mar 2017 20:29:32 -0800 Subject: [PATCH 3/5] CR feedback --- libraries/audio-client/src/AudioNoiseGate.cpp | 7 ++--- libraries/audio/src/InboundAudioStream.cpp | 30 ++++++++++--------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/libraries/audio-client/src/AudioNoiseGate.cpp b/libraries/audio-client/src/AudioNoiseGate.cpp index ac45f98f0b..98ce8cc9e8 100644 --- a/libraries/audio-client/src/AudioNoiseGate.cpp +++ b/libraries/audio-client/src/AudioNoiseGate.cpp @@ -77,8 +77,6 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { // More means better rejection but also can reject continuous things like singing. // NUMBER_OF_NOISE_SAMPLE_FRAMES: How often should we re-evaluate the noise floor? - _closedInLastFrame = false; - float loudness = 0; int thisSample = 0; int samplesOverNoiseGate = 0; @@ -142,14 +140,13 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { _sampleCounter = 0; } + if (samplesOverNoiseGate > NOISE_GATE_WIDTH) { _isOpen = true; _framesToClose = NOISE_GATE_CLOSE_FRAME_DELAY; } else { if (--_framesToClose == 0) { - if (_isOpen) { - _closedInLastFrame = true; - } + _closedInLastFrame = !_isOpen; _isOpen = false; } } diff --git a/libraries/audio/src/InboundAudioStream.cpp b/libraries/audio/src/InboundAudioStream.cpp index 12181cb8e2..3a5829e8fb 100644 --- a/libraries/audio/src/InboundAudioStream.cpp +++ b/libraries/audio/src/InboundAudioStream.cpp @@ -136,9 +136,9 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { break; } case SequenceNumberStats::Early: { - // Packet is early treat the packets as if all the packets between the last + // Packet is early. Treat the packets as if all the packets between the last // OnTime packet and this packet was lost. If we're using a codec this will - // also result in allowing the codec to flush its internal state. Then + // also result in allowing the codec to interpolate lost data. Then // fall through to the "on time" logic to actually handle this packet int packetsDropped = arrivalInfo._seqDiffFromExpected; lostAudioData(packetsDropped); @@ -150,8 +150,6 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { if (message.getType() == PacketType::SilentAudioFrame) { // If we recieved a SilentAudioFrame from our sender, we might want to drop // some of the samples in order to catch up to our desired jitter buffer size. - // NOTE: If we're using a codec we will be calling the codec's lostFrame() - // method to allow the codec to flush its internal state. writeDroppableSilentFrames(networkFrames); } else { // note: PCM and no codec are identical @@ -249,17 +247,21 @@ int InboundAudioStream::parseAudioData(PacketType type, const QByteArray& packet int InboundAudioStream::writeDroppableSilentFrames(int silentFrames) { - // if we have a decoder, we still want to tell the decoder about our - // lost frame. this will flush the internal state of the decoder - // we can safely ignore the output of the codec in this case, because - // we've enforced that on the sending side, the encoder ran at least - // one frame of truly silent audio before we sent the "droppable" silent - // frame. Technically we could leave this out, if we know for certain - // that the sender has really sent us an encoded packet of zeros, but - // since we can trust all encoders to always encode at least one silent - // frame (open source, someone code modify it), we will go ahead and - // tell our decoder about the lost frame. + // We can't guarentee that all clients have faded the stream down + // to silence and encode that silence before sending us a + // SilentAudioFrame. The encoder may have truncated the stream and + // left the decoder holding some loud state. To handle this case + // we will call the decoder's lostFrame() method, which indicates + // that it should interpolate from it's last known state (which may + // be some unknown loud state) down toward silence. if (_decoder) { + // FIXME - We could potentially use the output from the codec, in which + // case we might get a cleaner fade toward silence. NOTE: The below logic + // attempts to catch up in the event that the jitter buffers have grown. + // The better long term fix is to use the output from the decode, detect + // when it actually reaches silence, and then delete the silent portions + // of the jitter buffers. Or petentially do a cross fade from the decode + // output to silence. QByteArray decodedBuffer; _decoder->lostFrame(decodedBuffer); } From 79d8d90581103152926775f5cdd312bc1e2d0ecd Mon Sep 17 00:00:00 2001 From: Brad Hefta-Gaub Date: Wed, 8 Mar 2017 20:35:24 -0800 Subject: [PATCH 4/5] CR feedback --- libraries/audio/src/InboundAudioStream.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libraries/audio/src/InboundAudioStream.cpp b/libraries/audio/src/InboundAudioStream.cpp index 3a5829e8fb..70884c9fd1 100644 --- a/libraries/audio/src/InboundAudioStream.cpp +++ b/libraries/audio/src/InboundAudioStream.cpp @@ -137,7 +137,7 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { } case SequenceNumberStats::Early: { // Packet is early. Treat the packets as if all the packets between the last - // OnTime packet and this packet was lost. If we're using a codec this will + // OnTime packet and this packet were lost. If we're using a codec this will // also result in allowing the codec to interpolate lost data. Then // fall through to the "on time" logic to actually handle this packet int packetsDropped = arrivalInfo._seqDiffFromExpected; @@ -161,9 +161,9 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { } else { qDebug(audio) << "Codec mismatch: expected" << _selectedCodecName << "got" << codecInPacket << "writing silence"; - // Since the data in the stream is using a codec that we're not prepared for, + // Since the data in the stream is using a codec that we aren't prepared for, // we need to let the codec know that we don't have data for it, this will - // flush any internal codec state and produce fade to silence. + // allowe the codec to interpolate missing data and produce a fade to silence. lostAudioData(1); // inform others of the mismatch @@ -248,12 +248,12 @@ int InboundAudioStream::parseAudioData(PacketType type, const QByteArray& packet int InboundAudioStream::writeDroppableSilentFrames(int silentFrames) { // We can't guarentee that all clients have faded the stream down - // to silence and encode that silence before sending us a - // SilentAudioFrame. The encoder may have truncated the stream and - // left the decoder holding some loud state. To handle this case - // we will call the decoder's lostFrame() method, which indicates - // that it should interpolate from it's last known state (which may - // be some unknown loud state) down toward silence. + // to silence and encoding that silence before sending us a + // SilentAudioFrame. If the encoder has truncated the stream it will + // leave the decoder holding some unknown loud state. To handle this + // case we will call the decoder's lostFrame() method, which indicates + // that it should interpolate from its last known state down toward + // silence. if (_decoder) { // FIXME - We could potentially use the output from the codec, in which // case we might get a cleaner fade toward silence. NOTE: The below logic From bdab24d9bfc731b81db02ffdf409a7768881adb7 Mon Sep 17 00:00:00 2001 From: Brad Hefta-Gaub Date: Thu, 9 Mar 2017 08:25:56 -0800 Subject: [PATCH 5/5] fix spelling --- libraries/audio/src/InboundAudioStream.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/audio/src/InboundAudioStream.cpp b/libraries/audio/src/InboundAudioStream.cpp index 70884c9fd1..88ec7e7bc0 100644 --- a/libraries/audio/src/InboundAudioStream.cpp +++ b/libraries/audio/src/InboundAudioStream.cpp @@ -163,7 +163,7 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { // Since the data in the stream is using a codec that we aren't prepared for, // we need to let the codec know that we don't have data for it, this will - // allowe the codec to interpolate missing data and produce a fade to silence. + // allow the codec to interpolate missing data and produce a fade to silence. lostAudioData(1); // inform others of the mismatch @@ -248,7 +248,7 @@ int InboundAudioStream::parseAudioData(PacketType type, const QByteArray& packet int InboundAudioStream::writeDroppableSilentFrames(int silentFrames) { // We can't guarentee that all clients have faded the stream down - // to silence and encoding that silence before sending us a + // to silence and encoded that silence before sending us a // SilentAudioFrame. If the encoder has truncated the stream it will // leave the decoder holding some unknown loud state. To handle this // case we will call the decoder's lostFrame() method, which indicates