From 1bdad89cbe2bd95b1af910ee6d4c10d232c86012 Mon Sep 17 00:00:00 2001 From: Brad Hefta-Gaub Date: Wed, 8 Mar 2017 17:27:46 -0800 Subject: [PATCH] properly handle silent packet transitions --- libraries/audio-client/src/AudioClient.cpp | 7 +++- libraries/audio-client/src/AudioNoiseGate.cpp | 7 +++- libraries/audio-client/src/AudioNoiseGate.h | 5 +++ libraries/audio/src/InboundAudioStream.cpp | 34 ++++++++++++++++--- 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/libraries/audio-client/src/AudioClient.cpp b/libraries/audio-client/src/AudioClient.cpp index bd141cfb12..9cd87d2e70 100644 --- a/libraries/audio-client/src/AudioClient.cpp +++ b/libraries/audio-client/src/AudioClient.cpp @@ -1052,7 +1052,12 @@ void AudioClient::handleAudioInput() { auto packetType = _shouldEchoToServer ? PacketType::MicrophoneAudioWithEcho : PacketType::MicrophoneAudioNoEcho; - if (_lastInputLoudness == 0) { + // if the _inputGate closed in this last frame, then we don't actually want + // to send a silent packet, instead, we want to go ahead and encode and send + // the output from the input gate (eventually, this could be crossfaded) + // and allow the codec to properly encode down to silent/zero. If we still + // have _lastInputLoudness of 0 in our NEXT frame, we will send a silent packet + if (_lastInputLoudness == 0 && !_inputGate.closedInLastFrame()) { packetType = PacketType::SilentAudioFrame; } Transform audioTransform; diff --git a/libraries/audio-client/src/AudioNoiseGate.cpp b/libraries/audio-client/src/AudioNoiseGate.cpp index 8766a20cdf..9458f47d8c 100644 --- a/libraries/audio-client/src/AudioNoiseGate.cpp +++ b/libraries/audio-client/src/AudioNoiseGate.cpp @@ -58,6 +58,7 @@ void AudioNoiseGate::removeDCOffset(int16_t* samples, int numSamples) { } } +#include void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { // @@ -77,7 +78,8 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { // NOISE_GATE_FRAMES_TO_AVERAGE: How many audio frames should we average together to compute noise floor. // More means better rejection but also can reject continuous things like singing. // NUMBER_OF_NOISE_SAMPLE_FRAMES: How often should we re-evaluate the noise floor? - + + _closedInLastFrame = false; float loudness = 0; int thisSample = 0; @@ -147,6 +149,9 @@ void AudioNoiseGate::gateSamples(int16_t* samples, int numSamples) { _framesToClose = NOISE_GATE_CLOSE_FRAME_DELAY; } else { if (--_framesToClose == 0) { + if (_isOpen) { + _closedInLastFrame = true; + } _isOpen = false; } } diff --git a/libraries/audio-client/src/AudioNoiseGate.h b/libraries/audio-client/src/AudioNoiseGate.h index 8cb1155938..18b5a77056 100644 --- a/libraries/audio-client/src/AudioNoiseGate.h +++ b/libraries/audio-client/src/AudioNoiseGate.h @@ -24,6 +24,10 @@ public: void removeDCOffset(int16_t* samples, int numSamples); bool clippedInLastFrame() const { return _didClipInLastFrame; } + + bool closedInLastFrame() const { return _closedInLastFrame; } + + bool isOpen() const { return _isOpen; } float getMeasuredFloor() const { return _measuredFloor; } float getLastLoudness() const { return _lastLoudness; } @@ -40,6 +44,7 @@ private: float _sampleFrames[NUMBER_OF_NOISE_SAMPLE_FRAMES]; int _sampleCounter; bool _isOpen; + bool _closedInLastFrame { false }; int _framesToClose; }; diff --git a/libraries/audio/src/InboundAudioStream.cpp b/libraries/audio/src/InboundAudioStream.cpp index 57c344adaf..12181cb8e2 100644 --- a/libraries/audio/src/InboundAudioStream.cpp +++ b/libraries/audio/src/InboundAudioStream.cpp @@ -136,9 +136,10 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { break; } case SequenceNumberStats::Early: { - // Packet is early; write droppable silent samples for each of the skipped packets. - // NOTE: we assume that each dropped packet contains the same number of samples - // as the packet we just received. + // Packet is early treat the packets as if all the packets between the last + // OnTime packet and this packet was lost. If we're using a codec this will + // also result in allowing the codec to flush its internal state. Then + // fall through to the "on time" logic to actually handle this packet int packetsDropped = arrivalInfo._seqDiffFromExpected; lostAudioData(packetsDropped); @@ -147,7 +148,10 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { case SequenceNumberStats::OnTime: { // Packet is on time; parse its data to the ringbuffer if (message.getType() == PacketType::SilentAudioFrame) { - // FIXME - Some codecs need to know about these silent frames... and can produce better output + // If we recieved a SilentAudioFrame from our sender, we might want to drop + // some of the samples in order to catch up to our desired jitter buffer size. + // NOTE: If we're using a codec we will be calling the codec's lostFrame() + // method to allow the codec to flush its internal state. writeDroppableSilentFrames(networkFrames); } else { // note: PCM and no codec are identical @@ -158,7 +162,12 @@ int InboundAudioStream::parseData(ReceivedMessage& message) { parseAudioData(message.getType(), afterProperties); } else { qDebug(audio) << "Codec mismatch: expected" << _selectedCodecName << "got" << codecInPacket << "writing silence"; - writeDroppableSilentFrames(networkFrames); + + // Since the data in the stream is using a codec that we're not prepared for, + // we need to let the codec know that we don't have data for it, this will + // flush any internal codec state and produce fade to silence. + lostAudioData(1); + // inform others of the mismatch auto sendingNode = DependencyManager::get()->nodeWithUUID(message.getSourceID()); emit mismatchedAudioCodec(sendingNode, _selectedCodecName, codecInPacket); @@ -240,6 +249,21 @@ int InboundAudioStream::parseAudioData(PacketType type, const QByteArray& packet int InboundAudioStream::writeDroppableSilentFrames(int silentFrames) { + // if we have a decoder, we still want to tell the decoder about our + // lost frame. this will flush the internal state of the decoder + // we can safely ignore the output of the codec in this case, because + // we've enforced that on the sending side, the encoder ran at least + // one frame of truly silent audio before we sent the "droppable" silent + // frame. Technically we could leave this out, if we know for certain + // that the sender has really sent us an encoded packet of zeros, but + // since we can trust all encoders to always encode at least one silent + // frame (open source, someone code modify it), we will go ahead and + // tell our decoder about the lost frame. + if (_decoder) { + QByteArray decodedBuffer; + _decoder->lostFrame(decodedBuffer); + } + // calculate how many silent frames we should drop. int silentSamples = silentFrames * _numChannels; int samplesPerFrame = _ringBuffer.getNumFrameSamples();