Some experimentation yields promising results...

This commit is contained in:
Zach Fox 2018-10-16 17:34:48 -07:00
parent 79f30eb05d
commit 26e388b139
5 changed files with 844 additions and 811 deletions

View file

@ -1182,6 +1182,7 @@ Application::Application(int& argc, char** argv, QElapsedTimer& startupTimer, bo
auto TTS = DependencyManager::get<TTSScriptingInterface>().data();
connect(TTS, &TTSScriptingInterface::ttsSampleCreated, audioIO, &AudioClient::handleTTSAudioInput);
connect(TTS, &TTSScriptingInterface::clearTTSBuffer, audioIO, &AudioClient::clearTTSBuffer);
connect(audioIO, &AudioClient::inputReceived, [](const QByteArray& audio) {
static auto recorder = DependencyManager::get<recording::Recorder>();

View file

@ -65,7 +65,7 @@ void TTSScriptingInterface::testTone(const bool& alsoInject) {
int16_t temp = (glm::sin(glm::radians((float)a))) * 32768;
samples[a] = temp;
}
emit ttsSampleCreated(_lastSoundByteArray);
emit ttsSampleCreated(_lastSoundByteArray, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50, 96);
if (alsoInject) {
AudioInjectorOptions options;
@ -75,11 +75,16 @@ void TTSScriptingInterface::testTone(const bool& alsoInject) {
}
}
void TTSScriptingInterface::speakText(const QString& textToSpeak, const bool& alsoInject) {
void TTSScriptingInterface::speakText(const QString& textToSpeak,
const int& newChunkSize,
const int& timerInterval,
const int& sampleRate,
const int& bitsPerSample,
const bool& alsoInject) {
WAVEFORMATEX fmt;
fmt.wFormatTag = WAVE_FORMAT_PCM;
fmt.nSamplesPerSec = 24000;
fmt.wBitsPerSample = 16;
fmt.nSamplesPerSec = sampleRate;
fmt.wBitsPerSample = bitsPerSample;
fmt.nChannels = 1;
fmt.nBlockAlign = fmt.nChannels * fmt.wBitsPerSample / 8;
fmt.nAvgBytesPerSec = fmt.nSamplesPerSec * fmt.nBlockAlign;
@ -146,7 +151,7 @@ void TTSScriptingInterface::speakText(const QString& textToSpeak, const bool& al
_lastSoundByteArray.resize(0);
_lastSoundByteArray.append(buf1, dwSize);
emit ttsSampleCreated(_lastSoundByteArray);
emit ttsSampleCreated(_lastSoundByteArray, newChunkSize, timerInterval);
if (alsoInject) {
AudioInjectorOptions options;
@ -160,4 +165,6 @@ void TTSScriptingInterface::stopLastSpeech() {
if (_lastSoundAudioInjector) {
_lastSoundAudioInjector->stop();
}
emit clearTTSBuffer();
}

View file

@ -19,6 +19,7 @@
#include <sapi.h> // SAPI
#include <sphelper.h> // SAPI Helper
#include <AudioInjector.h>
#include <AudioConstants.h>
class TTSScriptingInterface : public QObject, public Dependency {
Q_OBJECT
@ -28,11 +29,17 @@ public:
~TTSScriptingInterface();
Q_INVOKABLE void testTone(const bool& alsoInject = false);
Q_INVOKABLE void speakText(const QString& textToSpeak, const bool& alsoInject = false);
Q_INVOKABLE void speakText(const QString& textToSpeak,
const int& newChunkSize = (AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50),
const int& timerInterval = 96,
const int& sampleRate = 24000,
const int& bitsPerSample = 16,
const bool& alsoInject = false);
Q_INVOKABLE void stopLastSpeech();
signals:
void ttsSampleCreated(QByteArray outputArray);
void ttsSampleCreated(QByteArray outputArray, const int& newChunkSize, const int& timerInterval);
void clearTTSBuffer();
private:
class CComAutoInit {

View file

@ -186,7 +186,7 @@ AudioClient::AudioClient() :
_networkToOutputResampler(NULL), _localToOutputResampler(NULL),
_audioLimiter(AudioConstants::SAMPLE_RATE, OUTPUT_CHANNEL_COUNT), _outgoingAvatarAudioSequenceNumber(0),
_audioOutputIODevice(_localInjectorsStream, _receivedAudioStream, this), _stats(&_receivedAudioStream),
_positionGetter(DEFAULT_POSITION_GETTER),
_positionGetter(DEFAULT_POSITION_GETTER), _TTSTimer(this),
#if defined(Q_OS_ANDROID)
_checkInputTimer(this), _isHeadsetPluggedIn(false),
#endif
@ -245,6 +245,8 @@ AudioClient::AudioClient() :
packetReceiver.registerListener(PacketType::NoisyMute, this, "handleNoisyMutePacket");
packetReceiver.registerListener(PacketType::MuteEnvironment, this, "handleMuteEnvironmentPacket");
packetReceiver.registerListener(PacketType::SelectedAudioFormat, this, "handleSelectedAudioFormat");
connect(&_TTSTimer, &QTimer::timeout, this, &AudioClient::processTTSBuffer);
}
AudioClient::~AudioClient() {
@ -939,7 +941,7 @@ void AudioClient::setReverbOptions(const AudioEffectOptions* options) {
}
}
void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray, const int& sampleRate, const int& channelCount) {
// If there is server echo, reverb will be applied to the recieved audio stream so no need to have it here.
bool hasReverb = _reverb || _receivedAudioStream.hasReverb();
if (_muted || !_audioOutput || (!_shouldEchoLocally && !hasReverb)) {
@ -949,7 +951,7 @@ void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
// NOTE: we assume the inputFormat and the outputFormat are the same, since on any modern
// multimedia OS they should be. If there is a device that this is not true for, we can
// add back support to do resampling.
if (_inputFormat.sampleRate() != _outputFormat.sampleRate()) {
if (sampleRate != _outputFormat.sampleRate()) {
return;
}
@ -972,7 +974,7 @@ void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
static QByteArray loopBackByteArray;
int numInputSamples = inputByteArray.size() / AudioConstants::SAMPLE_SIZE;
int numLoopbackSamples = (numInputSamples * OUTPUT_CHANNEL_COUNT) / _inputFormat.channelCount();
int numLoopbackSamples = (numInputSamples * OUTPUT_CHANNEL_COUNT) / channelCount;
loopBackByteArray.resize(numLoopbackSamples * AudioConstants::SAMPLE_SIZE);
@ -980,7 +982,7 @@ void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
int16_t* loopbackSamples = reinterpret_cast<int16_t*>(loopBackByteArray.data());
// upmix mono to stereo
if (!sampleChannelConversion(inputSamples, loopbackSamples, numInputSamples, _inputFormat.channelCount(),
if (!sampleChannelConversion(inputSamples, loopbackSamples, numInputSamples, channelCount,
OUTPUT_CHANNEL_COUNT)) {
// no conversion, just copy the samples
memcpy(loopbackSamples, inputSamples, numInputSamples * AudioConstants::SAMPLE_SIZE);
@ -1093,23 +1095,29 @@ void AudioClient::handleAudioInput(QByteArray& audioBuffer) {
}
}
void AudioClient::processAudioAndAddToRingBuffer(QByteArray& inputByteArray,
const uchar& channelCount,
const qint32& bytesForDuration,
QByteArray& rollingBuffer) {
void AudioClient::handleMicAudioInput() {
if (!_inputDevice || _isPlayingBackRecording) {
return;
}
#if defined(Q_OS_ANDROID)
_inputReadsSinceLastCheck++;
#endif
// input samples required to produce exactly NETWORK_FRAME_SAMPLES of output
const int inputSamplesRequired =
(_inputToNetworkResampler ? _inputToNetworkResampler->getMinInput(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL)
: AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) *
channelCount;
_inputFormat.channelCount();
const auto inputAudioSamples = std::unique_ptr<int16_t[]>(new int16_t[inputSamplesRequired]);
QByteArray inputByteArray = _inputDevice->readAll();
handleLocalEchoAndReverb(inputByteArray);
handleLocalEchoAndReverb(inputByteArray, _inputFormat.sampleRate(), _inputFormat.channelCount());
_inputRingBuffer.writeData(inputByteArray.data(), inputByteArray.size());
float audioInputMsecsRead = inputByteArray.size() / (float)(bytesForDuration);
float audioInputMsecsRead = inputByteArray.size() / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
_stats.updateInputMsRead(audioInputMsecsRead);
const int numNetworkBytes =
@ -1125,33 +1133,17 @@ void AudioClient::processAudioAndAddToRingBuffer(QByteArray& inputByteArray,
} else {
_inputRingBuffer.readSamples(inputAudioSamples.get(), inputSamplesRequired);
possibleResampling(_inputToNetworkResampler, inputAudioSamples.get(), networkAudioSamples, inputSamplesRequired,
numNetworkSamples, channelCount, _desiredInputFormat.channelCount());
numNetworkSamples, _inputFormat.channelCount(), _desiredInputFormat.channelCount());
}
int bytesInInputRingBuffer = _inputRingBuffer.samplesAvailable() * AudioConstants::SAMPLE_SIZE;
float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(bytesForDuration);
float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
_stats.updateInputMsUnplayed(msecsInInputRingBuffer);
QByteArray audioBuffer(reinterpret_cast<char*>(networkAudioSamples), numNetworkBytes);
rollingBuffer.append(audioBuffer);
handleAudioInput(audioBuffer);
}
}
void AudioClient::handleMicAudioInput() {
if (!_inputDevice || _isPlayingBackRecording) {
return;
}
#if defined(Q_OS_ANDROID)
_inputReadsSinceLastCheck++;
#endif
QByteArray temp;
processAudioAndAddToRingBuffer(_inputDevice->readAll(), _inputFormat.channelCount(),
_inputFormat.bytesForDuration(USECS_PER_MSEC), temp);
}
void AudioClient::handleDummyAudioInput() {
const int numNetworkBytes =
_isStereoInput ? AudioConstants::NETWORK_FRAME_BYTES_STEREO : AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL;
@ -1210,25 +1202,43 @@ int rawToWav(const char* rawData, const int& rawLength, const char* wavfn, long
return 0;
}
void AudioClient::handleTTSAudioInput(const QByteArray& audio) {
QByteArray audioBuffer(audio);
QString filename = QString::number(usecTimestampNow());
QString path = PathUtils::getAppDataPath() + "Audio/" + filename + "-before.wav";
rawToWav(audioBuffer.data(), audioBuffer.size(), path.toLocal8Bit(), 24000, 1);
QByteArray temp;
while (audioBuffer.size() > 0) {
void AudioClient::processTTSBuffer() {
Lock lock(_TTSMutex);
if (_TTSAudioBuffer.size() > 0) {
QByteArray part;
part.append(audioBuffer.data(), AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
audioBuffer.remove(0, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
processAudioAndAddToRingBuffer(part, 1, 48, temp);
part.append(_TTSAudioBuffer.data(), _TTSChunkSize);
_TTSAudioBuffer.remove(0, _TTSChunkSize);
handleAudioInput(part);
} else {
_isProcessingTTS = false;
_TTSTimer.stop();
}
}
filename = QString::number(usecTimestampNow());
path = PathUtils::getAppDataPath() + "Audio/" + filename + "-after.wav";
rawToWav(temp.data(), temp.size(), path.toLocal8Bit(), 12000, 1);
void AudioClient::handleTTSAudioInput(const QByteArray& audio, const int& newChunkSize, const int& timerInterval) {
_TTSChunkSize = newChunkSize;
_TTSAudioBuffer.append(audio);
handleLocalEchoAndReverb(_TTSAudioBuffer, 48000, 1);
//QString filename = QString::number(usecTimestampNow());
//QString path = PathUtils::getAppDataPath() + "Audio/" + filename + "-before.wav";
//rawToWav(_TTSAudioBuffer.data(), _TTSAudioBuffer.size(), path.toLocal8Bit(), 24000, 1);
//QByteArray temp;
_isProcessingTTS = true;
_TTSTimer.start(timerInterval);
//filename = QString::number(usecTimestampNow());
//path = PathUtils::getAppDataPath() + "Audio/" + filename + "-after.wav";
//rawToWav(temp.data(), temp.size(), path.toLocal8Bit(), 12000, 1);
}
void AudioClient::clearTTSBuffer() {
_TTSAudioBuffer.resize(0);
_isProcessingTTS = false;
_TTSTimer.stop();
}
void AudioClient::prepareLocalAudioInjectors(std::unique_ptr<Lock> localAudioLock) {
@ -1313,7 +1323,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
if (injectorBuffer) {
static const int HRTF_DATASET_INDEX = 1;
int numChannels = injector->isAmbisonic() ? AudioConstants::AMBISONIC
int numChannels = injector->isAmbisonic()
? AudioConstants::AMBISONIC
: (injector->isStereo() ? AudioConstants::STEREO : AudioConstants::MONO);
size_t bytesToRead = numChannels * AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL;
@ -1355,8 +1366,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
float azimuth = azimuthForSource(relativePosition);
// mono gets spatialized into mixBuffer
injector->getLocalHRTF().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX, azimuth, distance, gain,
AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
injector->getLocalHRTF().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX, azimuth, distance,
gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
}
} else {
@ -1509,7 +1520,8 @@ bool AudioClient::outputLocalInjector(const AudioInjectorPointer& injector) {
}
void AudioClient::outputFormatChanged() {
_outputFrameSize = (AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * OUTPUT_CHANNEL_COUNT * _outputFormat.sampleRate()) /
_outputFrameSize =
(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * OUTPUT_CHANNEL_COUNT * _outputFormat.sampleRate()) /
_desiredOutputFormat.sampleRate();
_receivedAudioStream.outputFormatChanged(_outputFormat.sampleRate(), OUTPUT_CHANNEL_COUNT);
}
@ -1725,8 +1737,8 @@ void AudioClient::outputNotify() {
int newOutputBufferSizeFrames = setOutputBufferSize(oldOutputBufferSizeFrames + 1, false);
if (newOutputBufferSizeFrames > oldOutputBufferSizeFrames) {
qCDebug(audioclient, "Starve threshold surpassed (%d starves in %d ms)", _outputStarveDetectionCount,
dt);
qCDebug(audioclient, "Starve threshold surpassed (%d starves in %d ms)",
_outputStarveDetectionCount, dt);
}
_outputStarveDetectionStartTimeMsec = now;
@ -2054,7 +2066,8 @@ qint64 AudioClient::AudioOutputIODevice::readData(char* data, qint64 maxSize) {
}
int bytesAudioOutputUnplayed = _audio->_audioOutput->bufferSize() - _audio->_audioOutput->bytesFree();
float msecsAudioOutputUnplayed = bytesAudioOutputUnplayed / (float)_audio->_outputFormat.bytesForDuration(USECS_PER_MSEC);
float msecsAudioOutputUnplayed =
bytesAudioOutputUnplayed / (float)_audio->_outputFormat.bytesForDuration(USECS_PER_MSEC);
_audio->_stats.updateOutputMsUnplayed(msecsAudioOutputUnplayed);
if (bytesAudioOutputUnplayed == 0) {

View file

@ -197,7 +197,11 @@ public slots:
void checkInputTimeout();
void handleDummyAudioInput();
void handleRecordedAudioInput(const QByteArray& audio);
void handleTTSAudioInput(const QByteArray& audio);
void handleTTSAudioInput(const QByteArray& audio,
const int& newChunkSize,
const int& timerInterval);
void clearTTSBuffer();
void processTTSBuffer();
void reset();
void audioMixerKilled();
@ -290,10 +294,11 @@ private:
float azimuthForSource(const glm::vec3& relativePosition);
float gainForSource(float distance, float volume);
void processAudioAndAddToRingBuffer(QByteArray& inputByteArray,
const uchar& channelCount,
const qint32& bytesForDuration,
QByteArray& rollingBuffer);
Mutex _TTSMutex;
QTimer _TTSTimer;
bool _isProcessingTTS {false};
QByteArray _TTSAudioBuffer;
int _TTSChunkSize = AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50;
#ifdef Q_OS_ANDROID
QTimer _checkInputTimer;
@ -401,7 +406,7 @@ private:
void configureReverb();
void updateReverbOptions();
void handleLocalEchoAndReverb(QByteArray& inputByteArray);
void handleLocalEchoAndReverb(QByteArray& inputByteArray, const int& sampleRate, const int& channelCount);
bool switchInputToAudioDevice(const QAudioDeviceInfo inputDeviceInfo, bool isShutdownRequest = false);
bool switchOutputToAudioDevice(const QAudioDeviceInfo outputDeviceInfo, bool isShutdownRequest = false);