From daeedc6ef1ee64472d66ef52decf4d6380c210f9 Mon Sep 17 00:00:00 2001 From: Zach Fox Date: Thu, 11 Oct 2018 17:10:14 -0700 Subject: [PATCH] Lots of progress today --- interface/src/Application.cpp | 11 ++- ...nterface.cpp => TTSScriptingInterface.cpp} | 58 ++++++++---- ...ingInterface.h => TTSScriptingInterface.h} | 19 ++-- libraries/audio-client/src/AudioClient.cpp | 88 ++++++++++--------- libraries/audio-client/src/AudioClient.h | 3 + 5 files changed, 111 insertions(+), 68 deletions(-) rename interface/src/scripting/{SpeechScriptingInterface.cpp => TTSScriptingInterface.cpp} (64%) rename interface/src/scripting/{SpeechScriptingInterface.h => TTSScriptingInterface.h} (79%) diff --git a/interface/src/Application.cpp b/interface/src/Application.cpp index 74532ef53a..728fea8c10 100644 --- a/interface/src/Application.cpp +++ b/interface/src/Application.cpp @@ -182,7 +182,7 @@ #include "scripting/RatesScriptingInterface.h" #include "scripting/SelectionScriptingInterface.h" #include "scripting/WalletScriptingInterface.h" -#include "scripting/SpeechScriptingInterface.h" +#include "scripting/TTSScriptingInterface.h" #if defined(Q_OS_MAC) || defined(Q_OS_WIN) #include "SpeechRecognizer.h" #endif @@ -944,7 +944,7 @@ bool setupEssentials(int& argc, char** argv, bool runningMarkerExisted) { DependencyManager::set(); DependencyManager::set(); DependencyManager::set(); - DependencyManager::set(); + DependencyManager::set(); DependencyManager::set(); @@ -1179,6 +1179,9 @@ Application::Application(int& argc, char** argv, QElapsedTimer& startupTimer, bo recording::Frame::registerFrameHandler(AudioConstants::getAudioFrameName(), [&audioIO](recording::Frame::ConstPointer frame) { audioIO->handleRecordedAudioInput(frame->data); }); + + auto TTS = DependencyManager::get().data(); + connect(TTS, &TTSScriptingInterface::ttsSampleCreated, audioIO, &AudioClient::handleTTSAudioInput); connect(audioIO, &AudioClient::inputReceived, [](const QByteArray& audio) { static auto recorder = DependencyManager::get(); @@ -3129,7 +3132,7 @@ void Application::onDesktopRootContextCreated(QQmlContext* surfaceContext) { surfaceContext->setContextProperty("ContextOverlay", DependencyManager::get().data()); surfaceContext->setContextProperty("Wallet", DependencyManager::get().data()); surfaceContext->setContextProperty("HiFiAbout", AboutUtil::getInstance()); - surfaceContext->setContextProperty("Speech", DependencyManager::get().data()); + surfaceContext->setContextProperty("TextToSpeech", DependencyManager::get().data()); if (auto steamClient = PluginManager::getInstance()->getSteamClientPlugin()) { surfaceContext->setContextProperty("Steam", new SteamScriptingInterface(engine, steamClient.get())); @@ -6800,7 +6803,7 @@ void Application::registerScriptEngineWithApplicationServices(ScriptEnginePointe scriptEngine->registerGlobalObject("Wallet", DependencyManager::get().data()); scriptEngine->registerGlobalObject("AddressManager", DependencyManager::get().data()); scriptEngine->registerGlobalObject("HifiAbout", AboutUtil::getInstance()); - scriptEngine->registerGlobalObject("Speech", DependencyManager::get().data()); + scriptEngine->registerGlobalObject("TextToSpeech", DependencyManager::get().data()); qScriptRegisterMetaType(scriptEngine.data(), OverlayIDtoScriptValue, OverlayIDfromScriptValue); diff --git a/interface/src/scripting/SpeechScriptingInterface.cpp b/interface/src/scripting/TTSScriptingInterface.cpp similarity index 64% rename from interface/src/scripting/SpeechScriptingInterface.cpp rename to interface/src/scripting/TTSScriptingInterface.cpp index b8e0f5c3e8..fdbb37e586 100644 --- a/interface/src/scripting/SpeechScriptingInterface.cpp +++ b/interface/src/scripting/TTSScriptingInterface.cpp @@ -1,6 +1,6 @@ // -// SpeechScriptingInterface.cpp -// interface/src/scripting +// TTSScriptingInterface.cpp +// libraries/audio-client/src/scripting // // Created by Zach Fox on 2018-10-10. // Copyright 2018 High Fidelity, Inc. @@ -9,10 +9,10 @@ // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html // -#include "SpeechScriptingInterface.h" +#include "TTSScriptingInterface.h" #include "avatar/AvatarManager.h" -SpeechScriptingInterface::SpeechScriptingInterface() { +TTSScriptingInterface::TTSScriptingInterface() { // // Create text to speech engine // @@ -38,8 +38,7 @@ SpeechScriptingInterface::SpeechScriptingInterface() { } } -SpeechScriptingInterface::~SpeechScriptingInterface() { - +TTSScriptingInterface::~TTSScriptingInterface() { } class ReleaseOnExit { @@ -55,7 +54,28 @@ private: IUnknown* m_p; }; -void SpeechScriptingInterface::speakText(const QString& textToSpeak) { +void TTSScriptingInterface::testTone(const bool& alsoInject) { + QByteArray byteArray(480000, 0); + _lastSoundByteArray.resize(0); + _lastSoundByteArray.resize(480000); + + int32_t a = 0; + int16_t* samples = reinterpret_cast(byteArray.data()); + for (a = 0; a < 240000; a++) { + int16_t temp = (glm::sin(glm::radians((float)a))) * 32768; + samples[a] = temp; + } + emit ttsSampleCreated(_lastSoundByteArray); + + if (alsoInject) { + AudioInjectorOptions options; + options.position = DependencyManager::get()->getMyAvatarPosition(); + + _lastSoundAudioInjector = AudioInjector::playSound(_lastSoundByteArray, options); + } +} + +void TTSScriptingInterface::speakText(const QString& textToSpeak, const bool& alsoInject) { WAVEFORMATEX fmt; fmt.wFormatTag = WAVE_FORMAT_PCM; fmt.nSamplesPerSec = 24000; @@ -92,9 +112,8 @@ void SpeechScriptingInterface::speakText(const QString& textToSpeak) { ReleaseOnExit rStream(pStream); ULONG streamNumber; - hr = m_tts->Speak(reinterpret_cast(textToSpeak.utf16()), - SPF_IS_XML | SPF_ASYNC | SPF_PURGEBEFORESPEAK, - &streamNumber); + hr = m_tts->Speak(reinterpret_cast(textToSpeak.utf16()), SPF_IS_XML | SPF_ASYNC | SPF_PURGEBEFORESPEAK, + &streamNumber); if (FAILED(hr)) { qDebug() << "Speak failed."; } @@ -124,14 +143,21 @@ void SpeechScriptingInterface::speakText(const QString& textToSpeak) { qDebug() << "Couldn't read from stream."; } - QByteArray byteArray = QByteArray::QByteArray(buf1, dwSize); + _lastSoundByteArray.resize(0); + _lastSoundByteArray.append(buf1, dwSize); - AudioInjectorOptions options; - options.position = DependencyManager::get()->getMyAvatarPosition(); + emit ttsSampleCreated(_lastSoundByteArray); - lastSound = AudioInjector::playSound(byteArray, options); + if (alsoInject) { + AudioInjectorOptions options; + options.position = DependencyManager::get()->getMyAvatarPosition(); + + _lastSoundAudioInjector = AudioInjector::playSound(_lastSoundByteArray, options); + } } -void SpeechScriptingInterface::stopLastSpeech() { - lastSound->stop(); +void TTSScriptingInterface::stopLastSpeech() { + if (_lastSoundAudioInjector) { + _lastSoundAudioInjector->stop(); + } } diff --git a/interface/src/scripting/SpeechScriptingInterface.h b/interface/src/scripting/TTSScriptingInterface.h similarity index 79% rename from interface/src/scripting/SpeechScriptingInterface.h rename to interface/src/scripting/TTSScriptingInterface.h index c683a1a3c6..cb9c6c8c3e 100644 --- a/interface/src/scripting/SpeechScriptingInterface.h +++ b/interface/src/scripting/TTSScriptingInterface.h @@ -1,5 +1,5 @@ -// SpeechScriptingInterface.h -// interface/src/scripting +// TTSScriptingInterface.h +// libraries/audio-client/src/scripting // // Created by Zach Fox on 2018-10-10. // Copyright 2018 High Fidelity, Inc. @@ -20,16 +20,20 @@ #include // SAPI Helper #include -class SpeechScriptingInterface : public QObject, public Dependency { +class TTSScriptingInterface : public QObject, public Dependency { Q_OBJECT public: - SpeechScriptingInterface(); - ~SpeechScriptingInterface(); + TTSScriptingInterface(); + ~TTSScriptingInterface(); - Q_INVOKABLE void speakText(const QString& textToSpeak); + Q_INVOKABLE void testTone(const bool& alsoInject = false); + Q_INVOKABLE void speakText(const QString& textToSpeak, const bool& alsoInject = false); Q_INVOKABLE void stopLastSpeech(); +signals: + void ttsSampleCreated(QByteArray outputArray); + private: class CComAutoInit { public: @@ -72,7 +76,8 @@ private: // Default voice token CComPtr m_voiceToken; - AudioInjectorPointer lastSound; + QByteArray _lastSoundByteArray; + AudioInjectorPointer _lastSoundByteArray; }; #endif // hifi_SpeechScriptingInterface_h diff --git a/libraries/audio-client/src/AudioClient.cpp b/libraries/audio-client/src/AudioClient.cpp index d00bc29054..96f1c97878 100644 --- a/libraries/audio-client/src/AudioClient.cpp +++ b/libraries/audio-client/src/AudioClient.cpp @@ -1135,6 +1135,46 @@ void AudioClient::handleAudioInput(QByteArray& audioBuffer) { } } +void AudioClient::processAudioAndAddToRingBuffer(QByteArray& inputByteArray, const uchar& channelCount, const qint32& bytesForDuration) { + // input samples required to produce exactly NETWORK_FRAME_SAMPLES of output + const int inputSamplesRequired = + (_inputToNetworkResampler ? _inputToNetworkResampler->getMinInput(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) + : AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) * + channelCount; + + const auto inputAudioSamples = std::unique_ptr(new int16_t[inputSamplesRequired]); + + handleLocalEchoAndReverb(inputByteArray); + + _inputRingBuffer.writeData(inputByteArray.data(), inputByteArray.size()); + + float audioInputMsecsRead = inputByteArray.size() / (float)(bytesForDuration); + _stats.updateInputMsRead(audioInputMsecsRead); + + const int numNetworkBytes = + _isStereoInput ? AudioConstants::NETWORK_FRAME_BYTES_STEREO : AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL; + const int numNetworkSamples = + _isStereoInput ? AudioConstants::NETWORK_FRAME_SAMPLES_STEREO : AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; + + static int16_t networkAudioSamples[AudioConstants::NETWORK_FRAME_SAMPLES_STEREO]; + + while (_inputRingBuffer.samplesAvailable() >= inputSamplesRequired) { + if (_muted) { + _inputRingBuffer.shiftReadPosition(inputSamplesRequired); + } else { + _inputRingBuffer.readSamples(inputAudioSamples.get(), inputSamplesRequired); + possibleResampling(_inputToNetworkResampler, inputAudioSamples.get(), networkAudioSamples, inputSamplesRequired, + numNetworkSamples, channelCount, _desiredInputFormat.channelCount()); + } + int bytesInInputRingBuffer = _inputRingBuffer.samplesAvailable() * AudioConstants::SAMPLE_SIZE; + float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC)); + _stats.updateInputMsUnplayed(msecsInInputRingBuffer); + + QByteArray audioBuffer(reinterpret_cast(networkAudioSamples), numNetworkBytes); + handleAudioInput(audioBuffer); + } +} + void AudioClient::handleMicAudioInput() { if (!_inputDevice || _isPlayingBackRecording) { return; @@ -1144,47 +1184,8 @@ void AudioClient::handleMicAudioInput() { _inputReadsSinceLastCheck++; #endif - // input samples required to produce exactly NETWORK_FRAME_SAMPLES of output - const int inputSamplesRequired = (_inputToNetworkResampler ? - _inputToNetworkResampler->getMinInput(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) : - AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) * _inputFormat.channelCount(); - - const auto inputAudioSamples = std::unique_ptr(new int16_t[inputSamplesRequired]); - QByteArray inputByteArray = _inputDevice->readAll(); - - handleLocalEchoAndReverb(inputByteArray); - - _inputRingBuffer.writeData(inputByteArray.data(), inputByteArray.size()); - - float audioInputMsecsRead = inputByteArray.size() / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC)); - _stats.updateInputMsRead(audioInputMsecsRead); - - const int numNetworkBytes = _isStereoInput - ? AudioConstants::NETWORK_FRAME_BYTES_STEREO - : AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL; - const int numNetworkSamples = _isStereoInput - ? AudioConstants::NETWORK_FRAME_SAMPLES_STEREO - : AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL; - - static int16_t networkAudioSamples[AudioConstants::NETWORK_FRAME_SAMPLES_STEREO]; - - while (_inputRingBuffer.samplesAvailable() >= inputSamplesRequired) { - if (_muted) { - _inputRingBuffer.shiftReadPosition(inputSamplesRequired); - } else { - _inputRingBuffer.readSamples(inputAudioSamples.get(), inputSamplesRequired); - possibleResampling(_inputToNetworkResampler, - inputAudioSamples.get(), networkAudioSamples, - inputSamplesRequired, numNetworkSamples, - _inputFormat.channelCount(), _desiredInputFormat.channelCount()); - } - int bytesInInputRingBuffer = _inputRingBuffer.samplesAvailable() * AudioConstants::SAMPLE_SIZE; - float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC)); - _stats.updateInputMsUnplayed(msecsInInputRingBuffer); - - QByteArray audioBuffer(reinterpret_cast(networkAudioSamples), numNetworkBytes); - handleAudioInput(audioBuffer); - } + processAudioAndAddToRingBuffer(_inputDevice->readAll(), _inputFormat.channelCount(), + _inputFormat.bytesForDuration(USECS_PER_MSEC)); } void AudioClient::handleDummyAudioInput() { @@ -1201,6 +1202,11 @@ void AudioClient::handleRecordedAudioInput(const QByteArray& audio) { handleAudioInput(audioBuffer); } +void AudioClient::handleTTSAudioInput(const QByteArray& audio) { + QByteArray audioBuffer(audio); + processAudioAndAddToRingBuffer(audioBuffer, 1, 48); +} + void AudioClient::prepareLocalAudioInjectors(std::unique_ptr localAudioLock) { bool doSynchronously = localAudioLock.operator bool(); if (!localAudioLock) { diff --git a/libraries/audio-client/src/AudioClient.h b/libraries/audio-client/src/AudioClient.h index 5e7f1fb8a0..170a355abe 100644 --- a/libraries/audio-client/src/AudioClient.h +++ b/libraries/audio-client/src/AudioClient.h @@ -197,6 +197,7 @@ public slots: void checkInputTimeout(); void handleDummyAudioInput(); void handleRecordedAudioInput(const QByteArray& audio); + void handleTTSAudioInput(const QByteArray& audio); void reset(); void audioMixerKilled(); @@ -289,6 +290,8 @@ private: float azimuthForSource(const glm::vec3& relativePosition); float gainForSource(float distance, float volume); + void processAudioAndAddToRingBuffer(QByteArray& inputByteArray, const uchar& channelCount, const qint32& bytesForDuration); + #ifdef Q_OS_ANDROID QTimer _checkInputTimer; long _inputReadsSinceLastCheck = 0l;