From daeedc6ef1ee64472d66ef52decf4d6380c210f9 Mon Sep 17 00:00:00 2001
From: Zach Fox <fox@highfidelity.io>
Date: Thu, 11 Oct 2018 17:10:14 -0700
Subject: [PATCH] Lots of progress today

---
 interface/src/Application.cpp                 | 11 ++-
 ...nterface.cpp => TTSScriptingInterface.cpp} | 58 ++++++++----
 ...ingInterface.h => TTSScriptingInterface.h} | 19 ++--
 libraries/audio-client/src/AudioClient.cpp    | 88 ++++++++++---------
 libraries/audio-client/src/AudioClient.h      |  3 +
 5 files changed, 111 insertions(+), 68 deletions(-)
 rename interface/src/scripting/{SpeechScriptingInterface.cpp => TTSScriptingInterface.cpp} (64%)
 rename interface/src/scripting/{SpeechScriptingInterface.h => TTSScriptingInterface.h} (79%)
diff --git a/interface/src/Application.cpp b/interface/src/Application.cpp
index 74532ef53a..728fea8c10 100644
--- a/interface/src/Application.cpp
+++ b/interface/src/Application.cpp
@@ -182,7 +182,7 @@
 #include "scripting/RatesScriptingInterface.h"
 #include "scripting/SelectionScriptingInterface.h"
 #include "scripting/WalletScriptingInterface.h"
-#include "scripting/SpeechScriptingInterface.h"
+#include "scripting/TTSScriptingInterface.h"
 #if defined(Q_OS_MAC) || defined(Q_OS_WIN)
 #include "SpeechRecognizer.h"
 #endif
@@ -944,7 +944,7 @@ bool setupEssentials(int& argc, char** argv, bool runningMarkerExisted) {
     DependencyManager::set<Ledger>();
     DependencyManager::set<Wallet>();
     DependencyManager::set<WalletScriptingInterface>();
-    DependencyManager::set<SpeechScriptingInterface>();
+    DependencyManager::set<TTSScriptingInterface>();
 
     DependencyManager::set<FadeEffect>();
 
@@ -1179,6 +1179,9 @@ Application::Application(int& argc, char** argv, QElapsedTimer& startupTimer, bo
         recording::Frame::registerFrameHandler(AudioConstants::getAudioFrameName(), [&audioIO](recording::Frame::ConstPointer frame) {
             audioIO->handleRecordedAudioInput(frame->data);
         });
+        
+        auto TTS = DependencyManager::get<TTSScriptingInterface>().data();
+        connect(TTS, &TTSScriptingInterface::ttsSampleCreated, audioIO, &AudioClient::handleTTSAudioInput);
 
         connect(audioIO, &AudioClient::inputReceived, [](const QByteArray& audio) {
             static auto recorder = DependencyManager::get<recording::Recorder>();
@@ -3129,7 +3132,7 @@ void Application::onDesktopRootContextCreated(QQmlContext* surfaceContext) {
     surfaceContext->setContextProperty("ContextOverlay", DependencyManager::get<ContextOverlayInterface>().data());
     surfaceContext->setContextProperty("Wallet", DependencyManager::get<WalletScriptingInterface>().data());
     surfaceContext->setContextProperty("HiFiAbout", AboutUtil::getInstance());
-    surfaceContext->setContextProperty("Speech", DependencyManager::get<SpeechScriptingInterface>().data());
+    surfaceContext->setContextProperty("TextToSpeech", DependencyManager::get<TTSScriptingInterface>().data());
 
     if (auto steamClient = PluginManager::getInstance()->getSteamClientPlugin()) {
         surfaceContext->setContextProperty("Steam", new SteamScriptingInterface(engine, steamClient.get()));
@@ -6800,7 +6803,7 @@ void Application::registerScriptEngineWithApplicationServices(ScriptEnginePointe
     scriptEngine->registerGlobalObject("Wallet", DependencyManager::get<WalletScriptingInterface>().data());
     scriptEngine->registerGlobalObject("AddressManager", DependencyManager::get<AddressManager>().data());
     scriptEngine->registerGlobalObject("HifiAbout", AboutUtil::getInstance());
-    scriptEngine->registerGlobalObject("Speech", DependencyManager::get<SpeechScriptingInterface>().data());
+    scriptEngine->registerGlobalObject("TextToSpeech", DependencyManager::get<TTSScriptingInterface>().data());
 
     qScriptRegisterMetaType(scriptEngine.data(), OverlayIDtoScriptValue, OverlayIDfromScriptValue);
 
diff --git a/interface/src/scripting/SpeechScriptingInterface.cpp b/interface/src/scripting/TTSScriptingInterface.cpp
similarity index 64%
rename from interface/src/scripting/SpeechScriptingInterface.cpp
rename to interface/src/scripting/TTSScriptingInterface.cpp
index b8e0f5c3e8..fdbb37e586 100644
--- a/interface/src/scripting/SpeechScriptingInterface.cpp
+++ b/interface/src/scripting/TTSScriptingInterface.cpp
@@ -1,6 +1,6 @@
 ﻿//
-//  SpeechScriptingInterface.cpp
-//  interface/src/scripting
+//  TTSScriptingInterface.cpp
+//  libraries/audio-client/src/scripting
 //
 //  Created by Zach Fox on 2018-10-10.
 //  Copyright 2018 High Fidelity, Inc.
@@ -9,10 +9,10 @@
 //  See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
 //
 
-#include "SpeechScriptingInterface.h"
+#include "TTSScriptingInterface.h"
 #include "avatar/AvatarManager.h"
 
-SpeechScriptingInterface::SpeechScriptingInterface() {
+TTSScriptingInterface::TTSScriptingInterface() {
     //
     // Create text to speech engine
     //
@@ -38,8 +38,7 @@ SpeechScriptingInterface::SpeechScriptingInterface() {
     }
 }
 
-SpeechScriptingInterface::~SpeechScriptingInterface() {
-
+TTSScriptingInterface::~TTSScriptingInterface() {
 }
 
 class ReleaseOnExit {
@@ -55,7 +54,28 @@ private:
     IUnknown* m_p;
 };
 
-void SpeechScriptingInterface::speakText(const QString& textToSpeak) {
+void TTSScriptingInterface::testTone(const bool& alsoInject) {
+    QByteArray byteArray(480000, 0);
+    _lastSoundByteArray.resize(0);
+    _lastSoundByteArray.resize(480000);
+
+    int32_t a = 0;
+    int16_t* samples = reinterpret_cast<int16_t*>(byteArray.data());
+    for (a = 0; a < 240000; a++) {
+        int16_t temp = (glm::sin(glm::radians((float)a))) * 32768;
+        samples[a] = temp;
+    }
+    emit ttsSampleCreated(_lastSoundByteArray);
+
+    if (alsoInject) {
+        AudioInjectorOptions options;
+        options.position = DependencyManager::get<AvatarManager>()->getMyAvatarPosition();
+
+        _lastSoundAudioInjector = AudioInjector::playSound(_lastSoundByteArray, options);
+    }
+}
+
+void TTSScriptingInterface::speakText(const QString& textToSpeak, const bool& alsoInject) {
     WAVEFORMATEX fmt;
     fmt.wFormatTag = WAVE_FORMAT_PCM;
     fmt.nSamplesPerSec = 24000;
@@ -92,9 +112,8 @@ void SpeechScriptingInterface::speakText(const QString& textToSpeak) {
     ReleaseOnExit rStream(pStream);
 
     ULONG streamNumber;
-    hr = m_tts->Speak(reinterpret_cast<LPCWSTR>(textToSpeak.utf16()),
-        SPF_IS_XML | SPF_ASYNC | SPF_PURGEBEFORESPEAK,
-        &streamNumber);
+    hr = m_tts->Speak(reinterpret_cast<LPCWSTR>(textToSpeak.utf16()), SPF_IS_XML | SPF_ASYNC | SPF_PURGEBEFORESPEAK,
+                      &streamNumber);
     if (FAILED(hr)) {
         qDebug() << "Speak failed.";
     }
@@ -124,14 +143,21 @@ void SpeechScriptingInterface::speakText(const QString& textToSpeak) {
         qDebug() << "Couldn't read from stream.";
     }
 
-    QByteArray byteArray = QByteArray::QByteArray(buf1, dwSize);
+    _lastSoundByteArray.resize(0);
+    _lastSoundByteArray.append(buf1, dwSize);
 
-    AudioInjectorOptions options;
-    options.position = DependencyManager::get<AvatarManager>()->getMyAvatarPosition();
+    emit ttsSampleCreated(_lastSoundByteArray);
 
-    lastSound = AudioInjector::playSound(byteArray, options);
+    if (alsoInject) {
+        AudioInjectorOptions options;
+        options.position = DependencyManager::get<AvatarManager>()->getMyAvatarPosition();
+
+        _lastSoundAudioInjector = AudioInjector::playSound(_lastSoundByteArray, options);
+    }
 }
 
-void SpeechScriptingInterface::stopLastSpeech() {
-    lastSound->stop();
+void TTSScriptingInterface::stopLastSpeech() {
+    if (_lastSoundAudioInjector) {
+        _lastSoundAudioInjector->stop();
+    }
 }
diff --git a/interface/src/scripting/SpeechScriptingInterface.h b/interface/src/scripting/TTSScriptingInterface.h
similarity index 79%
rename from interface/src/scripting/SpeechScriptingInterface.h
rename to interface/src/scripting/TTSScriptingInterface.h
index c683a1a3c6..cb9c6c8c3e 100644
--- a/interface/src/scripting/SpeechScriptingInterface.h
+++ b/interface/src/scripting/TTSScriptingInterface.h
@@ -1,5 +1,5 @@
-//  SpeechScriptingInterface.h
-//  interface/src/scripting
+//  TTSScriptingInterface.h
+//  libraries/audio-client/src/scripting
 //
 //  Created by Zach Fox on 2018-10-10.
 //  Copyright 2018 High Fidelity, Inc.
@@ -20,16 +20,20 @@
 #include <sphelper.h>  // SAPI Helper
 #include <AudioInjector.h>
 
-class SpeechScriptingInterface : public QObject, public Dependency {
+class TTSScriptingInterface : public QObject, public Dependency {
     Q_OBJECT
 
 public:
-    SpeechScriptingInterface();
-    ~SpeechScriptingInterface();
+    TTSScriptingInterface();
+    ~TTSScriptingInterface();
 
-    Q_INVOKABLE void speakText(const QString& textToSpeak);
+    Q_INVOKABLE void testTone(const bool& alsoInject = false);
+    Q_INVOKABLE void speakText(const QString& textToSpeak, const bool& alsoInject = false);
     Q_INVOKABLE void stopLastSpeech();
 
+signals:
+    void ttsSampleCreated(QByteArray outputArray);
+
 private:
     class CComAutoInit {
     public:
@@ -72,7 +76,8 @@ private:
     // Default voice token
     CComPtr<ISpObjectToken> m_voiceToken;
 
-    AudioInjectorPointer lastSound;
+    QByteArray _lastSoundByteArray;
+    AudioInjectorPointer _lastSoundByteArray;
 };
 
 #endif // hifi_SpeechScriptingInterface_h
diff --git a/libraries/audio-client/src/AudioClient.cpp b/libraries/audio-client/src/AudioClient.cpp
index d00bc29054..96f1c97878 100644
--- a/libraries/audio-client/src/AudioClient.cpp
+++ b/libraries/audio-client/src/AudioClient.cpp
@@ -1135,6 +1135,46 @@ void AudioClient::handleAudioInput(QByteArray& audioBuffer) {
     }
 }
 
+void AudioClient::processAudioAndAddToRingBuffer(QByteArray& inputByteArray, const uchar& channelCount, const qint32& bytesForDuration) {
+    // input samples required to produce exactly NETWORK_FRAME_SAMPLES of output
+    const int inputSamplesRequired =
+        (_inputToNetworkResampler ? _inputToNetworkResampler->getMinInput(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL)
+                                  : AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) *
+        channelCount;
+
+    const auto inputAudioSamples = std::unique_ptr<int16_t[]>(new int16_t[inputSamplesRequired]);
+
+    handleLocalEchoAndReverb(inputByteArray);
+
+    _inputRingBuffer.writeData(inputByteArray.data(), inputByteArray.size());
+
+    float audioInputMsecsRead = inputByteArray.size() / (float)(bytesForDuration);
+    _stats.updateInputMsRead(audioInputMsecsRead);
+
+    const int numNetworkBytes =
+        _isStereoInput ? AudioConstants::NETWORK_FRAME_BYTES_STEREO : AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL;
+    const int numNetworkSamples =
+        _isStereoInput ? AudioConstants::NETWORK_FRAME_SAMPLES_STEREO : AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL;
+
+    static int16_t networkAudioSamples[AudioConstants::NETWORK_FRAME_SAMPLES_STEREO];
+
+    while (_inputRingBuffer.samplesAvailable() >= inputSamplesRequired) {
+        if (_muted) {
+            _inputRingBuffer.shiftReadPosition(inputSamplesRequired);
+        } else {
+            _inputRingBuffer.readSamples(inputAudioSamples.get(), inputSamplesRequired);
+            possibleResampling(_inputToNetworkResampler, inputAudioSamples.get(), networkAudioSamples, inputSamplesRequired,
+                               numNetworkSamples, channelCount, _desiredInputFormat.channelCount());
+        }
+        int bytesInInputRingBuffer = _inputRingBuffer.samplesAvailable() * AudioConstants::SAMPLE_SIZE;
+        float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
+        _stats.updateInputMsUnplayed(msecsInInputRingBuffer);
+
+        QByteArray audioBuffer(reinterpret_cast<char*>(networkAudioSamples), numNetworkBytes);
+        handleAudioInput(audioBuffer);
+    }
+}
+
 void AudioClient::handleMicAudioInput() {
     if (!_inputDevice || _isPlayingBackRecording) {
         return;
@@ -1144,47 +1184,8 @@ void AudioClient::handleMicAudioInput() {
     _inputReadsSinceLastCheck++;
 #endif
 
-    // input samples required to produce exactly NETWORK_FRAME_SAMPLES of output
-    const int inputSamplesRequired = (_inputToNetworkResampler ?
-                                      _inputToNetworkResampler->getMinInput(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) :
-                                      AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) * _inputFormat.channelCount();
-
-    const auto inputAudioSamples = std::unique_ptr<int16_t[]>(new int16_t[inputSamplesRequired]);
-    QByteArray inputByteArray = _inputDevice->readAll();
-
-    handleLocalEchoAndReverb(inputByteArray);
-
-    _inputRingBuffer.writeData(inputByteArray.data(), inputByteArray.size());
-
-    float audioInputMsecsRead = inputByteArray.size() / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
-    _stats.updateInputMsRead(audioInputMsecsRead);
-
-    const int numNetworkBytes = _isStereoInput
-        ? AudioConstants::NETWORK_FRAME_BYTES_STEREO
-        : AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL;
-    const int numNetworkSamples = _isStereoInput
-        ? AudioConstants::NETWORK_FRAME_SAMPLES_STEREO
-        : AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL;
-
-    static int16_t networkAudioSamples[AudioConstants::NETWORK_FRAME_SAMPLES_STEREO];
-
-    while (_inputRingBuffer.samplesAvailable() >= inputSamplesRequired) {
-        if (_muted) {
-            _inputRingBuffer.shiftReadPosition(inputSamplesRequired);
-        } else {
-            _inputRingBuffer.readSamples(inputAudioSamples.get(), inputSamplesRequired);
-            possibleResampling(_inputToNetworkResampler,
-                inputAudioSamples.get(), networkAudioSamples,
-                inputSamplesRequired, numNetworkSamples,
-                _inputFormat.channelCount(), _desiredInputFormat.channelCount());
-        }
-        int bytesInInputRingBuffer = _inputRingBuffer.samplesAvailable() * AudioConstants::SAMPLE_SIZE;
-        float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
-        _stats.updateInputMsUnplayed(msecsInInputRingBuffer);
-
-        QByteArray audioBuffer(reinterpret_cast<char*>(networkAudioSamples), numNetworkBytes);
-        handleAudioInput(audioBuffer);
-    }
+    processAudioAndAddToRingBuffer(_inputDevice->readAll(), _inputFormat.channelCount(),
+                                   _inputFormat.bytesForDuration(USECS_PER_MSEC));
 }
 
 void AudioClient::handleDummyAudioInput() {
@@ -1201,6 +1202,11 @@ void AudioClient::handleRecordedAudioInput(const QByteArray& audio) {
     handleAudioInput(audioBuffer);
 }
 
+void AudioClient::handleTTSAudioInput(const QByteArray& audio) {
+    QByteArray audioBuffer(audio);
+    processAudioAndAddToRingBuffer(audioBuffer, 1, 48);
+}
+
 void AudioClient::prepareLocalAudioInjectors(std::unique_ptr<Lock> localAudioLock) {
     bool doSynchronously = localAudioLock.operator bool();
     if (!localAudioLock) {
diff --git a/libraries/audio-client/src/AudioClient.h b/libraries/audio-client/src/AudioClient.h
index 5e7f1fb8a0..170a355abe 100644
--- a/libraries/audio-client/src/AudioClient.h
+++ b/libraries/audio-client/src/AudioClient.h
@@ -197,6 +197,7 @@ public slots:
     void checkInputTimeout();
     void handleDummyAudioInput();
     void handleRecordedAudioInput(const QByteArray& audio);
+    void handleTTSAudioInput(const QByteArray& audio);
     void reset();
     void audioMixerKilled();
 
@@ -289,6 +290,8 @@ private:
     float azimuthForSource(const glm::vec3& relativePosition);
     float gainForSource(float distance, float volume);
 
+    void processAudioAndAddToRingBuffer(QByteArray& inputByteArray, const uchar& channelCount, const qint32& bytesForDuration);
+
 #ifdef Q_OS_ANDROID
     QTimer _checkInputTimer;
     long _inputReadsSinceLastCheck = 0l;