// // TTSScriptingInterface.cpp // libraries/audio-client/src/scripting // // Created by Zach Fox on 2018-10-10. // Copyright 2018 High Fidelity, Inc. // // Distributed under the Apache License, Version 2.0. // See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html // #include "TTSScriptingInterface.h" #include "avatar/AvatarManager.h" TTSScriptingInterface::TTSScriptingInterface() { // // Create text to speech engine // HRESULT hr = m_tts.CoCreateInstance(CLSID_SpVoice); if (FAILED(hr)) { qDebug() << "Text-to-speech engine creation failed."; } // // Get token corresponding to default voice // hr = SpGetDefaultTokenFromCategoryId(SPCAT_VOICES, &m_voiceToken, FALSE); if (FAILED(hr)) { qDebug() << "Can't get default voice token."; } // // Set default voice // hr = m_tts->SetVoice(m_voiceToken); if (FAILED(hr)) { qDebug() << "Can't set default voice."; } } TTSScriptingInterface::~TTSScriptingInterface() { } class ReleaseOnExit { public: ReleaseOnExit(IUnknown* p) : m_p(p) {} ~ReleaseOnExit() { if (m_p) { m_p->Release(); } } private: IUnknown* m_p; }; void TTSScriptingInterface::testTone(const bool& alsoInject) { QByteArray byteArray(480000, 0); _lastSoundByteArray.resize(0); _lastSoundByteArray.resize(480000); int32_t a = 0; int16_t* samples = reinterpret_cast(byteArray.data()); for (a = 0; a < 240000; a++) { int16_t temp = (glm::sin(glm::radians((float)a))) * 32768; samples[a] = temp; } emit ttsSampleCreated(_lastSoundByteArray, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50, 96); if (alsoInject) { AudioInjectorOptions options; options.position = DependencyManager::get()->getMyAvatarPosition(); _lastSoundAudioInjector = AudioInjector::playSound(_lastSoundByteArray, options); } } void TTSScriptingInterface::speakText(const QString& textToSpeak, const int& newChunkSize, const int& timerInterval, const int& sampleRate, const int& bitsPerSample, const bool& alsoInject) { WAVEFORMATEX fmt; fmt.wFormatTag = WAVE_FORMAT_PCM; fmt.nSamplesPerSec = sampleRate; fmt.wBitsPerSample = bitsPerSample; fmt.nChannels = 1; fmt.nBlockAlign = fmt.nChannels * fmt.wBitsPerSample / 8; fmt.nAvgBytesPerSec = fmt.nSamplesPerSec * fmt.nBlockAlign; fmt.cbSize = 0; IStream* pStream = NULL; ISpStream* pSpStream = nullptr; HRESULT hr = CoCreateInstance(CLSID_SpStream, nullptr, CLSCTX_ALL, __uuidof(ISpStream), (void**)&pSpStream); if (FAILED(hr)) { qDebug() << "CoCreateInstance failed."; } ReleaseOnExit rSpStream(pSpStream); pStream = SHCreateMemStream(NULL, 0); if (nullptr == pStream) { qDebug() << "SHCreateMemStream failed."; } hr = pSpStream->SetBaseStream(pStream, SPDFID_WaveFormatEx, &fmt); if (FAILED(hr)) { qDebug() << "Can't set base stream."; } hr = m_tts->SetOutput(pSpStream, true); if (FAILED(hr)) { qDebug() << "Can't set output stream."; } ReleaseOnExit rStream(pStream); ULONG streamNumber; hr = m_tts->Speak(reinterpret_cast(textToSpeak.utf16()), SPF_IS_XML | SPF_ASYNC | SPF_PURGEBEFORESPEAK, &streamNumber); if (FAILED(hr)) { qDebug() << "Speak failed."; } m_tts->WaitUntilDone(-1); hr = pSpStream->GetBaseStream(&pStream); if (FAILED(hr)) { qDebug() << "Couldn't get base stream."; } hr = IStream_Reset(pStream); if (FAILED(hr)) { qDebug() << "Couldn't reset stream."; } ULARGE_INTEGER StreamSize; StreamSize.LowPart = 0; hr = IStream_Size(pStream, &StreamSize); DWORD dwSize = StreamSize.QuadPart; char* buf1 = new char[dwSize + 1]; memset(buf1, 0, dwSize + 1); hr = IStream_Read(pStream, buf1, dwSize); if (FAILED(hr)) { qDebug() << "Couldn't read from stream."; } _lastSoundByteArray.resize(0); _lastSoundByteArray.append(buf1, dwSize); emit ttsSampleCreated(_lastSoundByteArray, newChunkSize, timerInterval); if (alsoInject) { AudioInjectorOptions options; options.position = DependencyManager::get()->getMyAvatarPosition(); _lastSoundAudioInjector = AudioInjector::playSound(_lastSoundByteArray, options); } } void TTSScriptingInterface::stopLastSpeech() { if (_lastSoundAudioInjector) { _lastSoundAudioInjector->stop(); } emit clearTTSBuffer(); }