diff --git a/interface/src/scripting/SpeechScriptingInterface.cpp b/interface/src/scripting/SpeechScriptingInterface.cpp
index a38e1aa824..b9c7718075 100644
--- a/interface/src/scripting/SpeechScriptingInterface.cpp
+++ b/interface/src/scripting/SpeechScriptingInterface.cpp
@@ -19,8 +19,7 @@ SpeechScriptingInterface::SpeechScriptingInterface() {
     //
     HRESULT hr = m_tts.CoCreateInstance(CLSID_SpVoice);
     if (FAILED(hr)) {
-        ATLTRACE(TEXT("Text-to-speech creation failed.\n"));
-        AtlThrow(hr);
+        qDebug() << "Text-to-speech engine creation failed.";
     }
 
     //
@@ -28,8 +27,7 @@ SpeechScriptingInterface::SpeechScriptingInterface() {
     //
     hr = SpGetDefaultTokenFromCategoryId(SPCAT_VOICES, &m_voiceToken, FALSE);
     if (FAILED(hr)) {
-        ATLTRACE(TEXT("Can't get default voice token.\n"));
-        AtlThrow(hr);
+        qDebug() << "Can't get default voice token.";
     }
 
     //
@@ -37,28 +35,7 @@ SpeechScriptingInterface::SpeechScriptingInterface() {
     //
     hr = m_tts->SetVoice(m_voiceToken);
     if (FAILED(hr)) {
-        ATLTRACE(TEXT("Can't set default voice.\n"));
-        AtlThrow(hr);
-    }
-
-    WAVEFORMATEX fmt;
-    fmt.wFormatTag = WAVE_FORMAT_PCM;
-    fmt.nSamplesPerSec = 48000;
-    fmt.wBitsPerSample = 16;
-    fmt.nChannels = 1;
-    fmt.nBlockAlign = fmt.nChannels * fmt.wBitsPerSample / 8;
-    fmt.nAvgBytesPerSec = fmt.nSamplesPerSec * fmt.nBlockAlign;
-    fmt.cbSize = 0;
-
-    BYTE* pcontent = new BYTE[1024 * 1000];
-
-    cpIStream = SHCreateMemStream(NULL, 0);
-    hr = outputStream->SetBaseStream(cpIStream, SPDFID_WaveFormatEx, &fmt);
-
-    hr = m_tts->SetOutput(outputStream, true);
-    if (FAILED(hr)) {
-        ATLTRACE(TEXT("Can't set output stream.\n"));
-        AtlThrow(hr);
+        qDebug() << "Can't set default voice.";
     }
 }
 
@@ -66,30 +43,91 @@ SpeechScriptingInterface::~SpeechScriptingInterface() {
 
 }
 
+class ReleaseOnExit {
+public:
+    ReleaseOnExit(IUnknown* p) : m_p(p) {}
+    ~ReleaseOnExit() {
+        if (m_p) {
+            m_p->Release();
+        }
+    }
+
+private:
+    IUnknown* m_p;
+};
+
 void SpeechScriptingInterface::speakText(const QString& textToSpeak) {
+    WAVEFORMATEX fmt;
+    fmt.wFormatTag = WAVE_FORMAT_PCM;
+    fmt.nSamplesPerSec = 44100;
+    fmt.wBitsPerSample = 16;
+    fmt.nChannels = 1;
+    fmt.nBlockAlign = fmt.nChannels * fmt.wBitsPerSample / 8;
+    fmt.nAvgBytesPerSec = fmt.nSamplesPerSec * fmt.nBlockAlign;
+    fmt.cbSize = 0;
+
+    IStream* pStream = NULL;
+
+    ISpStream* pSpStream = nullptr;
+    HRESULT hr = CoCreateInstance(CLSID_SpStream, nullptr, CLSCTX_ALL, __uuidof(ISpStream), (void**)&pSpStream);
+    if (FAILED(hr)) {
+        qDebug() << "CoCreateInstance failed.";
+    }
+    ReleaseOnExit rSpStream(pSpStream);
+
+    pStream = SHCreateMemStream(NULL, 0);
+    if (nullptr == pStream) {
+        qDebug() << "SHCreateMemStream failed.";
+    }
+
+    hr = pSpStream->SetBaseStream(pStream, SPDFID_WaveFormatEx, &fmt);
+    if (FAILED(hr)) {
+        qDebug() << "Can't set base stream.";
+    }
+
+    hr = m_tts->SetOutput(pSpStream, true);
+    if (FAILED(hr)) {
+        qDebug() << "Can't set output stream.";
+    }
+
+    ReleaseOnExit rStream(pStream);
+
     ULONG streamNumber;
-    HRESULT hr = m_tts->Speak(reinterpret_cast<LPCWSTR>(textToSpeak.utf16()),
+    hr = m_tts->Speak(reinterpret_cast<LPCWSTR>(textToSpeak.utf16()),
         SPF_IS_NOT_XML | SPF_ASYNC | SPF_PURGEBEFORESPEAK,
         &streamNumber);
     if (FAILED(hr)) {
-        ATLTRACE(TEXT("Speak failed.\n"));
-        AtlThrow(hr);
+        qDebug() << "Speak failed.";
     }
 
     m_tts->WaitUntilDone(-1);
 
-    outputStream->GetBaseStream(&cpIStream);
+    hr = pSpStream->GetBaseStream(&pStream);
+    if (FAILED(hr)) {
+        qDebug() << "Couldn't get base stream.";
+    }
+
+    hr = IStream_Reset(pStream);
+    if (FAILED(hr)) {
+        qDebug() << "Couldn't reset stream.";
+    }
+
     ULARGE_INTEGER StreamSize;
     StreamSize.LowPart = 0;
-    hr = IStream_Size(cpIStream, &StreamSize);
+    hr = IStream_Size(pStream, &StreamSize);
 
     DWORD dwSize = StreamSize.QuadPart;
     char* buf1 = new char[dwSize + 1];
-    hr = IStream_Read(cpIStream, buf1, dwSize);
+    memset(buf1, 0, dwSize + 1);
+
+    hr = IStream_Read(pStream, buf1, dwSize);
+    if (FAILED(hr)) {
+        qDebug() << "Couldn't read from stream.";
+    }
+
+    QByteArray byteArray = QByteArray::QByteArray(buf1, dwSize);
 
-    QByteArray byteArray = QByteArray::QByteArray(buf1, (int)dwSize);
     AudioInjectorOptions options;
-
     options.position = DependencyManager::get<AvatarManager>()->getMyAvatarPosition();
 
     AudioInjector::playSound(byteArray, options);
diff --git a/interface/src/scripting/SpeechScriptingInterface.h b/interface/src/scripting/SpeechScriptingInterface.h
index 311bd80605..ad6777e339 100644
--- a/interface/src/scripting/SpeechScriptingInterface.h
+++ b/interface/src/scripting/SpeechScriptingInterface.h
@@ -13,7 +13,9 @@
 
 #include <QtCore/QObject>
 #include <DependencyManager.h>
-
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <sapi.h>      // SAPI
 #include <sphelper.h>  // SAPI Helper
 
@@ -68,9 +70,6 @@ private:
 
     // Default voice token
     CComPtr<ISpObjectToken> m_voiceToken;
-
-    CComPtr<ISpStream> outputStream;
-    CComPtr<IStream> cpIStream;
 };
 
 #endif // hifi_SpeechScriptingInterface_h