Merge pull request #3652 from ctrlaltdavid/19976

CR for Job #19976 - Microsoft Speech API: allow me to control my avatar with voice
2025-04-14 05:26:16 +02:00 · 2014-10-29 20:46:12 -07:00 · 2014-10-29 20:46:12 -07:00 · e314fa1fc3
commit e314fa1fc3
parent 6b4b0bc8a4 627dad88cb
6 changed files with 291 additions and 29 deletions
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@ -30,10 +30,10 @@ elseif (UNIX)
  # include the right GL headers for UNIX
  set(GL_HEADERS "#include <GL/gl.h>\n#include <GL/glut.h>\n#include <GL/glext.h>")
 elseif (WIN32)
-    add_definitions(-D_USE_MATH_DEFINES) # apparently needed to get M_PI and other defines from cmath/math.h
-    add_definitions(-DWINDOWS_LEAN_AND_MEAN) # needed to make sure windows doesn't go to crazy with its defines
+  add_definitions(-D_USE_MATH_DEFINES) # apparently needed to get M_PI and other defines from cmath/math.h
+  add_definitions(-DWINDOWS_LEAN_AND_MEAN) # needed to make sure windows doesn't go to crazy with its defines

-    set(GL_HEADERS "#include <windowshacks.h>\n#include <GL/glew.h>\n#include <GL/glut.h>\n#include <GL/wglew.h>")
+  set(GL_HEADERS "#include <windowshacks.h>\n#include <GL/glew.h>\n#include <GL/glut.h>\n#include <GL/wglew.h>")
 endif ()

 # set up the external glm library
@ -46,13 +46,19 @@ configure_file(InterfaceVersion.h.in "${PROJECT_BINARY_DIR}/includes/InterfaceVe
 # grab the implementation and header files from src dirs
 file(GLOB_RECURSE INTERFACE_SRCS "src/*.cpp" "src/*.h")

-# Add SpeechRecognizer if on OS X, otherwise remove
-if (APPLE)
-    file(GLOB INTERFACE_OBJCPP_SRCS "src/SpeechRecognizer.mm")
-    set(INTERFACE_SRCS ${INTERFACE_SRCS} ${INTERFACE_OBJCPP_SRCS})
+# Add SpeechRecognizer if on Windows or OS X, otherwise remove
+if (WIN32)
+  # Use .cpp and .h files as is.
+elseif (APPLE)
+  file(GLOB INTERFACE_OBJCPP_SRCS "src/SpeechRecognizer.mm")
+  set(INTERFACE_SRCS ${INTERFACE_SRCS} ${INTERFACE_OBJCPP_SRCS})
+  get_filename_component(SPEECHRECOGNIZER_CPP "src/SpeechRecognizer.cpp" ABSOLUTE)
+  list(REMOVE_ITEM INTERFACE_SRCS ${SPEECHRECOGNIZER_CPP})
 else ()
-    get_filename_component(SPEECHRECOGNIZER_H "src/SpeechRecognizer.h" ABSOLUTE)
-    list(REMOVE_ITEM INTERFACE_SRCS ${SPEECHRECOGNIZER_H})
+  get_filename_component(SPEECHRECOGNIZER_H "src/SpeechRecognizer.h" ABSOLUTE)
+  list(REMOVE_ITEM INTERFACE_SRCS ${SPEECHRECOGNIZER_H})
+  get_filename_component(SPEECHRECOGNIZER_CPP "src/SpeechRecognizer.cpp" ABSOLUTE)
+  list(REMOVE_ITEM INTERFACE_SRCS ${SPEECHRECOGNIZER_CPP})
 endif ()

 find_package(Qt5 COMPONENTS Gui Multimedia Network OpenGL Script Svg WebKitWidgets)
@ -185,20 +191,20 @@ target_link_libraries(
 add_definitions(-DQT_NO_BEARERMANAGEMENT)

 if (APPLE)
-    # link in required OS X frameworks and include the right GL headers
-    find_library(CoreAudio CoreAudio)
-    find_library(CoreFoundation CoreFoundation)
-    find_library(GLUT GLUT)
-    find_library(OpenGL OpenGL)
-    find_library(AppKit AppKit)
+  # link in required OS X frameworks and include the right GL headers
+  find_library(CoreAudio CoreAudio)
+  find_library(CoreFoundation CoreFoundation)
+  find_library(GLUT GLUT)
+  find_library(OpenGL OpenGL)
+  find_library(AppKit AppKit)

-    target_link_libraries(${TARGET_NAME} ${CoreAudio} ${CoreFoundation} ${GLUT} ${OpenGL} ${AppKit})
+  target_link_libraries(${TARGET_NAME} ${CoreAudio} ${CoreFoundation} ${GLUT} ${OpenGL} ${AppKit})
    
-    # install command for OS X bundle
-    INSTALL(TARGETS ${TARGET_NAME}
-        BUNDLE DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/install" COMPONENT Runtime
-        RUNTIME DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/install" COMPONENT Runtime
-    )
+  # install command for OS X bundle
+  INSTALL(TARGETS ${TARGET_NAME}
+    BUNDLE DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/install" COMPONENT Runtime
+    RUNTIME DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/install" COMPONENT Runtime
+  )
 else (APPLE)
  # copy the resources files beside the executable
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
--- a/interface/src/Application.cpp
+++ b/interface/src/Application.cpp
@ -3864,7 +3864,7 @@ ScriptEngine* Application::loadScript(const QString& scriptFilename, bool isUser
    scriptEngine->registerGlobalObject("Camera", cameraScriptable);
    connect(scriptEngine, SIGNAL(finished(const QString&)), cameraScriptable, SLOT(deleteLater()));

-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    scriptEngine->registerGlobalObject("SpeechRecognizer", Menu::getInstance()->getSpeechRecognizer());
 #endif

--- a/interface/src/Menu.cpp
+++ b/interface/src/Menu.cpp
@ -99,7 +99,7 @@ Menu::Menu() :
    _lodToolsDialog(NULL),
    _newLocationDialog(NULL),
    _userLocationsDialog(NULL),
-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    _speechRecognizer(),
 #endif
    _maxVoxels(DEFAULT_MAX_VOXELS_PER_SYSTEM),
@ -225,7 +225,7 @@ Menu::Menu() :
    addActionToQMenuAndActionHash(toolsMenu, MenuOption::MetavoxelEditor, 0, this, SLOT(showMetavoxelEditor()));
    addActionToQMenuAndActionHash(toolsMenu, MenuOption::ScriptEditor,  Qt::ALT | Qt::Key_S, this, SLOT(showScriptEditor()));

-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    QAction* speechRecognizerAction = addCheckableActionToQMenuAndActionHash(toolsMenu, MenuOption::ControlWithSpeech,
            Qt::CTRL | Qt::SHIFT | Qt::Key_C, _speechRecognizer.getEnabled(), &_speechRecognizer, SLOT(setEnabled(bool)));
    connect(&_speechRecognizer, SIGNAL(enabledUpdated(bool)), speechRecognizerAction, SLOT(setChecked(bool)));
@ -769,7 +769,7 @@ void Menu::loadSettings(QSettings* settings) {
                                         QStandardPaths::writableLocation(QStandardPaths::DesktopLocation)).toString();
    setScriptsLocation(settings->value("scriptsLocation", QString()).toString());

-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    _speechRecognizer.setEnabled(settings->value("speechRecognitionEnabled", false).toBool());
 #endif

@ -835,7 +835,7 @@ void Menu::saveSettings(QSettings* settings) {
    settings->setValue("boundaryLevelAdjust", _boundaryLevelAdjust);
    settings->setValue("snapshotsLocation", _snapshotsLocation);
    settings->setValue("scriptsLocation", _scriptsLocation);
-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    settings->setValue("speechRecognitionEnabled", _speechRecognizer.getEnabled());
 #endif
    settings->beginGroup("View Frustum Offset Camera");
--- a/interface/src/Menu.h
+++ b/interface/src/Menu.h
@ -23,7 +23,7 @@
 #include <MenuItemProperties.h>
 #include <OctreeConstants.h>

-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
 #include "SpeechRecognizer.h"
 #endif

@ -148,7 +148,7 @@ public:

    bool shouldRenderMesh(float largestDimension, float distanceToCamera);

-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    SpeechRecognizer* getSpeechRecognizer() { return &_speechRecognizer; }
 #endif

@ -288,7 +288,7 @@ private:
    LodToolsDialog* _lodToolsDialog;
    QPointer<DataWebDialog> _newLocationDialog;
    QPointer<DataWebDialog> _userLocationsDialog;
-#ifdef Q_OS_MAC
+#if defined(Q_OS_MAC) || defined(Q_OS_WIN)
    SpeechRecognizer _speechRecognizer;
 #endif
    int _maxVoxels;
--- a/interface/src/SpeechRecognizer.cpp
+++ b/interface/src/SpeechRecognizer.cpp
@ -0,0 +1,237 @@
+//
+//  SpeechRecognizer.cpp
+//  interface/src
+//
+//  Created by David Rowe on 10/20/2014.
+//  Copyright 2014 High Fidelity, Inc.
+//
+//  Distributed under the Apache License, Version 2.0.
+//  See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
+//
+
+#include <QtGlobal>
+#include <QDebug>
+
+#include "SpeechRecognizer.h"
+
+#if defined(Q_OS_WIN)
+
+#include <sapi.h>
+
+SpeechRecognizer::SpeechRecognizer() :
+    QObject(),
+    _enabled(false),
+    _commands(),
+    _comInitialized(false),
+    _speechRecognizer(NULL),
+    _speechRecognizerContext(NULL),
+    _speechRecognizerGrammar(NULL),
+    _commandRecognizedEvent(NULL),
+    _commandRecognizedNotifier(NULL) {
+
+    HRESULT hr = ::CoInitialize(NULL);
+
+    if (SUCCEEDED(hr)) {
+        _comInitialized = true;
+    }
+
+    _commandRecognizedNotifier = new QWinEventNotifier();
+    connect(_commandRecognizedNotifier, &QWinEventNotifier::activated, this, &SpeechRecognizer::notifyCommandRecognized);
+}
+
+SpeechRecognizer::~SpeechRecognizer() {
+    if (_speechRecognizerGrammar) {
+        static_cast<ISpRecoGrammar*>(_speechRecognizerGrammar)->Release();
+    }
+
+    if (_enabled) {
+        static_cast<ISpRecoContext*>(_speechRecognizerContext)->Release();
+        static_cast<ISpRecognizer*>(_speechRecognizer)->Release();
+    }
+
+    if (_comInitialized) {
+        ::CoUninitialize();
+    }
+}
+
+void SpeechRecognizer::handleCommandRecognized(const char* command) {
+    emit commandRecognized(QString(command));
+}
+
+void SpeechRecognizer::setEnabled(bool enabled) {
+    if (enabled == _enabled || !_comInitialized) {
+        return;
+    }
+
+    _enabled = enabled;
+
+    if (_enabled) {
+
+        HRESULT hr = S_OK;
+
+        // Set up dedicated recognizer instead of using shared Windows recognizer.
+        // - By default, shared recognizer's commands like "move left" override any added here.
+        // - Unless do SetGrammarState(SPGS_EXCLUSIVE) on shared recognizer but then non-Interface commands don't work at all.
+        // - With dedicated recognizer, user can choose whether to have Windows recognizer running in addition to Interface's.
+        if (SUCCEEDED(hr)) {
+            hr = CoCreateInstance(CLSID_SpInprocRecognizer, NULL, CLSCTX_ALL, IID_ISpRecognizer, (void**)&_speechRecognizer);
+        }
+        if (SUCCEEDED(hr)) {
+            ISpObjectToken* audioToken;
+            ISpObjectTokenCategory* audioTokenCategory;
+            hr = CoCreateInstance(CLSID_SpObjectTokenCategory, NULL, CLSCTX_ALL, IID_ISpObjectTokenCategory, 
+                (void**)&audioTokenCategory);
+            if (SUCCEEDED(hr)) {
+                hr = audioTokenCategory->SetId(SPCAT_AUDIOIN, TRUE);
+            }
+            if (SUCCEEDED(hr)) {
+                WCHAR * tokenID;
+                hr = audioTokenCategory->GetDefaultTokenId(&tokenID);
+                if (SUCCEEDED(hr)) {
+                    hr = CoCreateInstance(CLSID_SpObjectToken, NULL, CLSCTX_ALL, IID_ISpObjectToken, (void**)&audioToken);
+                    if (SUCCEEDED(hr)) {
+                        hr = audioToken->SetId(NULL, tokenID, FALSE);
+                    }
+                    ::CoTaskMemFree(tokenID);
+                }
+            }
+            if (SUCCEEDED(hr)) {
+                hr = static_cast<ISpRecognizer*>(_speechRecognizer)->SetInput(audioToken, TRUE);
+            }
+
+        }
+        if (SUCCEEDED(hr)) {
+            hr = static_cast<ISpRecognizer*>(_speechRecognizer)
+                ->CreateRecoContext(reinterpret_cast<ISpRecoContext**>(&_speechRecognizerContext));
+            if (FAILED(hr)) {
+                static_cast<ISpRecognizer*>(_speechRecognizer)->Release();
+            }
+        }
+
+        // Set up event notification mechanism.
+        if (SUCCEEDED(hr)) {
+            hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->SetNotifyWin32Event();
+        }
+        if (SUCCEEDED(hr)) {
+            _commandRecognizedEvent = static_cast<ISpRecoContext*>(_speechRecognizerContext)->GetNotifyEventHandle();
+            if (_commandRecognizedEvent) {
+                _commandRecognizedNotifier->setHandle(_commandRecognizedEvent);
+                _commandRecognizedNotifier->setEnabled(true);
+            } else {
+                hr = S_FALSE;
+            }
+        }
+        
+        // Set which events to be notified of.
+        if (SUCCEEDED(hr)) {
+            hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)
+                ->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION));
+        }
+
+        // Create grammar and load commands.
+        if (SUCCEEDED(hr)) {
+            hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)
+                ->CreateGrammar(NULL, reinterpret_cast<ISpRecoGrammar**>(&_speechRecognizerGrammar));
+        }
+        if (SUCCEEDED(hr)) {
+            reloadCommands();
+        }
+        
+        _enabled = SUCCEEDED(hr);
+
+        qDebug() << "Speech recognition" << (_enabled ? "enabled" : "enable failed");
+
+    } else {
+        _commandRecognizedNotifier->setEnabled(false);
+        static_cast<ISpRecoContext*>(_speechRecognizerContext)->Release();
+        static_cast<ISpRecognizer*>(_speechRecognizer)->Release();
+        qDebug() << "Speech recognition disabled";
+    }
+
+    emit enabledUpdated(_enabled);
+}
+
+void SpeechRecognizer::addCommand(const QString& command) {
+    _commands.insert(command);
+    reloadCommands();
+}
+
+void SpeechRecognizer::removeCommand(const QString& command) {
+    _commands.remove(command);
+    reloadCommands();
+}
+
+void SpeechRecognizer::reloadCommands() {
+    if (!_enabled || _commands.count() == 0) {
+        return;
+    }
+
+    HRESULT hr = S_OK;
+
+    if (SUCCEEDED(hr)) {
+        hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->Pause(NULL);
+    }
+
+    if (SUCCEEDED(hr)) {
+        WORD langId = MAKELANGID(LANG_NEUTRAL, SUBLANG_NEUTRAL);
+        hr = static_cast<ISpRecoGrammar*>(_speechRecognizerGrammar)->ResetGrammar(langId);
+    }
+
+    DWORD ruleID = 0;
+    SPSTATEHANDLE initialState;
+    for (QSet<QString>::const_iterator iter = _commands.constBegin(); iter != _commands.constEnd(); iter++) {
+        ruleID += 1;
+
+        if (SUCCEEDED(hr)) {
+            hr = static_cast<ISpRecoGrammar*>(_speechRecognizerGrammar)->
+                GetRule(NULL, ruleID, SPRAF_TopLevel | SPRAF_Active | SPRAF_Dynamic, TRUE, &initialState);
+        }
+
+        if (SUCCEEDED(hr)) {
+            const std::wstring command = (*iter).toStdWString();
+            hr = static_cast<ISpRecoGrammar*>(_speechRecognizerGrammar)->
+                AddWordTransition(initialState, NULL, command.c_str(), L" ", SPWT_LEXICAL, 1.0, NULL);
+        }
+    }
+
+    if (SUCCEEDED(hr)) {
+        hr = static_cast<ISpRecoGrammar*>(_speechRecognizerGrammar)->Commit(NULL);
+    }
+
+    if (SUCCEEDED(hr)) {
+        hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->Resume(NULL);
+    }
+
+    if (SUCCEEDED(hr)) {
+        hr = static_cast<ISpRecoGrammar*>(_speechRecognizerGrammar)->SetRuleState(NULL, NULL, SPRS_ACTIVE);
+    }
+
+    if (FAILED(hr)) {
+        qDebug() << "ERROR: Didn't successfully reload speech commands";
+    }
+}
+
+void SpeechRecognizer::notifyCommandRecognized(void* handle) {
+    SPEVENT eventItem;
+    memset(&eventItem, 0, sizeof(SPEVENT));
+    HRESULT hr = static_cast<ISpRecoContext*>(_speechRecognizerContext)->GetEvents(1, &eventItem, NULL);
+
+    if (SUCCEEDED(hr)) {
+        if (eventItem.eEventId == SPEI_RECOGNITION && eventItem.elParamType == SPET_LPARAM_IS_OBJECT) {
+            ISpRecoResult* recognitionResult = reinterpret_cast<ISpRecoResult*>(eventItem.lParam);
+            wchar_t* pText;
+
+            hr = recognitionResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pText, NULL);
+
+            if (SUCCEEDED(hr)) {
+                QString text = QString::fromWCharArray(pText);
+                handleCommandRecognized(text.toStdString().c_str());
+                ::CoTaskMemFree(pText);
+            }
+
+            recognitionResult->Release();
+        }
+    }
+}
+
+#endif // defined(Q_OS_WIN)
--- a/interface/src/SpeechRecognizer.h
+++ b/interface/src/SpeechRecognizer.h
@ -16,6 +16,10 @@
 #include <QSet>
 #include <QString>

+#if defined(Q_OS_WIN)
+#include <QWinEventNotifier>
+#endif
+
 class SpeechRecognizer : public QObject {
    Q_OBJECT
 public:
@ -40,8 +44,23 @@ protected:
 private:
    bool _enabled;
    QSet<QString> _commands;
+#if defined(Q_OS_MAC)
    void* _speechRecognizerDelegate;
    void* _speechRecognizer;
+#elif defined(Q_OS_WIN)
+    bool _comInitialized;
+    // Use void* instead of ATL CComPtr<> for speech recognizer in order to avoid linker errors with Visual Studio Express.
+    void* _speechRecognizer;
+    void* _speechRecognizerContext;
+    void* _speechRecognizerGrammar;
+    void* _commandRecognizedEvent;
+    QWinEventNotifier* _commandRecognizedNotifier;
+#endif
+
+#if defined(Q_OS_WIN)
+private slots:
+    void notifyCommandRecognized(void* handle);
+#endif
 };

 #endif // hifi_SpeechRecognizer_h