mmir-plugin-speech-io

Plugin for the MMIR framework that adds state-machines for managing speech input/output states

Usage no npm install needed!

<script type="module">
  import mmirPluginSpeechIo from 'https://cdn.skypack.dev/mmir-plugin-speech-io';
</script>

README

mmir-plugin-speech-io

MIT license GitHub package.json version npm

Plugin for the MMIR framework that adds state-machines for managing speech input/output states

NOTE currently this plugin requires a webpack build process (see mmir-webpack).

Configuration

for including plugin in mmir webpack build

//...
const mmirAppConfig = {
  includePlugins: [
    {id: 'mmir-plugin-speech-io', config: {
      //optional configuration for the plugin:
      alternativeResults: 5,
      longPause: true,
      command: {
        languageModel: 'dictation',
        alternativeResults: 2
      }
    }}
  ],
  //...
};

const webpack = require('webpack');
module.exports = function(webpackConfig, _options){
  try{
    require('mmir-webpack')(webpack, webpackConfig, mmirAppConfig);
  } catch(err){
    console.log(err);
    throw err;
  }
  return webpackConfig;
}

Configuration values:


  /**
   * do disable stopping TTS output when starting ASR (microphone) input?
   *
   * (by default: do stop TTS before starting ASR/microphone input)
   *
   * @default false
   */
  disableCancelPrompt?: boolean;
  /**
   * number of (max.) alternative ASR results
   * @default 1
   */
  alternativeResults?: number;
  /**
   * speech mode / language model that should be used for ASR
   *
   * @default speechMode === 'dictation'? 'dictation' : 'search'
   */
  languageModel?: 'dictation' | 'search';
  /**
   * EOS (End Of Speech) detection pause:
   * require long pause (or only a short pause) to detect end-of-speech?
   *
   * @default false
   */
  longPause?: boolean;

  /**
   * disable improved ASR feedback (see documentation of {@link mmir.MediaManager.startRecord})
   *
   * (NOTE: will be ignored if not supported by active ASR engine)
   *
   * @default speechInputMode === 'command'
   */
  disableImprovedFeedback?: boolean;

  /**
   * enable/disable receiving interim ASR results depending on speech mode
   *
   * @default speechInputMode === 'dictation'
   */
  enableInterimResults?: boolean

  /**
   * Flag that indicates if end-of-speech (EOS) detection will be used
   * for speech recognition.
   *
   * If enabled, the recognition will be stopped after EOS was detected
   * (e.g. upon a pause after dictating a sentence).
   *
   * @default false
   */
  eos?: boolean;

  /**
   * specific configration values for active speech-mode ('dictation' or 'command'):
   * override general configuration values for the plugin (see configuration options above)
   */
  dictation?: {...};
  command?: {...};

  /**
   * The execution context/name (see {@link mmir.MediaManager.setDefaultCtx}) for the
   * recognition functions (i.e. `mmir.media.recognize()` | `mmir.media.startRecord()` | `mmir.media.stopRecord()` | `mmir.media.cancelRecognition()`).
   *
   * @default undefined (i.e. use default context)
   */
  asrEngine?: string;

  /**
   * The execution context/name (see {@link mmir.MediaManager.setDefaultCtx}) for the
   * speech sythesis functions (i.e. `mmir.media.tts()` | `mmir.media.cancelSpeech()`).
   *
   * @default undefined (i.e. use default context)
   */
  ttsEngine?: string;

  /**
   * Custom / default options for ASR (speech recognition):
   * note that configuration for [[SpeechIoPluginConfigurationEntry]] superceed these default options.
   *
   * NOTE: if specified, the same default options are used regardless of current language setting (see {@link mmir.LanguageManager#getLanguage})
   *
   * @default undefined
   */
  asrDefaultOptions?: Partial<ASROptions>;

  /**
   * Custom / default options for TTS (speech synthesis):
   * note should not set `language` or `voice` options with this!
   *
   * (i.e. should only be used for (custom) options that are independet of language setting)
   *
   * NOTE: if **not** specified per language, the default options should not contain any language dependend settings (e.g. like `voice`)
   *
   * @default undefined
   */
  ttsDefaultOptions?: Partial<TTSOptions> | {[languageCode: string]: Partial<TTSOptions>};

  /**
   * During active speech-input in 'dictation' mode:
   * if detected as single input/sentence, will stop speech-input for the input-control.
   *
   * The canceling will only be applied, if it matches the whole input/sentence, i.e.:
   * <pre>
   * isStopCommand("some sentence <stop word>") -> false
   * isStopCommand(" <stop word> ") -> true
   * </pre>
   *
   * Can be either set with a string, or an object/dictionary that maps a
   * language ID to to the stop-command.
   * <pre>
   * var stopCmd = {
   *   de: 'anhalten',
   *   en: 'stop'
   * }
   * </pre>
   *
   * NOTE can be set during runtime with:
   * <pre>
   * voiceUiService.ctrl.speechIn.setDictationCommand(dictStopWord, dictAbortWord);
   * </pre>
   * @default ""
   */
  dictStopWord?: string | {[languageId: string]: string};

  /**
   * Will only work, if `dictStopWord` is also set!
   *
   * During active speech-input in 'dictation' mode:
   * if detected as single input/sentence, will abort speech-input for the
   * input-control and revert its text to its previous state (i.e. before dictation
   * was started for the input-control).
   *
   * The aborting will only be applied, if it matches the whole input/sentence, i.e.:
   * <pre>
   * isAbortCommand("some sentence <abort word>") -> false
   * isAbortCommand(" <abort word> ") -> true
   * </pre>
   *
   * Can be either set with a string, or an object/dictionary that maps a
   * language ID to to the abort-command.
   * <pre>
   * var abortCmd = {
   *   de: 'rückgängig',
   *   en: 'undo'
   * }
   * </pre>
   *
   * NOTE can be set during runtime with:
   * <pre>
   * voiceUiService.ctrl.speechIn.setDictationCommand(dictStopWord, dictAbortWord);
   * </pre>
   *
   * NOTE IMPORTANT:
   * currently this feature requires, that the original text, that will be reverted
   * to, is set manually to the text-element's `dataset` to key `original-text`
   * (e.g. when starting dictation)!
   *
   * @default ""
   */
  dictAbortWord?: string | {[languageId: string]: string};

  /**
   * disable visual feedback for unstable dictation input in "pure text" input controls
   *
   * @default false
   */
  disableUnstableFeedback?: boolean;

  /**
   * enable/disable sound feedback for click/touch interactions
   *
   * @default true
   */
  soundFeedbackEnabled?: boolean;

  /**
   * enable/disable haptic feedback for click/touch interactions
   *
   * NOTE: haptic feedback (vibration) may not be supported for all execution
   *       environments/devices (will be ignored, if not supported)
   *
   * @default true
   */
  hapticFeedbackEnabled?: boolean;

  /**
   * print additional debug (console) output for speech I/O state-machine
   * @default false
   */
  showVuiDebugOutput?: boolean;

  /**
   * enable "barge in" during speech prompt (TTS / speech synthesis output):
   *
   * by default, speech input (i.e. recognition) will be disabled during speech synthesis
   * in order to prevent recording the synthesized speech.
   *
   * If echo-cancellation is used when recording the speech, or synthesis is outputed through headphones
   * or similiar, "barge in" can be enabled:
   * in this case the micropohne stays open (i.e. speech recognition stays active) during speech synthesis.
   *
   * @default false
   */
  enableBargeIn?: boolean;

  /**
   * if enabled:
   * if speech-mode 'command' and 'guided-input' is active and entering a new view
   * -> do start input for first "input-control" if auto-proceed is active
   *
   * NOTE 'guided-input' mode is not implemented yet
   *
   * @default false
   */
  inputCtrlAutoProceed?: boolean;

  /**
   * if enabled:
   * the `mmir-service` will not raise an 'init' event upon intialization on the `mmir.dialog` instance.
   *
   * Otherwise, the `mmir-service` will raise an 'init' event with event data:
   * ```
   * {
   *  appConfig: IAppSettings,
   *  mmir: ExtMmirModule<CmdImpl>,
   *  emma: EmmaUtil<CmdImpl>
   * }
   * ```
   * This event and its data can be used in the `dialog.xml` state definition's inital state by
   * defining a transition for the event `init` (see example).
   *
   * @default false
   * @example
   * <scxml xmlns="http://www.w3.org/2005/07/scxml" version="1.0"
   *        profile="ecmascript" id="scxmlRoot" initial="AppStart">
   * <state id="AppStart">
   *   <!-- transition for init-event, which in this example will trigger state-change to "MainApp" -->
   *   <transition event="init" target="MainApp">
   *    <script>
   *      // get event data contents:
   *      var appConfig = _event.data.appConfig;
   *      var mmir = _event.data.mmir;
   *      var emmaUtil = _event.data.emma;
   *      //... use them somehow (e.g. could be stored in data model variable)
   */
  preventDialogManagerInit?: boolean;

  /**
   * if a prompt is active (i.e. TTS is playing), when a new one is requested:
   * cancel the current/active one (and read the new one)?
   *
   * If `false`, the the new prompt may be discarded, or cancel/replace the active one,
   * depending on the `ReadOptions` of the new prompt.
   *
   * @default true
   */
  cancelOnNewPrompt?: boolean;
}

Example Usage

speechCommand

speechCommand will be triggered, if speech-input is in command mode, and an (stable) ASR result becomes available

  1. register for speechCommand

    ...
    vuiCtrl.ctrl.enterView(mmirService.speechEvents.speechCommand.subscribe(result => this.evalSemantics(result)));
    
  2. parse ASR result in speechCommand, and create appropriate interpretation, and trigger commandAction

    public evalSemantics(emma: RecognitionEmma){
    
      const asrResult = this.mmir.emma._extractAsrData(emma);
      const text = asrResult.text;
    
      this.mmir.semantic.interpret(text, null, result => {
    
        let semantic: any;
        if(result.semantic != null) {
          semantic = result.semantic;
          semantic.phrase = text;
          if(this._debugMsg) console.log("semantic : " + result.semantic);//DEBUG
        }
        else {
    
          //create "no-match" semantic-object:
          semantic = {
            "NoMatch": {
              "phrase": text
            }
          };
        }
    
        this.mmir.emma.setSpeechUnderstanding(emma, semantic);
    
        // will trigger/emit commandAction:
        this.mmir.speechioInput.raise("speech",  semantic);
    
      });
    
    }