Skip to main content

sapi_lite/stt/
mod.rs

1//! Speech recognition API.
2//!
3//! ## Recognizer
4//!
5//! The entry point for speech recognition is the [`Recognizer`], which encapsulates an in-process
6//! speech recognition engine. You generally won't need more than one instance of the recognizer.
7//!
8//! ## Context
9//!
10//! The recognizer can have one or more recognition contexts. This module provides two variants of
11//! contexts:
12//! * [`SyncContext`] will block the current thread until the engine recognizes a phrase, or until
13//! the given timeout.
14//! * [`EventfulContext`] will call the supplied event handler whenever the engine recognizes a
15//! phrase.
16//!
17//! For asynchronous recognition, see the [`tokio`](crate::tokio) module.
18//!
19//! ## Grammar
20//!
21//! Each context can have one or more grammars loaded into it. A grammar consists of one or more
22//! rules that define what phrases the engine can recognize. You can enable or disable the whole
23//! grammar, or individual rules in it by their name.
24
25use std::sync::{Arc, Mutex};
26
27use windows as Windows;
28use Windows::core::IUnknown;
29use Windows::Win32::Media::Speech::{
30    ISpRecognizer, SpInprocRecognizer, SPRECOSTATE, SPRST_ACTIVE, SPRST_INACTIVE,
31};
32use Windows::Win32::System::Com::{CoCreateInstance, CLSCTX_ALL};
33
34use crate::audio::AudioStream;
35use crate::com_util::Intf;
36use crate::token::Category;
37use crate::Result;
38
39mod context;
40mod grammar;
41mod phrase;
42mod semantics;
43
44pub use context::{Context, EventHandler, EventfulContext, SyncContext};
45pub use grammar::{Grammar, GrammarBuilder, RepeatRange, Rule, RuleArena};
46pub use phrase::Phrase;
47pub use semantics::{SemanticString, SemanticTree, SemanticValue};
48
49/// Specifies where the input for speech recognition should come from.
50pub enum RecognitionInput {
51    /// Listen to the default recording device on the system
52    Default,
53    /// Read from the given stream
54    Stream(AudioStream),
55}
56
57impl RecognitionInput {
58    fn to_sapi(self) -> Result<IUnknown> {
59        Ok(match self {
60            Self::Default => {
61                Category::new(r"HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech\AudioInput")?
62                    .default_token()?
63                    .to_sapi()
64                    .0
65            }
66            Self::Stream(stream) => stream.to_sapi().0,
67        })
68    }
69}
70
71/// The in-process speech recognition engine.
72pub struct Recognizer {
73    intf: Intf<ISpRecognizer>,
74    pauser: RecognitionPauser,
75    global_pause: Mutex<Option<ScopedPause>>,
76}
77
78impl Recognizer {
79    /// Creates a new recognition engine, configured to listen to the default recording device.
80    pub fn new() -> Result<Self> {
81        let intf: ISpRecognizer =
82            unsafe { CoCreateInstance(&SpInprocRecognizer, None, CLSCTX_ALL) }?;
83        unsafe { intf.SetInput(RecognitionInput::Default.to_sapi()?, false) }?;
84        Ok(Self {
85            pauser: RecognitionPauser::new(intf.clone()),
86            intf: Intf(intf),
87            global_pause: Mutex::new(None),
88        })
89    }
90
91    /// Configures the recognizer to listen to the given input.
92    pub fn set_input(&self, input: RecognitionInput, allow_fmt_changes: bool) -> Result<()> {
93        unsafe { self.intf.SetInput(input.to_sapi()?, allow_fmt_changes) }
94    }
95
96    /// Enables or disables recognition.
97    pub fn set_enabled(&self, enabled: bool) -> Result<()> {
98        let mut global_pause = self.global_pause.lock().unwrap();
99        if global_pause.is_none() != enabled {
100            if enabled {
101                *global_pause = None;
102            } else {
103                *global_pause = Some(self.pauser.pause()?);
104            }
105        }
106        Ok(())
107    }
108}
109
110fn reco_state(enabled: bool) -> SPRECOSTATE {
111    if enabled {
112        SPRST_ACTIVE
113    } else {
114        SPRST_INACTIVE
115    }
116}
117
118struct PauserState {
119    intf: Intf<ISpRecognizer>,
120    pause_count: usize,
121}
122
123impl PauserState {
124    fn pause(&mut self) -> Result<()> {
125        if self.pause_count == 0 {
126            unsafe { self.intf.SetRecoState(reco_state(false)) }?;
127        }
128        self.pause_count += 1;
129        Ok(())
130    }
131
132    fn resume(&mut self) -> Result<()> {
133        if self.pause_count == 1 {
134            unsafe { self.intf.SetRecoState(reco_state(true)) }?;
135        }
136        self.pause_count -= 1;
137        Ok(())
138    }
139}
140
141#[derive(Clone)]
142struct RecognitionPauser {
143    state: Arc<Mutex<PauserState>>,
144}
145
146impl RecognitionPauser {
147    fn new(intf: ISpRecognizer) -> Self {
148        Self {
149            state: Arc::new(Mutex::new(PauserState {
150                intf: Intf(intf),
151                pause_count: 0,
152            })),
153        }
154    }
155
156    fn pause(&self) -> Result<ScopedPause> {
157        ScopedPause::new(self.state.clone())
158    }
159}
160
161struct ScopedPause {
162    state: Arc<Mutex<PauserState>>,
163}
164
165impl ScopedPause {
166    fn new(state: Arc<Mutex<PauserState>>) -> Result<Self> {
167        {
168            state.lock().unwrap().pause()?;
169        }
170        Ok(Self { state })
171    }
172}
173
174impl Drop for ScopedPause {
175    fn drop(&mut self) {
176        // The following call is expected to succeed, but failure shouldn't cause panic
177        let _ = self.state.lock().unwrap().resume();
178    }
179}