1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
use super::{
result_from_json_c_str, AcceptWaveformError, CompleteResult, DecodingState, PartialResult,
};
use crate::models::{Model, SpeakerModel};
use std::{ffi::CString, os::raw::c_int, ptr::NonNull};
use vosk_sys::*;
/// The main object which processes data.
/// Takes audio as input and returns decoded information as words, confidences, times, and other metadata.
pub struct Recognizer(NonNull<VoskRecognizer>);
impl Recognizer {
/// Creates the recognizer object. Returns [`None`] if a problem occured.
///
/// The recognizers process the speech and return text using shared model data.
///
/// * `model` - [`Model`] containing static data for recognizer. Model can be shared
/// across recognizers, even running in different threads.
///
/// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer.
/// Make sure this rate matches the audio content, it is a common issue causing accuracy problems.
///
/// [`Model`]: crate::Model
#[must_use]
pub fn new(model: &Model, sample_rate: f32) -> Option<Self> {
let recognizer_ptr = unsafe { vosk_recognizer_new(model.0.as_ptr(), sample_rate) };
Some(Self(NonNull::new(recognizer_ptr)?))
}
/// Creates the recognizer object with speaker recognition. Returns [`None`] if a problem occured
///
/// With the speaker recognition mode the recognizer not just recognize
/// text but also return speaker vectors one can use for speaker identification
///
/// * `model` - [`Model`] containing the data for recognizer. Model can be
/// shared across recognizers, even running in different threads.
///
/// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer.
/// Make sure this rate matches the audio content, it is a common
/// issue causing accuracy problems.
///
/// * `spk_model` - Speaker model for speaker identification.
///
/// [`Model`]: crate::Model
#[must_use]
pub fn new_with_speaker(
model: &Model,
sample_rate: f32,
speaker_model: &SpeakerModel,
) -> Option<Self> {
let recognizer_ptr = unsafe {
vosk_recognizer_new_spk(model.0.as_ptr(), sample_rate, speaker_model.0.as_ptr())
};
Some(Self(NonNull::new(recognizer_ptr)?))
}
/// Creates the recognizer object with that only recognizes certain words.
/// Returns [`None`] if a problem occured.
///
/// Sometimes when you want to improve recognition accuracy and when you don't need
/// to recognize large vocabulary you can specify a list of phrases to recognize. This
/// will improve recognizer speed and accuracy but might return \[unk\] if user said
/// something different.
///
/// Only recognizers with lookahead models support this type of quick configuration.
/// Precompiled HCLG graph models are not supported.
///
/// * `model` - [`Model`] containing the data for recognizer. Model can be shared
/// across recognizers, even running in different threads.
///
/// * `sample_rate` - The sample rate of the audio you going to feed into the recognizer.
/// Make sure this rate matches the audio content, it is a common issue causing accuracy problems.
///
/// * `grammar` - The list of phrases to recognize.
///
/// # Examples
///
/// ```no_run
/// # use vosk::{Model, Recognizer};
/// #
/// let model = Model::new("/path/to/model").expect("Could not create a model");
/// let recognizer = Recognizer::new_with_grammar(
/// &model,
/// 16000.0,
/// &["one two three four five", "[unk]"],
/// )
/// .expect("Could not create a recognizer");
/// ```
///
/// [`Model`]: crate::Model
#[must_use]
pub fn new_with_grammar(
model: &Model,
sample_rate: f32,
grammar: &[impl AsRef<str>],
) -> Option<Self> {
let grammar_c = CString::new(format!(
"[{}]",
grammar
.iter()
.map(|phrase| format!("\"{}\"", phrase.as_ref()))
.collect::<Vec<_>>()
.join(", ")
))
.ok()?;
let recognizer_ptr =
unsafe { vosk_recognizer_new_grm(model.0.as_ptr(), sample_rate, grammar_c.as_ptr()) };
Some(Self(NonNull::new(recognizer_ptr)?))
}
/// Adds speaker model to already initialized recognizer
///
/// Can add speaker recognition model to already created recognizer. Helps to initialize
/// speaker recognition for grammar-based recognizer.
pub fn set_speaker_model(&mut self, speaker_model: &SpeakerModel) {
unsafe { vosk_recognizer_set_spk_model(self.0.as_ptr(), speaker_model.0.as_ptr()) }
}
/// Configures recognizer to output n-best results in [`result`] and [`final_result`]
///
/// The returned value from those methods will be a [`CompleteResult::Single`]
/// if `max_alternatives` is 0, and [`CompleteResult::Multiple`] otherwise.
///
/// * `max_alternatives` - Maximum alternatives to return (may be fewer) (default: 0)
///
/// [`result`]: Self::result
/// [`final_result`]: Self::final_result
/// [`CompleteResult::Single`]: crate::CompleteResult::Single
/// [`CompleteResult::Multiple`]: crate::CompleteResult::Multiple
pub fn set_max_alternatives(&mut self, max_alternatives: u16) {
unsafe { vosk_recognizer_set_max_alternatives(self.0.as_ptr(), max_alternatives as c_int) }
}
/// Enables or disables words with metadata in the output, represented as:
///
/// * [`WordInAlternative`] in a [`CompleteResult::Multiple`]
///
/// * [`Word`] in a [`CompleteResult::Single`]
///
/// [`WordInAlternative`]: crate::WordInAlternative
/// [`Word`]: crate::Word
/// [`CompleteResult::Multiple`]: crate::CompleteResult::Multiple
/// [`CompleteResult::Single`]: crate::CompleteResult::Single
pub fn set_words(&mut self, enable: bool) {
unsafe { vosk_recognizer_set_words(self.0.as_ptr(), i32::from(enable)) }
}
/// Like [`set_words`], but for [`PartialResult`].
///
/// Words will always be represented as [`Word`]
///
/// [`set_words`]: Self::set_words
/// [`PartialResult`]: crate::PartialResult
/// [`Word`]: crate::Word
pub fn set_partial_words(&mut self, enable: bool) {
unsafe { vosk_recognizer_set_partial_words(self.0.as_ptr(), i32::from(enable)) }
}
/// Enables or disables Natural Language Semantics Markup Language (NLSML) in the output
pub fn set_nlsml(&mut self, enable: bool) {
unsafe { vosk_recognizer_set_nlsml(self.0.as_ptr(), i32::from(enable)) }
}
/// Accept and process new chunk of voice data.
///
/// * `data` - Audio data in PCM 16-bit mono format.
///
/// Returns a [`DecodingState`], which represents the state of the decodification
/// after this chunk of data has been processed.
pub fn accept_waveform(&mut self, data: &[i16]) -> Result<DecodingState, AcceptWaveformError> {
// vosk_recognizer_accept_waveform and vosk_recognizer_accept_waveform_f are meant
// to be used by languages that do not have an i16 type (those functions also take PCM 16-bit audio,
// but represented as an f32 or i8). Those extra functions aren't needed in rust so they
// will not be wrapped
let len = data.len();
let decoding_state = unsafe {
vosk_recognizer_accept_waveform_s(
self.0.as_ptr(),
data.as_ptr(),
i32::try_from(len).map_err(|_| AcceptWaveformError::BufferTooLong(len))?,
)
};
Ok(DecodingState::from_c_int(decoding_state))
}
/// Returns speech recognition result, waiting for silence (see [`DecodingState::Finalized`]) to give a result.
///
/// The returned value will be a [`CompleteResult::Single`]
/// if [`set_max_alternatives`] was passed a 0 (default), and
/// [`CompleteResult::Multiple`] otherwise.
///
/// If words are enabled (see [`set_words`]), it also returns metadata abut the words.
///
/// [`set_max_alternatives`]: Self::set_max_alternatives
/// [`set_words`]: Self::set_words
/// [`CompleteResult::Multiple`]: crate::CompleteResult::Multiple
/// [`CompleteResult::Single`]: crate::CompleteResult::Single
#[must_use]
pub fn result(&mut self) -> CompleteResult {
unsafe { result_from_json_c_str(vosk_recognizer_result(self.0.as_ptr())) }
}
/// Returns partial speech recognition, which is not yet finalized and may change after
/// processing more data.
///
/// If words are enabled (see [`set_partial_words`]), it also returns metadata abut the words.
///
/// [`set_partial_words`]: Self::set_partial_words
#[must_use]
pub fn partial_result(&mut self) -> PartialResult {
unsafe { result_from_json_c_str(vosk_recognizer_partial_result(self.0.as_ptr())) }
}
/// Returns speech recognition result. Like [`result`] but it does not
/// wait for silence and it flushes the data so everything is processed
///
/// [`result`]: Self::result
#[must_use]
pub fn final_result(&mut self) -> CompleteResult {
unsafe { result_from_json_c_str(vosk_recognizer_final_result(self.0.as_ptr())) }
}
/// Resets current results and data so the recognition can continue from scratch
pub fn reset(&mut self) {
unsafe {
vosk_recognizer_reset(self.0.as_ptr());
}
}
}
// SAFETY: Recognizer shares no state, so it is Send
unsafe impl Send for Recognizer {}
// SAFETY: All methods that mutate Recognizer require a &mut to it,
// which ensures exclusive access, so it is Sync
unsafe impl Sync for Recognizer {}
impl Drop for Recognizer {
fn drop(&mut self) {
unsafe { vosk_recognizer_free(self.0.as_ptr()) }
}
}