Skip to main content

tauri_plugin_stt/
models.rs

1use serde::{Deserialize, Serialize};
2
3/// Language code for speech recognition (e.g., "en-US", "pt-BR", "ja-JP")
4pub type LanguageCode = String;
5
6/// Configuration for starting speech recognition
7#[derive(Debug, Clone, Default, Deserialize, Serialize)]
8#[serde(rename_all = "camelCase")]
9pub struct ListenConfig {
10    /// Language code for recognition (e.g., "en-US", "pt-BR")
11    /// If not specified, uses device default language
12    #[serde(default)]
13    pub language: Option<LanguageCode>,
14
15    /// Whether to return interim (partial) results
16    #[serde(default, rename = "interimResults")]
17    pub interim_results: bool,
18
19    /// Whether to continue listening after getting a result
20    /// If false, stops after first final result
21    #[serde(default)]
22    pub continuous: bool,
23
24    /// Maximum duration to listen in milliseconds (0 = no limit)
25    #[serde(default, rename = "maxDuration")]
26    pub max_duration: u32,
27
28    /// Maximum number of alternative transcriptions
29    #[serde(default, rename = "maxAlternatives")]
30    pub max_alternatives: Option<u32>,
31
32    /// Use on-device recognition only (iOS 13+, no network required)
33    /// When true, recognition works offline but may be less accurate
34    #[serde(default, rename = "onDevice")]
35    pub on_device: bool,
36}
37
38/// Recognition state
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
40#[serde(rename_all = "lowercase")]
41#[derive(Default)]
42pub enum RecognitionState {
43    /// Not currently listening
44    #[default]
45    Idle,
46    /// Actively listening for speech
47    Listening,
48    /// Processing audio (may briefly occur between utterances)
49    Processing,
50}
51
52/// A speech recognition result
53#[derive(Debug, Clone, Serialize, Deserialize)]
54#[serde(rename_all = "camelCase")]
55pub struct RecognitionResult {
56    /// The recognized text
57    pub transcript: String,
58
59    /// Whether this is a final result (vs interim/partial)
60    pub is_final: bool,
61
62    /// Confidence score (0.0 to 1.0), if available
63    #[serde(default)]
64    pub confidence: Option<f32>,
65
66    /// Base64-encoded WAV audio of the utterance that produced this
67    /// result (desktop only). `None` on mobile / when unavailable.
68    #[serde(default, skip_serializing_if = "Option::is_none")]
69    pub audio_data: Option<String>,
70}
71
72/// Current status of speech recognition
73#[derive(Debug, Clone, Serialize, Deserialize)]
74#[serde(rename_all = "camelCase")]
75pub struct RecognitionStatus {
76    /// Current state
77    pub state: RecognitionState,
78
79    /// Whether STT is available on this device
80    pub is_available: bool,
81
82    /// Current language being used
83    #[serde(default)]
84    pub language: Option<LanguageCode>,
85}
86
87/// Supported language information
88#[derive(Debug, Clone, Serialize, Deserialize)]
89#[serde(rename_all = "camelCase")]
90pub struct SupportedLanguage {
91    /// Language code (e.g., "en-US")
92    pub code: LanguageCode,
93
94    /// Human-readable name (e.g., "English (United States)")
95    pub name: String,
96
97    /// Whether the model for this language is installed locally (desktop only)
98    #[serde(default)]
99    pub installed: Option<bool>,
100}
101
102/// Permission status
103#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
104#[serde(rename_all = "lowercase")]
105pub enum PermissionStatus {
106    /// Permission has been granted
107    Granted,
108    /// Permission has been denied
109    Denied,
110    /// Permission hasn't been requested yet
111    Unknown,
112}
113
114/// Response for permission check
115#[derive(Debug, Clone, Serialize, Deserialize)]
116#[serde(rename_all = "camelCase")]
117pub struct PermissionResponse {
118    /// Microphone permission status
119    pub microphone: PermissionStatus,
120
121    /// Speech recognition permission status (iOS/macOS specific)
122    pub speech_recognition: PermissionStatus,
123}
124
125/// Response for availability check
126#[derive(Debug, Clone, Serialize, Deserialize)]
127#[serde(rename_all = "camelCase")]
128pub struct AvailabilityResponse {
129    /// Whether STT is available
130    pub available: bool,
131
132    /// Reason if not available
133    #[serde(default)]
134    pub reason: Option<String>,
135}
136
137/// Response for supported languages
138#[derive(Debug, Clone, Serialize, Deserialize)]
139#[serde(rename_all = "camelCase")]
140pub struct SupportedLanguagesResponse {
141    /// List of supported languages
142    pub languages: Vec<SupportedLanguage>,
143}
144
145/// Unified error codes for cross-platform consistency
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
147#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
148#[derive(Default)]
149pub enum SttErrorCode {
150    /// No error
151    #[default]
152    None,
153    /// Speech recognition service not available
154    NotAvailable,
155    /// Microphone permission denied
156    PermissionDenied,
157    /// Speech recognition permission denied (iOS)
158    SpeechPermissionDenied,
159    /// Network error (server-based recognition)
160    NetworkError,
161    /// Audio recording error
162    AudioError,
163    /// Recognition timed out (maxDuration reached)
164    Timeout,
165    /// No speech detected
166    NoSpeech,
167    /// Language not supported
168    LanguageNotSupported,
169    /// Recognition was cancelled
170    Cancelled,
171    /// Already listening
172    AlreadyListening,
173    /// Not currently listening
174    NotListening,
175    /// Service busy
176    Busy,
177    /// No Whisper model has been downloaded yet
178    ModelNotInstalled,
179    /// Unknown error
180    Unknown,
181}
182
183impl SttErrorCode {
184    /// Get a human-readable description of the error
185    pub fn description(&self) -> &'static str {
186        match self {
187            Self::None => "No error",
188            Self::NotAvailable => "Speech recognition is not available on this device",
189            Self::PermissionDenied => "Microphone permission was denied",
190            Self::SpeechPermissionDenied => "Speech recognition permission was denied",
191            Self::NetworkError => "Network error during recognition",
192            Self::AudioError => "Error accessing audio input",
193            Self::Timeout => "Recognition timed out",
194            Self::NoSpeech => "No speech was detected",
195            Self::LanguageNotSupported => "The requested language is not supported",
196            Self::Cancelled => "Recognition was cancelled",
197            Self::AlreadyListening => "Already listening for speech",
198            Self::NotListening => "Not currently listening",
199            Self::Busy => "Speech recognition service is busy",
200            Self::ModelNotInstalled => "No speech recognition model has been downloaded",
201            Self::Unknown => "An unknown error occurred",
202        }
203    }
204
205    /// Get the numeric code for this error
206    pub fn code(&self) -> i32 {
207        match self {
208            Self::None => 0,
209            Self::NotAvailable => -1,
210            Self::PermissionDenied => -2,
211            Self::SpeechPermissionDenied => -3,
212            Self::NetworkError => -4,
213            Self::AudioError => -5,
214            Self::Timeout => -6,
215            Self::NoSpeech => -7,
216            Self::LanguageNotSupported => -8,
217            Self::Cancelled => -9,
218            Self::AlreadyListening => -10,
219            Self::NotListening => -11,
220            Self::Busy => -12,
221            Self::ModelNotInstalled => -13,
222            Self::Unknown => -99,
223        }
224    }
225}
226
227/// Structured error event for frontend consumption
228#[derive(Debug, Clone, Serialize, Deserialize)]
229#[serde(rename_all = "camelCase")]
230pub struct SttError {
231    /// Error code for programmatic handling
232    pub code: SttErrorCode,
233    /// Human-readable error message
234    pub message: String,
235    /// Platform-specific error details (optional)
236    #[serde(default)]
237    pub details: Option<String>,
238}
239
240#[derive(Debug, Clone, Serialize, Deserialize)]
241#[serde(rename_all = "camelCase")]
242pub struct WhisperModelInfo {
243    /// Stable identifier (`tiny`, `tiny.en`, `base`, `base.en`,
244    /// `small`, `small.en`, `medium`, `medium.en`, `large-v3`).
245    pub id: String,
246    /// Human-readable name shown in the model manager.
247    pub display_name: String,
248    /// Approximate on-disk size in megabytes — used for confirmation
249    /// dialogs ("Download 142 MB?") and for the disk-usage summary.
250    pub size_mb: u32,
251    /// Approximate working-set memory in megabytes (whisper.cpp's
252    /// published "required memory" — covers RAM on CPU, VRAM on GPU).
253    /// Drives the "your device has only X MB" gate so we never let
254    /// a user download a model their machine can't actually run.
255    pub required_memory_mb: u32,
256    /// Whether the binary is currently present in `app_data_dir`.
257    pub installed: bool,
258    /// Whether this model is the one `start_listening` will load.
259    pub active: bool,
260    /// Marks the suggested default for first-time users. Exactly one
261    /// model in the catalogue carries this flag.
262    pub recommended: bool,
263    /// Short qualitative label for the speed ↔ accuracy trade-off,
264    /// e.g. `"fastest"`, `"balanced"`, `"most accurate"`. Lets the UI
265    /// stay in sync with the catalogue without owning copy.
266    pub tier: String,
267    /// `Some("en")` for English-optimised variants (`*.en`), `None`
268    /// for the multilingual default models. The frontend prefers an
269    /// `.en` variant when the course's declared language is English.
270    #[serde(default)]
271    pub language: Option<String>,
272    /// `false` when the local machine doesn't have enough RAM/VRAM to
273    /// load this model. Drives the install button's disabled state and
274    /// the "Not enough memory" hint in the UI.
275    pub fits_in_memory: bool,
276    /// Power-user model (currently the `large` family). Hidden from
277    /// the default catalogue listing — surfaces only when the caller
278    /// explicitly asks for advanced models.
279    pub advanced: bool,
280}
281
282#[derive(Debug, Clone, Serialize, Deserialize)]
283#[serde(rename_all = "camelCase")]
284pub struct WhisperModelsResponse {
285    /// Catalogue ordered from smallest to largest.
286    pub models: Vec<WhisperModelInfo>,
287    /// Currently active model id (`None` if none installed yet).
288    #[serde(default)]
289    pub active: Option<String>,
290    /// Total bytes occupied by every installed model. Drives the
291    /// "Disk usage" line in the settings page.
292    pub total_disk_bytes: u64,
293    /// Total physical RAM (in MB) the host machine reports. The UI
294    /// shows this next to each model's `requiredMemoryMb` so the user
295    /// understands *why* a model is greyed out.
296    pub system_memory_mb: u32,
297}
298
299#[cfg(test)]
300mod tests {
301    use super::*;
302
303    #[test]
304    fn test_listen_config_defaults() {
305        let config: ListenConfig = serde_json::from_str("{}").unwrap();
306        assert!(config.language.is_none());
307        assert!(!config.interim_results);
308        assert!(!config.continuous);
309        assert_eq!(config.max_duration, 0);
310    }
311
312    #[test]
313    fn test_listen_config_full() {
314        let json = r#"{
315            "language": "pt-BR",
316            "interimResults": true,
317            "continuous": true,
318            "maxDuration": 30
319        }"#;
320        let config: ListenConfig = serde_json::from_str(json).unwrap();
321        assert_eq!(config.language, Some("pt-BR".to_string()));
322        assert!(config.interim_results);
323        assert!(config.continuous);
324        assert_eq!(config.max_duration, 30);
325    }
326
327    #[test]
328    fn test_recognition_state_serialization() {
329        assert_eq!(
330            serde_json::to_string(&RecognitionState::Idle).unwrap(),
331            "\"idle\""
332        );
333        assert_eq!(
334            serde_json::to_string(&RecognitionState::Listening).unwrap(),
335            "\"listening\""
336        );
337        assert_eq!(
338            serde_json::to_string(&RecognitionState::Processing).unwrap(),
339            "\"processing\""
340        );
341    }
342
343    #[test]
344    fn test_recognition_result() {
345        let result = RecognitionResult {
346            transcript: "Hello world".to_string(),
347            is_final: true,
348            confidence: Some(0.95),
349            audio_data: None,
350        };
351        let json = serde_json::to_string(&result).unwrap();
352        assert!(json.contains("\"transcript\":\"Hello world\""));
353        assert!(json.contains("\"isFinal\":true"));
354        assert!(json.contains("\"confidence\":0.95"));
355    }
356
357    #[test]
358    fn test_permission_status_serialization() {
359        assert_eq!(
360            serde_json::to_string(&PermissionStatus::Granted).unwrap(),
361            "\"granted\""
362        );
363        assert_eq!(
364            serde_json::to_string(&PermissionStatus::Denied).unwrap(),
365            "\"denied\""
366        );
367        assert_eq!(
368            serde_json::to_string(&PermissionStatus::Unknown).unwrap(),
369            "\"unknown\""
370        );
371    }
372}