Skip to main content

rust_tts_wrapper/
types.rs

1//! Shared types used across the crate.
2
3use std::collections::HashMap;
4use std::fmt;
5use std::os::raw::c_char;
6
7/// Voice gender, matching Swift's `UnifiedVoice.Gender`.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum Gender {
10    Male,
11    Female,
12    Unknown,
13}
14
15impl fmt::Display for Gender {
16    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
17        match self {
18            Self::Male => write!(f, "Male"),
19            Self::Female => write!(f, "Female"),
20            Self::Unknown => write!(f, "Unknown"),
21        }
22    }
23}
24
25/// Normalize a raw gender string to a typed [`Gender`].
26#[must_use]
27pub fn normalize_gender(value: &str) -> Gender {
28    match value.to_lowercase().as_str() {
29        "female" => Gender::Female,
30        "male" => Gender::Male,
31        _ => Gender::Unknown,
32    }
33}
34
35/// A language code entry with BCP-47, ISO 639-3, and display name.
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct LanguageCode {
38    /// BCP-47 language tag (e.g. `"en-US"`).
39    pub bcp47: String,
40    /// ISO 639-3 language code (e.g. `"eng"`).
41    pub iso639_3: String,
42    /// Human-readable language name (e.g. `"English (United States)"`).
43    pub display: String,
44}
45
46/// A single voice offered by an engine, unified across all providers.
47/// Mirrors Swift's `UnifiedVoice`.
48#[derive(Debug, Clone)]
49pub struct Voice {
50    /// Unique voice identifier within the engine.
51    pub id: String,
52    /// Human-readable voice name.
53    pub name: String,
54    /// Gender of the voice.
55    pub gender: Gender,
56    /// The engine/provider that provides this voice (e.g. `"azure"`, `"google"`).
57    pub provider: String,
58    /// Language codes supported by this voice.
59    pub language_codes: Vec<LanguageCode>,
60}
61
62impl Voice {
63    /// Convenience: return the primary (first) BCP-47 language code, or empty string.
64    #[must_use]
65    pub fn primary_language(&self) -> &str {
66        self.language_codes.first().map_or("", |l| l.bcp47.as_str())
67    }
68}
69
70/// Audio output format, matching Swift's `AudioFormat`.
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum AudioFormat {
73    Mp3,
74    Wav,
75    Ogg,
76    Opus,
77    Aac,
78    Flac,
79    Pcm,
80}
81
82impl fmt::Display for AudioFormat {
83    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
84        match self {
85            Self::Mp3 => write!(f, "mp3"),
86            Self::Wav => write!(f, "wav"),
87            Self::Ogg => write!(f, "ogg"),
88            Self::Opus => write!(f, "opus"),
89            Self::Aac => write!(f, "aac"),
90            Self::Flac => write!(f, "flac"),
91            Self::Pcm => write!(f, "pcm"),
92        }
93    }
94}
95
96/// Named speech rate presets, matching Swift's `SpeechRate`.
97#[derive(Debug, Clone, Copy, PartialEq)]
98pub enum SpeechRate {
99    XSlow,
100    Slow,
101    Medium,
102    Fast,
103    XFast,
104}
105
106impl SpeechRate {
107    /// Convert to a float multiplier (1.0 = normal).
108    #[must_use]
109    pub fn rate_value(self) -> f32 {
110        match self {
111            Self::XSlow => 0.5,
112            Self::Slow => 0.75,
113            Self::Medium => 1.0,
114            Self::Fast => 1.25,
115            Self::XFast => 1.5,
116        }
117    }
118}
119
120/// Named speech pitch presets, matching Swift's `SpeechPitch`.
121#[derive(Debug, Clone, Copy, PartialEq)]
122pub enum SpeechPitch {
123    XLow,
124    Low,
125    Medium,
126    High,
127    XHigh,
128}
129
130impl SpeechPitch {
131    /// Convert to a float multiplier (1.0 = normal).
132    #[must_use]
133    pub fn pitch_value(self) -> f32 {
134        match self {
135            Self::XLow => 0.5,
136            Self::Low => 0.75,
137            Self::Medium => 1.0,
138            Self::High => 1.25,
139            Self::XHigh => 1.5,
140        }
141    }
142}
143
144/// Options for speak/synth calls, matching Swift's `SpeakOptions`.
145#[derive(Debug, Clone, Default)]
146pub struct SpeakOptions {
147    /// Speech rate as a float multiplier (1.0 = normal). Overrides `speech_rate`.
148    pub rate: Option<f32>,
149    /// Speech rate as a named preset.
150    pub speech_rate: Option<SpeechRate>,
151    /// Speech pitch as a float multiplier (1.0 = normal). Overrides `speech_pitch`.
152    pub pitch: Option<f32>,
153    /// Speech pitch as a named preset.
154    pub speech_pitch: Option<SpeechPitch>,
155    /// Volume (0.0–1.0).
156    pub volume: Option<f32>,
157    /// Voice identifier.
158    pub voice: Option<String>,
159    /// Desired audio output format.
160    pub format: Option<AudioFormat>,
161    /// Whether to preprocess SpeechMarkdown to SSML.
162    pub use_speech_markdown: bool,
163    /// Whether to request real word boundary events from the API.
164    pub use_word_boundary: bool,
165    /// If true, pass SSML directly to the engine without wrapping.
166    pub raw_ssml: bool,
167    /// Engine-specific extra options.
168    pub extra: HashMap<String, String>,
169}
170
171impl SpeakOptions {
172    /// Resolve the effective rate value.
173    #[must_use]
174    pub fn effective_rate(&self) -> f32 {
175        self.rate
176            .or_else(|| self.speech_rate.map(SpeechRate::rate_value))
177            .unwrap_or(1.0)
178    }
179
180    /// Resolve the effective pitch value.
181    #[must_use]
182    pub fn effective_pitch(&self) -> f32 {
183        self.pitch
184            .or_else(|| self.speech_pitch.map(SpeechPitch::pitch_value))
185            .unwrap_or(1.0)
186    }
187
188    /// Resolve the effective volume value.
189    #[must_use]
190    pub fn effective_volume(&self) -> f32 {
191        self.volume.unwrap_or(1.0)
192    }
193}
194
195/// A word boundary event with timing information.
196/// Mirrors Swift's `WordBoundary`.
197#[derive(Debug, Clone, PartialEq)]
198pub struct WordBoundary {
199    /// The spoken word text.
200    pub text: String,
201    /// Offset from start of audio in milliseconds.
202    pub offset: u64,
203    /// Duration of the word in milliseconds.
204    pub duration: u64,
205}
206
207/// Describes a registered engine for introspection.
208#[derive(Debug, Clone)]
209pub struct EngineDescriptor {
210    /// Unique engine identifier.
211    pub id: String,
212    /// Human-readable engine name.
213    pub name: String,
214    /// Whether this engine requires API credentials.
215    pub needs_credentials: bool,
216    /// JSON array of credential key names, e.g. `r#"["apiKey"]"#`.
217    pub credential_keys_json: String,
218}
219
220/// Metadata for a Sherpa-ONNX model from the registry.
221#[derive(Debug, Clone)]
222pub struct SherpaModelInfo {
223    /// Model identifier (e.g. `"kokoro-en-en-19"`).
224    pub id: String,
225    /// Model type (e.g. `"kokoro"`, `"vits"`).
226    pub model_type: String,
227    /// Human-readable model name.
228    pub name: String,
229    /// Languages supported by this model.
230    pub language: Vec<SherpaLanguage>,
231    /// Sample rate in Hz.
232    pub sample_rate: u32,
233    /// Number of speakers (for multi-speaker models).
234    pub num_speakers: u32,
235    /// Download URL for the model archive.
236    pub url: String,
237    /// Whether the archive is compressed.
238    pub compression: bool,
239    /// Approximate download size in megabytes.
240    pub filesize_mb: f64,
241}
242
243/// A language entry within a Sherpa-ONNX model.
244#[derive(Debug, Clone)]
245pub struct SherpaLanguage {
246    /// ISO 639 language code.
247    pub lang_code: String,
248    /// Full language name.
249    pub language_name: String,
250    /// Country code.
251    pub country: String,
252}
253
254/// C-compatible voice descriptor returned by [`tts_get_voices`](crate::tts_get_voices).
255#[repr(C)]
256pub struct tts_voice {
257    /// Voice identifier (owned C string).
258    pub id: *mut c_char,
259    /// Voice name (owned C string).
260    pub name: *mut c_char,
261    /// Language tag (owned C string).
262    pub language: *mut c_char,
263    /// Gender (owned C string).
264    pub gender: *mut c_char,
265    /// Engine identifier (owned C string).
266    pub engine: *mut c_char,
267}
268
269/// C-compatible engine descriptor returned by [`tts_get_engines`](crate::tts_get_engines).
270#[repr(C)]
271pub struct tts_engine_info {
272    /// Engine identifier (owned C string).
273    pub id: *mut c_char,
274    /// Engine name (owned C string).
275    pub name: *mut c_char,
276    /// Whether credentials are required.
277    pub needs_credentials: bool,
278    /// JSON array of credential key names (owned C string).
279    pub credential_keys_json: *mut c_char,
280}
281
282/// Error type for TTS operations.
283#[derive(Debug)]
284pub struct TtsError(pub String);
285
286impl fmt::Display for TtsError {
287    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
288        write!(f, "{}", self.0)
289    }
290}
291
292impl std::error::Error for TtsError {}
293
294impl From<anyhow::Error> for TtsError {
295    fn from(e: anyhow::Error) -> Self {
296        TtsError(e.to_string())
297    }
298}
299
300/// Result alias using [`TtsError`].
301pub type TtsResult<T> = Result<T, TtsError>;