Skip to main content

rust_tts_wrapper/
engine.rs

1//! Core TTS engine trait.
2
3use crate::types::{SpeakOptions, TtsResult, Voice, WordBoundary};
4use std::fmt;
5
6/// Callback for streaming audio chunks.
7pub type OnAudioCallback<'a> = &'a mut dyn FnMut(&[u8]);
8
9/// Callback for word boundary events.
10pub type OnBoundaryCallback<'a> = &'a mut dyn FnMut(&str, f32, f32);
11
12/// Callback for speech-started events.
13pub type OnStartCallback<'a> = &'a mut dyn FnMut();
14
15/// Callback for speech-finished events.
16pub type OnEndCallback<'a> = &'a mut dyn FnMut();
17
18/// Callback for error events.
19pub type OnErrorCallback<'a> = &'a mut dyn FnMut(&str);
20
21/// Convert speech markdown to SSML if detected, otherwise return text as-is.
22/// Returns (processed_text, is_ssml).
23#[cfg(feature = "cloud")]
24#[must_use]
25pub fn preprocess_speech_markdown(text: &str, platform: &str) -> (String, bool) {
26    use speechmarkdown_rust::{Platform, SpeechMarkdownParser};
27
28    if !SpeechMarkdownParser::is_speech_markdown(text) {
29        return (text.to_string(), false);
30    }
31
32    let platform = match platform {
33        "azure" => Platform::MicrosoftAzure,
34        "google" => Platform::GoogleAssistant,
35        _ => Platform::AmazonAlexa,
36    };
37
38    match SpeechMarkdownParser::to_ssml(text, platform) {
39        Ok(ssml) => (ssml, true),
40        Err(_) => (text.to_string(), false),
41    }
42}
43
44#[cfg(not(feature = "cloud"))]
45#[must_use]
46pub fn preprocess_speech_markdown(text: &str, _platform: &str) -> (String, bool) {
47    (text.to_string(), false)
48}
49
50/// Trait that every TTS engine must implement.
51///
52/// Mirrors Swift's `TTSClient` protocol.
53#[allow(clippy::missing_errors_doc)]
54pub trait TtsEngine: Send + Sync + fmt::Debug {
55    /// Start speaking `text` asynchronously.
56    #[allow(clippy::too_many_arguments)]
57    fn speak(
58        &self,
59        text: &str,
60        voice: Option<&str>,
61        rate: f32,
62        pitch: f32,
63        volume: f32,
64        on_audio: Option<OnAudioCallback>,
65        on_boundary: Option<OnBoundaryCallback>,
66    ) -> TtsResult<()>;
67
68    /// Speak with full [`SpeakOptions`], matching Swift's `speak(_:options:)`.
69    fn speak_with_options(
70        &self,
71        text: &str,
72        options: Option<&SpeakOptions>,
73        on_audio: Option<OnAudioCallback>,
74        on_boundary: Option<OnBoundaryCallback>,
75    ) -> TtsResult<()> {
76        let opts = options.cloned().unwrap_or_default();
77        self.speak(
78            text,
79            opts.voice.as_deref(),
80            opts.effective_rate(),
81            opts.effective_pitch(),
82            opts.effective_volume(),
83            on_audio,
84            on_boundary,
85        )
86    }
87
88    /// Speak `text` synchronously, blocking until synthesis completes.
89    #[allow(clippy::too_many_arguments)]
90    fn speak_sync(
91        &self,
92        text: &str,
93        voice: Option<&str>,
94        rate: f32,
95        pitch: f32,
96        volume: f32,
97        on_audio: Option<OnAudioCallback>,
98        on_boundary: Option<OnBoundaryCallback>,
99    ) -> TtsResult<()>;
100
101    /// Stop any in-progress speech.
102    fn stop(&self) -> TtsResult<()>;
103
104    /// Pause speech (default: no-op, engines may override).
105    fn pause(&self) -> TtsResult<()> {
106        Ok(())
107    }
108
109    /// Resume speech (default: no-op, engines may override).
110    fn resume(&self) -> TtsResult<()> {
111        Ok(())
112    }
113
114    /// List available voices for this engine.
115    fn get_voices(&self) -> TtsResult<Vec<Voice>>;
116
117    /// Return the unique identifier of this engine (e.g. `"system"`, `"sherpaonnx"`).
118    fn engine_id(&self) -> &'static str;
119
120    /// Check whether the configured credentials are valid.
121    /// Default: attempt to fetch voices as a validation.
122    fn check_credentials(&self) -> TtsResult<bool> {
123        match self.get_voices() {
124            Ok(_) => Ok(true),
125            Err(_) => Ok(false),
126        }
127    }
128
129    /// Synthesize text to audio bytes (full buffer, no playback).
130    /// Mirrors Swift's `synthToBytes(_:options:)`.
131    fn synth_to_bytes(
132        &self,
133        text: &str,
134        voice: Option<&str>,
135        rate: f32,
136        pitch: f32,
137        volume: f32,
138    ) -> TtsResult<Vec<u8>> {
139        let mut buf = Vec::new();
140        self.speak(
141            text,
142            voice,
143            rate,
144            pitch,
145            volume,
146            Some(&mut |chunk: &[u8]| {
147                buf.extend_from_slice(chunk);
148            }),
149            None,
150        )?;
151        Ok(buf)
152    }
153
154    /// Synthesize with [`SpeakOptions`].
155    fn synth_to_bytes_with_options(
156        &self,
157        text: &str,
158        options: Option<&SpeakOptions>,
159    ) -> TtsResult<Vec<u8>> {
160        let opts = options.cloned().unwrap_or_default();
161        self.synth_to_bytes(
162            text,
163            opts.voice.as_deref(),
164            opts.effective_rate(),
165            opts.effective_pitch(),
166            opts.effective_volume(),
167        )
168    }
169
170    /// Synthesize text and return word boundary information.
171    fn synth_with_boundaries(
172        &self,
173        text: &str,
174        voice: Option<&str>,
175        rate: f32,
176        pitch: f32,
177        volume: f32,
178    ) -> TtsResult<(Vec<u8>, Vec<WordBoundary>)> {
179        let audio = self.synth_to_bytes(text, voice, rate, pitch, volume)?;
180        let boundaries = estimate_word_boundaries(text);
181        Ok((audio, boundaries))
182    }
183}
184
185/// Estimate word boundaries using word-length-adjusted timing.
186/// Mirrors Swift's `WordTimingEstimator.estimate(text:wordsPerMinute:)`.
187#[must_use]
188#[allow(clippy::cast_precision_loss)]
189pub fn estimate_word_boundaries(text: &str) -> Vec<WordBoundary> {
190    estimate_word_boundaries_with_wpm(text, 150.0)
191}
192
193/// Estimate word boundaries with configurable words per minute.
194/// Matches Swift's `WordTimingEstimator.estimate(text:wordsPerMinute:)`.
195#[must_use]
196#[allow(clippy::cast_precision_loss)]
197pub fn estimate_word_boundaries_with_wpm(text: &str, words_per_minute: f64) -> Vec<WordBoundary> {
198    let words: Vec<&str> = text.split_whitespace().filter(|w| !w.is_empty()).collect();
199    if words.is_empty() {
200        return Vec::new();
201    }
202
203    let ms_per_word = 60_000.0 / words_per_minute;
204
205    let mut boundaries = Vec::with_capacity(words.len());
206    let mut current_ms: u64 = 0;
207
208    for word in &words {
209        let length_factor = (word.len() as f64 / 5.0).clamp(0.5, 2.0);
210        let duration = (ms_per_word * length_factor) as u64;
211        let duration = duration.max(1);
212
213        boundaries.push(WordBoundary {
214            text: (*word).to_string(),
215            offset: current_ms,
216            duration,
217        });
218        current_ms += duration;
219    }
220
221    boundaries
222}