active_call/synthesis/
supertonic.rs

1use crate::offline::get_offline_models;
2use crate::synthesis::{SynthesisClient, SynthesisEvent, SynthesisOption, SynthesisType};
3use anyhow::{Result, anyhow};
4use async_trait::async_trait;
5use audio_codec::Resampler;
6use bytes::Bytes;
7use futures::stream::BoxStream;
8use tokio::sync::mpsc;
9use tokio_stream::wrappers::UnboundedReceiverStream;
10use tokio_util::sync::CancellationToken;
11use tracing::{debug, warn};
12
13pub struct SupertonicTtsClient {
14    voice_style: String,
15    speed: f32,
16    target_rate: i32,
17    tx: Option<mpsc::UnboundedSender<(Option<usize>, Result<SynthesisEvent>)>>,
18    token: CancellationToken,
19}
20
21impl SupertonicTtsClient {
22    pub fn create(_streaming: bool, option: &SynthesisOption) -> Result<Box<dyn SynthesisClient>> {
23        let voice_style = option.speaker.clone().unwrap_or_else(|| "M1".to_string());
24        let speed = option.speed.unwrap_or(1.0);
25        let target_rate = option.samplerate.unwrap_or(16000);
26
27        Ok(Box::new(Self {
28            voice_style,
29            speed,
30            target_rate,
31            tx: None,
32            token: CancellationToken::new(),
33        }))
34    }
35
36    fn ensure_models_initialized() -> Result<()> {
37        if get_offline_models().is_none() {
38            anyhow::bail!(
39                "Offline models not initialized. Please call init_offline_models() first."
40            );
41        }
42        Ok(())
43    }
44
45    async fn synthesize_text(&self, text: String, cmd_seq: Option<usize>) -> Result<()> {
46        let Some(tx) = self.tx.as_ref() else {
47            return Ok(());
48        };
49
50        let models =
51            get_offline_models().ok_or_else(|| anyhow!("offline models not initialized"))?;
52
53        let voice_style = self.voice_style.clone();
54        let speed = self.speed;
55        let target_rate = self.target_rate;
56        let tx_clone = tx.clone();
57        // Supertonic: en is hardcoded for now, or detect from text?
58        let language = "en".to_string();
59
60        let tts_arc = models.get_supertonic().await?;
61
62        // Run synthesis in blocking task
63        tokio::task::spawn_blocking(move || {
64            // Need write lock to use synthesize if it takes &mut self
65            // But if synthesize takes &mut self, blocking_write is needed.
66            let mut guard = tts_arc.blocking_write();
67
68            if let Some(tts) = guard.as_mut() {
69                debug!(
70                    text = %text,
71                    voice = %voice_style,
72                    speed = speed,
73                    target_rate = target_rate,
74                    "Calling Supertonic TTS synthesis"
75                );
76
77                match tts.synthesize(&text, &language, Some(&voice_style), Some(speed)) {
78                    Ok(samples) => {
79                        if !samples.is_empty() {
80                            let mut samples_i16: Vec<i16> = samples
81                                .iter()
82                                .map(|&s| (s * 32768.0).max(-32768.0).min(32767.0) as i16)
83                                .collect();
84
85                            // Resample if needed
86                            if tts.sample_rate() != target_rate {
87                                let mut resampler = Resampler::new(
88                                    tts.sample_rate() as usize,
89                                    target_rate as usize,
90                                );
91                                samples_i16 = resampler.resample(&samples_i16);
92                            }
93
94                            // Convert i16 samples to PCM bytes
95                            let mut bytes = Vec::with_capacity(samples_i16.len() * 2);
96                            for s in samples_i16 {
97                                bytes.extend_from_slice(&s.to_le_bytes());
98                            }
99
100                            // Send AudioChunk
101                            let _ = tx_clone.send((
102                                cmd_seq,
103                                Ok(SynthesisEvent::AudioChunk(Bytes::from(bytes))),
104                            ));
105
106                            // Send Finished
107                            let _ = tx_clone.send((cmd_seq, Ok(SynthesisEvent::Finished)));
108                        } else {
109                            warn!("Supertonic produced empty audio");
110                            let _ = tx_clone.send((cmd_seq, Ok(SynthesisEvent::Finished)));
111                        }
112                    }
113                    Err(e) => {
114                        warn!(error = %e, "Supertonic inference failed");
115                        let _ = tx_clone.send((cmd_seq, Err(anyhow!("Synthesis failed: {}", e))));
116                    }
117                }
118            } else {
119                warn!("Supertonic TTS not initialized");
120                let _ = tx_clone.send((cmd_seq, Err(anyhow!("TTS not initialized"))));
121            }
122        })
123        .await
124        .map_err(|e| anyhow!("task join error: {}", e))?;
125
126        Ok(())
127    }
128}
129
130#[async_trait]
131impl SynthesisClient for SupertonicTtsClient {
132    fn provider(&self) -> SynthesisType {
133        SynthesisType::Supertonic
134    }
135
136    async fn start(
137        &mut self,
138    ) -> Result<BoxStream<'static, (Option<usize>, Result<SynthesisEvent>)>> {
139        Self::ensure_models_initialized()?;
140
141        let (tx, rx) = mpsc::unbounded_channel();
142        self.tx = Some(tx);
143
144        // Initialize TTS if needed
145        let models =
146            get_offline_models().ok_or_else(|| anyhow!("offline models not initialized"))?;
147        models.init_supertonic().await?;
148
149        debug!(
150            "SupertonicTtsClient started with voice: {}",
151            self.voice_style
152        );
153
154        Ok(Box::pin(UnboundedReceiverStream::new(rx)))
155    }
156
157    async fn synthesize(
158        &mut self,
159        text: &str,
160        cmd_seq: Option<usize>,
161        _option: Option<SynthesisOption>,
162    ) -> Result<()> {
163        self.synthesize_text(text.to_string(), cmd_seq).await
164    }
165
166    async fn stop(&mut self) -> Result<()> {
167        self.token.cancel();
168        self.tx = None;
169        Ok(())
170    }
171}