1use crate::agent::inference::InferenceEvent;
2#[cfg(feature = "embedded-voice-assets")]
3use kokoros::tts::koko::TTSKoko;
4#[cfg(feature = "embedded-voice-assets")]
5use rodio::OutputStream;
6use rodio::Sink;
7use std::sync::atomic::{AtomicBool, Ordering};
8use std::sync::mpsc;
9use std::sync::Arc;
10use tokio::sync::mpsc as tokio_mpsc;
11
12pub struct VoiceManager {
15 sender: mpsc::SyncSender<String>,
16 enabled: Arc<AtomicBool>,
17 available: Arc<AtomicBool>,
18 cancelled: Arc<AtomicBool>, sink: Arc<tokio::sync::Mutex<Option<Sink>>>,
20 current_voice: Arc<std::sync::Mutex<String>>,
22 current_speed: Arc<std::sync::Mutex<f32>>,
24 current_volume: Arc<std::sync::Mutex<f32>>,
26}
27
28impl VoiceManager {
29 pub fn new(event_tx: tokio_mpsc::Sender<InferenceEvent>) -> Self {
30 let cfg = crate::agent::config::load_config();
31 let initial_voice = crate::agent::config::effective_voice(&cfg);
32 let initial_speed = crate::agent::config::effective_voice_speed(&cfg);
33 let initial_volume = crate::agent::config::effective_voice_volume(&cfg);
34 let (tx, rx) = mpsc::sync_channel::<String>(1024);
36 let enabled = Arc::new(AtomicBool::new(true));
37 let available = Arc::new(AtomicBool::new(cfg!(feature = "embedded-voice-assets")));
38 let cancelled = Arc::new(AtomicBool::new(false));
39 let enabled_ctx = enabled.clone();
40 #[cfg(not(feature = "embedded-voice-assets"))]
41 let available_ctx = available.clone();
42 let _cancelled_ctx = cancelled.clone();
43 let sink_shared = Arc::new(tokio::sync::Mutex::new(None));
44 let current_voice = Arc::new(std::sync::Mutex::new(initial_voice));
45 let current_speed = Arc::new(std::sync::Mutex::new(initial_speed));
46 let current_volume = Arc::new(std::sync::Mutex::new(initial_volume));
47 let _voice_synth = Arc::clone(¤t_voice);
48 let _speed_synth = Arc::clone(¤t_speed);
49 let _volume_synth = Arc::clone(¤t_volume);
50 let sink_manager_clone = Arc::clone(&sink_shared);
51
52 let _ = std::thread::Builder::new()
55 .name("VoiceManager".into())
56 .stack_size(32 * 1024 * 1024) .spawn(move || {
58 #[cfg(not(feature = "embedded-voice-assets"))]
59 {
60 enabled_ctx.store(false, Ordering::SeqCst);
61 available_ctx.store(false, Ordering::SeqCst);
62 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(
63 "Voice Engine: Disabled in crates.io/source build (use packaged releases for baked-in voice).".into(),
64 ));
65 while rx.recv().is_ok() {}
66 return;
67 }
68
69 #[cfg(feature = "embedded-voice-assets")]
70 {
71 let mut _stream: Option<OutputStream> = None;
72
73 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(
74 "Voice Engine: Initializing Audio Pipeline...".into(),
75 ));
76 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(
77 "Voice Engine: Activating Baked-In Weights...".into(),
78 ));
79
80 const MODEL_BYTES: &[u8] =
82 include_bytes!("../../.hematite/assets/voice/kokoro-v1.0.onnx");
83 const VOICES_BYTES: &[u8] =
84 include_bytes!("../../.hematite/assets/voice/voices.bin");
85
86 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(
87 "Voice Engine: Loading voice model...".into(),
88 ));
89
90 let tts_result = std::panic::catch_unwind(|| {
92 TTSKoko::new_from_memory(MODEL_BYTES, VOICES_BYTES)
93 });
94
95 let tts = match tts_result {
96 Ok(Ok(engine)) => {
97 enabled_ctx.store(true, Ordering::SeqCst);
98 if let Ok((s, handle)) = OutputStream::try_default() {
99 _stream = Some(s);
100 if let Ok(new_sink) = Sink::try_new(&handle) {
101 let mut lock = sink_shared.blocking_lock();
102 *lock = Some(new_sink);
103 }
104 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(
105 "Voice Engine: Vibrant & Ready ✅".into(),
106 ));
107 } else {
108 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(
109 "Voice Engine: ERROR - No audio device found ❌".into(),
110 ));
111 }
112 Some(engine)
113 }
114 Ok(Err(e)) => {
115 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(format!(
116 "Voice Engine: ERROR - {} ❌",
117 e
118 )));
119 None
120 }
121 Err(panic_val) => {
122 let msg = panic_val
123 .downcast_ref::<String>()
124 .map(|s| s.as_str())
125 .or_else(|| panic_val.downcast_ref::<&str>().copied())
126 .unwrap_or("unknown panic");
127 let _ = event_tx.blocking_send(InferenceEvent::VoiceStatus(format!(
128 "Voice Engine: CRASH - {} ❌",
129 msg
130 )));
131 None
132 }
133 };
134
135 let (synth_tx, mut synth_rx) = tokio_mpsc::channel::<String>(64);
137 let tts_shared = Arc::new(tokio::sync::Mutex::new(tts));
138 let tts_synth_clone = Arc::clone(&tts_shared);
139 let sink_synth_clone = Arc::clone(&sink_shared);
140 let event_tx_synth = event_tx.clone();
141
142 std::thread::spawn(move || {
143 let rt = tokio::runtime::Builder::new_current_thread()
144 .enable_all()
145 .build()
146 .unwrap();
147
148 rt.block_on(async {
149 while let Some(to_speak) = synth_rx.recv().await {
150 let mut engine_opt = tts_synth_clone.lock().await;
151 if let Some(ref mut engine) = *engine_opt {
152 let voice_id = _voice_synth
153 .lock()
154 .map(|v| v.clone())
155 .unwrap_or_else(|_| "af_sky".to_string());
156 let speed = _speed_synth.lock().map(|v| *v).unwrap_or(1.0);
157 let volume = _volume_synth.lock().map(|v| *v).unwrap_or(1.0);
158 let res = engine.tts_raw_audio_streaming(
159 &to_speak,
160 "en-us",
161 &voice_id,
162 speed,
163 None,
164 None,
165 None,
166 None,
167 |chunk| {
168 if _cancelled_ctx.load(Ordering::SeqCst) {
169 return Err(Box::new(std::io::Error::new(
170 std::io::ErrorKind::Interrupted,
171 "Silenced",
172 )));
173 }
174 if !chunk.is_empty() {
175 if let Ok(mut snk_opt) = sink_synth_clone.try_lock() {
176 if let Some(ref mut snk) = *snk_opt {
177 snk.set_volume(volume);
178 let source = rodio::buffer::SamplesBuffer::new(
179 1, 24000, chunk,
180 );
181 snk.append(source);
182 snk.play();
183 }
184 }
185 }
186 Ok(())
187 },
188 );
189 if let Err(e) = res {
190 if e.to_string() != "Silenced" {
191 let _ = event_tx_synth
192 .send(InferenceEvent::VoiceStatus(format!(
193 "Audio Pipeline: Synthesis Error - {}",
194 e
195 )))
196 .await;
197 }
198 }
199 }
200 drop(engine_opt);
201 }
202 });
203 });
204
205 let mut sentence_buffer = String::new();
208 let mut last_activity = std::time::Instant::now();
209
210 loop {
211 let timeout = std::time::Duration::from_millis(150);
212 let result = rx.recv_timeout(timeout);
213
214 let token = match result {
215 Ok(t) => {
216 last_activity = std::time::Instant::now();
217 Some(t)
218 }
219 Err(mpsc::RecvTimeoutError::Timeout) => {
220 if !sentence_buffer.is_empty() && last_activity.elapsed() > timeout {
221 None
222 } else {
223 continue;
224 }
225 }
226 Err(mpsc::RecvTimeoutError::Disconnected) => break,
227 };
228
229 if let Some(ref text) = token {
230 if !enabled_ctx.load(Ordering::Relaxed) || text == "\x03" {
231 sentence_buffer.clear();
232 continue;
233 }
234 if text == "\x04" {
235 if !sentence_buffer.is_empty() {
236 let to_speak = sentence_buffer.trim().to_string();
237 sentence_buffer.clear();
238 let _ = synth_tx.blocking_send(to_speak);
239 }
240 continue;
241 }
242 sentence_buffer.push_str(text);
243 }
244
245 let to_speak = sentence_buffer.trim().to_string();
246 let has_punctuation = to_speak.ends_with('.')
247 || to_speak.ends_with('!')
248 || to_speak.ends_with('?')
249 || to_speak.ends_with(':')
250 || to_speak.ends_with('\n');
251
252 let is_word_boundary = token
253 .as_ref()
254 .map(|t| t.starts_with(' ') || t.starts_with('\n') || t.starts_with('\t'))
255 .unwrap_or(true);
256
257 let is_done = token.is_none();
258
259 if (!to_speak.is_empty() && has_punctuation && is_word_boundary)
260 || (is_done && !to_speak.is_empty())
261 {
262 sentence_buffer.clear();
263 let _ = synth_tx.blocking_send(to_speak);
264 }
265 }
266 }
267 });
268
269 Self {
270 sender: tx,
271 enabled,
272 available,
273 cancelled,
274 sink: sink_manager_clone,
275 current_voice,
276 current_speed,
277 current_volume,
278 }
279 }
280
281 pub fn speak(&self, text: String) {
282 if self.enabled.load(Ordering::Relaxed) {
283 self.cancelled.store(false, Ordering::SeqCst);
285 let _ = self.sender.try_send(text);
286 }
287 }
288
289 pub fn stop(&self) {
291 self.cancelled.store(true, Ordering::SeqCst);
292 let _ = self.sender.try_send("\x03".to_string());
293 if let Ok(mut lock) = self.sink.try_lock() {
294 if let Some(sink) = lock.as_mut() {
295 sink.stop();
296 sink.pause();
297 sink.play();
298 }
299 }
300 }
301
302 pub fn flush(&self) {
303 if self.enabled.load(Ordering::Relaxed) {
304 let _ = self.sender.try_send("\x04".to_string());
305 }
306 }
307
308 pub fn toggle(&self) -> bool {
309 if !self.available.load(Ordering::Relaxed) {
310 self.enabled.store(false, Ordering::Relaxed);
311 return false;
312 }
313 let current = self.enabled.load(Ordering::Relaxed);
314 let next = !current;
315 self.enabled.store(next, Ordering::Relaxed);
316 next
317 }
318
319 pub fn is_enabled(&self) -> bool {
320 self.available.load(Ordering::Relaxed) && self.enabled.load(Ordering::Relaxed)
321 }
322
323 pub fn is_available(&self) -> bool {
324 self.available.load(Ordering::Relaxed)
325 }
326
327 pub fn set_voice(&self, voice_id: &str) {
329 if let Ok(mut v) = self.current_voice.lock() {
330 *v = voice_id.to_string();
331 }
332 }
333
334 pub fn current_voice_id(&self) -> String {
335 self.current_voice
336 .lock()
337 .map(|v| v.clone())
338 .unwrap_or_else(|_| "af_sky".to_string())
339 }
340
341 pub fn set_speed(&self, speed: f32) {
342 if let Ok(mut v) = self.current_speed.lock() {
343 *v = speed.clamp(0.5, 2.0);
344 }
345 }
346
347 pub fn set_volume(&self, volume: f32) {
348 if let Ok(mut v) = self.current_volume.lock() {
349 *v = volume.clamp(0.0, 3.0);
350 }
351 }
352}
353
354pub const VOICE_LIST: &[(&str, &str)] = &[
356 ("af_alloy", "American Female — Alloy"),
357 ("af_aoede", "American Female — Aoede"),
358 ("af_bella", "American Female — Bella ⭐"),
359 ("af_heart", "American Female — Heart ⭐"),
360 ("af_jessica", "American Female — Jessica"),
361 ("af_kore", "American Female — Kore"),
362 ("af_nicole", "American Female — Nicole"),
363 ("af_nova", "American Female — Nova"),
364 ("af_river", "American Female — River"),
365 ("af_sarah", "American Female — Sarah"),
366 ("af_sky", "American Female — Sky (default)"),
367 ("am_adam", "American Male — Adam"),
368 ("am_echo", "American Male — Echo"),
369 ("am_eric", "American Male — Eric"),
370 ("am_fenrir", "American Male — Fenrir"),
371 ("am_liam", "American Male — Liam"),
372 ("am_michael", "American Male — Michael ⭐"),
373 ("am_onyx", "American Male — Onyx"),
374 ("am_puck", "American Male — Puck"),
375 ("bf_alice", "British Female — Alice"),
376 ("bf_emma", "British Female — Emma ⭐"),
377 ("bf_isabella", "British Female — Isabella"),
378 ("bf_lily", "British Female — Lily"),
379 ("bm_daniel", "British Male — Daniel"),
380 ("bm_fable", "British Male — Fable ⭐"),
381 ("bm_george", "British Male — George ⭐"),
382 ("bm_lewis", "British Male — Lewis"),
383 ("ef_dora", "Spanish Female — Dora"),
384 ("em_alex", "Spanish Male — Alex"),
385 ("ff_siwis", "French Female — Siwis"),
386 ("hf_alpha", "Hindi Female — Alpha"),
387 ("hf_beta", "Hindi Female — Beta"),
388 ("hm_omega", "Hindi Male — Omega"),
389 ("hm_psi", "Hindi Male — Psi"),
390 ("if_sara", "Italian Female — Sara"),
391 ("im_nicola", "Italian Male — Nicola"),
392 ("jf_alpha", "Japanese Female — Alpha"),
393 ("jf_gongitsune", "Japanese Female — Gongitsune"),
394 ("jf_nezumi", "Japanese Female — Nezumi"),
395 ("jf_tebukuro", "Japanese Female — Tebukuro"),
396 ("jm_kumo", "Japanese Male — Kumo"),
397 ("zf_xiaobei", "Chinese Female — Xiaobei"),
398 ("zf_xiaoni", "Chinese Female — Xiaoni"),
399 ("zf_xiaoxiao", "Chinese Female — Xiaoxiao"),
400 ("zf_xiaoyi", "Chinese Female — Xiaoyi"),
401 ("zm_yunjian", "Chinese Male — Yunjian"),
402 ("zm_yunxi", "Chinese Male — Yunxi"),
403 ("zm_yunxia", "Chinese Male — Yunxia"),
404 ("zm_yunyang", "Chinese Male — Yunyang"),
405];