use crate::tts::{AudioFormat, Speaker, SynthesizedAudio};
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use car_inference::backend::mlx_kokoro::KokoroBackend;
use car_inference::backend_cache::{estimate_model_size, BackendCache, CachedBackend};
use std::path::PathBuf;
use std::sync::{Arc, OnceLock};
fn shared_cache() -> &'static BackendCache<KokoroBackend> {
static CACHE: OnceLock<BackendCache<KokoroBackend>> = OnceLock::new();
CACHE.get_or_init(BackendCache::from_env)
}
pub struct KokoroSpeaker {
backend: CachedBackend<KokoroBackend>,
voice: String,
tmp_dir: PathBuf,
}
impl std::fmt::Debug for KokoroSpeaker {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("KokoroSpeaker")
.field("voice", &self.voice)
.field("tmp_dir", &self.tmp_dir)
.finish()
}
}
impl KokoroSpeaker {
pub fn from_config(config: &VoiceConfig) -> Result<Self> {
let model_dir = resolve_kokoro_model_dir(&config.local_tts_model).ok_or_else(|| {
VoiceError::Config(format!(
"Kokoro TTS: model '{}' not found in HuggingFace cache — \
run `car models pull {}` first",
config.local_tts_model, config.local_tts_model,
))
})?;
let key = config.local_tts_model.clone();
let size = estimate_model_size(&model_dir);
let backend = shared_cache()
.get_or_load(&key, size, || KokoroBackend::load(&model_dir))
.map_err(|e: car_inference::InferenceError| {
VoiceError::Config(format!("load Kokoro model: {e}"))
})?;
Ok(Self {
backend,
voice: config.local_tts_voice.clone(),
tmp_dir: std::env::temp_dir(),
})
}
}
#[async_trait]
impl Speaker for KokoroSpeaker {
async fn synth(&self, text: &str) -> Result<SynthesizedAudio> {
let wav_path = self
.tmp_dir
.join(format!("car-kokoro-{}.wav", uuid::Uuid::new_v4()));
let voice = self.voice.clone();
let text = text.to_string();
let backend = Arc::clone(&self.backend);
let wav_path_clone = wav_path.clone();
let wav_path = tokio::task::spawn_blocking(move || -> Result<PathBuf> {
let mut guard = backend
.lock()
.map_err(|_| VoiceError::Config("Kokoro backend mutex poisoned".into()))?;
guard
.synthesize(&text, Some(&voice), &wav_path_clone)
.map_err(|e| VoiceError::Config(format!("Kokoro synth: {e}")))
})
.await
.map_err(|e| VoiceError::Config(format!("Kokoro synth task: {e}")))??;
let bytes = tokio::fs::read(&wav_path)
.await
.map_err(|e| VoiceError::Config(format!("read Kokoro wav: {e}")))?;
let _ = tokio::fs::remove_file(&wav_path).await;
Ok(SynthesizedAudio {
bytes,
format: AudioFormat::Wav,
})
}
}
fn resolve_kokoro_model_dir(repo: &str) -> Option<PathBuf> {
let home = dirs::home_dir()?;
let hub_dir = home.join(".cache/huggingface/hub");
let normalized = repo.replace('/', "--");
let candidate = hub_dir
.join(format!("models--{normalized}"))
.join("snapshots");
let entries = std::fs::read_dir(&candidate).ok()?;
entries
.flatten()
.filter_map(|e| {
let p = e.path();
let meta = e.metadata().ok()?;
if !meta.is_dir() {
return None;
}
let mtime = meta.modified().ok()?;
Some((mtime, p))
})
.max_by_key(|(m, _)| *m)
.map(|(_, p)| p)
}