car-voice 0.13.0

Voice I/O capability for CAR — mic capture, VAD, listener/speaker traits
Documentation
//! In-process Kokoro-82M TTS provider.
//!
//! Wraps `car_inference::backend::mlx_kokoro::KokoroBackend` so voice
//! synthesis runs entirely on-device via MLX/Metal — no HTTP server, no
//! subprocess, no Python. Replaces the `LocalTtsSpeaker` path for users
//! who don't want to keep an `mlx-audio` (or Piper-shim) Python server
//! running alongside CAR.
//!
//! Only compiled on Apple Silicon macOS because that's the only platform
//! where the MLX backend builds. Other targets should use
//! `TtsProvider::Elevenlabs` or `TtsProvider::Local`.

use crate::tts::{AudioFormat, Speaker, SynthesizedAudio};
use crate::{Result, VoiceConfig, VoiceError};
use async_trait::async_trait;
use car_inference::backend::mlx_kokoro::KokoroBackend;
use car_inference::backend_cache::{estimate_model_size, BackendCache, CachedBackend};
use std::path::PathBuf;
use std::sync::{Arc, OnceLock};

/// Process-wide Kokoro cache. Shared across all `KokoroSpeaker`
/// instances so re-constructing the speaker (e.g. on config reload) or
/// having multiple voice sessions doesn't trigger a reload. LRU budget
/// comes from `CAR_INFERENCE_MODEL_CACHE_MB` just like the inference
/// engine's caches — they cohabitate sanely.
fn shared_cache() -> &'static BackendCache<KokoroBackend> {
    static CACHE: OnceLock<BackendCache<KokoroBackend>> = OnceLock::new();
    CACHE.get_or_init(BackendCache::from_env)
}

/// In-process Kokoro TTS.
///
/// Holds an `Arc<Mutex<KokoroBackend>>` handle out of the shared
/// process-wide cache. Concurrent `synth` calls on the same `voice`
/// serialize on the mutex (MLX ops are not `Sync`); two different
/// voices would hit the same backend anyway since Kokoro voices are
/// style vectors, not separate models — only one backend gets loaded.
pub struct KokoroSpeaker {
    backend: CachedBackend<KokoroBackend>,
    voice: String,
    tmp_dir: PathBuf,
}

impl std::fmt::Debug for KokoroSpeaker {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("KokoroSpeaker")
            .field("voice", &self.voice)
            .field("tmp_dir", &self.tmp_dir)
            .finish()
    }
}

impl KokoroSpeaker {
    /// Construct from a [`VoiceConfig`]. Loads the Kokoro backend via
    /// the shared cache — first construction pays the ~1 s load cost,
    /// subsequent constructions / calls are cheap.
    pub fn from_config(config: &VoiceConfig) -> Result<Self> {
        let model_dir = resolve_kokoro_model_dir(&config.local_tts_model).ok_or_else(|| {
            VoiceError::Config(format!(
                "Kokoro TTS: model '{}' not found in HuggingFace cache — \
                 run `car models pull {}` first",
                config.local_tts_model, config.local_tts_model,
            ))
        })?;
        let key = config.local_tts_model.clone();
        let size = estimate_model_size(&model_dir);
        let backend = shared_cache()
            .get_or_load(&key, size, || KokoroBackend::load(&model_dir))
            .map_err(|e: car_inference::InferenceError| {
                VoiceError::Config(format!("load Kokoro model: {e}"))
            })?;
        Ok(Self {
            backend,
            voice: config.local_tts_voice.clone(),
            tmp_dir: std::env::temp_dir(),
        })
    }
}

#[async_trait]
impl Speaker for KokoroSpeaker {
    async fn synth(&self, text: &str) -> Result<SynthesizedAudio> {
        // Write to a per-call temp WAV, read it back into memory, and
        // return the bytes. The backend API is path-based; the disk
        // round-trip is negligible vs synthesis time.
        let wav_path = self
            .tmp_dir
            .join(format!("car-kokoro-{}.wav", uuid::Uuid::new_v4()));
        let voice = self.voice.clone();
        let text = text.to_string();
        let backend = Arc::clone(&self.backend);
        let wav_path_clone = wav_path.clone();

        // `synthesize` is synchronous and CPU/GPU-bound. Hold the
        // sync Mutex only on the blocking worker thread to keep the
        // tokio runtime moving.
        let wav_path = tokio::task::spawn_blocking(move || -> Result<PathBuf> {
            let mut guard = backend
                .lock()
                .map_err(|_| VoiceError::Config("Kokoro backend mutex poisoned".into()))?;
            guard
                .synthesize(&text, Some(&voice), &wav_path_clone)
                .map_err(|e| VoiceError::Config(format!("Kokoro synth: {e}")))
        })
        .await
        .map_err(|e| VoiceError::Config(format!("Kokoro synth task: {e}")))??;

        let bytes = tokio::fs::read(&wav_path)
            .await
            .map_err(|e| VoiceError::Config(format!("read Kokoro wav: {e}")))?;
        let _ = tokio::fs::remove_file(&wav_path).await;

        Ok(SynthesizedAudio {
            bytes,
            format: AudioFormat::Wav,
        })
    }
}

/// Locate the Kokoro model dir in the HuggingFace cache.
///
/// Matches `~/.cache/huggingface/hub/models--{org}--{repo}/snapshots/*/`
/// — the same layout `hf_hub::api` writes and that our
/// `car_inference` loaders read.
fn resolve_kokoro_model_dir(repo: &str) -> Option<PathBuf> {
    let home = dirs::home_dir()?;
    let hub_dir = home.join(".cache/huggingface/hub");
    let normalized = repo.replace('/', "--");
    let candidate = hub_dir
        .join(format!("models--{normalized}"))
        .join("snapshots");
    let entries = std::fs::read_dir(&candidate).ok()?;
    // Pick the newest snapshot dir (there's typically one; take the most
    // recent mtime to be resilient to partial pulls).
    entries
        .flatten()
        .filter_map(|e| {
            let p = e.path();
            let meta = e.metadata().ok()?;
            if !meta.is_dir() {
                return None;
            }
            let mtime = meta.modified().ok()?;
            Some((mtime, p))
        })
        .max_by_key(|(m, _)| *m)
        .map(|(_, p)| p)
}