car-voice 0.14.0

//! In-process Whisper speech-to-text via `whisper-rs` (FFI to
//! whisper.cpp).
//!
//! Replaces the ElevenLabs Scribe cloud path and the earlier Python
//! HTTP shim. Runs entirely inside the Tokhn process — model is loaded
//! once into memory, inference uses whisper.cpp's Metal backend on
//! Apple Silicon, and captured samples are handed to the FFI directly
//! with no HTTP, no subprocess, no Python.
//!
//! Hallucination resistance comes from three things whisper.cpp already
//! does well:
//! - The encoder's `no_speech_probability` output — segments where the
//!   model is confident the audio is non-speech get dropped before
//!   they reach the user.
//! - Suppressing the "non-speech token" class at decode time so
//!   `[BLANK_AUDIO]` / `[MUSIC]` style captions don't come through as
//!   real text.
//! - Not conditioning on previous text, which is the source of the
//!   "Thank you for watching." YouTube-caption hallucination Whisper
//!   learned from training data.
//!
//! Model files are downloaded once from the `ggerganov/whisper.cpp`
//! model repository on Hugging Face and cached at
//! `~/.tokhn/whisper/ggml-<model>.bin`. First-run startup pauses for
//! the download (~600 MB for the default quantized turbo model);
//! subsequent launches load from disk in a couple of seconds.

use std::path::{Path, PathBuf};
use std::sync::Arc;

use async_trait::async_trait;
use rubato::{FftFixedIn, Resampler};
use whisper_rs::{FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters};

use crate::stt::SttProvider;
use crate::{Result, VoiceConfig, VoiceError};

/// Sample rate whisper.cpp expects. Non-negotiable.
const WHISPER_SAMPLE_RATE: u32 = 16_000;

/// Base URL where `ggerganov/whisper.cpp` hosts the GGML model bundles.
const WHISPER_MODEL_BASE_URL: &str = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main";

/// In-process Whisper STT.
pub struct WhisperCppSttProvider {
    ctx: Arc<WhisperContext>,
    language: String,
}

impl std::fmt::Debug for WhisperCppSttProvider {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("WhisperCppSttProvider")
            .field("language", &self.language)
            .finish()
    }
}

impl WhisperCppSttProvider {
    /// Build a provider by resolving the configured model file and
    /// loading it. If the file isn't present at the expected cache
    /// path yet, this downloads it synchronously — first-run cost is
    /// the 600 MB (or more, depending on model) download.
    pub fn from_config(config: &VoiceConfig) -> Result<Self> {
        let model_id = &config.whisper_cpp_model;
        let model_path = resolve_model_path(model_id)?;
        if !model_path.exists() {
            tracing::info!(
                "[whisper-cpp] model file missing at {}, downloading from {}",
                model_path.display(),
                WHISPER_MODEL_BASE_URL,
            );
            download_model(model_id, &model_path)?;
        }
        tracing::info!(
            "[whisper-cpp] loading {} ({} MB on disk)",
            model_path.display(),
            std::fs::metadata(&model_path)
                .map(|m| m.len() / (1024 * 1024))
                .unwrap_or(0),
        );
        let ctx = WhisperContext::new_with_params(
            model_path
                .to_str()
                .ok_or_else(|| VoiceError::Stt("model path is not utf-8".into()))?,
            WhisperContextParameters::default(),
        )
        .map_err(|e| VoiceError::Stt(format!("whisper ctx init: {e}")))?;
        Ok(Self {
            ctx: Arc::new(ctx),
            language: config.language.clone(),
        })
    }
}

#[async_trait]
impl SttProvider for WhisperCppSttProvider {
    async fn transcribe(&self, samples: &[f32], sample_rate: u32) -> Result<String> {
        // Whisper wants 16 kHz mono f32 in [-1, 1]. We take responsibility
        // for the resample here; callers can hand us whatever VPIO
        // chose for its capture bus.
        let samples_16k = if sample_rate == WHISPER_SAMPLE_RATE {
            samples.to_vec()
        } else {
            resample_to_16k(samples, sample_rate)?
        };

        let ctx = Arc::clone(&self.ctx);
        let lang = self.language.clone();

        // whisper.cpp inference is CPU/GPU-bound (Metal on Apple
        // Silicon) but is synchronous — move it off the tokio runtime
        // onto a blocking pool so we don't stall other async tasks.
        tokio::task::spawn_blocking(move || run_transcription(&ctx, &samples_16k, &lang))
            .await
            .map_err(|e| VoiceError::Stt(format!("whisper join: {e}")))?
    }
}

fn run_transcription(ctx: &WhisperContext, samples: &[f32], language: &str) -> Result<String> {
    let mut state = ctx
        .create_state()
        .map_err(|e| VoiceError::Stt(format!("whisper state: {e}")))?;

    let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 });
    params.set_language(Some(language));
    // Don't carry tokens from one segment to the next — this is the
    // single biggest source of Whisper's "Thank you for watching" style
    // hallucinations (the decoder runs away on a self-similar prefix
    // it learned from YouTube captions).
    params.set_no_context(true);
    // Suppress bracketed non-speech tokens ([BLANK_AUDIO], [MUSIC], ...)
    // so they never surface as fake transcripts.
    params.set_suppress_nst(true);
    // Drop segments where the model's no-speech probability is high —
    // these are where hallucinations come from.
    params.set_no_speech_thold(0.6);
    // Keep log output quiet — whisper.cpp logs progress by default.
    params.set_print_progress(false);
    params.set_print_realtime(false);
    params.set_print_special(false);
    params.set_print_timestamps(false);
    // Single-threaded is actually faster on Metal (the GPU path
    // doesn't benefit from host threads) and avoids stealing cycles
    // from the VPIO realtime audio thread. Default stays 1 to
    // preserve that contract bit-for-bit.
    //
    // Linux containers running q5_0-turbo on CPU only (meet-bot
    // under linux/arm64, GPU-less Cloud Run / ECS workers) need
    // multi-thread to stay under realtime: at 1 thread, each 6 s
    // segment took ~40 s wallclock; at 6 threads, ~9 s — under
    // realtime with zero backlog drops (#144). Override via
    // `WHISPER_N_THREADS=<n>`. Empty / unset / unparseable / sub-1
    // → falls back to 1.
    let n_threads = std::env::var("WHISPER_N_THREADS")
        .ok()
        .and_then(|s| s.trim().parse::<i32>().ok())
        .filter(|&n| n >= 1)
        .unwrap_or(1);
    params.set_n_threads(n_threads);

    state
        .full(params, samples)
        .map_err(|e| VoiceError::Stt(format!("whisper full: {e}")))?;

    let n = state
        .full_n_segments()
        .map_err(|e| VoiceError::Stt(format!("whisper n_segments: {e}")))?;
    let mut text = String::new();
    for i in 0..n {
        let seg = state
            .full_get_segment_text(i)
            .map_err(|e| VoiceError::Stt(format!("whisper get_segment_text: {e}")))?;
        text.push_str(&seg);
    }
    Ok(text.trim().to_string())
}

fn resolve_model_path(model_id: &str) -> Result<PathBuf> {
    let cache_root = dirs::home_dir()
        .ok_or_else(|| VoiceError::Stt("no home dir".into()))?
        .join(".tokhn")
        .join("whisper");
    std::fs::create_dir_all(&cache_root)
        .map_err(|e| VoiceError::Stt(format!("create cache dir: {e}")))?;
    Ok(cache_root.join(format!("ggml-{model_id}.bin")))
}

fn download_model(model_id: &str, dest: &Path) -> Result<()> {
    let file_name = format!("ggml-{model_id}.bin");
    let url = format!("{WHISPER_MODEL_BASE_URL}/{file_name}");
    tracing::info!("[whisper-cpp] downloading {url}");

    // Blocking reqwest call — we're in `from_config` which is sync.
    // This pauses startup for the size of the model (600 MB–3 GB
    // depending on variant). Happens exactly once per model, then the
    // file is cached.
    let client = reqwest::blocking::Client::builder()
        .timeout(std::time::Duration::from_secs(60 * 10))
        .build()
        .map_err(|e| VoiceError::Stt(format!("http client: {e}")))?;
    let mut resp = client
        .get(&url)
        .send()
        .map_err(|e| VoiceError::Stt(format!("download {url}: {e}")))?;
    if !resp.status().is_success() {
        return Err(VoiceError::Stt(format!(
            "download {url} returned {}",
            resp.status()
        )));
    }

    // Write to a tempfile and rename atomically so an interrupted
    // download doesn't leave a truncated file at the cache path.
    let tmp = dest.with_extension("bin.partial");
    {
        let mut f =
            std::fs::File::create(&tmp).map_err(|e| VoiceError::Stt(format!("create tmp: {e}")))?;
        resp.copy_to(&mut f)
            .map_err(|e| VoiceError::Stt(format!("write: {e}")))?;
    }
    std::fs::rename(&tmp, dest).map_err(|e| VoiceError::Stt(format!("rename: {e}")))?;

    tracing::info!(
        "[whisper-cpp] saved to {} ({} MB)",
        dest.display(),
        std::fs::metadata(dest)
            .map(|m| m.len() / (1024 * 1024))
            .unwrap_or(0),
    );
    Ok(())
}

/// Resample arbitrary-rate f32 mono audio down to whisper.cpp's 16 kHz
/// input requirement. Uses a fixed-size FFT resampler; accuracy is
/// plenty for speech content at these ratios.
fn resample_to_16k(samples: &[f32], source_rate: u32) -> Result<Vec<f32>> {
    if samples.is_empty() {
        return Ok(Vec::new());
    }
    // `FftFixedIn` works in fixed-size chunks — whisper models swallow
    // up to 30 seconds at once and our segments are rarely >10 s, so
    // a 1024-sample frame gives plenty of granularity.
    let chunk_size = 1024;
    let mut resampler = FftFixedIn::<f32>::new(
        source_rate as usize,
        WHISPER_SAMPLE_RATE as usize,
        chunk_size,
        1,
        1,
    )
    .map_err(|e| VoiceError::Stt(format!("resampler init: {e}")))?;

    let mut out = Vec::with_capacity(
        samples.len() * WHISPER_SAMPLE_RATE as usize / source_rate as usize + chunk_size,
    );
    let mut cursor = 0usize;
    while cursor + chunk_size <= samples.len() {
        let input = vec![samples[cursor..cursor + chunk_size].to_vec()];
        let output = resampler
            .process(&input, None)
            .map_err(|e| VoiceError::Stt(format!("resampler: {e}")))?;
        out.extend_from_slice(&output[0]);
        cursor += chunk_size;
    }
    // Handle the tail (< chunk_size samples). `process_partial` pads
    // internally — we lose <64 ms at the end, acceptable for speech.
    if cursor < samples.len() {
        let tail = samples[cursor..].to_vec();
        let mut padded = tail;
        padded.resize(chunk_size, 0.0);
        let input = vec![padded];
        let output = resampler
            .process(&input, None)
            .map_err(|e| VoiceError::Stt(format!("resampler tail: {e}")))?;
        out.extend_from_slice(&output[0]);
    }
    Ok(out)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn resolve_model_path_uses_home_cache() {
        let p = resolve_model_path("tiny").unwrap();
        assert!(p.ends_with("ggml-tiny.bin"));
        assert!(p.to_string_lossy().contains(".tokhn/whisper"));
    }

    #[test]
    fn resample_empty_is_empty() {
        let out = resample_to_16k(&[], 44_100).unwrap();
        assert!(out.is_empty());
    }

    #[test]
    fn resample_keeps_rate_when_equal() {
        // 16 kHz → 16 kHz shouldn't invoke the resampler (same-rate
        // fast path in the caller), but the helper should still
        // behave sensibly when called directly.
        let input: Vec<f32> = (0..2048).map(|i| (i as f32 / 100.0).sin()).collect();
        let out = resample_to_16k(&input, 16_000).unwrap();
        // Ratio is 1.0 so length should be within a chunk of input.
        let diff = (out.len() as i64 - input.len() as i64).abs();
        assert!(
            diff < 1200,
            "out.len={} input.len={}",
            out.len(),
            input.len()
        );
    }
}