car-voice 0.13.0

//! Voice subsystem configuration.

use serde::{Deserialize, Serialize};
use std::sync::Arc;
use tokio::sync::watch;

/// Default voice-context prompt overlay. Prepended to the system prompt
/// of voice-invoked inference calls so the model emits short,
/// voice-appropriate responses without further coaxing. Verbose by
/// design — quip's production canaries showed this exact shape works
/// where shorter prompts didn't.
///
/// Override with [`VoiceConfig::voice_prompt_overlay`]; an explicit
/// empty string disables the overlay entirely. See
/// `docs/proposals/voice-sidecar-orchestration.md` §"Voice-context
/// prompt overlay".
pub const DEFAULT_VOICE_PROMPT_OVERLAY: &str =
    "[VOICE CONTEXT: This is a real-time voice call. Speed is critical — act immediately.
When checking email, get ALL emails in the inbox (not just unread — set unreadOnly to false).
The most important emails are ones that were read but never replied to or acted on.
Skip newsletters, marketing emails, and automated notifications.
Return ONLY subject lines and sender names. Do NOT include email bodies.
Focus on personal/work emails from real people that likely need a response.
When checking calendar, limit to the next 2 weeks and list only time, title, and attendees.
Do NOT ask clarifying questions — use sensible defaults and act.
Keep responses under 500 characters. The user is waiting on a live call.]";

/// How the listener decides when to record speech.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ListenerMode {
    /// Always-on listening with VAD-driven turn detection.
    /// Voice-first GUIs default to this.
    Auto,
    /// Manual control: caller drives `start_segment` / `end_segment`.
    PushToTalk,
    /// Always-on but only emits transcripts after a wake word is heard.
    WakeWord,
}

/// Which TTS backend to use.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TtsProvider {
    /// ElevenLabs cloud TTS (requires API key).
    Elevenlabs,
    /// Local OpenAI-compatible TTS server (e.g. mlx-audio, Piper). HTTP
    /// client — requires a separate server (often Python) running.
    Local,
    /// In-process Kokoro-82M TTS via MLX/Metal. No HTTP, no server, no
    /// Python. macOS-only. Model is pulled to the HuggingFace cache on
    /// first use.
    Kokoro,
    /// macOS AVSpeechSynthesizer — Apple's built-in TTS. Free, on-device,
    /// no model download, no MLX dependency. macOS only; non-macOS
    /// targets reject this variant in `provider::build_tts_speaker`.
    AppleSpeech,
}

impl Default for TtsProvider {
    fn default() -> Self {
        // On macOS, prefer Apple's built-in AVSpeechSynthesizer — free,
        // on-device, no model download, no Kokoro/MLX setup. Kokoro is
        // still available as an explicit opt-in for users who prefer its
        // voice character. Non-macOS targets fall back to ElevenLabs;
        // none of the local TTS paths run cross-platform without setup.
        #[cfg(target_os = "macos")]
        {
            Self::AppleSpeech
        }
        #[cfg(not(target_os = "macos"))]
        {
            Self::Elevenlabs
        }
    }
}

/// Which STT backend to use.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SttProvider {
    /// ElevenLabs Scribe cloud STT (requires API key).
    Elevenlabs,
    /// In-process Whisper via whisper.cpp. Runs entirely on-device
    /// with Metal acceleration on Apple Silicon. No HTTP, no server,
    /// no Python — model file is downloaded once and cached at
    /// `~/.tokhn/whisper/`.
    WhisperCpp,
    /// In-process Parakeet TDT via ONNX Runtime. Faster than whisper
    /// on Apple Silicon and emits per-token timestamps natively for
    /// cleaner streaming UX. Requires the `parakeet` cargo feature
    /// (pulls in `ort` + `ndarray`); model files (~600 MB) downloaded
    /// once to `~/.car/models/parakeet-tdt-0.6b-v2-int8/`.
    Parakeet,
    /// macOS SFSpeechRecognizer — Apple's built-in STT. On-device, free,
    /// multilingual, no model download. Requires Speech Recognition
    /// permission (host calls SFSpeechRecognizer.requestAuthorization at
    /// startup). macOS only; non-macOS targets reject this variant.
    AppleSpeech,
}

impl Default for SttProvider {
    fn default() -> Self {
        // On macOS, prefer Apple's built-in SFSpeechRecognizer — free,
        // on-device, no model download. whisper.cpp stays as an explicit
        // opt-in (and remains the cross-platform default elsewhere).
        #[cfg(target_os = "macos")]
        {
            Self::AppleSpeech
        }
        #[cfg(not(target_os = "macos"))]
        {
            Self::WhisperCpp
        }
    }
}

impl Default for ListenerMode {
    fn default() -> Self {
        Self::Auto
    }
}

/// Configuration for the voice subsystem.
///
/// Loaded from `tokhn-config` (or env vars / defaults). Channels do not
/// re-derive any of this — they pass it to [`crate::Listener::start`].
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct VoiceConfig {
    // ─── Provider selection ────────────────────────────────────────────────
    /// STT provider.
    #[serde(default)]
    pub stt_provider: SttProvider,

    /// TTS provider.
    #[serde(default)]
    pub tts_provider: TtsProvider,

    /// ElevenLabs API key. Resolved from `ELEVENLABS_API_KEY` env var if not
    /// set explicitly.
    #[serde(default)]
    pub elevenlabs_api_key: Option<String>,

    /// ElevenLabs voice ID for narration. Defaults to the TARS-style narrator
    /// the Tauri app was using.
    #[serde(default = "default_voice_id")]
    pub elevenlabs_voice_id: String,

    /// ElevenLabs TTS model.
    #[serde(default = "default_tts_model")]
    pub elevenlabs_tts_model: String,

    // ─── Local provider config ────────────────────────────────────────────
    /// Base URL for the local OpenAI-compatible TTS server.
    #[serde(default = "default_local_tts_url")]
    pub local_tts_url: String,

    /// Model name for local TTS (e.g. `"mlx-community/Kokoro-82M-bf16"`).
    #[serde(default = "default_local_tts_model")]
    pub local_tts_model: String,

    /// Whisper model identifier for the in-process whisper.cpp STT
    /// provider. Matches the suffix used by `ggerganov/whisper.cpp`
    /// on Hugging Face — e.g. `"large-v3-turbo-q5_0"`, `"medium-q5_0"`,
    /// `"tiny-q5_0"`. First-run launch downloads
    /// `ggml-<model>.bin` from there and caches it at
    /// `~/.tokhn/whisper/ggml-<model>.bin`.
    #[serde(default = "default_whisper_cpp_model")]
    pub whisper_cpp_model: String,

    /// Voice name for local TTS (provider-specific, e.g. `"af_heart"`).
    #[serde(default = "default_local_tts_voice")]
    pub local_tts_voice: String,

    /// Playback speed multiplier for local TTS.
    #[serde(default = "default_local_tts_speed")]
    pub local_tts_speed: f32,

    /// Sampling temperature for local TTS (Qwen3-TTS).
    #[serde(default = "default_local_tts_temperature")]
    pub local_tts_temperature: f32,

    /// Reference audio path for voice cloning (Qwen3-TTS-Base).
    #[serde(default)]
    pub local_tts_ref_audio: Option<String>,

    /// Reference text for voice cloning (Qwen3-TTS-Base).
    #[serde(default)]
    pub local_tts_ref_text: Option<String>,

    /// Natural language voice description for voice design (Qwen3-TTS-VoiceDesign).
    #[serde(default)]
    pub local_tts_instruct: Option<String>,

    // ─── Audio device ──────────────────────────────────────────────────────
    /// Optional input device name. `None` means use the OS default.
    #[serde(default)]
    pub input_device: Option<String>,

    /// Capture sample rate in Hz. Common values: 16000, 44100, 48000.
    /// 16 kHz mono is what ElevenLabs STT prefers.
    #[serde(default = "default_sample_rate")]
    pub sample_rate: u32,

    /// Language code for STT (e.g. `"en"`, `"es"`).
    #[serde(default = "default_language")]
    pub language: String,

    // ─── Listener UX policy ────────────────────────────────────────────────
    /// Listening mode (auto / push-to-talk / wake-word).
    #[serde(default)]
    pub mode: ListenerMode,

    /// Wake word(s) recognized in `WakeWord` mode.
    #[serde(default = "default_wake_words")]
    pub wake_words: Vec<String>,

    // ─── VAD tuning ────────────────────────────────────────────────────────
    /// VAD energy threshold above the noise floor, in dB. Raised from 9.0
    /// to 15.0 in commit `7033ca3` so the audio bed stops bleeding into the
    /// AirPods microphone.
    #[serde(default = "default_vad_threshold_db")]
    pub vad_threshold_db: f32,

    /// Minimum continuous voiced energy before SpeechStart fires (ms).
    #[serde(default = "default_speech_onset_ms")]
    pub speech_onset_ms: u32,

    /// Silence duration that ends a turn (ms).
    #[serde(default = "default_turn_end_ms")]
    pub turn_end_ms: u32,

    /// IIR smoothing factor for energy estimates (0.0–1.0). Higher = smoother.
    #[serde(default = "default_smoothing_factor")]
    pub smoothing_factor: f32,

    /// Hysteresis margin in dB to prevent rapid speech/silence flapping.
    #[serde(default = "default_hysteresis_db")]
    pub hysteresis_db: f32,

    // ─── Barge-in / capture-loop tuning ───────────────────────────────────
    /// While Tokhn is playing TTS, the VAD threshold is raised by this
    /// many dB so only loud user speech (a barge-in) registers. Lower
    /// = more sensitive to interruption; higher = more echo tolerance.
    #[serde(default = "default_barge_in_boost_db")]
    pub barge_in_boost_db: f32,

    /// Milliseconds to keep the threshold boost active *after* TTS ends,
    /// to absorb speaker tail / room reverb.
    #[serde(default = "default_boost_tail_ms")]
    pub boost_tail_ms: u64,

    /// Hard cap on segment length. Anything longer is force-finalized —
    /// VAD has been seen to lock onto background noise and produce
    /// 17-second "speech" segments that never naturally end.
    #[serde(default = "default_max_segment_ms")]
    pub max_segment_ms: u64,

    /// Reject finalized segments whose average RMS isn't this many dB
    /// above the calibrated noise floor. Drops low-energy noise before
    /// it reaches STT (which then hallucinates content from it).
    #[serde(default = "default_segment_min_snr_db")]
    pub segment_min_snr_db: f32,

    // ─── Voice orchestration ──────────────────────────────────────────────
    /// Voice-context prompt overlay prepended to system prompts on the
    /// voice-invoked inference path. `None` uses
    /// [`DEFAULT_VOICE_PROMPT_OVERLAY`]. An explicit `Some("".into())`
    /// disables the overlay (e.g. for callers who already supply their
    /// own voice-tuned system prompt).
    #[serde(default)]
    pub voice_prompt_overlay: Option<String>,

    /// Progress-phrase interval in seconds. While the sidecar is
    /// still running, play a short "still working on that" phrase
    /// every `progress_interval_secs`. `None` defaults to 8.
    #[serde(default)]
    pub progress_interval_secs: Option<u64>,

    /// Maximum number of progress-phrase attempts before giving up
    /// on the sidecar. `None` defaults to 4 — combined with the
    /// 8-second interval that's a 32-second cap, lower than the
    /// default `sidecar_timeout` so progress is the dominant
    /// abandonment signal.
    #[serde(default)]
    pub max_progress_attempts: Option<u32>,
}

impl VoiceConfig {
    /// Construct a `VoiceConfig` by starting from [`Self::default`] and
    /// applying overrides read from environment variables. Supported
    /// vars (all optional):
    ///
    /// - `TOKHN_STT_PROVIDER` = `"local"` | `"elevenlabs"`
    /// - `TOKHN_STT_URL` — base URL of a local OpenAI-compatible STT
    ///   server (e.g. `http://127.0.0.1:19281/v1`).
    /// - `TOKHN_STT_MODEL` — model id for the local STT provider.
    /// - `TOKHN_TTS_PROVIDER` = `"local"` | `"elevenlabs"`
    /// - `TOKHN_TTS_URL` — local TTS server base URL.
    /// - `TOKHN_TTS_MODEL` — local TTS model id.
    ///
    /// Unknown values for the `*_PROVIDER` vars are ignored (default
    /// stays in effect); callers should verify a known provider is
    /// reachable before relying on it.
    pub fn from_env() -> Self {
        let mut cfg = Self::default();
        if let Ok(v) = std::env::var("TOKHN_STT_PROVIDER") {
            match v.to_lowercase().as_str() {
                // Accept `whisper-cpp`, `whispercpp`, and `local` as
                // aliases — `local` is a historical name that's still
                // in docs/shells, but the provider is in-process
                // whisper.cpp regardless.
                "whisper-cpp" | "whispercpp" | "whisper_cpp" | "local" => {
                    cfg.stt_provider = SttProvider::WhisperCpp;
                }
                "elevenlabs" | "eleven_labs" | "eleven-labs" => {
                    cfg.stt_provider = SttProvider::Elevenlabs;
                }
                "parakeet" | "parakeet-tdt" | "parakeet_tdt" => {
                    cfg.stt_provider = SttProvider::Parakeet;
                }
                "apple_speech" | "apple-speech" | "apple" | "sfspeech" => {
                    cfg.stt_provider = SttProvider::AppleSpeech;
                }
                _ => {}
            }
        }
        if let Ok(v) = std::env::var("TOKHN_STT_MODEL") {
            if !v.is_empty() {
                cfg.whisper_cpp_model = v;
            }
        }
        if let Ok(v) = std::env::var("TOKHN_TTS_PROVIDER") {
            match v.to_lowercase().as_str() {
                "local" => cfg.tts_provider = TtsProvider::Local,
                "kokoro" | "kokoro_native" | "kokoro-native" => {
                    cfg.tts_provider = TtsProvider::Kokoro;
                }
                "elevenlabs" | "eleven_labs" | "eleven-labs" => {
                    cfg.tts_provider = TtsProvider::Elevenlabs;
                }
                "apple_speech" | "apple-speech" | "apple" | "avspeech" => {
                    cfg.tts_provider = TtsProvider::AppleSpeech;
                }
                _ => {}
            }
        }
        if let Ok(v) = std::env::var("TOKHN_TTS_URL") {
            if !v.is_empty() {
                cfg.local_tts_url = v;
            }
        }
        if let Ok(v) = std::env::var("TOKHN_TTS_MODEL") {
            if !v.is_empty() {
                cfg.local_tts_model = v;
            }
        }
        cfg
    }
}

impl Default for VoiceConfig {
    fn default() -> Self {
        Self {
            stt_provider: SttProvider::default(),
            tts_provider: TtsProvider::default(),
            elevenlabs_api_key: None,
            elevenlabs_voice_id: default_voice_id(),
            elevenlabs_tts_model: default_tts_model(),
            local_tts_url: default_local_tts_url(),
            local_tts_model: default_local_tts_model(),
            whisper_cpp_model: default_whisper_cpp_model(),
            local_tts_voice: default_local_tts_voice(),
            local_tts_speed: default_local_tts_speed(),
            local_tts_temperature: default_local_tts_temperature(),
            local_tts_ref_audio: None,
            local_tts_ref_text: None,
            local_tts_instruct: None,
            input_device: None,
            sample_rate: default_sample_rate(),
            language: default_language(),
            mode: ListenerMode::default(),
            wake_words: default_wake_words(),
            vad_threshold_db: default_vad_threshold_db(),
            speech_onset_ms: default_speech_onset_ms(),
            turn_end_ms: default_turn_end_ms(),
            smoothing_factor: default_smoothing_factor(),
            hysteresis_db: default_hysteresis_db(),
            barge_in_boost_db: default_barge_in_boost_db(),
            boost_tail_ms: default_boost_tail_ms(),
            max_segment_ms: default_max_segment_ms(),
            segment_min_snr_db: default_segment_min_snr_db(),
            voice_prompt_overlay: None,
            progress_interval_secs: None,
            max_progress_attempts: None,
        }
    }
}

/// Compose the voice-context overlay with an optional caller system
/// prompt. Returns the combined string suitable for
/// `GenerateRequest.context`.
///
/// Behaviour:
/// - `config.voice_prompt_overlay = None` → use [`DEFAULT_VOICE_PROMPT_OVERLAY`].
/// - `config.voice_prompt_overlay = Some("")` → disable overlay
///   (caller's prompt is returned unchanged; `None` if no caller prompt).
/// - Both overlay and caller prompt present → overlay first, blank line,
///   then caller prompt.
pub fn compose_voice_context(config: &VoiceConfig, caller_context: Option<&str>) -> Option<String> {
    let overlay = config
        .voice_prompt_overlay
        .as_deref()
        .unwrap_or(DEFAULT_VOICE_PROMPT_OVERLAY);
    match (overlay.is_empty(), caller_context) {
        (true, None) => None,
        (true, Some(ctx)) => Some(ctx.to_string()),
        (false, None) => Some(overlay.to_string()),
        (false, Some(ctx)) => Some(format!("{overlay}\n\n{ctx}")),
    }
}

// ─────────────────────────────────────────────────────────────────────
// Hot-reloadable config
// ─────────────────────────────────────────────────────────────────────

/// Sender half of a hot-reloadable voice config.
///
/// The config owner (e.g. a file watcher or UI settings panel) calls
/// [`VoiceConfigSender::update`] when the config changes. All holders
/// of a [`VoiceConfigHandle`] see the new values immediately.
#[derive(Debug, Clone)]
pub struct VoiceConfigSender {
    tx: Arc<watch::Sender<VoiceConfig>>,
}

/// Read-only handle to the current voice config. Cheap to clone —
/// listeners, VAD, and speakers each hold one.
#[derive(Debug, Clone)]
pub struct VoiceConfigHandle {
    rx: watch::Receiver<VoiceConfig>,
}

/// Create a sender/handle pair seeded with the initial config.
pub fn voice_config_watch(initial: VoiceConfig) -> (VoiceConfigSender, VoiceConfigHandle) {
    let (tx, rx) = watch::channel(initial);
    (
        VoiceConfigSender { tx: Arc::new(tx) },
        VoiceConfigHandle { rx },
    )
}

impl VoiceConfigSender {
    /// Push a new config to all handles. Only notifies if the config
    /// actually changed (avoids spurious wake-ups).
    pub fn update(&self, config: VoiceConfig) {
        self.tx.send_if_modified(|current| {
            if *current != config {
                *current = config;
                true
            } else {
                false
            }
        });
    }

    /// Read the current config.
    pub fn current(&self) -> VoiceConfig {
        self.tx.borrow().clone()
    }
}

impl VoiceConfigHandle {
    /// Snapshot the current config.
    pub fn current(&self) -> VoiceConfig {
        self.rx.borrow().clone()
    }

    /// Wait for the config to change, returning the new value. Returns
    /// `None` if the sender is dropped.
    pub async fn changed(&mut self) -> Option<VoiceConfig> {
        self.rx.changed().await.ok()?;
        Some(self.rx.borrow_and_update().clone())
    }
}

fn default_voice_id() -> String {
    // TARS-style narrator voice from app/src/lib/voiceEngine.ts
    "UznIBkKIQe3ZG2tGydre".into()
}

fn default_tts_model() -> String {
    "eleven_turbo_v2_5".into()
}

fn default_sample_rate() -> u32 {
    16_000
}

fn default_language() -> String {
    "en".into()
}

fn default_wake_words() -> Vec<String> {
    vec!["tokhn".into(), "token".into(), "talking".into()]
}

fn default_vad_threshold_db() -> f32 {
    15.0
}

fn default_speech_onset_ms() -> u32 {
    100
}

fn default_turn_end_ms() -> u32 {
    1400
}

fn default_smoothing_factor() -> f32 {
    0.3
}

fn default_hysteresis_db() -> f32 {
    3.0
}

fn default_barge_in_boost_db() -> f32 {
    18.0
}

fn default_boost_tail_ms() -> u64 {
    // 500ms is the sweet spot empirically: covers most macOS speaker
    // tails + near-field room reverb for Tokhn's own speech without
    // making real user responses feel lagged. At 350ms, late echo
    // after longer sentences occasionally slipped through the boosted
    // VAD threshold and started a spurious recording.
    500
}

fn default_max_segment_ms() -> u64 {
    6_000
}

fn default_segment_min_snr_db() -> f32 {
    8.0
}

fn default_local_tts_url() -> String {
    "http://127.0.0.1:19280/v1".into()
}

fn default_local_tts_model() -> String {
    "mlx-community/Kokoro-82M-bf16".into()
}

/// Default whisper.cpp model id. `large-v3-turbo-q5_0` is the
/// quantized turbo variant (~600 MB file, near-best accuracy, runs
/// well above real-time on M-series). Downloads from
/// `https://huggingface.co/ggerganov/whisper.cpp`.
fn default_whisper_cpp_model() -> String {
    "large-v3-turbo-q5_0".into()
}

fn default_local_tts_voice() -> String {
    "af_heart".into()
}

fn default_local_tts_speed() -> f32 {
    1.0
}

fn default_local_tts_temperature() -> f32 {
    0.7
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn compose_voice_context_uses_default_when_unset() {
        let cfg = VoiceConfig::default();
        let out = compose_voice_context(&cfg, None).expect("overlay present by default");
        assert!(out.starts_with("[VOICE CONTEXT:"));
    }

    #[test]
    fn compose_voice_context_empty_overlay_disables() {
        let cfg = VoiceConfig {
            voice_prompt_overlay: Some(String::new()),
            ..VoiceConfig::default()
        };
        assert_eq!(compose_voice_context(&cfg, None), None);
        assert_eq!(
            compose_voice_context(&cfg, Some("hi")),
            Some("hi".to_string())
        );
    }

    #[test]
    fn compose_voice_context_concatenates_with_caller_prompt() {
        let cfg = VoiceConfig {
            voice_prompt_overlay: Some("OVERLAY".into()),
            ..VoiceConfig::default()
        };
        assert_eq!(
            compose_voice_context(&cfg, Some("CALLER")),
            Some("OVERLAY\n\nCALLER".to_string())
        );
    }

    #[test]
    fn compose_voice_context_overlay_only_when_no_caller_prompt() {
        let cfg = VoiceConfig {
            voice_prompt_overlay: Some("OVERLAY".into()),
            ..VoiceConfig::default()
        };
        assert_eq!(
            compose_voice_context(&cfg, None),
            Some("OVERLAY".to_string())
        );
    }
}