car-voice 0.15.1

//! Voice turn orchestrator — drives `dispatch_voice_turn` into TTS.
//!
//! Sits between the listener (which produces finalized utterances) and
//! the speaker (which plays audio). Implements the two-track pattern
//! from `docs/proposals/voice-sidecar-orchestration.md`:
//!
//! 1. Classify the utterance — tool-likely (email/calendar/search) or
//!    conversational.
//! 2. Tool-likely → play a hardcoded bridge phrase first; only the
//!    sidecar runs an LLM. Prevents the fast model from hallucinating
//!    tool data it doesn't have ("STRUCTURAL HALLUCINATION FIX").
//! 3. Conversational → fast track streams a quick reply through TTS
//!    while the sidecar runs in parallel for the substantive answer.
//! 4. The pipeline lock releases between fast turn and sidecar so the
//!    user can speak again as soon as the fast TTS ends.
//! 5. A new utterance arriving cancels the in-flight turn (turn_id
//!    bump + cancellation token); a stale sidecar result that arrives
//!    after a barge-in is dropped at the gate.
//!
//! See [`docs/proposals/voice-sidecar-orchestration-plan.md`] §6 for
//! the design behind the cancellation, mixer integration, and prewarm
//! decisions.

use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Duration;

use car_engine::{
    dispatch_voice_turn_sidecar_only_with_classifier,
    dispatch_voice_turn_sidecar_only_with_telemetry, dispatch_voice_turn_with_telemetry,
    DirectDataFetcher, SidecarResult, VoiceTelemetry, VoiceTurnControl, VoiceTurnError,
    VoiceTurnHandle,
};
use car_inference::{
    intent::IntentHint, GenerateParams, GenerateRequest, InferenceEngine, StreamEvent,
};
use tokio::sync::{Mutex, Semaphore};

use crate::{
    compose_voice_context,
    utterance::{ToolKind, UtteranceClass},
    voice_audio_mixer::VoiceMixerHandle,
    Result, Speaker, VoiceConfig, VoiceError,
};
// Re-exports so the `crate::utterance::*` types and helpers are
// reachable through `car_voice::orchestrator::*` for callers that
// followed the old layout. The canonical home is `crate::utterance`.
pub use crate::utterance::{bridge_phrase, classify_utterance, format_for_voice};

fn tool_kind_str(kind: ToolKind) -> &'static str {
    match kind {
        ToolKind::Email => "email",
        ToolKind::Calendar => "calendar",
        ToolKind::Search => "search",
        ToolKind::Unknown => "unknown",
    }
}

/// Find the index (inclusive) of the last sentence-ending punctuation
/// in `s`. Used to flush sentence-buffered TTS as text streams in.
fn find_sentence_end(s: &str) -> Option<usize> {
    s.rfind(|c: char| matches!(c, '.' | '!' | '?'))
}

/// Default sidecar wait timeout. After this, drop the continuation
/// even if the sidecar is still running.
const DEFAULT_SIDECAR_TIMEOUT: Duration = Duration::from_secs(30);

/// Default interval between "still working on that" progress phrases.
const DEFAULT_PROGRESS_INTERVAL: Duration = Duration::from_secs(8);

/// Default cap on progress-phrase attempts (8s × 4 = 32s).
const DEFAULT_MAX_PROGRESS_ATTEMPTS: u32 = 4;

/// Hard ceiling on tokens per voice reply. 200 tokens ≈ ~30 seconds
/// of TTS at typical speaking rates — already long for a voice turn.
/// The voice prompt overlay asks the model to be much shorter; this
/// cap prevents a chatty model from rambling past the user's
/// patience even when the prompt is ignored.
const DEFAULT_VOICE_MAX_TOKENS: usize = 200;

fn voice_params() -> GenerateParams {
    GenerateParams {
        max_tokens: DEFAULT_VOICE_MAX_TOKENS,
        ..GenerateParams::default()
    }
}

/// Progress phrase played periodically while the sidecar is still
/// running. Hardcoded for now; pool variety is a follow-up like
/// bridge phrases.
const PROGRESS_PHRASE: &str = "Still working on that.";

/// Orchestrates one voice session — listener output → engine dispatch
/// → TTS playback. Hold one per concurrent voice session (typically
/// one per app instance for a single-user mic).
pub struct VoiceOrchestrator {
    engine: Arc<InferenceEngine>,
    speaker: Arc<dyn Speaker>,
    /// Optional mixer handle. When `Some`, TTS routes through
    /// `queue_tts`/`stop_tts` for cancellable playback. When `None`,
    /// falls back to the speaker's non-cancellable `speak` (barge-in
    /// still bumps `current_turn_id` to drop stale results, but the
    /// active TTS clip plays to its end).
    mixer: Option<VoiceMixerHandle>,
    /// Optional telemetry sink. When `Some`, the orchestrator emits
    /// voice eventlog kinds (fast/sidecar lifecycle, bridge played,
    /// turn cancelled) for observability. Independent of the engine's
    /// own telemetry: the orchestrator passes the same sink through
    /// to dispatch so engine-side and caller-side events land in the
    /// same log.
    telemetry: Option<VoiceTelemetry>,
    /// Optional direct data fetcher (Phase C). When attached and the
    /// utterance classifies as tool-likely, the sidecar consults the
    /// fetcher first; on hit the LLM is skipped entirely.
    direct_fetcher: Option<Arc<dyn DirectDataFetcher>>,
    /// Optional explicit model id for the fast track (overrides the
    /// adaptive router). Use when you know which model you want
    /// (e.g. `apple/foundation:default` on macOS 26+ for sub-500ms
    /// TTFT) and the router's scoring isn't cooperating, or when
    /// you've registered a delegated model whose hint you want to
    /// pin. `None` keeps adaptive routing.
    fast_model: Option<String>,
    /// Optional explicit model id for the sidecar track. Same
    /// semantics as `fast_model` — `None` keeps adaptive routing.
    sidecar_model: Option<String>,
    /// When true, conversational utterances skip the fast track and
    /// go straight to a single sidecar dispatch — one TTS clip per
    /// turn instead of two. Useful when fast and sidecar would
    /// resolve to the same model (so the second clip just paraphrases
    /// the first), or when you want a simpler single-reply UX. Tool-
    /// likely utterances still play their bridge phrase. Default
    /// `false` preserves the two-track behavior described in the
    /// proposal.
    skip_fast_track: bool,
    config: VoiceConfig,
    /// Serializes fast + sidecar TTS playback. Permit count = 1.
    /// Released between turns so the user can barge in cleanly.
    pipeline_lock: Arc<Semaphore>,
    /// Most recent turn id minted. Sidecar results with a stale id
    /// are dropped at the gate (race-recovery).
    current_turn_id: Arc<AtomicU64>,
    /// Control surface for the in-flight turn, if any. Stored
    /// separately from the receivers (which stay with the driving
    /// task) so `cancel_current_turn` can reach the cancellation
    /// token from a different task. See plan §6.8.
    current_handle: Arc<Mutex<Option<VoiceTurnControl>>>,
    sidecar_timeout: Duration,
}

impl VoiceOrchestrator {
    /// Construct an orchestrator that uses the speaker's
    /// non-cancellable `speak` path. Suitable for environments where
    /// no mixer is available or barge-in mid-TTS isn't required.
    pub fn new(
        engine: Arc<InferenceEngine>,
        speaker: Arc<dyn Speaker>,
        config: VoiceConfig,
    ) -> Self {
        Self::with_mixer(engine, speaker, config, None)
    }

    /// Construct an orchestrator with optional mixer-routed TTS.
    /// Pass `Some(mixer)` to enable cancellable playback (mixer's
    /// `stop_tts` is called on barge-in).
    pub fn with_mixer(
        engine: Arc<InferenceEngine>,
        speaker: Arc<dyn Speaker>,
        config: VoiceConfig,
        mixer: Option<VoiceMixerHandle>,
    ) -> Self {
        Self {
            engine,
            speaker,
            mixer,
            telemetry: None,
            direct_fetcher: None,
            fast_model: None,
            sidecar_model: None,
            skip_fast_track: false,
            config,
            pipeline_lock: Arc::new(Semaphore::new(1)),
            current_turn_id: Arc::new(AtomicU64::new(0)),
            current_handle: Arc::new(Mutex::new(None)),
            sidecar_timeout: DEFAULT_SIDECAR_TIMEOUT,
        }
    }

    /// Override the sidecar wait timeout. Builder-style.
    pub fn with_sidecar_timeout(mut self, timeout: Duration) -> Self {
        self.sidecar_timeout = timeout;
        self
    }

    /// Attach a telemetry sink. The orchestrator emits voice eventlog
    /// kinds (`VoiceFastTurnStarted`/`Ended`, `VoiceSidecarResolved`/
    /// `Failed`/`TimedOut`, `VoiceBridgePlayed`, `VoiceTurnCancelled`)
    /// to this sink as turns progress. The same sink is passed
    /// through to `car-engine`'s dispatch so engine-side and
    /// caller-side events share the log.
    pub fn with_telemetry(mut self, telemetry: VoiceTelemetry) -> Self {
        self.telemetry = Some(telemetry);
        self
    }

    /// Pin specific model ids for the fast and sidecar tracks. Each
    /// argument is a full model id (e.g. `apple/foundation:default`)
    /// or `None` to leave that track on the adaptive router. Useful
    /// when the router's scoring picks an unavailable model (e.g. an
    /// MLX model whose weight shards aren't downloaded) or when you
    /// want to force a specific delegated runner. Builder-style.
    pub fn with_models(
        mut self,
        fast_model: Option<String>,
        sidecar_model: Option<String>,
    ) -> Self {
        self.fast_model = fast_model;
        self.sidecar_model = sidecar_model;
        self
    }

    /// Skip the fast track for conversational utterances — produces
    /// one TTS clip per turn (sidecar only) instead of two. See
    /// `skip_fast_track` docs above.
    pub fn with_skip_fast_track(mut self, skip: bool) -> Self {
        self.skip_fast_track = skip;
        self
    }

    /// Attach a direct data fetcher (Phase C). On a tool-likely
    /// utterance, the sidecar consults the fetcher first; on hit the
    /// LLM is skipped entirely. Saves ~1 LLM round-trip per
    /// applicable turn (1-3 seconds of latency on observed quip
    /// workloads). On miss, falls through to the LLM path
    /// transparently.
    pub fn with_direct_fetcher(mut self, fetcher: Arc<dyn DirectDataFetcher>) -> Self {
        self.direct_fetcher = Some(fetcher);
        self
    }

    /// Issue a 1-token probe to load the fast model into memory.
    /// Best-effort; errors are logged, not propagated. Apple's
    /// FoundationModels has a non-trivial cold-start; calling this at
    /// orchestrator construction makes the first turn meet the
    /// <500ms target. Idempotent.
    pub async fn prewarm(&self) {
        let req = GenerateRequest {
            prompt: ".".to_string(),
            // Honor a pinned fast model when set — otherwise prewarm
            // routes adaptively and may resolve to a different model
            // than the dispatch path actually uses.
            model: self.fast_model.clone(),
            params: GenerateParams {
                max_tokens: 1,
                ..Default::default()
            },
            intent: Some(IntentHint {
                prefer_fast: true,
                ..IntentHint::default()
            }),
            ..Default::default()
        };
        match self.engine.generate(req).await {
            Ok(_) => tracing::debug!("voice fast model prewarmed"),
            Err(e) => tracing::warn!(
                error = %e,
                "fast model prewarm failed; first turn will be slower"
            ),
        }
    }

    /// Listener calls this on `VoiceEvent::BargeIn` (or when a new
    /// utterance arrives during an in-flight turn). Halts active
    /// TTS, bumps the current turn id so any in-flight sidecar
    /// result is dropped at its arrival gate, and cancels the
    /// in-flight turn's tasks via the stored control.
    pub async fn cancel_current_turn(&self) {
        if let Some(m) = &self.mixer {
            m.stop_tts();
        }
        self.current_turn_id.fetch_add(1, Ordering::AcqRel);
        let mut g = self.current_handle.lock().await;
        if let Some(c) = g.as_ref() {
            c.cancel();
        }
        *g = None;
    }

    /// Drive one finalized utterance through the two-track pipeline.
    pub async fn handle_utterance(&self, utterance: String) -> Result<()> {
        // Cancel any in-flight turn before minting a new one. New
        // utterance arriving means the previous one is implicitly
        // superseded — drop its sidecar result if it arrives later.
        self.cancel_current_turn().await;

        let class = classify_utterance(&utterance);
        let handle = match class {
            UtteranceClass::ToolLikely(kind) => self.dispatch_with_bridge(utterance, kind).await?,
            UtteranceClass::Conversational if self.skip_fast_track => {
                self.dispatch_sidecar_only(utterance)
            }
            UtteranceClass::Conversational => self.dispatch_two_track(utterance),
        };

        let our_turn = handle.turn_id();
        self.current_turn_id.store(our_turn, Ordering::Release);
        {
            let mut g = self.current_handle.lock().await;
            *g = Some(handle.control.clone());
        }

        // Fast track: bridge-only paths have a pre-closed channel and
        // exit the drain immediately. Conversational paths drain the
        // streamed deltas through TTS as sentences land.
        let _permit = self
            .pipeline_lock
            .clone()
            .acquire_owned()
            .await
            .map_err(|_| VoiceError::Playback("pipeline closed".into()))?;
        self.drain_fast_to_tts(handle.fast).await?;
        drop(_permit); // release between turns; user can barge in here.

        // Sidecar wait — capped by sidecar_timeout. Plays
        // periodic progress phrases while the user is waiting.
        // Race-recovery via turn_id gate: if the result's turn_id
        // doesn't match the *current* turn id, drop it.
        match self.wait_with_progress(handle.sidecar, our_turn).await {
            Some(Ok(result)) => self.maybe_play_sidecar(our_turn, result).await?,
            Some(Err(VoiceTurnError::Cancelled)) => {
                tracing::info!(turn_id = our_turn, "sidecar cancelled");
            }
            Some(Err(VoiceTurnError::Inference(e))) => {
                tracing::warn!(turn_id = our_turn, error = %e, "sidecar inference failed");
            }
            None => {
                if let Some(t) = &self.telemetry {
                    t.emit(
                        car_eventlog::EventKind::VoiceSidecarTimedOut,
                        our_turn,
                        vec![(
                            "timeout_ms",
                            (self.sidecar_timeout.as_millis() as u64).into(),
                        )],
                    );
                }
            }
        }

        // Clear our slot so cancel_current_turn doesn't fire on a
        // stale handle.
        let mut g = self.current_handle.lock().await;
        if g.as_ref().map(|c| c.turn_id) == Some(our_turn) {
            *g = None;
        }
        Ok(())
    }

    /// Drive a tool-likely utterance: play the bridge phrase
    /// synchronously, then return a sidecar-only handle.
    async fn dispatch_with_bridge(
        &self,
        utterance: String,
        kind: ToolKind,
    ) -> Result<VoiceTurnHandle> {
        // Bridge phrase plays under the pipeline lock so it can't
        // interleave with whatever the previous turn was finishing.
        let phrase = bridge_phrase(kind);
        {
            let _permit = self
                .pipeline_lock
                .clone()
                .acquire_owned()
                .await
                .map_err(|_| VoiceError::Playback("pipeline closed".into()))?;
            self.speak(phrase).await?;
        }
        // Sidecar still runs to produce the substantive answer.
        let handle = self.dispatch_sidecar_only(utterance);
        if let Some(t) = &self.telemetry {
            t.emit(
                car_eventlog::EventKind::VoiceBridgePlayed,
                handle.turn_id(),
                vec![
                    ("kind", serde_json::Value::from(tool_kind_str(kind))),
                    ("phrase", serde_json::Value::from(phrase)),
                ],
            );
        }
        Ok(handle)
    }

    fn dispatch_two_track(&self, utterance: String) -> VoiceTurnHandle {
        let context = compose_voice_context(&self.config, None);
        let fast_req = GenerateRequest {
            prompt: utterance.clone(),
            model: self.fast_model.clone(),
            params: voice_params(),
            context: context.clone(),
            intent: Some(IntentHint {
                prefer_fast: true,
                ..IntentHint::default()
            }),
            ..Default::default()
        };
        let sidecar_req = GenerateRequest {
            prompt: utterance.clone(),
            model: self.sidecar_model.clone(),
            params: voice_params(),
            context,
            ..Default::default()
        };
        dispatch_voice_turn_with_telemetry(
            self.engine.clone(),
            utterance,
            fast_req,
            sidecar_req,
            self.telemetry.clone(),
        )
    }

    fn dispatch_sidecar_only(&self, utterance: String) -> VoiceTurnHandle {
        let context = compose_voice_context(&self.config, None);
        let sidecar_req = GenerateRequest {
            prompt: utterance.clone(),
            model: self.sidecar_model.clone(),
            params: voice_params(),
            context,
            ..Default::default()
        };
        match &self.direct_fetcher {
            Some(fetcher) => dispatch_voice_turn_sidecar_only_with_classifier(
                self.engine.clone(),
                utterance,
                sidecar_req,
                Some(fetcher.clone()),
                self.telemetry.clone(),
            ),
            None => dispatch_voice_turn_sidecar_only_with_telemetry(
                self.engine.clone(),
                utterance,
                sidecar_req,
                self.telemetry.clone(),
            ),
        }
    }

    async fn drain_fast_to_tts(
        &self,
        mut rx: tokio::sync::mpsc::Receiver<StreamEvent>,
    ) -> Result<()> {
        let mut buf = String::new();
        while let Some(evt) = rx.recv().await {
            match evt {
                StreamEvent::TextDelta(d) => {
                    buf.push_str(&d);
                    if let Some(end) = find_sentence_end(&buf) {
                        let sentence: String = buf.drain(..=end).collect();
                        let trimmed = sentence.trim();
                        if !trimmed.is_empty() {
                            self.speak(trimmed).await?;
                        }
                    }
                }
                StreamEvent::Done { .. } => break,
                _ => {}
            }
        }
        let trimmed = buf.trim();
        if !trimmed.is_empty() {
            self.speak(trimmed).await?;
        }
        Ok(())
    }

    /// Wait for the sidecar's oneshot to resolve, playing a short
    /// progress phrase every `progress_interval_secs` while the user
    /// would otherwise be staring at silence. Returns:
    /// - `Some(Ok(result))` on a sidecar hit within the cap
    /// - `Some(Err(error))` when the sidecar resolved with an error
    ///   (cancellation, inference failure)
    /// - `None` when the cap (the lesser of `sidecar_timeout` and
    ///   `max_progress_attempts × progress_interval_secs`) elapsed
    ///   without resolution
    async fn wait_with_progress(
        &self,
        sidecar: tokio::sync::oneshot::Receiver<std::result::Result<SidecarResult, VoiceTurnError>>,
        our_turn: u64,
    ) -> Option<std::result::Result<SidecarResult, VoiceTurnError>> {
        let interval = Duration::from_secs(
            self.config
                .progress_interval_secs
                .unwrap_or(DEFAULT_PROGRESS_INTERVAL.as_secs()),
        );
        let max_attempts = self
            .config
            .max_progress_attempts
            .unwrap_or(DEFAULT_MAX_PROGRESS_ATTEMPTS);
        let progress_cap = interval.saturating_mul(max_attempts);
        let total_cap = std::cmp::min(progress_cap, self.sidecar_timeout);
        let deadline = tokio::time::Instant::now() + total_cap;

        tokio::pin!(sidecar);
        let mut attempts: u32 = 0;
        loop {
            // Sleep up to `interval`, but never past the overall deadline.
            let now = tokio::time::Instant::now();
            if now >= deadline {
                return None;
            }
            let remaining = deadline.saturating_duration_since(now);
            let sleep_for = std::cmp::min(interval, remaining);

            tokio::select! {
                biased;
                outcome = &mut sidecar => {
                    return Some(outcome.unwrap_or(Err(VoiceTurnError::Cancelled)));
                }
                _ = tokio::time::sleep(sleep_for) => {
                    if tokio::time::Instant::now() >= deadline {
                        return None;
                    }
                    attempts += 1;
                    if attempts > max_attempts {
                        return None;
                    }
                    // Bail if we've been superseded — no point playing a
                    // progress phrase for a turn that's no longer current.
                    if self.current_turn_id.load(Ordering::Acquire) != our_turn {
                        // Wait briefly to see if the sidecar resolves anyway.
                        return match tokio::time::timeout(
                            Duration::from_millis(50),
                            &mut sidecar,
                        )
                        .await
                        {
                            Ok(Ok(r)) => Some(r),
                            _ => Some(Err(VoiceTurnError::Cancelled)),
                        };
                    }
                    let _permit = match self
                        .pipeline_lock
                        .clone()
                        .acquire_owned()
                        .await
                    {
                        Ok(p) => p,
                        Err(_) => return None,
                    };
                    if let Err(e) = self.speak(PROGRESS_PHRASE).await {
                        tracing::warn!(turn_id = our_turn, error=%e, "progress phrase playback failed");
                    }
                    drop(_permit);
                }
            }
        }
    }

    async fn maybe_play_sidecar(&self, our_turn: u64, result: SidecarResult) -> Result<()> {
        let now = self.current_turn_id.load(Ordering::Acquire);
        if result.turn_id != now {
            tracing::info!(
                stale = result.turn_id,
                current = now,
                "dropped stale sidecar result"
            );
            return Ok(());
        }
        let _permit = self
            .pipeline_lock
            .clone()
            .acquire_owned()
            .await
            .map_err(|_| VoiceError::Playback("pipeline closed".into()))?;
        self.speak(&result.text).await?;
        let _ = our_turn; // logged via tracing field elsewhere; keep param so
                          // future telemetry can correlate without churn.
        Ok(())
    }

    /// Speak `text` through the cancellable mixer path when one is
    /// available; fall back to `Speaker::speak` otherwise. Empty/
    /// whitespace input is a no-op.
    async fn speak(&self, text: &str) -> Result<()> {
        if text.trim().is_empty() {
            return Ok(());
        }
        match &self.mixer {
            Some(mixer) => {
                let audio = self.speaker.synth(text).await?;
                mixer.queue_tts(audio.bytes);
                let flag = mixer.speaking_flag();
                while flag.load(Ordering::Relaxed) {
                    tokio::time::sleep(Duration::from_millis(20)).await;
                }
                Ok(())
            }
            None => self.speaker.speak(text).await,
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // Pure-function tests for classify_utterance / bridge_phrase /
    // format_for_voice live alongside their definitions in
    // `crate::utterance`. The orchestrator's unit tests cover the
    // pieces that depend on the macOS-only mixer/dispatch machinery.

    #[test]
    fn find_sentence_end_basic() {
        assert_eq!(find_sentence_end(""), None);
        assert_eq!(find_sentence_end("hello world"), None);
        assert_eq!(find_sentence_end("Hello."), Some(5));
        assert_eq!(find_sentence_end("Hi! Bye."), Some(7));
        assert_eq!(find_sentence_end("Hi! And then..."), Some(14));
    }

    #[test]
    fn voice_config_progress_defaults_round_trip() {
        // Ensure the new config fields default to None (use built-ins)
        // and round-trip through JSON cleanly for hot-reload paths.
        let cfg = VoiceConfig::default();
        assert!(cfg.progress_interval_secs.is_none());
        assert!(cfg.max_progress_attempts.is_none());
        let json = serde_json::to_string(&cfg).unwrap();
        let back: VoiceConfig = serde_json::from_str(&json).unwrap();
        assert_eq!(back.progress_interval_secs, None);
        assert_eq!(back.max_progress_attempts, None);
    }
}