car-voice 0.13.0

//! Events emitted by the voice listener.

use serde::{Deserialize, Serialize};
use std::path::PathBuf;

use crate::enrollment::TranscriptRole;

/// Events surfaced by a [`crate::Listener`] as audio flows through capture,
/// VAD, and STT.
///
/// Channels subscribe to a stream of these and react however they want
/// (display a transcript, feed it to a session, animate a waveform, etc.).
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum VoiceEvent {
    /// VAD detected the start of a speech segment.
    SpeechStart,

    /// VAD detected the end of a speech segment.
    SpeechEnd,

    /// A finalized transcript for the most recent speech segment.
    Transcript {
        text: String,
        /// Total duration of the underlying audio in milliseconds.
        duration_ms: u64,
        /// Which speaker this came from. `Unknown` when no speaker
        /// pipeline is attached or the classifier couldn't decide.
        /// `serde(default)` keeps older serialised events loadable.
        #[serde(default)]
        role: TranscriptRole,
    },

    /// An incremental partial during streaming transcription.
    ///
    /// Emitted by listeners using a streaming-capable STT backend
    /// (Parakeet TDT today; future native-streaming backends would
    /// reuse this variant). Partials *monotonically extend* — text
    /// never revises once emitted. The canonical "this segment is
    /// done" signal is still `Transcript`; `Partial` is purely for
    /// live transcript UX.
    Partial {
        /// Accumulated text decoded so far.
        text: String,
        /// Audio duration covered so far in milliseconds — useful for
        /// UI progress indicators.
        duration_ms: u64,
    },

    /// A raw PCM chunk captured from the microphone, for consumers that want
    /// to render waveforms or feed audio elsewhere. Optional — most consumers
    /// only care about transcripts.
    AudioChunk {
        /// 16-bit signed PCM samples, mono.
        samples: Vec<i16>,
        /// Sample rate in Hz.
        sample_rate: u32,
    },

    /// The user started speaking *while Tokhn was speaking*. The
    /// listener fires this when its (boosted) VAD detects user voice
    /// loud enough to override the bed / TTS leakage. Channels should
    /// halt the active narration immediately so the user can be heard.
    BargeIn,

    /// A previously-armed enrollment capture succeeded — the
    /// voiceprint has been persisted at `save_path`. Downstream
    /// callers typically switch onboarding state and reload their
    /// [`crate::enrollment::SpeakerPipeline`] to pick up the new
    /// enrollment.
    EnrollmentCaptured { label: String, save_path: PathBuf },

    /// An enrollment capture attempt failed. `reason` is
    /// human-readable; the caller decides whether to re-arm with a
    /// different sentence / threshold or surface the error.
    EnrollmentFailed { reason: String },
}