car-voice 0.13.0

Voice I/O capability for CAR — mic capture, VAD, listener/speaker traits
Documentation
//! Voice Activity Detection — energy-based with adaptive noise floor.
//!
//! Ported from `app/src-tauri/src/microphone/vad.rs`, which itself was
//! ported from VoiceRail's `EnergyDetector`. Algorithm:
//!
//! 1. Per-chunk RMS in dB.
//! 2. IIR-smoothed instantaneous level.
//! 3. Noise floor: bottom percentile of a warmup window, then slow drift.
//! 4. Hysteresis-gated speech state with minimum onset / silence durations.
//!
//! Tuning lives in [`crate::VoiceConfig`] so channels can override per
//! environment (the prod default of 15 dB is from commit `7033ca3`, raised
//! from 9 dB to keep the audio bed out of the AirPods microphone).

use crate::VoiceConfig;
use std::time::Instant;

const NOISE_FLOOR_DEFAULT_DB: f32 = -50.0;
const NOISE_FLOOR_PERCENTILE: f32 = 0.20;
const NOISE_FLOOR_ADAPT_RATE: f32 = 0.05;
const SILENCE_FLOOR_DB: f32 = -96.0;
const WARMUP_SKIP_MS: u64 = 800;
const WARMUP_CALIBRATE_MS: u64 = 1500;

pub struct VadState {
    smoothed_rms_db: f32,
    noise_floor_db: f32,
    is_speaking: bool,
    calibrated: bool,
    calibration_samples: Vec<f32>,
    speech_onset_at: Option<Instant>,
    silence_onset_at: Option<Instant>,
    start_time: Instant,
    #[allow(dead_code)]
    sample_rate: u32,

    threshold_db: f32,
    smoothing_factor: f32,
    hysteresis_db: f32,
    speech_onset_ms: u64,
    turn_end_ms: u64,

    /// Temporary additive offset on top of `threshold_db`. The capture
    /// loop bumps this while Tokhn is speaking so only louder-than-echo
    /// user voice can register — that's barge-in. Set back to 0 once
    /// playback ends.
    threshold_boost_db: f32,
}

impl VadState {
    /// Create a VAD state from a [`VoiceConfig`]. Pulls all tuning knobs
    /// (threshold, smoothing, hysteresis, onset/turn timings) from config so
    /// callers don't need to know the constants.
    pub fn from_config(sample_rate: u32, config: &VoiceConfig) -> Self {
        Self {
            smoothed_rms_db: NOISE_FLOOR_DEFAULT_DB,
            noise_floor_db: NOISE_FLOOR_DEFAULT_DB,
            is_speaking: false,
            calibrated: false,
            calibration_samples: Vec::with_capacity(128),
            speech_onset_at: None,
            silence_onset_at: None,
            start_time: Instant::now(),
            sample_rate,
            threshold_db: config.vad_threshold_db,
            smoothing_factor: config.smoothing_factor,
            hysteresis_db: config.hysteresis_db,
            speech_onset_ms: config.speech_onset_ms as u64,
            turn_end_ms: config.turn_end_ms as u64,
            threshold_boost_db: 0.0,
        }
    }

    /// Add a temporary offset on top of the configured threshold.
    /// The capture loop bumps this by ~18 dB while Tokhn is speaking
    /// so only loud user speech (barge-in) registers; sets it back
    /// to 0 once playback ends.
    pub fn set_threshold_boost(&mut self, db: f32) {
        self.threshold_boost_db = db;
    }

    /// The currently calibrated noise floor in dB. The capture loop
    /// reads this to compute a finalized segment's SNR before
    /// deciding whether to send it to STT.
    pub fn noise_floor_db(&self) -> f32 {
        self.noise_floor_db
    }

    /// Has the noise-floor warmup finished?
    pub fn is_calibrated(&self) -> bool {
        self.calibrated
    }

    /// Process a chunk of f32 samples in `[-1.0, 1.0]`.
    pub fn process_samples(&mut self, samples: &[f32]) {
        let elapsed_ms = self.start_time.elapsed().as_millis() as u64;

        // RMS in dB
        let mut sum_sq: f32 = 0.0;
        for &s in samples {
            let scaled = s * 32768.0;
            sum_sq += scaled * scaled;
        }
        let rms = (sum_sq / samples.len() as f32).sqrt();
        let instant_db = if rms < 1.0 {
            SILENCE_FLOOR_DB
        } else {
            20.0 * (rms / 32768.0).log10()
        };

        // IIR smoothing
        self.smoothed_rms_db = self.smoothing_factor * instant_db
            + (1.0 - self.smoothing_factor) * self.smoothed_rms_db;

        // Noise floor calibration
        if elapsed_ms < WARMUP_SKIP_MS {
            return;
        } else if !self.calibrated && elapsed_ms < WARMUP_CALIBRATE_MS {
            self.calibration_samples.push(self.smoothed_rms_db);
            return;
        } else if !self.calibrated && !self.calibration_samples.is_empty() {
            let mut sorted = self.calibration_samples.clone();
            sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
            let cutoff = (sorted.len() as f32 * NOISE_FLOOR_PERCENTILE).max(1.0) as usize;
            let sum: f32 = sorted[..cutoff].iter().sum();
            self.noise_floor_db = sum / cutoff as f32;
            self.calibrated = true;
            tracing::info!(
                "[vad] noise floor calibrated: {:.1}dB from {} samples",
                self.noise_floor_db,
                self.calibration_samples.len()
            );
        } else if self.calibrated && self.smoothed_rms_db < self.noise_floor_db {
            self.noise_floor_db +=
                (self.smoothed_rms_db - self.noise_floor_db) * NOISE_FLOOR_ADAPT_RATE;
        }

        // Threshold with hysteresis. The temporary boost is added on
        // top of the configured threshold so we can require louder
        // speech during TTS playback (barge-in) without permanently
        // raising the bar.
        let threshold_db = self.noise_floor_db + self.threshold_db + self.threshold_boost_db;
        let was_speaking = self.is_speaking;

        if self.is_speaking {
            if self.smoothed_rms_db < threshold_db - self.hysteresis_db {
                self.is_speaking = false;
            }
        } else if self.smoothed_rms_db > threshold_db {
            self.is_speaking = true;
        }

        let now = Instant::now();
        if self.is_speaking && !was_speaking {
            self.speech_onset_at = Some(now);
            self.silence_onset_at = None;
        } else if !self.is_speaking && was_speaking {
            self.silence_onset_at = Some(now);
        }
    }

    /// Energy is currently above the speech threshold.
    pub fn is_speech_active(&self) -> bool {
        self.calibrated && self.is_speaking
    }

    /// Speech has been confirmed (above threshold for the onset duration).
    #[allow(dead_code)]
    pub fn is_speech_confirmed(&self) -> bool {
        if !self.calibrated || !self.is_speaking {
            return false;
        }
        match self.speech_onset_at {
            Some(onset) => onset.elapsed().as_millis() as u64 >= self.speech_onset_ms,
            None => false,
        }
    }

    /// The turn has ended (silence persisted for `turn_end_ms` after speech).
    pub fn turn_ended(&self) -> bool {
        match self.silence_onset_at {
            Some(silence_at) => silence_at.elapsed().as_millis() as u64 >= self.turn_end_ms,
            None => false,
        }
    }

    /// Clear the speech-state machine without re-running calibration.
    /// Used by the capture loop after Tokhn finishes speaking — the
    /// brief period during own playback can otherwise leave the VAD
    /// in a "speaking" state that immediately produces a false-positive
    /// segment from the trailing room reverb.
    pub fn reset_speech_state(&mut self) {
        self.is_speaking = false;
        self.speech_onset_at = None;
        self.silence_onset_at = None;
        self.smoothed_rms_db = self.noise_floor_db;
    }
}