use std::sync::{Mutex, OnceLock};
use voice_activity_detector::VoiceActivityDetector;
const CHUNK_SIZE: usize = 512;
const SPEECH_THRESHOLD: f32 = 0.5;
const MIN_SPEECH_RATIO: f32 = 0.05;
const SILENCE_RMS_FLOOR: f32 = 0.005;
static VAD: OnceLock<Mutex<VoiceActivityDetector>> = OnceLock::new();
fn get_vad() -> Result<&'static Mutex<VoiceActivityDetector>, voice_activity_detector::Error> {
if let Some(vad) = VAD.get() {
return Ok(vad);
}
let vad = VoiceActivityDetector::builder()
.sample_rate(16000)
.chunk_size(CHUNK_SIZE)
.build()?;
Ok(VAD.get_or_init(|| Mutex::new(vad)))
}
pub fn contains_speech(samples: &[f32]) -> bool {
if samples.is_empty() {
return false;
}
let rms = audio_rms(samples);
if rms < SILENCE_RMS_FLOOR {
log::debug!("VAD: audio below noise floor (RMS={rms:.6}), skipping");
return false;
}
log::debug!("VAD: audio RMS={rms:.4}, running speech detection");
match detect_speech(samples) {
Ok(has_speech) => {
if !has_speech {
log::info!("VAD: no speech detected (RMS={rms:.4})");
}
has_speech
}
Err(e) => {
log::warn!("VAD inference failed, conservatively skipping transcription: {e}");
false
}
}
}
fn detect_speech(samples: &[f32]) -> Result<bool, voice_activity_detector::Error> {
let vad_mutex = get_vad()?;
let mut vad = vad_mutex.lock().unwrap_or_else(|e| e.into_inner());
vad.reset();
let total_chunks = samples.len().div_ceil(CHUNK_SIZE);
if total_chunks == 0 {
return Ok(false);
}
let required_speech_chunks = (MIN_SPEECH_RATIO * total_chunks as f32).ceil() as usize;
let mut speech_chunks: usize = 0;
for chunk_start in (0..samples.len()).step_by(CHUNK_SIZE) {
let chunk_end = (chunk_start + CHUNK_SIZE).min(samples.len());
let chunk = &samples[chunk_start..chunk_end];
let probability = vad.predict(chunk.iter().copied());
if probability >= SPEECH_THRESHOLD {
speech_chunks += 1;
if speech_chunks >= required_speech_chunks {
log::debug!(
"VAD: early exit — {speech_chunks}/{total_chunks} chunks contain speech"
);
return Ok(true);
}
}
}
let speech_ratio = speech_chunks as f32 / total_chunks as f32;
log::debug!(
"VAD: {speech_chunks}/{total_chunks} chunks contain speech (ratio={speech_ratio:.2})"
);
Ok(speech_ratio >= MIN_SPEECH_RATIO)
}
#[cfg_attr(not(test), allow(dead_code))]
fn is_below_noise_floor(samples: &[f32]) -> bool {
if samples.is_empty() {
return true;
}
audio_rms(samples) < SILENCE_RMS_FLOOR
}
fn audio_rms(samples: &[f32]) -> f32 {
if samples.is_empty() {
return 0.0;
}
(samples.iter().map(|s| s * s).sum::<f32>() / samples.len() as f32).sqrt()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_audio_has_no_speech() {
assert!(!contains_speech(&[]));
}
#[test]
fn test_silence_has_no_speech() {
let samples = vec![0.0f32; 16_000]; assert!(!contains_speech(&samples));
}
#[test]
fn test_low_noise_has_no_speech() {
let samples = vec![0.001f32; 16_000];
assert!(!contains_speech(&samples));
}
#[test]
fn test_noise_floor_check() {
assert!(is_below_noise_floor(&[0.0; 1000]));
assert!(is_below_noise_floor(&[0.001; 1000]));
assert!(!is_below_noise_floor(&[0.1; 1000]));
assert!(is_below_noise_floor(&[]));
}
#[test]
fn test_vad_model_loads() {
let result = get_vad();
assert!(result.is_ok(), "VAD model failed to load: {result:?}");
}
#[test]
fn test_detect_speech_on_silence() {
let samples = vec![0.01f32; 16_000];
let result = detect_speech(&samples);
assert!(result.is_ok());
assert!(!result.unwrap());
}
}