#![forbid(unsafe_code)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_lossless)]
const NUM_SUBBANDS: usize = 4;
const DEFAULT_ENERGY_THRESHOLD_DB: f32 = 10.0;
const DEFAULT_HANGOVER_FRAMES: u32 = 8;
const NOISE_TRACK_ALPHA: f32 = 0.995;
const NOISE_TRACK_ALPHA_RISE: f32 = 0.90;
const SPECTRAL_FLATNESS_THRESHOLD: f32 = 0.70;
const ZCR_HIGH_THRESHOLD: f32 = 60.0;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum VadDecision {
Voice,
Silence,
}
impl VadDecision {
#[must_use]
pub fn is_voice(self) -> bool {
self == Self::Voice
}
}
#[derive(Debug, Clone)]
pub struct FrameFeatures {
pub energy: f32,
pub spectral_flatness: f32,
pub zcr: f32,
pub subband_energy: [f32; NUM_SUBBANDS],
}
impl FrameFeatures {
#[must_use]
pub fn from_pcm_i16(samples: &[i16], sample_rate: u32) -> Self {
if samples.is_empty() {
return Self::zeroed();
}
let energy: f32 = samples.iter().map(|&s| (s as f32) * (s as f32)).sum();
let mut zcr_count = 0u32;
for w in samples.windows(2) {
let a = w[0];
let b = w[1];
if (a >= 0 && b < 0) || (a < 0 && b >= 0) {
zcr_count += 1;
}
}
let normaliser = 160.0 / samples.len() as f32;
let zcr = zcr_count as f32 * normaliser;
let subband_energy = compute_subband_energies(samples, sample_rate);
let spectral_flatness = spectral_flatness_from_bands(&subband_energy);
Self {
energy,
spectral_flatness,
zcr,
subband_energy,
}
}
#[must_use]
pub fn from_pcm_f32(samples: &[f32], sample_rate: u32) -> Self {
let i16_samples: Vec<i16> = samples
.iter()
.map(|&s| (s.clamp(-1.0, 1.0) * 32767.0) as i16)
.collect();
Self::from_pcm_i16(&i16_samples, sample_rate)
}
fn zeroed() -> Self {
Self {
energy: 0.0,
spectral_flatness: 1.0,
zcr: 0.0,
subband_energy: [0.0; NUM_SUBBANDS],
}
}
}
fn compute_subband_energies(samples: &[i16], _sample_rate: u32) -> [f32; NUM_SUBBANDS] {
let mut bands = [0.0f32; NUM_SUBBANDS];
for (i, &s) in samples.iter().enumerate() {
let band = i % NUM_SUBBANDS;
bands[band] += (s as f32) * (s as f32);
}
let n = (samples.len() / NUM_SUBBANDS).max(1) as f32;
for b in &mut bands {
*b /= n;
}
bands
}
fn spectral_flatness_from_bands(bands: &[f32; NUM_SUBBANDS]) -> f32 {
let min_energy = 1e-6_f32;
let arith_mean: f32 = bands.iter().map(|&b| b + min_energy).sum::<f32>() / NUM_SUBBANDS as f32;
let log_sum: f32 = bands.iter().map(|&b| (b + min_energy).ln()).sum::<f32>();
let geo_mean = (log_sum / NUM_SUBBANDS as f32).exp();
if arith_mean > 0.0 {
(geo_mean / arith_mean).min(1.0)
} else {
1.0
}
}
#[derive(Debug, Clone)]
pub struct VadConfig {
pub energy_threshold_db: f32,
pub hangover_frames: u32,
pub spectral_flatness_threshold: f32,
pub zcr_high_threshold: f32,
pub energy_weight: f32,
}
impl Default for VadConfig {
fn default() -> Self {
Self {
energy_threshold_db: DEFAULT_ENERGY_THRESHOLD_DB,
hangover_frames: DEFAULT_HANGOVER_FRAMES,
spectral_flatness_threshold: SPECTRAL_FLATNESS_THRESHOLD,
zcr_high_threshold: ZCR_HIGH_THRESHOLD,
energy_weight: 0.6,
}
}
}
pub struct VoiceActivityDetector {
config: VadConfig,
noise_floor: f32,
hangover: u32,
frame_count: u64,
smoothed_energy: f32,
}
impl VoiceActivityDetector {
#[must_use]
pub fn new(config: VadConfig) -> Self {
Self {
config,
noise_floor: 1.0,
hangover: 0,
frame_count: 0,
smoothed_energy: 0.0,
}
}
pub fn process_i16(&mut self, samples: &[i16], sample_rate: u32) -> VadDecision {
let features = FrameFeatures::from_pcm_i16(samples, sample_rate);
self.process_features(&features)
}
pub fn process_f32(&mut self, samples: &[f32], sample_rate: u32) -> VadDecision {
let features = FrameFeatures::from_pcm_f32(samples, sample_rate);
self.process_features(&features)
}
pub fn process_features(&mut self, features: &FrameFeatures) -> VadDecision {
self.frame_count += 1;
self.smoothed_energy = 0.9 * self.smoothed_energy + 0.1 * features.energy;
let threshold_linear = db_to_linear_energy(self.config.energy_threshold_db);
let energy_above_noise = features.energy > self.noise_floor * threshold_linear;
let spectral_speech = features.spectral_flatness < self.config.spectral_flatness_threshold;
let zcr_noise = features.zcr > self.config.zcr_high_threshold;
let w_e = self.config.energy_weight;
let w_s = (1.0 - w_e) * 0.5;
let w_z = (1.0 - w_e) * 0.5;
let speech_score = w_e * energy_above_noise as u8 as f32
+ w_s * spectral_speech as u8 as f32
+ w_z * (!zcr_noise) as u8 as f32;
let raw_voice = speech_score >= 0.5;
let decision_before_hangover = raw_voice;
if !decision_before_hangover {
let alpha = if features.energy > self.noise_floor {
NOISE_TRACK_ALPHA_RISE
} else {
NOISE_TRACK_ALPHA
};
self.noise_floor = alpha * self.noise_floor + (1.0 - alpha) * features.energy.max(1.0);
}
if raw_voice {
self.hangover = self.config.hangover_frames;
VadDecision::Voice
} else if self.hangover > 0 {
self.hangover -= 1;
VadDecision::Voice
} else {
VadDecision::Silence
}
}
pub fn reset(&mut self) {
self.noise_floor = 1.0;
self.hangover = 0;
self.frame_count = 0;
self.smoothed_energy = 0.0;
}
#[must_use]
pub fn noise_floor(&self) -> f32 {
self.noise_floor
}
#[must_use]
pub fn frame_count(&self) -> u64 {
self.frame_count
}
}
#[inline]
fn db_to_linear_energy(db: f32) -> f32 {
10.0_f32.powf(db / 10.0)
}
#[cfg(test)]
mod tests {
use super::*;
fn silence_frame(len: usize) -> Vec<i16> {
vec![0i16; len]
}
fn speech_frame(len: usize, amplitude: i16) -> Vec<i16> {
(0..len)
.map(|i| if i % 2 == 0 { amplitude } else { -amplitude })
.collect()
}
fn loud_sine(len: usize) -> Vec<i16> {
let freq = 200.0f32;
let sr = 16000.0f32;
(0..len)
.map(|i| {
let t = i as f32 / sr;
((2.0 * std::f32::consts::PI * freq * t).sin() * 26000.0) as i16
})
.collect()
}
#[test]
fn test_silence_classified_as_silence() {
let mut vad = VoiceActivityDetector::new(VadConfig::default());
for _ in 0..30 {
vad.process_i16(&silence_frame(160), 16000);
}
let decision = vad.process_i16(&silence_frame(160), 16000);
assert_eq!(decision, VadDecision::Silence);
}
#[test]
fn test_loud_speech_classified_as_voice() {
let mut vad = VoiceActivityDetector::new(VadConfig::default());
for _ in 0..10 {
vad.process_i16(&silence_frame(160), 16000);
}
let frame = loud_sine(160);
let decision = vad.process_i16(&frame, 16000);
assert_eq!(decision, VadDecision::Voice);
}
#[test]
fn test_hangover_extends_voice() {
let cfg = VadConfig {
hangover_frames: 5,
..Default::default()
};
let mut vad = VoiceActivityDetector::new(cfg);
for _ in 0..10 {
vad.process_i16(&silence_frame(160), 16000);
}
vad.process_i16(&loud_sine(160), 16000);
let d = vad.process_i16(&silence_frame(160), 16000);
assert_eq!(
d,
VadDecision::Voice,
"hangover should keep decision as Voice"
);
}
#[test]
fn test_frame_count_increments() {
let mut vad = VoiceActivityDetector::new(VadConfig::default());
assert_eq!(vad.frame_count(), 0);
vad.process_i16(&silence_frame(160), 16000);
vad.process_i16(&silence_frame(160), 16000);
assert_eq!(vad.frame_count(), 2);
}
#[test]
fn test_reset_clears_state() {
let mut vad = VoiceActivityDetector::new(VadConfig::default());
for _ in 0..20 {
vad.process_i16(&loud_sine(160), 16000);
}
vad.reset();
assert_eq!(vad.frame_count(), 0);
assert_eq!(vad.noise_floor(), 1.0);
}
#[test]
fn test_f32_processing() {
let mut vad = VoiceActivityDetector::new(VadConfig::default());
for _ in 0..10 {
vad.process_f32(&vec![0.0f32; 160], 16000);
}
let loud: Vec<f32> = (0..160)
.map(|i| {
let t = i as f32 / 16000.0;
(2.0 * std::f32::consts::PI * 200.0 * t).sin() * 0.8
})
.collect();
let d = vad.process_f32(&loud, 16000);
assert_eq!(d, VadDecision::Voice);
}
#[test]
fn test_spectral_flatness_flat_is_close_to_one() {
let bands = [1000.0f32, 1100.0, 950.0, 1050.0];
let sf = spectral_flatness_from_bands(&bands);
assert!(
sf > 0.90,
"flat-spectrum flatness should be > 0.90, got {sf}"
);
}
#[test]
fn test_spectral_flatness_peaky_is_low() {
let bands = [10000.0f32, 10.0, 10.0, 10.0];
let sf = spectral_flatness_from_bands(&bands);
assert!(
sf < 0.50,
"peaky-spectrum flatness should be < 0.50, got {sf}"
);
}
#[test]
fn test_frame_features_zero_input() {
let feats = FrameFeatures::from_pcm_i16(&[], 16000);
assert_eq!(feats.energy, 0.0);
}
#[test]
fn test_db_to_linear_energy_10db() {
let ratio = db_to_linear_energy(10.0);
assert!((ratio - 10.0).abs() < 0.01, "expected ~10.0 got {ratio}");
}
}