#[cfg(feature = "vad-silero")]
mod silero;
#[cfg(feature = "vad-silero")]
pub use silero::SileroVad;
use std::collections::VecDeque;
use crate::TranscribeError;
pub trait Vad: Send {
fn frame_size(&self) -> usize;
fn is_speech(&mut self, frame: &[f32]) -> Result<bool, TranscribeError>;
fn drain_prefill(&mut self) -> Vec<f32> {
Vec::new()
}
fn reset(&mut self);
}
pub struct EnergyVad {
frame_size: usize,
threshold_rms: f32,
}
impl EnergyVad {
pub fn new(frame_size: usize, threshold_rms: f32) -> Self {
Self {
frame_size,
threshold_rms,
}
}
}
impl Vad for EnergyVad {
fn frame_size(&self) -> usize {
self.frame_size
}
fn is_speech(&mut self, frame: &[f32]) -> Result<bool, TranscribeError> {
if frame.len() != self.frame_size {
return Err(TranscribeError::Audio(format!(
"expected {} samples, got {}",
self.frame_size,
frame.len()
)));
}
let rms = (frame.iter().map(|s| s * s).sum::<f32>() / frame.len() as f32).sqrt();
Ok(rms > self.threshold_rms)
}
fn reset(&mut self) {}
}
pub struct SmoothedVad {
inner: Box<dyn Vad>,
onset_frames: usize,
hangover_frames: usize,
prefill_frames: usize,
frame_buffer: VecDeque<Vec<f32>>,
hangover_counter: usize,
onset_counter: usize,
in_speech: bool,
at_onset: bool,
}
impl SmoothedVad {
pub fn new(
inner: Box<dyn Vad>,
prefill_frames: usize,
hangover_frames: usize,
onset_frames: usize,
) -> Self {
Self {
inner,
onset_frames,
hangover_frames,
prefill_frames,
frame_buffer: VecDeque::new(),
hangover_counter: 0,
onset_counter: 0,
in_speech: false,
at_onset: false,
}
}
pub fn in_speech(&self) -> bool {
self.in_speech
}
pub fn frame_buffer(&self) -> &VecDeque<Vec<f32>> {
&self.frame_buffer
}
}
impl Vad for SmoothedVad {
fn frame_size(&self) -> usize {
self.inner.frame_size()
}
fn is_speech(&mut self, frame: &[f32]) -> Result<bool, TranscribeError> {
if self.prefill_frames > 0 {
self.frame_buffer.push_back(frame.to_vec());
while self.frame_buffer.len() > self.prefill_frames + 1 {
self.frame_buffer.pop_front();
}
}
let voice = self.inner.is_speech(frame)?;
match (self.in_speech, voice) {
(false, true) => {
self.onset_counter += 1;
if self.onset_counter >= self.onset_frames {
self.in_speech = true;
self.at_onset = true;
self.hangover_counter = self.hangover_frames;
self.onset_counter = 0;
Ok(true)
} else {
Ok(false)
}
}
(true, true) => {
self.hangover_counter = self.hangover_frames;
Ok(true)
}
(true, false) => {
if self.hangover_counter > 0 {
self.hangover_counter -= 1;
Ok(true)
} else {
self.in_speech = false;
Ok(false)
}
}
(false, false) => {
self.onset_counter = 0;
Ok(false)
}
}
}
fn drain_prefill(&mut self) -> Vec<f32> {
if !self.at_onset {
return Vec::new();
}
self.at_onset = false;
self.frame_buffer.pop_back();
let frame_size = self.inner.frame_size();
let mut out = Vec::with_capacity(self.frame_buffer.len() * frame_size);
for buf in self.frame_buffer.drain(..) {
out.extend(buf);
}
out
}
fn reset(&mut self) {
self.frame_buffer.clear();
self.hangover_counter = 0;
self.onset_counter = 0;
self.in_speech = false;
self.at_onset = false;
self.inner.reset();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn energy_vad_silence_below_threshold() {
let mut vad = EnergyVad::new(480, 0.01);
let silence = vec![0.0f32; 480];
assert!(!vad.is_speech(&silence).unwrap());
}
#[test]
fn energy_vad_loud_signal_above_threshold() {
let mut vad = EnergyVad::new(480, 0.01);
let loud = vec![1.0f32; 480];
assert!(vad.is_speech(&loud).unwrap());
}
#[test]
fn energy_vad_wrong_frame_size() {
let mut vad = EnergyVad::new(480, 0.01);
let short = vec![0.0f32; 100];
assert!(vad.is_speech(&short).is_err());
}
#[test]
fn energy_vad_threshold_boundary() {
let mut vad = EnergyVad::new(480, 0.5);
let below = vec![0.1f32; 480];
assert!(!vad.is_speech(&below).unwrap());
let above = vec![0.9f32; 480];
assert!(vad.is_speech(&above).unwrap());
}
#[test]
fn energy_vad_frame_size_getter() {
let vad = EnergyVad::new(320, 0.01);
assert_eq!(vad.frame_size(), 320);
}
fn make_smoothed(onset: usize, hangover: usize, prefill: usize) -> SmoothedVad {
SmoothedVad::new(
Box::new(EnergyVad::new(480, 0.01)),
prefill,
hangover,
onset,
)
}
#[test]
fn smoothed_onset_requires_n_frames() {
let mut vad = make_smoothed(3, 5, 0);
let speech = vec![1.0f32; 480];
assert!(!vad.is_speech(&speech).unwrap());
assert!(!vad.in_speech());
assert!(!vad.is_speech(&speech).unwrap());
assert!(!vad.in_speech());
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.in_speech());
}
#[test]
fn smoothed_hangover_extends_speech() {
let mut vad = make_smoothed(1, 3, 0);
let speech = vec![1.0f32; 480];
let silence = vec![0.0f32; 480];
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.in_speech());
assert!(vad.is_speech(&silence).unwrap()); assert!(vad.in_speech());
assert!(vad.is_speech(&silence).unwrap()); assert!(vad.in_speech());
assert!(vad.is_speech(&silence).unwrap()); assert!(vad.in_speech());
assert!(!vad.is_speech(&silence).unwrap());
assert!(!vad.in_speech());
}
#[test]
fn smoothed_speech_resets_hangover() {
let mut vad = make_smoothed(1, 2, 0);
let speech = vec![1.0f32; 480];
let silence = vec![0.0f32; 480];
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.is_speech(&silence).unwrap());
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.is_speech(&silence).unwrap()); assert!(vad.is_speech(&silence).unwrap()); assert!(!vad.is_speech(&silence).unwrap()); }
#[test]
fn smoothed_onset_counter_resets_on_silence() {
let mut vad = make_smoothed(3, 0, 0);
let speech = vec![1.0f32; 480];
let silence = vec![0.0f32; 480];
assert!(!vad.is_speech(&speech).unwrap());
assert!(!vad.is_speech(&speech).unwrap());
assert!(!vad.is_speech(&silence).unwrap());
assert!(!vad.is_speech(&speech).unwrap());
assert!(!vad.is_speech(&speech).unwrap());
assert!(!vad.in_speech());
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.in_speech());
}
#[test]
fn smoothed_reset_clears_state() {
let mut vad = make_smoothed(1, 5, 10);
let speech = vec![1.0f32; 480];
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.is_speech(&speech).unwrap());
assert!(vad.in_speech());
assert!(!vad.frame_buffer().is_empty());
vad.reset();
assert!(!vad.in_speech());
assert!(vad.frame_buffer().is_empty());
}
#[test]
fn smoothed_prefill_buffer_size() {
let mut vad = make_smoothed(1, 0, 5);
let speech = vec![1.0f32; 480];
for _ in 0..10 {
let _ = vad.is_speech(&speech).unwrap();
}
assert_eq!(vad.frame_buffer().len(), 6);
}
#[test]
fn smoothed_frame_size_delegates() {
let vad = make_smoothed(1, 0, 0);
assert_eq!(vad.frame_size(), 480);
}
#[test]
fn smoothed_drain_prefill_returns_pre_onset_frames() {
let mut vad = make_smoothed(2, 0, 5);
let speech = vec![1.0f32; 480];
let silence = vec![0.0f32; 480];
assert!(!vad.is_speech(&silence).unwrap());
assert!(!vad.is_speech(&silence).unwrap());
assert!(!vad.is_speech(&silence).unwrap());
assert!(!vad.is_speech(&speech).unwrap()); assert!(vad.is_speech(&speech).unwrap());
let prefill = vad.drain_prefill();
assert_eq!(prefill.len(), 4 * 480);
}
#[test]
fn smoothed_drain_prefill_empty_without_prefill() {
let mut vad = make_smoothed(1, 0, 0); let speech = vec![1.0f32; 480];
assert!(vad.is_speech(&speech).unwrap());
let prefill = vad.drain_prefill();
assert!(prefill.is_empty());
}
#[test]
fn smoothed_no_buffering_when_prefill_zero() {
let mut vad = make_smoothed(1, 0, 0);
let speech = vec![1.0f32; 480];
for _ in 0..10 {
let _ = vad.is_speech(&speech).unwrap();
}
assert!(vad.frame_buffer().is_empty());
}
}