mod silero;
use crate::error::*;
use crate::vad::silero::{Silero, SileroConfig};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use tokio::sync::mpsc::{Receiver, Sender};
use tracing::debug;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceActivityDetectionConfig {
pub padding_size: usize,
pub threshold: f32,
pub silero_config: SileroConfig,
}
pub struct VoiceActivityDetection {
silero: Silero,
config: VoiceActivityDetectionConfig,
sample_receiver: Receiver<f32>,
speech_sender: Sender<Vec<f32>>,
buffer: VecDeque<f32>,
silence_count: usize,
is_speech_active: bool,
cumulative_sample_count: usize,
}
impl VoiceActivityDetection {
pub fn init(
config: VoiceActivityDetectionConfig,
sample_receiver: Receiver<f32>,
speech_sender: Sender<Vec<f32>>,
) -> Result<Self> {
debug!("Initializing NihilityAsrVad with config {:?}", &config);
let silero = Silero::init(config.silero_config.clone())?;
let buffer = VecDeque::with_capacity(silero.config.chunk_size * (config.padding_size + 2));
Ok(Self {
silero,
config,
sample_receiver,
speech_sender,
buffer,
silence_count: 0,
is_speech_active: false,
cumulative_sample_count: 0,
})
}
pub async fn run(&mut self) -> Result<()> {
while let Some(sample) = self.sample_receiver.recv().await {
self.buffer.push_back(sample);
self.cumulative_sample_count += 1;
if self.cumulative_sample_count == self.silero.config.chunk_size {
self.cumulative_sample_count = 0;
let probability = self.silero.predict(
&self
.buffer
.range((self.buffer.len() - self.silero.config.chunk_size)..)
.copied()
.collect::<Vec<f32>>(),
)?;
debug!("Probability: {}", probability);
if probability >= self.config.threshold {
self.is_speech_active = true;
self.silence_count = 0;
} else {
self.silence_count += 1;
if self.silence_count >= self.config.padding_size {
if self.is_speech_active {
self.is_speech_active = false;
self.silence_count = 0;
self.speech_sender
.send(self.buffer.drain(..).collect())
.await?;
} else {
if self.silence_count != self.config.padding_size {
self.silence_count -= 1;
self.buffer.drain(..self.silero.config.chunk_size);
}
}
}
}
}
}
Ok(())
}
}
impl Default for VoiceActivityDetectionConfig {
fn default() -> Self {
Self {
silero_config: SileroConfig::default(),
padding_size: 4,
threshold: 0.001,
}
}
}