mod silero;
use crate::error::*;
use crate::vad::silero::{Silero, SileroConfig};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use tokio::sync::broadcast::{channel, Receiver, Sender};
use tracing::{debug, error};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceActivityDetectionConfig {
pub speech_channel_size: usize,
pub probability_channel_size: usize,
pub padding_size: usize,
pub threshold: f32,
pub silero_config: SileroConfig,
}
pub struct VoiceActivityDetection {
silero: Silero,
config: VoiceActivityDetectionConfig,
sample_receiver: Receiver<f32>,
speech_sender: Sender<Vec<f32>>,
probability_sender: Option<Sender<f32>>,
buffer: VecDeque<f32>,
silence_count: usize,
is_speech_active: bool,
cumulative_sample_count: usize,
}
impl VoiceActivityDetection {
pub fn init(
config: VoiceActivityDetectionConfig,
sample_receiver: Receiver<f32>,
) -> Result<Self> {
debug!("Initializing NihilityAsrVad with config {:?}", &config);
let silero = Silero::init(config.silero_config.clone())?;
let buffer = VecDeque::with_capacity(silero.config.chunk_size * (config.padding_size + 2));
let (speech_sender, _) = channel(config.speech_channel_size);
let probability_sender = if config.probability_channel_size > 0 {
let (probability_sender, _) = channel(config.probability_channel_size);
Some(probability_sender)
} else {
None
};
Ok(Self {
silero,
config,
sample_receiver,
speech_sender,
probability_sender,
buffer,
silence_count: 0,
is_speech_active: false,
cumulative_sample_count: 0,
})
}
pub fn get_probability_receiver(&self) -> Result<Receiver<f32>> {
self.probability_sender
.as_ref()
.ok_or(NihilityListenerError::Init(
"probability channel size must greater than zero".to_string(),
))
.map(|sender| sender.subscribe())
}
pub fn get_speech_receiver(&self) -> Receiver<Vec<f32>> {
self.speech_sender.subscribe()
}
pub async fn run(&mut self) -> Result<()> {
while let Ok(sample) = self.sample_receiver.recv().await {
self.buffer.push_back(sample);
self.cumulative_sample_count += 1;
if self.cumulative_sample_count == self.silero.config.chunk_size {
self.cumulative_sample_count = 0;
let probability = self.silero.predict(
&self
.buffer
.range((self.buffer.len() - self.silero.config.chunk_size)..)
.copied()
.collect::<Vec<f32>>(),
)?;
if let Some(probability_sender) = self.probability_sender.as_ref()
&& let Err(e) = probability_sender.send(probability)
{
error!("Error sending probability: {:?}", e);
}
if probability >= self.config.threshold {
self.is_speech_active = true;
self.silence_count = 0;
} else {
self.silence_count += 1;
if self.silence_count >= self.config.padding_size {
if self.is_speech_active {
self.is_speech_active = false;
self.silence_count = 0;
self.speech_sender.send(self.buffer.drain(..).collect())?;
} else {
if self.silence_count != self.config.padding_size {
self.silence_count -= 1;
self.buffer.drain(..self.silero.config.chunk_size);
}
}
}
}
}
}
Ok(())
}
}
impl Default for VoiceActivityDetectionConfig {
fn default() -> Self {
Self {
speech_channel_size: 5,
silero_config: SileroConfig::default(),
padding_size: 4,
threshold: 0.01,
probability_channel_size: 0,
}
}
}