1mod silero;
2
3use crate::error::*;
4use crate::vad::silero::{Silero, SileroConfig};
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7use tokio::sync::broadcast::{channel, Receiver, Sender};
8use tracing::{debug, error};
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct VoiceActivityDetectionConfig {
13 pub speech_channel_size: usize,
14 pub probability_channel_size: usize,
15 pub padding_size: usize,
16 pub threshold: f32,
17 pub silero_config: SileroConfig,
18}
19
20pub struct VoiceActivityDetection {
22 silero: Silero,
23 config: VoiceActivityDetectionConfig,
24 sample_receiver: Receiver<f32>,
25 speech_sender: Sender<Vec<f32>>,
26 probability_sender: Option<Sender<f32>>,
27 buffer: VecDeque<f32>,
28 silence_count: usize,
29 is_speech_active: bool,
30 cumulative_sample_count: usize,
31}
32
33impl VoiceActivityDetection {
34 pub fn init(
36 config: VoiceActivityDetectionConfig,
37 sample_receiver: Receiver<f32>,
38 ) -> Result<Self> {
39 debug!("Initializing NihilityAsrVad with config {:?}", &config);
40 let silero = Silero::init(config.silero_config.clone())?;
41 let buffer = VecDeque::with_capacity(silero.config.chunk_size * (config.padding_size + 2));
42 let (speech_sender, _) = channel(config.speech_channel_size);
43 let probability_sender = if config.probability_channel_size > 0 {
44 let (probability_sender, _) = channel(config.probability_channel_size);
45 Some(probability_sender)
46 } else {
47 None
48 };
49 Ok(Self {
50 silero,
51 config,
52 sample_receiver,
53 speech_sender,
54 probability_sender,
55 buffer,
56 silence_count: 0,
57 is_speech_active: false,
58 cumulative_sample_count: 0,
59 })
60 }
61
62 pub fn get_probability_receiver(&self) -> Result<Receiver<f32>> {
63 self.probability_sender
64 .as_ref()
65 .ok_or(NihilityListenerError::Init(
66 "probability channel size must greater than zero".to_string(),
67 ))
68 .map(|sender| sender.subscribe())
69 }
70
71 pub fn get_speech_receiver(&self) -> Receiver<Vec<f32>> {
72 self.speech_sender.subscribe()
73 }
74
75 pub async fn run(&mut self) -> Result<()> {
77 while let Ok(sample) = self.sample_receiver.recv().await {
78 self.buffer.push_back(sample);
79 self.cumulative_sample_count += 1;
80 if self.cumulative_sample_count == self.silero.config.chunk_size {
82 self.cumulative_sample_count = 0;
84 let probability = self.silero.predict(
85 &self
86 .buffer
87 .range((self.buffer.len() - self.silero.config.chunk_size)..)
88 .copied()
89 .collect::<Vec<f32>>(),
90 )?;
91 if let Some(probability_sender) = self.probability_sender.as_ref()
92 && let Err(e) = probability_sender.send(probability)
93 {
94 error!("Error sending probability: {:?}", e);
95 }
96 if probability >= self.config.threshold {
98 self.is_speech_active = true;
99 self.silence_count = 0;
100 } else {
101 self.silence_count += 1;
102 if self.silence_count >= self.config.padding_size {
103 if self.is_speech_active {
104 self.is_speech_active = false;
106 self.silence_count = 0;
107 self.speech_sender.send(self.buffer.drain(..).collect())?;
108 } else {
109 if self.silence_count != self.config.padding_size {
111 self.silence_count -= 1;
112 self.buffer.drain(..self.silero.config.chunk_size);
113 }
114 }
115 }
116 }
117 }
118 }
119 Ok(())
120 }
121}
122
123impl Default for VoiceActivityDetectionConfig {
124 fn default() -> Self {
125 Self {
126 speech_channel_size: 5,
127 silero_config: SileroConfig::default(),
128 padding_size: 4,
129 threshold: 0.01,
130 probability_channel_size: 0,
131 }
132 }
133}