1mod silero;
2
3use crate::error::*;
4use crate::vad::silero::{Silero, SileroConfig};
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7use tokio::sync::broadcast;
8use tokio::sync::broadcast::{Receiver, Sender};
9use tracing::{debug, error};
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct VoiceActivityDetectionConfig {
14 pub speech_channel_size: usize,
15 pub probability_channel_size: usize,
16 pub padding_size: usize,
17 pub threshold: f32,
18 pub silero_config: SileroConfig,
19}
20
21pub struct VoiceActivityDetection {
23 silero: Silero,
24 config: VoiceActivityDetectionConfig,
25 sample_receiver: Receiver<f32>,
26 speech_sender: Sender<Vec<f32>>,
27 probability_sender: Option<Sender<f32>>,
28 buffer: VecDeque<f32>,
29 silence_count: usize,
30 is_speech_active: bool,
31 cumulative_sample_count: usize,
32}
33
34impl VoiceActivityDetection {
35 pub fn init(
37 config: VoiceActivityDetectionConfig,
38 sample_receiver: Receiver<f32>,
39 ) -> Result<Self> {
40 debug!("Initializing NihilityAsrVad with config {:?}", &config);
41 let silero = Silero::init(config.silero_config.clone())?;
42 let buffer = VecDeque::with_capacity(silero.config.chunk_size * (config.padding_size + 2));
43 let (speech_sender, _) = broadcast::channel(config.speech_channel_size);
44 let probability_sender = if config.probability_channel_size > 0 {
45 let (probability_sender, _) = broadcast::channel(config.probability_channel_size);
46 Some(probability_sender)
47 } else {
48 None
49 };
50 Ok(Self {
51 silero,
52 config,
53 sample_receiver,
54 speech_sender,
55 probability_sender,
56 buffer,
57 silence_count: 0,
58 is_speech_active: false,
59 cumulative_sample_count: 0,
60 })
61 }
62
63 pub fn get_probability_receiver(&self) -> Result<Receiver<f32>> {
64 self.probability_sender
65 .as_ref()
66 .ok_or(NihilityAsrError::Init(
67 "probability channel size must greater than zero".to_string(),
68 ))
69 .map(|sender| sender.subscribe())
70 }
71
72 pub fn get_speech_receiver(&self) -> Receiver<Vec<f32>> {
73 self.speech_sender.subscribe()
74 }
75
76 pub async fn run(&mut self) -> Result<()> {
78 while let Ok(sample) = self.sample_receiver.recv().await {
79 self.buffer.push_back(sample);
80 self.cumulative_sample_count += 1;
81 if self.cumulative_sample_count == self.silero.config.chunk_size {
83 self.cumulative_sample_count = 0;
85 let probability = self.silero.predict(
86 &self
87 .buffer
88 .range((self.buffer.len() - self.silero.config.chunk_size)..)
89 .copied()
90 .collect::<Vec<f32>>(),
91 )?;
92 if let Some(probability_sender) = self.probability_sender.as_ref()
93 && let Err(e) = probability_sender.send(probability)
94 {
95 error!("Error sending probability: {:?}", e);
96 }
97 if probability >= self.config.threshold {
99 self.is_speech_active = true;
100 self.silence_count = 0;
101 } else {
102 self.silence_count += 1;
103 if self.silence_count >= self.config.padding_size {
104 if self.is_speech_active {
105 self.is_speech_active = false;
107 self.silence_count = 0;
108 self.speech_sender.send(self.buffer.drain(..).collect())?;
109 } else {
110 if self.silence_count != self.config.padding_size {
112 self.silence_count -= 1;
113 self.buffer.drain(..self.silero.config.chunk_size);
114 }
115 }
116 }
117 }
118 }
119 }
120 Ok(())
121 }
122}
123
124impl Default for VoiceActivityDetectionConfig {
125 fn default() -> Self {
126 Self {
127 speech_channel_size: 5,
128 silero_config: SileroConfig::default(),
129 padding_size: 4,
130 threshold: 0.001,
131 probability_channel_size: 0,
132 }
133 }
134}