Skip to main content

nihility_listener/
vad.rs

1mod silero;
2
3use crate::error::*;
4use crate::vad::silero::{Silero, SileroConfig};
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7use tokio::sync::broadcast::{channel, Receiver, Sender};
8use tracing::{debug, error};
9
10/// 语音活动检测配置
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct VoiceActivityDetectionConfig {
13    pub speech_channel_size: usize,
14    pub probability_channel_size: usize,
15    pub padding_size: usize,
16    pub threshold: f32,
17    pub silero_config: SileroConfig,
18}
19
20/// 语音活动检测
21pub struct VoiceActivityDetection {
22    silero: Silero,
23    config: VoiceActivityDetectionConfig,
24    sample_receiver: Receiver<f32>,
25    speech_sender: Sender<Vec<f32>>,
26    probability_sender: Option<Sender<f32>>,
27    buffer: VecDeque<f32>,
28    silence_count: usize,
29    is_speech_active: bool,
30    cumulative_sample_count: usize,
31}
32
33impl VoiceActivityDetection {
34    /// 初始化语音活动检测实例,传入音频采样接收实例以及语音活动音频数据发送实例
35    pub fn init(
36        config: VoiceActivityDetectionConfig,
37        sample_receiver: Receiver<f32>,
38    ) -> Result<Self> {
39        debug!("Initializing NihilityAsrVad with config {:?}", &config);
40        let silero = Silero::init(config.silero_config.clone())?;
41        let buffer = VecDeque::with_capacity(silero.config.chunk_size * (config.padding_size + 2));
42        let (speech_sender, _) = channel(config.speech_channel_size);
43        let probability_sender = if config.probability_channel_size > 0 {
44            let (probability_sender, _) = channel(config.probability_channel_size);
45            Some(probability_sender)
46        } else {
47            None
48        };
49        Ok(Self {
50            silero,
51            config,
52            sample_receiver,
53            speech_sender,
54            probability_sender,
55            buffer,
56            silence_count: 0,
57            is_speech_active: false,
58            cumulative_sample_count: 0,
59        })
60    }
61
62    pub fn get_probability_receiver(&self) -> Result<Receiver<f32>> {
63        self.probability_sender
64            .as_ref()
65            .ok_or(NihilityListenerError::Init(
66                "probability channel size must greater than zero".to_string(),
67            ))
68            .map(|sender| sender.subscribe())
69    }
70
71    pub fn get_speech_receiver(&self) -> Receiver<Vec<f32>> {
72        self.speech_sender.subscribe()
73    }
74
75    /// 运行语音活动检测线程
76    pub async fn run(&mut self) -> Result<()> {
77        while let Ok(sample) = self.sample_receiver.recv().await {
78            self.buffer.push_back(sample);
79            self.cumulative_sample_count += 1;
80            // 当新样本数量积累到一个块大小时进行识别
81            if self.cumulative_sample_count == self.silero.config.chunk_size {
82                // 重置新样本数量计数
83                self.cumulative_sample_count = 0;
84                let probability = self.silero.predict(
85                    &self
86                        .buffer
87                        .range((self.buffer.len() - self.silero.config.chunk_size)..)
88                        .copied()
89                        .collect::<Vec<f32>>(),
90                )?;
91                if let Some(probability_sender) = self.probability_sender.as_ref()
92                    && let Err(e) = probability_sender.send(probability)
93                {
94                    error!("Error sending probability: {:?}", e);
95                }
96                // 当预测结果大于等于设定阈值
97                if probability >= self.config.threshold {
98                    self.is_speech_active = true;
99                    self.silence_count = 0;
100                } else {
101                    self.silence_count += 1;
102                    if self.silence_count >= self.config.padding_size {
103                        if self.is_speech_active {
104                            // 如果当前处于活动状态,标志活动语言结束,发送当前缓冲区内所有数据,并且重置缓冲区
105                            self.is_speech_active = false;
106                            self.silence_count = 0;
107                            self.speech_sender.send(self.buffer.drain(..).collect())?;
108                        } else {
109                            // 当静音块数量超过设置边界数量,移除最早的音频数据块
110                            if self.silence_count != self.config.padding_size {
111                                self.silence_count -= 1;
112                                self.buffer.drain(..self.silero.config.chunk_size);
113                            }
114                        }
115                    }
116                }
117            }
118        }
119        Ok(())
120    }
121}
122
123impl Default for VoiceActivityDetectionConfig {
124    fn default() -> Self {
125        Self {
126            speech_channel_size: 5,
127            silero_config: SileroConfig::default(),
128            padding_size: 4,
129            threshold: 0.01,
130            probability_channel_size: 0,
131        }
132    }
133}