nihility_listener/
vad.rs

1mod silero;
2
3use crate::error::*;
4use crate::vad::silero::{Silero, SileroConfig};
5use serde::{Deserialize, Serialize};
6use std::collections::VecDeque;
7use tokio::sync::broadcast;
8use tokio::sync::broadcast::{Receiver, Sender};
9use tracing::{debug, error};
10
11/// 语音活动检测配置
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct VoiceActivityDetectionConfig {
14    pub speech_channel_size: usize,
15    pub probability_channel_size: usize,
16    pub padding_size: usize,
17    pub threshold: f32,
18    pub silero_config: SileroConfig,
19}
20
21/// 语音活动检测
22pub struct VoiceActivityDetection {
23    silero: Silero,
24    config: VoiceActivityDetectionConfig,
25    sample_receiver: Receiver<f32>,
26    speech_sender: Sender<Vec<f32>>,
27    probability_sender: Option<Sender<f32>>,
28    buffer: VecDeque<f32>,
29    silence_count: usize,
30    is_speech_active: bool,
31    cumulative_sample_count: usize,
32}
33
34impl VoiceActivityDetection {
35    /// 初始化语音活动检测实例,传入音频采样接收实例以及语音活动音频数据发送实例
36    pub fn init(
37        config: VoiceActivityDetectionConfig,
38        sample_receiver: Receiver<f32>,
39    ) -> Result<Self> {
40        debug!("Initializing NihilityAsrVad with config {:?}", &config);
41        let silero = Silero::init(config.silero_config.clone())?;
42        let buffer = VecDeque::with_capacity(silero.config.chunk_size * (config.padding_size + 2));
43        let (speech_sender, _) = broadcast::channel(config.speech_channel_size);
44        let probability_sender = if config.probability_channel_size > 0 {
45            let (probability_sender, _) = broadcast::channel(config.probability_channel_size);
46            Some(probability_sender)
47        } else {
48            None
49        };
50        Ok(Self {
51            silero,
52            config,
53            sample_receiver,
54            speech_sender,
55            probability_sender,
56            buffer,
57            silence_count: 0,
58            is_speech_active: false,
59            cumulative_sample_count: 0,
60        })
61    }
62
63    pub fn get_probability_receiver(&self) -> Result<Receiver<f32>> {
64        self.probability_sender
65            .as_ref()
66            .ok_or(NihilityAsrError::Init(
67                "probability channel size must greater than zero".to_string(),
68            ))
69            .map(|sender| sender.subscribe())
70    }
71
72    pub fn get_speech_receiver(&self) -> Receiver<Vec<f32>> {
73        self.speech_sender.subscribe()
74    }
75
76    /// 运行语音活动检测线程
77    pub async fn run(&mut self) -> Result<()> {
78        while let Ok(sample) = self.sample_receiver.recv().await {
79            self.buffer.push_back(sample);
80            self.cumulative_sample_count += 1;
81            // 当新样本数量积累到一个块大小时进行识别
82            if self.cumulative_sample_count == self.silero.config.chunk_size {
83                // 重置新样本数量计数
84                self.cumulative_sample_count = 0;
85                let probability = self.silero.predict(
86                    &self
87                        .buffer
88                        .range((self.buffer.len() - self.silero.config.chunk_size)..)
89                        .copied()
90                        .collect::<Vec<f32>>(),
91                )?;
92                if let Some(probability_sender) = self.probability_sender.as_ref()
93                    && let Err(e) = probability_sender.send(probability)
94                {
95                    error!("Error sending probability: {:?}", e);
96                }
97                // 当预测结果大于等于设定阈值
98                if probability >= self.config.threshold {
99                    self.is_speech_active = true;
100                    self.silence_count = 0;
101                } else {
102                    self.silence_count += 1;
103                    if self.silence_count >= self.config.padding_size {
104                        if self.is_speech_active {
105                            // 如果当前处于活动状态,标志活动语言结束,发送当前缓冲区内所有数据,并且重置缓冲区
106                            self.is_speech_active = false;
107                            self.silence_count = 0;
108                            self.speech_sender.send(self.buffer.drain(..).collect())?;
109                        } else {
110                            // 当静音块数量超过设置边界数量,移除最早的音频数据块
111                            if self.silence_count != self.config.padding_size {
112                                self.silence_count -= 1;
113                                self.buffer.drain(..self.silero.config.chunk_size);
114                            }
115                        }
116                    }
117                }
118            }
119        }
120        Ok(())
121    }
122}
123
124impl Default for VoiceActivityDetectionConfig {
125    fn default() -> Self {
126        Self {
127            speech_channel_size: 5,
128            silero_config: SileroConfig::default(),
129            padding_size: 4,
130            threshold: 0.001,
131            probability_channel_size: 0,
132        }
133    }
134}