voice_engine/media/vad/
mod.rs

1use crate::event::{EventSender, SessionEvent};
2use crate::media::processor::Processor;
3use crate::media::{AudioFrame, PcmBuf, Samples};
4use anyhow::Result;
5use serde::{Deserialize, Serialize};
6use serde_with::skip_serializing_none;
7use std::any::Any;
8use std::cell::RefCell;
9use tokio_util::sync::CancellationToken;
10
11pub(crate) mod simd;
12pub mod tiny_silero;
13pub mod tiny_ten;
14pub(crate) mod utils;
15pub use tiny_silero::TinySilero;
16pub use tiny_ten::TinyTen;
17
18#[cfg(test)]
19mod benchmark_all;
20#[cfg(test)]
21mod tests;
22
23#[skip_serializing_none]
24#[derive(Clone, Debug, Deserialize, Serialize)]
25#[serde(rename_all = "camelCase")]
26#[serde(default)]
27pub struct VADOption {
28    pub r#type: VadType,
29    pub samplerate: u32,
30    /// Padding before speech detection (in ms)
31    pub speech_padding: u64,
32    /// Padding after silence detection (in ms)
33    pub silence_padding: u64,
34    pub ratio: f32,
35    pub voice_threshold: f32,
36    pub max_buffer_duration_secs: u64,
37    /// Timeout duration for silence (in ms), None means disable this feature
38    pub silence_timeout: Option<u64>,
39    pub endpoint: Option<String>,
40    pub secret_key: Option<String>,
41    pub secret_id: Option<String>,
42}
43
44impl Default for VADOption {
45    fn default() -> Self {
46        Self {
47            r#type: VadType::Silero,
48            samplerate: 16000,
49            // Python defaults: min_speech_duration_ms=250, min_silence_duration_ms=100, speech_pad_ms=30
50            speech_padding: 250,  // min_speech_duration_ms
51            silence_padding: 100, // min_silence_duration_ms
52            ratio: 0.5,
53            voice_threshold: 0.5,
54            max_buffer_duration_secs: 50,
55            silence_timeout: None,
56            endpoint: None,
57            secret_key: None,
58            secret_id: None,
59        }
60    }
61}
62
63#[derive(Clone, Debug, Serialize, Eq, Hash, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum VadType {
66    Silero,
67    Ten,
68    Other(String),
69}
70
71impl<'de> Deserialize<'de> for VadType {
72    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
73    where
74        D: serde::Deserializer<'de>,
75    {
76        let value = String::deserialize(deserializer)?;
77        match value.as_str() {
78            "silero" => Ok(VadType::Silero),
79            "ten" => Ok(VadType::Ten),
80            _ => Ok(VadType::Other(value)),
81        }
82    }
83}
84
85impl std::fmt::Display for VadType {
86    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87        match self {
88            VadType::Silero => write!(f, "silero"),
89            VadType::Ten => write!(f, "ten"),
90            VadType::Other(provider) => write!(f, "{}", provider),
91        }
92    }
93}
94
95impl TryFrom<&String> for VadType {
96    type Error = String;
97
98    fn try_from(value: &String) -> std::result::Result<Self, Self::Error> {
99        match value.as_str() {
100            "silero" => Ok(VadType::Silero),
101            "ten" => Ok(VadType::Ten),
102            other => Ok(VadType::Other(other.to_string())),
103        }
104    }
105}
106struct SpeechBuf {
107    samples: PcmBuf,
108    timestamp: u64,
109}
110
111struct VadProcessorInner {
112    vad: Box<dyn VadEngine>,
113    event_sender: EventSender,
114    option: VADOption,
115    window_bufs: Vec<SpeechBuf>,
116    triggered: bool,
117    current_speech_start: Option<u64>,
118    temp_end: Option<u64>,
119}
120pub struct VadProcessor {
121    inner: RefCell<VadProcessorInner>,
122}
123unsafe impl Send for VadProcessor {}
124unsafe impl Sync for VadProcessor {}
125
126pub trait VadEngine: Send + Sync + Any {
127    fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)>;
128}
129
130impl VadProcessorInner {
131    pub fn process_frame(&mut self, frame: &mut AudioFrame) -> Result<()> {
132        let samples = match &frame.samples {
133            Samples::PCM { samples } => samples,
134            _ => return Ok(()),
135        };
136
137        let samples = samples.to_owned();
138        let result = self.vad.process(frame);
139        if let Some((is_speaking, timestamp)) = result {
140            if is_speaking || self.triggered {
141                let current_buf = SpeechBuf { samples, timestamp };
142                self.window_bufs.push(current_buf);
143            }
144            self.process_vad_logic(is_speaking, timestamp, &frame.track_id)?;
145
146            // Clean up old buffers periodically
147            if self.window_bufs.len() > 1000 || !self.triggered {
148                let cutoff = if self.triggered {
149                    timestamp.saturating_sub(5000)
150                } else {
151                    timestamp.saturating_sub(self.option.silence_padding)
152                };
153                self.window_bufs.retain(|buf| buf.timestamp > cutoff);
154            }
155        }
156
157        Ok(())
158    }
159
160    fn process_vad_logic(
161        &mut self,
162        is_speaking: bool,
163        timestamp: u64,
164        track_id: &str,
165    ) -> Result<()> {
166        if is_speaking && !self.triggered {
167            self.triggered = true;
168            self.current_speech_start = Some(timestamp);
169            let event = SessionEvent::Speaking {
170                track_id: track_id.to_string(),
171                timestamp: crate::media::get_timestamp(),
172                start_time: timestamp,
173            };
174            self.event_sender.send(event).ok();
175        } else if !is_speaking {
176            if self.temp_end.is_none() {
177                self.temp_end = Some(timestamp);
178            }
179
180            if let Some(temp_end) = self.temp_end {
181                // Use saturating_sub to handle timestamp wrapping or out-of-order frames
182                let silence_duration = timestamp.saturating_sub(temp_end);
183
184                // Process regular silence detection for speech segments
185                if self.triggered && silence_duration >= self.option.silence_padding {
186                    if let Some(start_time) = self.current_speech_start {
187                        // Use safe duration calculation
188                        let duration = temp_end.saturating_sub(start_time);
189                        if duration >= self.option.speech_padding {
190                            let samples_vec = self
191                                .window_bufs
192                                .iter()
193                                .filter(|buf| {
194                                    buf.timestamp >= start_time && buf.timestamp <= temp_end
195                                })
196                                .flat_map(|buf| buf.samples.iter())
197                                .cloned()
198                                .collect();
199                            self.window_bufs.clear();
200
201                            let event = SessionEvent::Silence {
202                                track_id: track_id.to_string(),
203                                timestamp: crate::media::get_timestamp(),
204                                start_time,
205                                duration,
206                                samples: Some(samples_vec),
207                            };
208                            self.event_sender.send(event).ok();
209                        }
210                    }
211                    self.triggered = false;
212                    self.current_speech_start = None;
213                    self.temp_end = Some(timestamp); // Update temp_end for silence timeout tracking
214                }
215
216                // Process silence timeout if configured
217                if let Some(timeout) = self.option.silence_timeout {
218                    // Use same safe calculation for silence timeout
219                    let timeout_duration = timestamp.saturating_sub(temp_end);
220
221                    if timeout_duration >= timeout {
222                        let event = SessionEvent::Silence {
223                            track_id: track_id.to_string(),
224                            timestamp: crate::media::get_timestamp(),
225                            start_time: temp_end,
226                            duration: timeout_duration,
227                            samples: None,
228                        };
229                        self.event_sender.send(event).ok();
230                        self.temp_end = Some(timestamp);
231                    }
232                }
233            }
234        } else if is_speaking && self.temp_end.is_some() {
235            self.temp_end = None;
236        }
237
238        Ok(())
239    }
240}
241
242impl VadProcessor {
243    pub fn create(
244        _token: CancellationToken,
245        event_sender: EventSender,
246        option: VADOption,
247    ) -> Result<Box<dyn Processor>> {
248        let vad: Box<dyn VadEngine> = match option.r#type {
249            VadType::Silero => Box::new(tiny_silero::TinySilero::new(option.clone())?),
250            VadType::Ten => Box::new(tiny_ten::TinyTen::new(option.clone())?),
251            _ => Box::new(NopVad::new()?),
252        };
253        Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
254    }
255
256    pub fn create_nop(
257        _token: CancellationToken,
258        event_sender: EventSender,
259        option: VADOption,
260    ) -> Result<Box<dyn Processor>> {
261        let vad: Box<dyn VadEngine> = match option.r#type {
262            _ => Box::new(NopVad::new()?),
263        };
264        Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
265    }
266
267    pub fn new(
268        engine: Box<dyn VadEngine>,
269        event_sender: EventSender,
270        option: VADOption,
271    ) -> Result<Self> {
272        let inner = VadProcessorInner {
273            vad: engine,
274            event_sender,
275            option,
276            window_bufs: Vec::new(),
277            triggered: false,
278            current_speech_start: None,
279            temp_end: None,
280        };
281        Ok(Self {
282            inner: RefCell::new(inner),
283        })
284    }
285}
286
287impl Processor for VadProcessor {
288    fn process_frame(&self, frame: &mut AudioFrame) -> Result<()> {
289        self.inner.borrow_mut().process_frame(frame)
290    }
291}
292
293struct NopVad {}
294
295impl NopVad {
296    pub fn new() -> Result<Self> {
297        Ok(Self {})
298    }
299}
300
301impl VadEngine for NopVad {
302    fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)> {
303        let samples = match &frame.samples {
304            Samples::PCM { samples } => samples,
305            _ => return Some((false, frame.timestamp)),
306        };
307        // Check if there are any non-zero samples
308        let has_speech = samples.iter().any(|&x| x != 0);
309        Some((has_speech, frame.timestamp))
310    }
311}