voice_engine/media/vad/
mod.rs

1use crate::event::{EventSender, SessionEvent};
2use crate::media::processor::Processor;
3use crate::media::{AudioFrame, PcmBuf, Samples};
4use anyhow::Result;
5use serde::{Deserialize, Serialize};
6use serde_with::skip_serializing_none;
7use std::any::Any;
8use std::cell::RefCell;
9use tokio_util::sync::CancellationToken;
10
11pub(crate) mod simd;
12pub(crate) mod tiny_silero;
13pub(crate) mod tiny_ten;
14pub(crate) mod utils;
15
16#[cfg(test)]
17mod benchmark_all;
18#[cfg(test)]
19mod tests;
20
21#[skip_serializing_none]
22#[derive(Clone, Debug, Deserialize, Serialize)]
23#[serde(rename_all = "camelCase")]
24#[serde(default)]
25pub struct VADOption {
26    pub r#type: VadType,
27    pub samplerate: u32,
28    /// Padding before speech detection (in ms)
29    pub speech_padding: u64,
30    /// Padding after silence detection (in ms)
31    pub silence_padding: u64,
32    pub ratio: f32,
33    pub voice_threshold: f32,
34    pub max_buffer_duration_secs: u64,
35    /// Timeout duration for silence (in ms), None means disable this feature
36    pub silence_timeout: Option<u64>,
37    pub endpoint: Option<String>,
38    pub secret_key: Option<String>,
39    pub secret_id: Option<String>,
40}
41
42impl Default for VADOption {
43    fn default() -> Self {
44        Self {
45            r#type: VadType::Silero,
46            samplerate: 16000,
47            // Python defaults: min_speech_duration_ms=250, min_silence_duration_ms=100, speech_pad_ms=30
48            speech_padding: 250,  // min_speech_duration_ms
49            silence_padding: 100, // min_silence_duration_ms
50            ratio: 0.5,
51            voice_threshold: 0.5,
52            max_buffer_duration_secs: 50,
53            silence_timeout: None,
54            endpoint: None,
55            secret_key: None,
56            secret_id: None,
57        }
58    }
59}
60
61#[derive(Clone, Debug, Serialize, Eq, Hash, PartialEq)]
62#[serde(rename_all = "lowercase")]
63pub enum VadType {
64    Silero,
65    Ten,
66    Other(String),
67}
68
69impl<'de> Deserialize<'de> for VadType {
70    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
71    where
72        D: serde::Deserializer<'de>,
73    {
74        let value = String::deserialize(deserializer)?;
75        match value.as_str() {
76            "silero" => Ok(VadType::Silero),
77            "ten" => Ok(VadType::Ten),
78            _ => Ok(VadType::Other(value)),
79        }
80    }
81}
82
83impl std::fmt::Display for VadType {
84    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
85        match self {
86            VadType::Silero => write!(f, "silero"),
87            VadType::Ten => write!(f, "ten"),
88            VadType::Other(provider) => write!(f, "{}", provider),
89        }
90    }
91}
92
93impl TryFrom<&String> for VadType {
94    type Error = String;
95
96    fn try_from(value: &String) -> std::result::Result<Self, Self::Error> {
97        match value.as_str() {
98            "silero" => Ok(VadType::Silero),
99            "ten" => Ok(VadType::Ten),
100            other => Ok(VadType::Other(other.to_string())),
101        }
102    }
103}
104struct SpeechBuf {
105    samples: PcmBuf,
106    timestamp: u64,
107}
108
109struct VadProcessorInner {
110    vad: Box<dyn VadEngine>,
111    event_sender: EventSender,
112    option: VADOption,
113    window_bufs: Vec<SpeechBuf>,
114    triggered: bool,
115    current_speech_start: Option<u64>,
116    temp_end: Option<u64>,
117}
118pub struct VadProcessor {
119    inner: RefCell<VadProcessorInner>,
120}
121unsafe impl Send for VadProcessor {}
122unsafe impl Sync for VadProcessor {}
123
124pub trait VadEngine: Send + Sync + Any {
125    fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)>;
126}
127
128impl VadProcessorInner {
129    pub fn process_frame(&mut self, frame: &mut AudioFrame) -> Result<()> {
130        let samples = match &frame.samples {
131            Samples::PCM { samples } => samples,
132            _ => return Ok(()),
133        };
134
135        let samples = samples.to_owned();
136        let result = self.vad.process(frame);
137        if let Some((is_speaking, timestamp)) = result {
138            if is_speaking || self.triggered {
139                let current_buf = SpeechBuf { samples, timestamp };
140                self.window_bufs.push(current_buf);
141            }
142            self.process_vad_logic(is_speaking, timestamp, &frame.track_id)?;
143
144            // Clean up old buffers periodically
145            if self.window_bufs.len() > 1000 || !self.triggered {
146                let cutoff = if self.triggered {
147                    timestamp.saturating_sub(5000)
148                } else {
149                    timestamp.saturating_sub(self.option.silence_padding)
150                };
151                self.window_bufs.retain(|buf| buf.timestamp > cutoff);
152            }
153        }
154
155        Ok(())
156    }
157
158    fn process_vad_logic(
159        &mut self,
160        is_speaking: bool,
161        timestamp: u64,
162        track_id: &str,
163    ) -> Result<()> {
164        if is_speaking && !self.triggered {
165            self.triggered = true;
166            self.current_speech_start = Some(timestamp);
167            let event = SessionEvent::Speaking {
168                track_id: track_id.to_string(),
169                timestamp: crate::media::get_timestamp(),
170                start_time: timestamp,
171            };
172            self.event_sender.send(event).ok();
173        } else if !is_speaking {
174            if self.temp_end.is_none() {
175                self.temp_end = Some(timestamp);
176            }
177
178            if let Some(temp_end) = self.temp_end {
179                // Use saturating_sub to handle timestamp wrapping or out-of-order frames
180                let silence_duration = timestamp.saturating_sub(temp_end);
181
182                // Process regular silence detection for speech segments
183                if self.triggered && silence_duration >= self.option.silence_padding {
184                    if let Some(start_time) = self.current_speech_start {
185                        // Use safe duration calculation
186                        let duration = temp_end.saturating_sub(start_time);
187                        if duration >= self.option.speech_padding {
188                            let samples_vec = self
189                                .window_bufs
190                                .iter()
191                                .filter(|buf| {
192                                    buf.timestamp >= start_time && buf.timestamp <= temp_end
193                                })
194                                .flat_map(|buf| buf.samples.iter())
195                                .cloned()
196                                .collect();
197                            self.window_bufs.clear();
198
199                            let event = SessionEvent::Silence {
200                                track_id: track_id.to_string(),
201                                timestamp: crate::media::get_timestamp(),
202                                start_time,
203                                duration,
204                                samples: Some(samples_vec),
205                            };
206                            self.event_sender.send(event).ok();
207                        }
208                    }
209                    self.triggered = false;
210                    self.current_speech_start = None;
211                    self.temp_end = Some(timestamp); // Update temp_end for silence timeout tracking
212                }
213
214                // Process silence timeout if configured
215                if let Some(timeout) = self.option.silence_timeout {
216                    // Use same safe calculation for silence timeout
217                    let timeout_duration = timestamp.saturating_sub(temp_end);
218
219                    if timeout_duration >= timeout {
220                        let event = SessionEvent::Silence {
221                            track_id: track_id.to_string(),
222                            timestamp: crate::media::get_timestamp(),
223                            start_time: temp_end,
224                            duration: timeout_duration,
225                            samples: None,
226                        };
227                        self.event_sender.send(event).ok();
228                        self.temp_end = Some(timestamp);
229                    }
230                }
231            }
232        } else if is_speaking && self.temp_end.is_some() {
233            self.temp_end = None;
234        }
235
236        Ok(())
237    }
238}
239
240impl VadProcessor {
241    pub fn create(
242        _token: CancellationToken,
243        event_sender: EventSender,
244        option: VADOption,
245    ) -> Result<Box<dyn Processor>> {
246        let vad: Box<dyn VadEngine> = match option.r#type {
247            VadType::Silero => Box::new(tiny_silero::TinySilero::new(option.clone())?),
248            VadType::Ten => Box::new(tiny_ten::TinyVad::new(option.clone())?),
249            _ => Box::new(NopVad::new()?),
250        };
251        Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
252    }
253
254    pub fn create_nop(
255        _token: CancellationToken,
256        event_sender: EventSender,
257        option: VADOption,
258    ) -> Result<Box<dyn Processor>> {
259        let vad: Box<dyn VadEngine> = match option.r#type {
260            _ => Box::new(NopVad::new()?),
261        };
262        Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
263    }
264
265    pub fn new(
266        engine: Box<dyn VadEngine>,
267        event_sender: EventSender,
268        option: VADOption,
269    ) -> Result<Self> {
270        let inner = VadProcessorInner {
271            vad: engine,
272            event_sender,
273            option,
274            window_bufs: Vec::new(),
275            triggered: false,
276            current_speech_start: None,
277            temp_end: None,
278        };
279        Ok(Self {
280            inner: RefCell::new(inner),
281        })
282    }
283}
284
285impl Processor for VadProcessor {
286    fn process_frame(&self, frame: &mut AudioFrame) -> Result<()> {
287        self.inner.borrow_mut().process_frame(frame)
288    }
289}
290
291struct NopVad {}
292
293impl NopVad {
294    pub fn new() -> Result<Self> {
295        Ok(Self {})
296    }
297}
298
299impl VadEngine for NopVad {
300    fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)> {
301        let samples = match &frame.samples {
302            Samples::PCM { samples } => samples,
303            _ => return Some((false, frame.timestamp)),
304        };
305        // Check if there are any non-zero samples
306        let has_speech = samples.iter().any(|&x| x != 0);
307        Some((has_speech, frame.timestamp))
308    }
309}