active_call/media/vad/
mod.rs

1use crate::event::{EventSender, SessionEvent};
2use crate::media::processor::Processor;
3use crate::media::{AudioFrame, PcmBuf, Samples};
4use anyhow::Result;
5use serde::{Deserialize, Serialize};
6use serde_with::skip_serializing_none;
7use std::any::Any;
8use std::cell::RefCell;
9use tokio_util::sync::CancellationToken;
10
11pub(crate) mod simd;
12pub mod tiny_silero;
13pub mod tiny_ten;
14pub(crate) mod utils;
15pub use tiny_silero::TinySilero;
16pub use tiny_ten::TinyTen;
17
18#[cfg(test)]
19mod benchmark_all;
20#[cfg(test)]
21mod tests;
22
23#[skip_serializing_none]
24#[derive(Clone, Debug, Deserialize, Serialize)]
25#[serde(rename_all = "camelCase")]
26#[serde(default)]
27pub struct VADOption {
28    pub r#type: VadType,
29    pub samplerate: u32,
30    /// Padding before speech detection (in ms)
31    pub speech_padding: u64,
32    /// Padding after silence detection (in ms)
33    pub silence_padding: u64,
34    pub ratio: f32,
35    pub voice_threshold: f32,
36    pub max_buffer_duration_secs: u64,
37    /// Timeout duration for silence (in ms), None means disable this feature
38    pub silence_timeout: Option<u64>,
39    pub endpoint: Option<String>,
40    pub secret_key: Option<String>,
41    pub secret_id: Option<String>,
42}
43
44impl Default for VADOption {
45    fn default() -> Self {
46        Self {
47            r#type: VadType::Silero,
48            samplerate: 16000,
49            // Python defaults: min_speech_duration_ms=250, min_silence_duration_ms=100, speech_pad_ms=30
50            speech_padding: 250,  // min_speech_duration_ms
51            silence_padding: 100, // min_silence_duration_ms
52            ratio: 0.5,
53            voice_threshold: 0.5,
54            max_buffer_duration_secs: 50,
55            silence_timeout: None,
56            endpoint: None,
57            secret_key: None,
58            secret_id: None,
59        }
60    }
61}
62
63#[derive(Clone, Debug, Serialize, Eq, Hash, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum VadType {
66    Silero,
67    Ten,
68    Other(String),
69}
70
71impl<'de> Deserialize<'de> for VadType {
72    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
73    where
74        D: serde::Deserializer<'de>,
75    {
76        let value = String::deserialize(deserializer)?;
77        match value.as_str() {
78            "silero" => Ok(VadType::Silero),
79            "ten" => Ok(VadType::Ten),
80            _ => Ok(VadType::Other(value)),
81        }
82    }
83}
84
85impl std::fmt::Display for VadType {
86    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87        match self {
88            VadType::Silero => write!(f, "silero"),
89            VadType::Ten => write!(f, "ten"),
90            VadType::Other(provider) => write!(f, "{}", provider),
91        }
92    }
93}
94
95impl TryFrom<&String> for VadType {
96    type Error = String;
97
98    fn try_from(value: &String) -> std::result::Result<Self, Self::Error> {
99        match value.as_str() {
100            "silero" => Ok(VadType::Silero),
101            "ten" => Ok(VadType::Ten),
102            other => Ok(VadType::Other(other.to_string())),
103        }
104    }
105}
106struct SpeechBuf {
107    samples: PcmBuf,
108    timestamp: u64,
109}
110
111struct VadProcessorInner {
112    vad: Box<dyn VadEngine>,
113    event_sender: EventSender,
114    option: VADOption,
115    window_bufs: Vec<SpeechBuf>,
116    triggered: bool,
117    current_speech_start: Option<u64>,
118    temp_end: Option<u64>,
119}
120pub struct VadProcessor {
121    inner: RefCell<VadProcessorInner>,
122}
123unsafe impl Send for VadProcessor {}
124unsafe impl Sync for VadProcessor {}
125
126pub trait VadEngine: Send + Sync + Any {
127    fn process(&mut self, frame: &mut AudioFrame) -> Vec<(bool, u64)>;
128}
129
130impl VadProcessorInner {
131    pub fn process_frame(&mut self, frame: &mut AudioFrame) -> Result<()> {
132        let samples = match &frame.samples {
133            Samples::PCM { samples } => samples,
134            _ => return Ok(()),
135        };
136
137        let samples_cloned = samples.to_owned();
138        let results = self.vad.process(frame);
139        for (is_speaking, timestamp) in results {
140            if is_speaking || self.triggered {
141                let current_buf = SpeechBuf {
142                    samples: samples_cloned.clone(),
143                    timestamp,
144                };
145                self.window_bufs.push(current_buf);
146            }
147            self.process_vad_logic(is_speaking, timestamp, &frame.track_id)?;
148
149            // Clean up old buffers periodically
150            if self.window_bufs.len() > 1000 || !self.triggered {
151                let cutoff = if self.triggered {
152                    timestamp.saturating_sub(5000)
153                } else {
154                    timestamp.saturating_sub(self.option.silence_padding)
155                };
156                self.window_bufs.retain(|buf| buf.timestamp > cutoff);
157            }
158        }
159
160        Ok(())
161    }
162
163    fn process_vad_logic(
164        &mut self,
165        is_speaking: bool,
166        timestamp: u64,
167        track_id: &str,
168    ) -> Result<()> {
169        if is_speaking && !self.triggered {
170            self.triggered = true;
171            self.current_speech_start = Some(timestamp);
172            let event = SessionEvent::Speaking {
173                track_id: track_id.to_string(),
174                timestamp: crate::media::get_timestamp(),
175                start_time: timestamp,
176            };
177            self.event_sender.send(event).ok();
178        } else if !is_speaking {
179            if self.temp_end.is_none() {
180                self.temp_end = Some(timestamp);
181            }
182
183            if let Some(temp_end) = self.temp_end {
184                // Use saturating_sub to handle timestamp wrapping or out-of-order frames
185                let silence_duration = timestamp.saturating_sub(temp_end);
186
187                // Process regular silence detection for speech segments
188                if self.triggered && silence_duration >= self.option.silence_padding {
189                    if let Some(start_time) = self.current_speech_start {
190                        // Use safe duration calculation
191                        let duration = temp_end.saturating_sub(start_time);
192                        if duration >= self.option.speech_padding {
193                            let samples_vec = self
194                                .window_bufs
195                                .iter()
196                                .filter(|buf| {
197                                    buf.timestamp >= start_time && buf.timestamp <= temp_end
198                                })
199                                .flat_map(|buf| buf.samples.iter())
200                                .cloned()
201                                .collect();
202                            self.window_bufs.clear();
203
204                            let event = SessionEvent::Silence {
205                                track_id: track_id.to_string(),
206                                timestamp: crate::media::get_timestamp(),
207                                start_time,
208                                duration,
209                                samples: Some(samples_vec),
210                            };
211                            self.event_sender.send(event).ok();
212                        }
213                    }
214                    self.triggered = false;
215                    self.current_speech_start = None;
216                    self.temp_end = Some(timestamp); // Update temp_end for silence timeout tracking
217                }
218
219                // Process silence timeout if configured
220                if let Some(timeout) = self.option.silence_timeout {
221                    // Use same safe calculation for silence timeout
222                    let timeout_duration = timestamp.saturating_sub(temp_end);
223
224                    if timeout_duration >= timeout {
225                        let event = SessionEvent::Silence {
226                            track_id: track_id.to_string(),
227                            timestamp: crate::media::get_timestamp(),
228                            start_time: temp_end,
229                            duration: timeout_duration,
230                            samples: None,
231                        };
232                        self.event_sender.send(event).ok();
233                        self.temp_end = Some(timestamp);
234                    }
235                }
236            }
237        } else if is_speaking && self.temp_end.is_some() {
238            self.temp_end = None;
239        }
240
241        Ok(())
242    }
243}
244
245impl VadProcessor {
246    pub fn create(
247        _token: CancellationToken,
248        event_sender: EventSender,
249        option: VADOption,
250    ) -> Result<Box<dyn Processor>> {
251        let vad: Box<dyn VadEngine> = match option.r#type {
252            VadType::Silero => Box::new(tiny_silero::TinySilero::new(option.clone())?),
253            VadType::Ten => Box::new(tiny_ten::TinyTen::new(option.clone())?),
254            _ => Box::new(NopVad::new()?),
255        };
256        Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
257    }
258
259    pub fn create_nop(
260        _token: CancellationToken,
261        event_sender: EventSender,
262        option: VADOption,
263    ) -> Result<Box<dyn Processor>> {
264        let vad: Box<dyn VadEngine> = match option.r#type {
265            _ => Box::new(NopVad::new()?),
266        };
267        Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
268    }
269
270    pub fn new(
271        engine: Box<dyn VadEngine>,
272        event_sender: EventSender,
273        option: VADOption,
274    ) -> Result<Self> {
275        let inner = VadProcessorInner {
276            vad: engine,
277            event_sender,
278            option,
279            window_bufs: Vec::new(),
280            triggered: false,
281            current_speech_start: None,
282            temp_end: None,
283        };
284        Ok(Self {
285            inner: RefCell::new(inner),
286        })
287    }
288}
289
290impl Processor for VadProcessor {
291    fn process_frame(&self, frame: &mut AudioFrame) -> Result<()> {
292        self.inner.borrow_mut().process_frame(frame)
293    }
294}
295
296struct NopVad {}
297
298impl NopVad {
299    pub fn new() -> Result<Self> {
300        Ok(Self {})
301    }
302}
303
304impl VadEngine for NopVad {
305    fn process(&mut self, frame: &mut AudioFrame) -> Vec<(bool, u64)> {
306        let samples = match &frame.samples {
307            Samples::PCM { samples } => samples,
308            _ => return vec![(false, frame.timestamp)],
309        };
310        // Check if there are any non-zero samples
311        let has_speech = samples.iter().any(|&x| x != 0);
312        vec![(has_speech, frame.timestamp)]
313    }
314}