1use crate::event::{EventSender, SessionEvent};
2use crate::media::processor::Processor;
3use crate::media::{AudioFrame, PcmBuf, Samples};
4use anyhow::Result;
5use serde::{Deserialize, Serialize};
6use serde_with::skip_serializing_none;
7use std::any::Any;
8use std::cell::RefCell;
9use tokio_util::sync::CancellationToken;
10
11pub(crate) mod simd;
12pub mod tiny_silero;
13pub mod tiny_ten;
14pub(crate) mod utils;
15pub use tiny_silero::TinySilero;
16pub use tiny_ten::TinyTen;
17
18#[cfg(test)]
19mod benchmark_all;
20#[cfg(test)]
21mod tests;
22
23#[skip_serializing_none]
24#[derive(Clone, Debug, Deserialize, Serialize)]
25#[serde(rename_all = "camelCase")]
26#[serde(default)]
27pub struct VADOption {
28 pub r#type: VadType,
29 pub samplerate: u32,
30 pub speech_padding: u64,
32 pub silence_padding: u64,
34 pub ratio: f32,
35 pub voice_threshold: f32,
36 pub max_buffer_duration_secs: u64,
37 pub silence_timeout: Option<u64>,
39 pub endpoint: Option<String>,
40 pub secret_key: Option<String>,
41 pub secret_id: Option<String>,
42}
43
44impl Default for VADOption {
45 fn default() -> Self {
46 Self {
47 r#type: VadType::Silero,
48 samplerate: 16000,
49 speech_padding: 250, silence_padding: 100, ratio: 0.5,
53 voice_threshold: 0.5,
54 max_buffer_duration_secs: 50,
55 silence_timeout: None,
56 endpoint: None,
57 secret_key: None,
58 secret_id: None,
59 }
60 }
61}
62
63#[derive(Clone, Debug, Serialize, Eq, Hash, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum VadType {
66 Silero,
67 Ten,
68 Other(String),
69}
70
71impl<'de> Deserialize<'de> for VadType {
72 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
73 where
74 D: serde::Deserializer<'de>,
75 {
76 let value = String::deserialize(deserializer)?;
77 match value.as_str() {
78 "silero" => Ok(VadType::Silero),
79 "ten" => Ok(VadType::Ten),
80 _ => Ok(VadType::Other(value)),
81 }
82 }
83}
84
85impl std::fmt::Display for VadType {
86 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87 match self {
88 VadType::Silero => write!(f, "silero"),
89 VadType::Ten => write!(f, "ten"),
90 VadType::Other(provider) => write!(f, "{}", provider),
91 }
92 }
93}
94
95impl TryFrom<&String> for VadType {
96 type Error = String;
97
98 fn try_from(value: &String) -> std::result::Result<Self, Self::Error> {
99 match value.as_str() {
100 "silero" => Ok(VadType::Silero),
101 "ten" => Ok(VadType::Ten),
102 other => Ok(VadType::Other(other.to_string())),
103 }
104 }
105}
106struct SpeechBuf {
107 samples: PcmBuf,
108 timestamp: u64,
109}
110
111struct VadProcessorInner {
112 vad: Box<dyn VadEngine>,
113 event_sender: EventSender,
114 option: VADOption,
115 window_bufs: Vec<SpeechBuf>,
116 triggered: bool,
117 current_speech_start: Option<u64>,
118 temp_end: Option<u64>,
119}
120pub struct VadProcessor {
121 inner: RefCell<VadProcessorInner>,
122}
123unsafe impl Send for VadProcessor {}
124unsafe impl Sync for VadProcessor {}
125
126pub trait VadEngine: Send + Sync + Any {
127 fn process(&mut self, frame: &mut AudioFrame) -> Vec<(bool, u64)>;
128}
129
130impl VadProcessorInner {
131 pub fn process_frame(&mut self, frame: &mut AudioFrame) -> Result<()> {
132 let samples = match &frame.samples {
133 Samples::PCM { samples } => samples,
134 _ => return Ok(()),
135 };
136
137 let samples_cloned = samples.to_owned();
138 let results = self.vad.process(frame);
139 for (is_speaking, timestamp) in results {
140 if is_speaking || self.triggered {
141 let current_buf = SpeechBuf {
142 samples: samples_cloned.clone(),
143 timestamp,
144 };
145 self.window_bufs.push(current_buf);
146 }
147 self.process_vad_logic(is_speaking, timestamp, &frame.track_id)?;
148
149 if self.window_bufs.len() > 1000 || !self.triggered {
151 let cutoff = if self.triggered {
152 timestamp.saturating_sub(5000)
153 } else {
154 timestamp.saturating_sub(self.option.silence_padding)
155 };
156 self.window_bufs.retain(|buf| buf.timestamp > cutoff);
157 }
158 }
159
160 Ok(())
161 }
162
163 fn process_vad_logic(
164 &mut self,
165 is_speaking: bool,
166 timestamp: u64,
167 track_id: &str,
168 ) -> Result<()> {
169 if is_speaking && !self.triggered {
170 self.triggered = true;
171 self.current_speech_start = Some(timestamp);
172 let event = SessionEvent::Speaking {
173 track_id: track_id.to_string(),
174 timestamp: crate::media::get_timestamp(),
175 start_time: timestamp,
176 };
177 self.event_sender.send(event).ok();
178 } else if !is_speaking {
179 if self.temp_end.is_none() {
180 self.temp_end = Some(timestamp);
181 }
182
183 if let Some(temp_end) = self.temp_end {
184 let silence_duration = timestamp.saturating_sub(temp_end);
186
187 if self.triggered && silence_duration >= self.option.silence_padding {
189 if let Some(start_time) = self.current_speech_start {
190 let duration = temp_end.saturating_sub(start_time);
192 if duration >= self.option.speech_padding {
193 let samples_vec = self
194 .window_bufs
195 .iter()
196 .filter(|buf| {
197 buf.timestamp >= start_time && buf.timestamp <= temp_end
198 })
199 .flat_map(|buf| buf.samples.iter())
200 .cloned()
201 .collect();
202 self.window_bufs.clear();
203
204 let event = SessionEvent::Silence {
205 track_id: track_id.to_string(),
206 timestamp: crate::media::get_timestamp(),
207 start_time,
208 duration,
209 samples: Some(samples_vec),
210 };
211 self.event_sender.send(event).ok();
212 }
213 }
214 self.triggered = false;
215 self.current_speech_start = None;
216 self.temp_end = Some(timestamp); }
218
219 if let Some(timeout) = self.option.silence_timeout {
221 let timeout_duration = timestamp.saturating_sub(temp_end);
223
224 if timeout_duration >= timeout {
225 let event = SessionEvent::Silence {
226 track_id: track_id.to_string(),
227 timestamp: crate::media::get_timestamp(),
228 start_time: temp_end,
229 duration: timeout_duration,
230 samples: None,
231 };
232 self.event_sender.send(event).ok();
233 self.temp_end = Some(timestamp);
234 }
235 }
236 }
237 } else if is_speaking && self.temp_end.is_some() {
238 self.temp_end = None;
239 }
240
241 Ok(())
242 }
243}
244
245impl VadProcessor {
246 pub fn create(
247 _token: CancellationToken,
248 event_sender: EventSender,
249 option: VADOption,
250 ) -> Result<Box<dyn Processor>> {
251 let vad: Box<dyn VadEngine> = match option.r#type {
252 VadType::Silero => Box::new(tiny_silero::TinySilero::new(option.clone())?),
253 VadType::Ten => Box::new(tiny_ten::TinyTen::new(option.clone())?),
254 _ => Box::new(NopVad::new()?),
255 };
256 Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
257 }
258
259 pub fn create_nop(
260 _token: CancellationToken,
261 event_sender: EventSender,
262 option: VADOption,
263 ) -> Result<Box<dyn Processor>> {
264 let vad: Box<dyn VadEngine> = match option.r#type {
265 _ => Box::new(NopVad::new()?),
266 };
267 Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
268 }
269
270 pub fn new(
271 engine: Box<dyn VadEngine>,
272 event_sender: EventSender,
273 option: VADOption,
274 ) -> Result<Self> {
275 let inner = VadProcessorInner {
276 vad: engine,
277 event_sender,
278 option,
279 window_bufs: Vec::new(),
280 triggered: false,
281 current_speech_start: None,
282 temp_end: None,
283 };
284 Ok(Self {
285 inner: RefCell::new(inner),
286 })
287 }
288}
289
290impl Processor for VadProcessor {
291 fn process_frame(&self, frame: &mut AudioFrame) -> Result<()> {
292 self.inner.borrow_mut().process_frame(frame)
293 }
294}
295
296struct NopVad {}
297
298impl NopVad {
299 pub fn new() -> Result<Self> {
300 Ok(Self {})
301 }
302}
303
304impl VadEngine for NopVad {
305 fn process(&mut self, frame: &mut AudioFrame) -> Vec<(bool, u64)> {
306 let samples = match &frame.samples {
307 Samples::PCM { samples } => samples,
308 _ => return vec![(false, frame.timestamp)],
309 };
310 let has_speech = samples.iter().any(|&x| x != 0);
312 vec![(has_speech, frame.timestamp)]
313 }
314}