1use crate::event::{EventSender, SessionEvent};
2use crate::media::processor::Processor;
3use crate::media::{AudioFrame, PcmBuf, Samples};
4use anyhow::Result;
5use serde::{Deserialize, Serialize};
6use serde_with::skip_serializing_none;
7use std::any::Any;
8use std::cell::RefCell;
9use tokio_util::sync::CancellationToken;
10
11pub(crate) mod simd;
12pub mod tiny_silero;
13pub mod tiny_ten;
14pub(crate) mod utils;
15pub use tiny_silero::TinySilero;
16pub use tiny_ten::TinyTen;
17
18#[cfg(test)]
19mod benchmark_all;
20#[cfg(test)]
21mod tests;
22
23#[skip_serializing_none]
24#[derive(Clone, Debug, Deserialize, Serialize)]
25#[serde(rename_all = "camelCase")]
26#[serde(default)]
27pub struct VADOption {
28 pub r#type: VadType,
29 pub samplerate: u32,
30 pub speech_padding: u64,
32 pub silence_padding: u64,
34 pub ratio: f32,
35 pub voice_threshold: f32,
36 pub max_buffer_duration_secs: u64,
37 pub silence_timeout: Option<u64>,
39 pub endpoint: Option<String>,
40 pub secret_key: Option<String>,
41 pub secret_id: Option<String>,
42}
43
44impl Default for VADOption {
45 fn default() -> Self {
46 Self {
47 r#type: VadType::Silero,
48 samplerate: 16000,
49 speech_padding: 250, silence_padding: 100, ratio: 0.5,
53 voice_threshold: 0.5,
54 max_buffer_duration_secs: 50,
55 silence_timeout: None,
56 endpoint: None,
57 secret_key: None,
58 secret_id: None,
59 }
60 }
61}
62
63#[derive(Clone, Debug, Serialize, Eq, Hash, PartialEq)]
64#[serde(rename_all = "lowercase")]
65pub enum VadType {
66 Silero,
67 Ten,
68 Other(String),
69}
70
71impl<'de> Deserialize<'de> for VadType {
72 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
73 where
74 D: serde::Deserializer<'de>,
75 {
76 let value = String::deserialize(deserializer)?;
77 match value.as_str() {
78 "silero" => Ok(VadType::Silero),
79 "ten" => Ok(VadType::Ten),
80 _ => Ok(VadType::Other(value)),
81 }
82 }
83}
84
85impl std::fmt::Display for VadType {
86 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
87 match self {
88 VadType::Silero => write!(f, "silero"),
89 VadType::Ten => write!(f, "ten"),
90 VadType::Other(provider) => write!(f, "{}", provider),
91 }
92 }
93}
94
95impl TryFrom<&String> for VadType {
96 type Error = String;
97
98 fn try_from(value: &String) -> std::result::Result<Self, Self::Error> {
99 match value.as_str() {
100 "silero" => Ok(VadType::Silero),
101 "ten" => Ok(VadType::Ten),
102 other => Ok(VadType::Other(other.to_string())),
103 }
104 }
105}
106struct SpeechBuf {
107 samples: PcmBuf,
108 timestamp: u64,
109}
110
111struct VadProcessorInner {
112 vad: Box<dyn VadEngine>,
113 event_sender: EventSender,
114 option: VADOption,
115 window_bufs: Vec<SpeechBuf>,
116 triggered: bool,
117 current_speech_start: Option<u64>,
118 temp_end: Option<u64>,
119}
120pub struct VadProcessor {
121 inner: RefCell<VadProcessorInner>,
122}
123unsafe impl Send for VadProcessor {}
124unsafe impl Sync for VadProcessor {}
125
126pub trait VadEngine: Send + Sync + Any {
127 fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)>;
128}
129
130impl VadProcessorInner {
131 pub fn process_frame(&mut self, frame: &mut AudioFrame) -> Result<()> {
132 let samples = match &frame.samples {
133 Samples::PCM { samples } => samples,
134 _ => return Ok(()),
135 };
136
137 let samples = samples.to_owned();
138 let result = self.vad.process(frame);
139 if let Some((is_speaking, timestamp)) = result {
140 if is_speaking || self.triggered {
141 let current_buf = SpeechBuf { samples, timestamp };
142 self.window_bufs.push(current_buf);
143 }
144 self.process_vad_logic(is_speaking, timestamp, &frame.track_id)?;
145
146 if self.window_bufs.len() > 1000 || !self.triggered {
148 let cutoff = if self.triggered {
149 timestamp.saturating_sub(5000)
150 } else {
151 timestamp.saturating_sub(self.option.silence_padding)
152 };
153 self.window_bufs.retain(|buf| buf.timestamp > cutoff);
154 }
155 }
156
157 Ok(())
158 }
159
160 fn process_vad_logic(
161 &mut self,
162 is_speaking: bool,
163 timestamp: u64,
164 track_id: &str,
165 ) -> Result<()> {
166 if is_speaking && !self.triggered {
167 self.triggered = true;
168 self.current_speech_start = Some(timestamp);
169 let event = SessionEvent::Speaking {
170 track_id: track_id.to_string(),
171 timestamp: crate::media::get_timestamp(),
172 start_time: timestamp,
173 };
174 self.event_sender.send(event).ok();
175 } else if !is_speaking {
176 if self.temp_end.is_none() {
177 self.temp_end = Some(timestamp);
178 }
179
180 if let Some(temp_end) = self.temp_end {
181 let silence_duration = timestamp.saturating_sub(temp_end);
183
184 if self.triggered && silence_duration >= self.option.silence_padding {
186 if let Some(start_time) = self.current_speech_start {
187 let duration = temp_end.saturating_sub(start_time);
189 if duration >= self.option.speech_padding {
190 let samples_vec = self
191 .window_bufs
192 .iter()
193 .filter(|buf| {
194 buf.timestamp >= start_time && buf.timestamp <= temp_end
195 })
196 .flat_map(|buf| buf.samples.iter())
197 .cloned()
198 .collect();
199 self.window_bufs.clear();
200
201 let event = SessionEvent::Silence {
202 track_id: track_id.to_string(),
203 timestamp: crate::media::get_timestamp(),
204 start_time,
205 duration,
206 samples: Some(samples_vec),
207 };
208 self.event_sender.send(event).ok();
209 }
210 }
211 self.triggered = false;
212 self.current_speech_start = None;
213 self.temp_end = Some(timestamp); }
215
216 if let Some(timeout) = self.option.silence_timeout {
218 let timeout_duration = timestamp.saturating_sub(temp_end);
220
221 if timeout_duration >= timeout {
222 let event = SessionEvent::Silence {
223 track_id: track_id.to_string(),
224 timestamp: crate::media::get_timestamp(),
225 start_time: temp_end,
226 duration: timeout_duration,
227 samples: None,
228 };
229 self.event_sender.send(event).ok();
230 self.temp_end = Some(timestamp);
231 }
232 }
233 }
234 } else if is_speaking && self.temp_end.is_some() {
235 self.temp_end = None;
236 }
237
238 Ok(())
239 }
240}
241
242impl VadProcessor {
243 pub fn create(
244 _token: CancellationToken,
245 event_sender: EventSender,
246 option: VADOption,
247 ) -> Result<Box<dyn Processor>> {
248 let vad: Box<dyn VadEngine> = match option.r#type {
249 VadType::Silero => Box::new(tiny_silero::TinySilero::new(option.clone())?),
250 VadType::Ten => Box::new(tiny_ten::TinyTen::new(option.clone())?),
251 _ => Box::new(NopVad::new()?),
252 };
253 Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
254 }
255
256 pub fn create_nop(
257 _token: CancellationToken,
258 event_sender: EventSender,
259 option: VADOption,
260 ) -> Result<Box<dyn Processor>> {
261 let vad: Box<dyn VadEngine> = match option.r#type {
262 _ => Box::new(NopVad::new()?),
263 };
264 Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
265 }
266
267 pub fn new(
268 engine: Box<dyn VadEngine>,
269 event_sender: EventSender,
270 option: VADOption,
271 ) -> Result<Self> {
272 let inner = VadProcessorInner {
273 vad: engine,
274 event_sender,
275 option,
276 window_bufs: Vec::new(),
277 triggered: false,
278 current_speech_start: None,
279 temp_end: None,
280 };
281 Ok(Self {
282 inner: RefCell::new(inner),
283 })
284 }
285}
286
287impl Processor for VadProcessor {
288 fn process_frame(&self, frame: &mut AudioFrame) -> Result<()> {
289 self.inner.borrow_mut().process_frame(frame)
290 }
291}
292
293struct NopVad {}
294
295impl NopVad {
296 pub fn new() -> Result<Self> {
297 Ok(Self {})
298 }
299}
300
301impl VadEngine for NopVad {
302 fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)> {
303 let samples = match &frame.samples {
304 Samples::PCM { samples } => samples,
305 _ => return Some((false, frame.timestamp)),
306 };
307 let has_speech = samples.iter().any(|&x| x != 0);
309 Some((has_speech, frame.timestamp))
310 }
311}