1use crate::event::{EventSender, SessionEvent};
2use crate::media::processor::Processor;
3use crate::media::{AudioFrame, PcmBuf, Samples};
4use anyhow::Result;
5use serde::{Deserialize, Serialize};
6use serde_with::skip_serializing_none;
7use std::any::Any;
8use std::cell::RefCell;
9use tokio_util::sync::CancellationToken;
10
11pub(crate) mod simd;
12pub(crate) mod tiny_silero;
13pub(crate) mod tiny_ten;
14pub(crate) mod utils;
15
16#[cfg(test)]
17mod benchmark_all;
18#[cfg(test)]
19mod tests;
20
21#[skip_serializing_none]
22#[derive(Clone, Debug, Deserialize, Serialize)]
23#[serde(rename_all = "camelCase")]
24#[serde(default)]
25pub struct VADOption {
26 pub r#type: VadType,
27 pub samplerate: u32,
28 pub speech_padding: u64,
30 pub silence_padding: u64,
32 pub ratio: f32,
33 pub voice_threshold: f32,
34 pub max_buffer_duration_secs: u64,
35 pub silence_timeout: Option<u64>,
37 pub endpoint: Option<String>,
38 pub secret_key: Option<String>,
39 pub secret_id: Option<String>,
40}
41
42impl Default for VADOption {
43 fn default() -> Self {
44 Self {
45 r#type: VadType::Silero,
46 samplerate: 16000,
47 speech_padding: 250, silence_padding: 100, ratio: 0.5,
51 voice_threshold: 0.5,
52 max_buffer_duration_secs: 50,
53 silence_timeout: None,
54 endpoint: None,
55 secret_key: None,
56 secret_id: None,
57 }
58 }
59}
60
61#[derive(Clone, Debug, Serialize, Eq, Hash, PartialEq)]
62#[serde(rename_all = "lowercase")]
63pub enum VadType {
64 Silero,
65 Ten,
66 Other(String),
67}
68
69impl<'de> Deserialize<'de> for VadType {
70 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
71 where
72 D: serde::Deserializer<'de>,
73 {
74 let value = String::deserialize(deserializer)?;
75 match value.as_str() {
76 "silero" => Ok(VadType::Silero),
77 "ten" => Ok(VadType::Ten),
78 _ => Ok(VadType::Other(value)),
79 }
80 }
81}
82
83impl std::fmt::Display for VadType {
84 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
85 match self {
86 VadType::Silero => write!(f, "silero"),
87 VadType::Ten => write!(f, "ten"),
88 VadType::Other(provider) => write!(f, "{}", provider),
89 }
90 }
91}
92
93impl TryFrom<&String> for VadType {
94 type Error = String;
95
96 fn try_from(value: &String) -> std::result::Result<Self, Self::Error> {
97 match value.as_str() {
98 "silero" => Ok(VadType::Silero),
99 "ten" => Ok(VadType::Ten),
100 other => Ok(VadType::Other(other.to_string())),
101 }
102 }
103}
104struct SpeechBuf {
105 samples: PcmBuf,
106 timestamp: u64,
107}
108
109struct VadProcessorInner {
110 vad: Box<dyn VadEngine>,
111 event_sender: EventSender,
112 option: VADOption,
113 window_bufs: Vec<SpeechBuf>,
114 triggered: bool,
115 current_speech_start: Option<u64>,
116 temp_end: Option<u64>,
117}
118pub struct VadProcessor {
119 inner: RefCell<VadProcessorInner>,
120}
121unsafe impl Send for VadProcessor {}
122unsafe impl Sync for VadProcessor {}
123
124pub trait VadEngine: Send + Sync + Any {
125 fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)>;
126}
127
128impl VadProcessorInner {
129 pub fn process_frame(&mut self, frame: &mut AudioFrame) -> Result<()> {
130 let samples = match &frame.samples {
131 Samples::PCM { samples } => samples,
132 _ => return Ok(()),
133 };
134
135 let samples = samples.to_owned();
136 let result = self.vad.process(frame);
137 if let Some((is_speaking, timestamp)) = result {
138 if is_speaking || self.triggered {
139 let current_buf = SpeechBuf { samples, timestamp };
140 self.window_bufs.push(current_buf);
141 }
142 self.process_vad_logic(is_speaking, timestamp, &frame.track_id)?;
143
144 if self.window_bufs.len() > 1000 || !self.triggered {
146 let cutoff = if self.triggered {
147 timestamp.saturating_sub(5000)
148 } else {
149 timestamp.saturating_sub(self.option.silence_padding)
150 };
151 self.window_bufs.retain(|buf| buf.timestamp > cutoff);
152 }
153 }
154
155 Ok(())
156 }
157
158 fn process_vad_logic(
159 &mut self,
160 is_speaking: bool,
161 timestamp: u64,
162 track_id: &str,
163 ) -> Result<()> {
164 if is_speaking && !self.triggered {
165 self.triggered = true;
166 self.current_speech_start = Some(timestamp);
167 let event = SessionEvent::Speaking {
168 track_id: track_id.to_string(),
169 timestamp: crate::media::get_timestamp(),
170 start_time: timestamp,
171 };
172 self.event_sender.send(event).ok();
173 } else if !is_speaking {
174 if self.temp_end.is_none() {
175 self.temp_end = Some(timestamp);
176 }
177
178 if let Some(temp_end) = self.temp_end {
179 let silence_duration = timestamp.saturating_sub(temp_end);
181
182 if self.triggered && silence_duration >= self.option.silence_padding {
184 if let Some(start_time) = self.current_speech_start {
185 let duration = temp_end.saturating_sub(start_time);
187 if duration >= self.option.speech_padding {
188 let samples_vec = self
189 .window_bufs
190 .iter()
191 .filter(|buf| {
192 buf.timestamp >= start_time && buf.timestamp <= temp_end
193 })
194 .flat_map(|buf| buf.samples.iter())
195 .cloned()
196 .collect();
197 self.window_bufs.clear();
198
199 let event = SessionEvent::Silence {
200 track_id: track_id.to_string(),
201 timestamp: crate::media::get_timestamp(),
202 start_time,
203 duration,
204 samples: Some(samples_vec),
205 };
206 self.event_sender.send(event).ok();
207 }
208 }
209 self.triggered = false;
210 self.current_speech_start = None;
211 self.temp_end = Some(timestamp); }
213
214 if let Some(timeout) = self.option.silence_timeout {
216 let timeout_duration = timestamp.saturating_sub(temp_end);
218
219 if timeout_duration >= timeout {
220 let event = SessionEvent::Silence {
221 track_id: track_id.to_string(),
222 timestamp: crate::media::get_timestamp(),
223 start_time: temp_end,
224 duration: timeout_duration,
225 samples: None,
226 };
227 self.event_sender.send(event).ok();
228 self.temp_end = Some(timestamp);
229 }
230 }
231 }
232 } else if is_speaking && self.temp_end.is_some() {
233 self.temp_end = None;
234 }
235
236 Ok(())
237 }
238}
239
240impl VadProcessor {
241 pub fn create(
242 _token: CancellationToken,
243 event_sender: EventSender,
244 option: VADOption,
245 ) -> Result<Box<dyn Processor>> {
246 let vad: Box<dyn VadEngine> = match option.r#type {
247 VadType::Silero => Box::new(tiny_silero::TinySilero::new(option.clone())?),
248 VadType::Ten => Box::new(tiny_ten::TinyVad::new(option.clone())?),
249 _ => Box::new(NopVad::new()?),
250 };
251 Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
252 }
253
254 pub fn create_nop(
255 _token: CancellationToken,
256 event_sender: EventSender,
257 option: VADOption,
258 ) -> Result<Box<dyn Processor>> {
259 let vad: Box<dyn VadEngine> = match option.r#type {
260 _ => Box::new(NopVad::new()?),
261 };
262 Ok(Box::new(VadProcessor::new(vad, event_sender, option)?))
263 }
264
265 pub fn new(
266 engine: Box<dyn VadEngine>,
267 event_sender: EventSender,
268 option: VADOption,
269 ) -> Result<Self> {
270 let inner = VadProcessorInner {
271 vad: engine,
272 event_sender,
273 option,
274 window_bufs: Vec::new(),
275 triggered: false,
276 current_speech_start: None,
277 temp_end: None,
278 };
279 Ok(Self {
280 inner: RefCell::new(inner),
281 })
282 }
283}
284
285impl Processor for VadProcessor {
286 fn process_frame(&self, frame: &mut AudioFrame) -> Result<()> {
287 self.inner.borrow_mut().process_frame(frame)
288 }
289}
290
291struct NopVad {}
292
293impl NopVad {
294 pub fn new() -> Result<Self> {
295 Ok(Self {})
296 }
297}
298
299impl VadEngine for NopVad {
300 fn process(&mut self, frame: &mut AudioFrame) -> Option<(bool, u64)> {
301 let samples = match &frame.samples {
302 Samples::PCM { samples } => samples,
303 _ => return Some((false, frame.timestamp)),
304 };
305 let has_speech = samples.iter().any(|&x| x != 0);
307 Some((has_speech, frame.timestamp))
308 }
309}