1use super::audio_processor::VadAudioProcessor;
2use crate::config::VadConfig;
3use crate::{Result, error::SubXError};
4use log::{debug, trace, warn};
5use std::time::{Duration, Instant};
6use voice_activity_detector::{IteratorExt, LabeledAudio, VoiceActivityDetector};
7
8pub struct LocalVadDetector {
14 config: VadConfig,
15 audio_processor: VadAudioProcessor,
16}
17
18impl LocalVadDetector {
19 pub fn new(config: VadConfig) -> Result<Self> {
33 debug!("Initializing LocalVadDetector with config: {:?}", config);
34 Ok(Self {
35 config,
36 audio_processor: VadAudioProcessor::new()?,
37 })
38 }
39
40 pub async fn detect_speech_from_data(
50 &self,
51 mut audio_data: crate::services::vad::audio_processor::ProcessedAudioData,
52 ) -> Result<VadResult> {
53 debug!(
54 "Starting speech detection for ProcessedAudioData: sample_rate={}, duration={}",
55 audio_data.info.sample_rate, audio_data.info.duration_seconds
56 );
57 if audio_data.samples.is_empty() {
59 return Err(SubXError::audio_processing(
60 "Audio data is empty".to_string(),
61 ));
62 }
63 let start_time = Instant::now();
64
65 if audio_data.info.sample_rate != 8000 && audio_data.info.sample_rate != 16000 {
67 debug!(
68 "Resampling audio from {}Hz to 16000Hz...",
69 audio_data.info.sample_rate
70 );
71 use crate::services::vad::resample::resample_to_target_rate;
72 let resampled =
73 resample_to_target_rate(&audio_data.samples, audio_data.info.sample_rate, 16000)?;
74 let new_len = resampled.len();
75 audio_data.samples = resampled;
76 audio_data.info.sample_rate = 16000;
77 audio_data.info.duration_seconds = new_len as f64 / 16000.0;
78 audio_data.info.total_samples = new_len;
79 debug!(
80 "Resampling complete: new sample_rate=16000, total_samples={}, duration={:.3}s",
81 new_len, audio_data.info.duration_seconds
82 );
83 }
84
85 let chunk_size = self.calculate_chunk_size(audio_data.info.sample_rate);
87 debug!(
88 "Calculated VAD chunk_size={} for sample_rate={}",
89 chunk_size, audio_data.info.sample_rate
90 );
91 let vad = VoiceActivityDetector::builder()
92 .sample_rate(audio_data.info.sample_rate)
93 .chunk_size(chunk_size)
94 .build()
95 .map_err(|e| {
96 warn!("Failed to create VAD instance: {}", e);
97 SubXError::audio_processing(format!("Failed to create VAD: {}", e))
98 })?;
99
100 trace!("Running speech segment detection");
102 let speech_segments =
103 self.detect_speech_segments(vad, &audio_data.samples, audio_data.info.sample_rate)?;
104
105 let processing_duration = start_time.elapsed();
106 debug!(
107 "Speech detection completed in {:?} seconds, segments found: {}",
108 processing_duration,
109 speech_segments.len()
110 );
111
112 Ok(VadResult {
113 speech_segments,
114 processing_duration,
115 audio_info: audio_data.info,
116 })
117 }
118
119 fn detect_speech_segments(
120 &self,
121 mut vad: VoiceActivityDetector,
122 samples: &[i16],
123 sample_rate: u32,
124 ) -> Result<Vec<SpeechSegment>> {
125 trace!(
126 "Detecting speech segments: samples={}, sample_rate={}",
127 samples.len(),
128 sample_rate
129 );
130 let mut segments = Vec::new();
131 let chunk_size = self.calculate_chunk_size(sample_rate);
132 let chunk_duration_seconds = chunk_size as f64 / sample_rate as f64;
133
134 let vad_threshold = 1.0 - self.config.sensitivity;
136 debug!(
137 "VAD threshold set to {} (sensitivity={})",
138 vad_threshold, self.config.sensitivity
139 );
140 let labels: Vec<LabeledAudio<i16>> = samples
141 .iter()
142 .copied()
143 .label(&mut vad, vad_threshold, self.config.padding_chunks as usize)
144 .collect();
145 trace!("Labeling complete, total chunks: {}", labels.len());
146
147 let mut current_speech_start: Option<f64> = None;
148 let mut chunk_index = 0;
149
150 for label in labels {
151 let chunk_start_time = chunk_index as f64 * chunk_duration_seconds;
152 match label {
153 LabeledAudio::Speech(_chunk) => {
154 if current_speech_start.is_none() {
155 trace!(
156 "Speech started at {:.3}s (chunk #{})",
157 chunk_start_time, chunk_index
158 );
159 current_speech_start = Some(chunk_start_time);
160 }
161 }
162 LabeledAudio::NonSpeech(_chunk) => {
163 if let Some(start_time) = current_speech_start.take() {
164 let end_time = chunk_start_time;
165 let duration = end_time - start_time;
166 trace!(
167 "Speech ended at {:.3}s (duration {:.3}s)",
168 end_time, duration
169 );
170 if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
172 trace!(
173 "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
174 start_time, end_time, duration
175 );
176 segments.push(SpeechSegment {
177 start_time,
178 end_time,
179 duration,
180 });
181 } else {
182 trace!(
183 "Discarded short segment: start={:.3}s, end={:.3}s, duration={:.3}s",
184 start_time, end_time, duration
185 );
186 }
187 }
188 }
189 }
190 chunk_index += 1;
191 }
192
193 if let Some(start_time) = current_speech_start {
195 let end_time = chunk_index as f64 * chunk_duration_seconds;
196 let duration = end_time - start_time;
197 trace!(
198 "Final speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
199 start_time, end_time, duration
200 );
201 if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
202 trace!(
203 "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
204 start_time, end_time, duration
205 );
206 segments.push(SpeechSegment {
207 start_time,
208 end_time,
209 duration,
210 });
211 } else {
212 trace!(
213 "Discarded short final segment: start={:.3}s, end={:.3}s, duration={:.3}s",
214 start_time, end_time, duration
215 );
216 }
217 }
218
219 debug!("Speech segments detected: {}", segments.len());
220 Ok(segments)
221 }
222
223 pub fn calculate_chunk_size(&self, sample_rate: u32) -> usize {
254 trace!("Calculating chunk size for sample_rate={}", sample_rate);
255 let chunk_size = match sample_rate {
256 8000 => 256,
257 16000 => 512,
258 _ => panic!(
259 "Unsupported VAD sample_rate={}. Only 8kHz/256, 16kHz/512 are allowed.",
260 sample_rate
261 ),
262 };
263 debug!(
264 "Final chunk_size for sample_rate {}: {}",
265 sample_rate, chunk_size
266 );
267 chunk_size
268 }
269
270 pub fn audio_processor(&self) -> &VadAudioProcessor {
272 &self.audio_processor
273 }
274}
275
276#[derive(Debug, Clone)]
281pub struct VadResult {
282 pub speech_segments: Vec<SpeechSegment>,
284 pub processing_duration: Duration,
286 pub audio_info: AudioInfo,
288}
289
290#[derive(Debug, Clone)]
295pub struct SpeechSegment {
296 pub start_time: f64,
298 pub end_time: f64,
300 pub duration: f64,
302}
303
304#[derive(Debug, Clone)]
309pub struct AudioInfo {
310 pub sample_rate: u32,
312 pub channels: u16,
314 pub duration_seconds: f64,
316 pub total_samples: usize,
318}