subx_cli/services/vad/detector.rs
1use super::audio_processor::VadAudioProcessor;
2use crate::config::VadConfig;
3use crate::{Result, error::SubXError};
4use log::{debug, trace, warn};
5use std::path::Path;
6use std::time::{Duration, Instant};
7use voice_activity_detector::{IteratorExt, LabeledAudio, VoiceActivityDetector};
8
9/// Local voice activity detector.
10///
11/// Provides voice activity detection using local processing without
12/// external API calls. Uses the `voice_activity_detector` crate for
13/// speech detection and analysis.
14pub struct LocalVadDetector {
15 config: VadConfig,
16 audio_processor: VadAudioProcessor,
17}
18
19impl LocalVadDetector {
20 /// Create a new local VAD detector.
21 ///
22 /// # Arguments
23 ///
24 /// * `config` - VAD configuration parameters
25 ///
26 /// # Returns
27 ///
28 /// A new `LocalVadDetector` instance
29 ///
30 /// # Errors
31 ///
32 /// Returns an error if the audio processor cannot be initialized
33 pub fn new(config: VadConfig) -> Result<Self> {
34 debug!("Initializing LocalVadDetector with config: {:?}", config);
35 Ok(Self {
36 config,
37 audio_processor: VadAudioProcessor::new()?,
38 })
39 }
40
41 /// Detect speech activity in an audio file.
42 ///
43 /// Processes the entire audio file to identify speech segments
44 /// with timestamps and confidence scores.
45 ///
46 /// # Arguments
47 ///
48 /// * `audio_path` - Path to the audio file to analyze
49 ///
50 /// # Returns
51 ///
52 /// VAD analysis results including speech segments and metadata
53 ///
54 /// # Errors
55 ///
56 /// Returns an error if:
57 /// - Audio file cannot be loaded
58 /// - VAD processing fails
59 /// - Audio format is unsupported
60 pub async fn detect_speech(&self, audio_path: &Path) -> Result<VadResult> {
61 debug!("Starting speech detection for audio: {:?}", audio_path);
62 let start_time = Instant::now();
63
64 // 1. Load and preprocess audio
65 trace!("Loading and preprocessing audio file: {:?}", audio_path);
66 let audio_data = self
67 .audio_processor
68 .load_and_prepare_audio_direct(audio_path)
69 .await?;
70 debug!(
71 "Audio loaded: sample_rate={}Hz, channels={}, duration={}s, total_samples={}",
72 audio_data.info.sample_rate,
73 audio_data.info.channels,
74 audio_data.info.duration_seconds,
75 audio_data.info.total_samples
76 );
77
78 // 2. Calculate chunk size and create VAD with actual sample rate
79 let chunk_size = self.calculate_chunk_size(audio_data.info.sample_rate);
80 debug!(
81 "Calculated VAD chunk_size={} for sample_rate={}",
82 chunk_size, audio_data.info.sample_rate
83 );
84 let vad = VoiceActivityDetector::builder()
85 .sample_rate(audio_data.info.sample_rate)
86 .chunk_size(chunk_size)
87 .build()
88 .map_err(|e| {
89 warn!("Failed to create VAD instance: {}", e);
90 SubXError::audio_processing(format!("Failed to create VAD: {}", e))
91 })?;
92
93 // 3. Execute speech detection
94 trace!("Running speech segment detection");
95 let speech_segments =
96 self.detect_speech_segments(vad, &audio_data.samples, audio_data.info.sample_rate)?;
97
98 let processing_duration = start_time.elapsed();
99 debug!(
100 "Speech detection completed in {:?} seconds, segments found: {}",
101 processing_duration,
102 speech_segments.len()
103 );
104
105 Ok(VadResult {
106 speech_segments,
107 processing_duration,
108 audio_info: audio_data.info,
109 })
110 }
111
112 fn detect_speech_segments(
113 &self,
114 vad: VoiceActivityDetector,
115 samples: &[i16],
116 sample_rate: u32,
117 ) -> Result<Vec<SpeechSegment>> {
118 trace!(
119 "Detecting speech segments: samples={}, sample_rate={}",
120 samples.len(),
121 sample_rate
122 );
123 let mut segments = Vec::new();
124 let chunk_size = self.calculate_chunk_size(sample_rate);
125 let chunk_duration_seconds = chunk_size as f64 / sample_rate as f64;
126
127 // Use label functionality to identify speech and non-speech segments
128 let vad_threshold = 1.0 - self.config.sensitivity;
129 debug!(
130 "VAD threshold set to {} (sensitivity={})",
131 vad_threshold, self.config.sensitivity
132 );
133 let labels: Vec<LabeledAudio<i16>> = samples
134 .iter()
135 .copied()
136 .label(vad, vad_threshold, self.config.padding_chunks as usize)
137 .collect();
138 trace!("Labeling complete, total chunks: {}", labels.len());
139
140 let mut current_speech_start: Option<f64> = None;
141 let mut chunk_index = 0;
142
143 for label in labels {
144 let chunk_start_time = chunk_index as f64 * chunk_duration_seconds;
145 match label {
146 LabeledAudio::Speech(_chunk) => {
147 if current_speech_start.is_none() {
148 trace!(
149 "Speech started at {:.3}s (chunk #{})",
150 chunk_start_time, chunk_index
151 );
152 current_speech_start = Some(chunk_start_time);
153 }
154 }
155 LabeledAudio::NonSpeech(_chunk) => {
156 if let Some(start_time) = current_speech_start.take() {
157 let end_time = chunk_start_time;
158 let duration = end_time - start_time;
159 trace!(
160 "Speech ended at {:.3}s (duration {:.3}s)",
161 end_time, duration
162 );
163 // Filter out speech segments that are too short
164 if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
165 trace!(
166 "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
167 start_time, end_time, duration
168 );
169 segments.push(SpeechSegment {
170 start_time,
171 end_time,
172 duration,
173 });
174 } else {
175 trace!(
176 "Discarded short segment: start={:.3}s, end={:.3}s, duration={:.3}s",
177 start_time, end_time, duration
178 );
179 }
180 }
181 }
182 }
183 chunk_index += 1;
184 }
185
186 // Handle the last speech segment (if exists)
187 if let Some(start_time) = current_speech_start {
188 let end_time = chunk_index as f64 * chunk_duration_seconds;
189 let duration = end_time - start_time;
190 trace!(
191 "Final speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
192 start_time, end_time, duration
193 );
194 if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
195 trace!(
196 "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
197 start_time, end_time, duration
198 );
199 segments.push(SpeechSegment {
200 start_time,
201 end_time,
202 duration,
203 });
204 } else {
205 trace!(
206 "Discarded short final segment: start={:.3}s, end={:.3}s, duration={:.3}s",
207 start_time, end_time, duration
208 );
209 }
210 }
211
212 debug!("Speech segments detected: {}", segments.len());
213 Ok(segments)
214 }
215
216 /// Dynamically calculates the optimal VAD chunk size for a given audio sample rate.
217 ///
218 /// This function selects a chunk size (in samples) that is compatible with the VAD model's requirements
219 /// and recommended for common sample rates. For 8000 Hz and 16000 Hz, it uses 512 samples by default,
220 /// which is within the recommended range (512, 768, or 1024). For other sample rates, it uses a 30 ms
221 /// window as the baseline, with a minimum of 1024 samples. The function also ensures that the chunk size
222 /// always satisfies the model's constraint: `sample_rate <= 31.25 * chunk_size`.
223 ///
224 /// # Arguments
225 ///
226 /// - `sample_rate`: The audio sample rate in Hz (e.g., 16000 for 16kHz audio)
227 ///
228 /// # Returns
229 ///
230 /// The chunk size in number of samples, selected for optimal model compatibility.
231 ///
232 /// # Examples
233 ///
234 /// Basic usage:
235 ///
236 /// ```rust
237 /// use subx_cli::services::vad::LocalVadDetector;
238 /// let detector = LocalVadDetector::new(Default::default()).unwrap();
239 /// let chunk_size = detector.calculate_chunk_size(16000);
240 /// assert_eq!(chunk_size, 512);
241 /// ```
242 ///
243 /// # Model Constraint
244 ///
245 /// The returned chunk size always satisfies: `sample_rate <= 31.25 * chunk_size`.
246 pub fn calculate_chunk_size(&self, sample_rate: u32) -> usize {
247 trace!("Calculating chunk size for sample_rate={}", sample_rate);
248 let mut chunk_size = match sample_rate {
249 8000 => 512, // recommended: 512, 768, 1024
250 16000 => 512, // recommended: 512, 768, 1024
251 _ => {
252 let chunk_ms = 30f32;
253 let size = ((sample_rate as f32) * chunk_ms / 1000.0).round() as usize;
254 size.max(1024)
255 }
256 };
257 let min_chunk_size = ((sample_rate as f64) / 31.25).ceil() as usize;
258 if chunk_size < min_chunk_size {
259 warn!(
260 "Chunk size {} too small for sample_rate {}, adjusting to {}",
261 chunk_size, sample_rate, min_chunk_size
262 );
263 chunk_size = min_chunk_size;
264 }
265 debug!(
266 "Final chunk_size for sample_rate {}: {}",
267 sample_rate, chunk_size
268 );
269 chunk_size
270 }
271}
272
273/// VAD detection result containing speech segments and processing metadata.
274///
275/// Represents the complete result of voice activity detection analysis,
276/// including identified speech segments, timing information, and audio metadata.
277#[derive(Debug, Clone)]
278pub struct VadResult {
279 /// Detected speech segments with timing and confidence
280 pub speech_segments: Vec<SpeechSegment>,
281 /// Time taken to process the audio file
282 pub processing_duration: Duration,
283 /// Original audio file information
284 pub audio_info: AudioInfo,
285}
286
287/// Individual speech segment identified by VAD.
288///
289/// Represents a continuous segment of detected speech with timing
290/// and confidence information.
291#[derive(Debug, Clone)]
292pub struct SpeechSegment {
293 /// Start time of the speech segment in seconds
294 pub start_time: f64,
295 /// End time of the speech segment in seconds
296 pub end_time: f64,
297 /// Duration of the speech segment in seconds
298 pub duration: f64,
299}
300
301/// Audio file metadata and properties.
302///
303/// Contains technical information about the processed audio file
304/// including format, duration, and sample information.
305#[derive(Debug, Clone)]
306pub struct AudioInfo {
307 /// Audio sample rate in Hz
308 pub sample_rate: u32,
309 /// Number of audio channels
310 pub channels: u16,
311 /// Total duration of audio in seconds
312 pub duration_seconds: f64,
313 /// Total number of audio samples
314 pub total_samples: usize,
315}