subx_cli/services/vad/
detector.rs

1use super::audio_processor::VadAudioProcessor;
2use crate::config::VadConfig;
3use crate::{Result, error::SubXError};
4use log::{debug, trace, warn};
5use std::path::Path;
6use std::time::{Duration, Instant};
7use voice_activity_detector::{IteratorExt, LabeledAudio, VoiceActivityDetector};
8
9/// Local voice activity detector.
10///
11/// Provides voice activity detection using local processing without
12/// external API calls. Uses the `voice_activity_detector` crate for
13/// speech detection and analysis.
14pub struct LocalVadDetector {
15    config: VadConfig,
16    audio_processor: VadAudioProcessor,
17}
18
19impl LocalVadDetector {
20    /// Create a new local VAD detector.
21    ///
22    /// # Arguments
23    ///
24    /// * `config` - VAD configuration parameters
25    ///
26    /// # Returns
27    ///
28    /// A new `LocalVadDetector` instance
29    ///
30    /// # Errors
31    ///
32    /// Returns an error if the audio processor cannot be initialized
33    pub fn new(config: VadConfig) -> Result<Self> {
34        debug!("Initializing LocalVadDetector with config: {:?}", config);
35        Ok(Self {
36            config,
37            audio_processor: VadAudioProcessor::new()?,
38        })
39    }
40
41    /// Detect speech activity in an audio file.
42    ///
43    /// Processes the entire audio file to identify speech segments
44    /// with timestamps and confidence scores.
45    ///
46    /// # Arguments
47    ///
48    /// * `audio_path` - Path to the audio file to analyze
49    ///
50    /// # Returns
51    ///
52    /// VAD analysis results including speech segments and metadata
53    ///
54    /// # Errors
55    ///
56    /// Returns an error if:
57    /// - Audio file cannot be loaded
58    /// - VAD processing fails
59    /// - Audio format is unsupported
60    pub async fn detect_speech(&self, audio_path: &Path) -> Result<VadResult> {
61        debug!("Starting speech detection for audio: {:?}", audio_path);
62        let start_time = Instant::now();
63
64        // 1. Load and preprocess audio
65        trace!("Loading and preprocessing audio file: {:?}", audio_path);
66        let audio_data = self
67            .audio_processor
68            .load_and_prepare_audio_direct(audio_path)
69            .await?;
70        debug!(
71            "Audio loaded: sample_rate={}Hz, channels={}, duration={}s, total_samples={}",
72            audio_data.info.sample_rate,
73            audio_data.info.channels,
74            audio_data.info.duration_seconds,
75            audio_data.info.total_samples
76        );
77
78        // 2. Calculate chunk size and create VAD with actual sample rate
79        let chunk_size = self.calculate_chunk_size(audio_data.info.sample_rate);
80        debug!(
81            "Calculated VAD chunk_size={} for sample_rate={}",
82            chunk_size, audio_data.info.sample_rate
83        );
84        let vad = VoiceActivityDetector::builder()
85            .sample_rate(audio_data.info.sample_rate)
86            .chunk_size(chunk_size)
87            .build()
88            .map_err(|e| {
89                warn!("Failed to create VAD instance: {}", e);
90                SubXError::audio_processing(format!("Failed to create VAD: {}", e))
91            })?;
92
93        // 3. Execute speech detection
94        trace!("Running speech segment detection");
95        let speech_segments =
96            self.detect_speech_segments(vad, &audio_data.samples, audio_data.info.sample_rate)?;
97
98        let processing_duration = start_time.elapsed();
99        debug!(
100            "Speech detection completed in {:?} seconds, segments found: {}",
101            processing_duration,
102            speech_segments.len()
103        );
104
105        Ok(VadResult {
106            speech_segments,
107            processing_duration,
108            audio_info: audio_data.info,
109        })
110    }
111
112    fn detect_speech_segments(
113        &self,
114        vad: VoiceActivityDetector,
115        samples: &[i16],
116        sample_rate: u32,
117    ) -> Result<Vec<SpeechSegment>> {
118        trace!(
119            "Detecting speech segments: samples={}, sample_rate={}",
120            samples.len(),
121            sample_rate
122        );
123        let mut segments = Vec::new();
124        let chunk_size = self.calculate_chunk_size(sample_rate);
125        let chunk_duration_seconds = chunk_size as f64 / sample_rate as f64;
126
127        // Use label functionality to identify speech and non-speech segments
128        let vad_threshold = 1.0 - self.config.sensitivity;
129        debug!(
130            "VAD threshold set to {} (sensitivity={})",
131            vad_threshold, self.config.sensitivity
132        );
133        let labels: Vec<LabeledAudio<i16>> = samples
134            .iter()
135            .copied()
136            .label(vad, vad_threshold, self.config.padding_chunks as usize)
137            .collect();
138        trace!("Labeling complete, total chunks: {}", labels.len());
139
140        let mut current_speech_start: Option<f64> = None;
141        let mut chunk_index = 0;
142
143        for label in labels {
144            let chunk_start_time = chunk_index as f64 * chunk_duration_seconds;
145            match label {
146                LabeledAudio::Speech(_chunk) => {
147                    if current_speech_start.is_none() {
148                        trace!(
149                            "Speech started at {:.3}s (chunk #{})",
150                            chunk_start_time, chunk_index
151                        );
152                        current_speech_start = Some(chunk_start_time);
153                    }
154                }
155                LabeledAudio::NonSpeech(_chunk) => {
156                    if let Some(start_time) = current_speech_start.take() {
157                        let end_time = chunk_start_time;
158                        let duration = end_time - start_time;
159                        trace!(
160                            "Speech ended at {:.3}s (duration {:.3}s)",
161                            end_time, duration
162                        );
163                        // Filter out speech segments that are too short
164                        if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
165                            trace!(
166                                "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
167                                start_time, end_time, duration
168                            );
169                            segments.push(SpeechSegment {
170                                start_time,
171                                end_time,
172                                duration,
173                            });
174                        } else {
175                            trace!(
176                                "Discarded short segment: start={:.3}s, end={:.3}s, duration={:.3}s",
177                                start_time, end_time, duration
178                            );
179                        }
180                    }
181                }
182            }
183            chunk_index += 1;
184        }
185
186        // Handle the last speech segment (if exists)
187        if let Some(start_time) = current_speech_start {
188            let end_time = chunk_index as f64 * chunk_duration_seconds;
189            let duration = end_time - start_time;
190            trace!(
191                "Final speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
192                start_time, end_time, duration
193            );
194            if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
195                trace!(
196                    "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
197                    start_time, end_time, duration
198                );
199                segments.push(SpeechSegment {
200                    start_time,
201                    end_time,
202                    duration,
203                });
204            } else {
205                trace!(
206                    "Discarded short final segment: start={:.3}s, end={:.3}s, duration={:.3}s",
207                    start_time, end_time, duration
208                );
209            }
210        }
211
212        debug!("Speech segments detected: {}", segments.len());
213        Ok(segments)
214    }
215
216    /// Dynamically calculates the optimal VAD chunk size for a given audio sample rate.
217    ///
218    /// This function selects a chunk size (in samples) that is compatible with the VAD model's requirements
219    /// and recommended for common sample rates. For 8000 Hz and 16000 Hz, it uses 512 samples by default,
220    /// which is within the recommended range (512, 768, or 1024). For other sample rates, it uses a 30 ms
221    /// window as the baseline, with a minimum of 1024 samples. The function also ensures that the chunk size
222    /// always satisfies the model's constraint: `sample_rate <= 31.25 * chunk_size`.
223    ///
224    /// # Arguments
225    ///
226    /// - `sample_rate`: The audio sample rate in Hz (e.g., 16000 for 16kHz audio)
227    ///
228    /// # Returns
229    ///
230    /// The chunk size in number of samples, selected for optimal model compatibility.
231    ///
232    /// # Examples
233    ///
234    /// Basic usage:
235    ///
236    /// ```rust
237    /// use subx_cli::services::vad::LocalVadDetector;
238    /// let detector = LocalVadDetector::new(Default::default()).unwrap();
239    /// let chunk_size = detector.calculate_chunk_size(16000);
240    /// assert_eq!(chunk_size, 512);
241    /// ```
242    ///
243    /// # Model Constraint
244    ///
245    /// The returned chunk size always satisfies: `sample_rate <= 31.25 * chunk_size`.
246    pub fn calculate_chunk_size(&self, sample_rate: u32) -> usize {
247        trace!("Calculating chunk size for sample_rate={}", sample_rate);
248        let mut chunk_size = match sample_rate {
249            8000 => 512,  // recommended: 512, 768, 1024
250            16000 => 512, // recommended: 512, 768, 1024
251            _ => {
252                let chunk_ms = 30f32;
253                let size = ((sample_rate as f32) * chunk_ms / 1000.0).round() as usize;
254                size.max(1024)
255            }
256        };
257        let min_chunk_size = ((sample_rate as f64) / 31.25).ceil() as usize;
258        if chunk_size < min_chunk_size {
259            warn!(
260                "Chunk size {} too small for sample_rate {}, adjusting to {}",
261                chunk_size, sample_rate, min_chunk_size
262            );
263            chunk_size = min_chunk_size;
264        }
265        debug!(
266            "Final chunk_size for sample_rate {}: {}",
267            sample_rate, chunk_size
268        );
269        chunk_size
270    }
271}
272
273/// VAD detection result containing speech segments and processing metadata.
274///
275/// Represents the complete result of voice activity detection analysis,
276/// including identified speech segments, timing information, and audio metadata.
277#[derive(Debug, Clone)]
278pub struct VadResult {
279    /// Detected speech segments with timing and confidence
280    pub speech_segments: Vec<SpeechSegment>,
281    /// Time taken to process the audio file
282    pub processing_duration: Duration,
283    /// Original audio file information
284    pub audio_info: AudioInfo,
285}
286
287/// Individual speech segment identified by VAD.
288///
289/// Represents a continuous segment of detected speech with timing
290/// and confidence information.
291#[derive(Debug, Clone)]
292pub struct SpeechSegment {
293    /// Start time of the speech segment in seconds
294    pub start_time: f64,
295    /// End time of the speech segment in seconds
296    pub end_time: f64,
297    /// Duration of the speech segment in seconds
298    pub duration: f64,
299}
300
301/// Audio file metadata and properties.
302///
303/// Contains technical information about the processed audio file
304/// including format, duration, and sample information.
305#[derive(Debug, Clone)]
306pub struct AudioInfo {
307    /// Audio sample rate in Hz
308    pub sample_rate: u32,
309    /// Number of audio channels
310    pub channels: u16,
311    /// Total duration of audio in seconds
312    pub duration_seconds: f64,
313    /// Total number of audio samples
314    pub total_samples: usize,
315}