subx_cli/services/vad/
detector.rs

1use super::audio_processor::VadAudioProcessor;
2use crate::config::VadConfig;
3use crate::{Result, error::SubXError};
4use log::{debug, trace, warn};
5use std::time::{Duration, Instant};
6use voice_activity_detector::{IteratorExt, LabeledAudio, VoiceActivityDetector};
7
8/// Local voice activity detector.
9///
10/// Provides voice activity detection using local processing without
11/// external API calls. Uses the `voice_activity_detector` crate for
12/// speech detection and analysis.
13pub struct LocalVadDetector {
14    config: VadConfig,
15    audio_processor: VadAudioProcessor,
16}
17
18impl LocalVadDetector {
19    /// Create a new local VAD detector.
20    ///
21    /// # Arguments
22    ///
23    /// * `config` - VAD configuration parameters
24    ///
25    /// # Returns
26    ///
27    /// A new `LocalVadDetector` instance
28    ///
29    /// # Errors
30    ///
31    /// Returns an error if the audio processor cannot be initialized
32    pub fn new(config: VadConfig) -> Result<Self> {
33        debug!("Initializing LocalVadDetector with config: {:?}", config);
34        Ok(Self {
35            config,
36            audio_processor: VadAudioProcessor::new()?,
37        })
38    }
39
40    /// Detect speech activity in a ProcessedAudioData (for partial audio analysis).
41    ///
42    /// # Arguments
43    ///
44    /// * `audio_data` - Pre-processed audio data (can be cropped)
45    ///
46    /// # Returns
47    ///
48    /// VAD analysis results including speech segments and metadata
49    pub async fn detect_speech_from_data(
50        &self,
51        mut audio_data: crate::services::vad::audio_processor::ProcessedAudioData,
52    ) -> Result<VadResult> {
53        debug!(
54            "Starting speech detection for ProcessedAudioData: sample_rate={}, duration={}",
55            audio_data.info.sample_rate, audio_data.info.duration_seconds
56        );
57        // New: return error directly if audio data is empty
58        if audio_data.samples.is_empty() {
59            return Err(SubXError::audio_processing(
60                "Audio data is empty".to_string(),
61            ));
62        }
63        let start_time = Instant::now();
64
65        // 1.5. Resample if needed (always to 16000 if not 8000/16000)
66        if audio_data.info.sample_rate != 8000 && audio_data.info.sample_rate != 16000 {
67            debug!(
68                "Resampling audio from {}Hz to 16000Hz...",
69                audio_data.info.sample_rate
70            );
71            use crate::services::vad::resample::resample_to_target_rate;
72            let resampled =
73                resample_to_target_rate(&audio_data.samples, audio_data.info.sample_rate, 16000)?;
74            let new_len = resampled.len();
75            audio_data.samples = resampled;
76            audio_data.info.sample_rate = 16000;
77            audio_data.info.duration_seconds = new_len as f64 / 16000.0;
78            audio_data.info.total_samples = new_len;
79            debug!(
80                "Resampling complete: new sample_rate=16000, total_samples={}, duration={:.3}s",
81                new_len, audio_data.info.duration_seconds
82            );
83        }
84
85        // 2. Calculate chunk size and create VAD with actual sample rate
86        let chunk_size = self.calculate_chunk_size(audio_data.info.sample_rate);
87        debug!(
88            "Calculated VAD chunk_size={} for sample_rate={}",
89            chunk_size, audio_data.info.sample_rate
90        );
91        let vad = VoiceActivityDetector::builder()
92            .sample_rate(audio_data.info.sample_rate)
93            .chunk_size(chunk_size)
94            .build()
95            .map_err(|e| {
96                warn!("Failed to create VAD instance: {}", e);
97                SubXError::audio_processing(format!("Failed to create VAD: {}", e))
98            })?;
99
100        // 3. Execute speech detection
101        trace!("Running speech segment detection");
102        let speech_segments =
103            self.detect_speech_segments(vad, &audio_data.samples, audio_data.info.sample_rate)?;
104
105        let processing_duration = start_time.elapsed();
106        debug!(
107            "Speech detection completed in {:?} seconds, segments found: {}",
108            processing_duration,
109            speech_segments.len()
110        );
111
112        Ok(VadResult {
113            speech_segments,
114            processing_duration,
115            audio_info: audio_data.info,
116        })
117    }
118
119    fn detect_speech_segments(
120        &self,
121        mut vad: VoiceActivityDetector,
122        samples: &[i16],
123        sample_rate: u32,
124    ) -> Result<Vec<SpeechSegment>> {
125        trace!(
126            "Detecting speech segments: samples={}, sample_rate={}",
127            samples.len(),
128            sample_rate
129        );
130        let mut segments = Vec::new();
131        let chunk_size = self.calculate_chunk_size(sample_rate);
132        let chunk_duration_seconds = chunk_size as f64 / sample_rate as f64;
133
134        // Use label functionality to identify speech and non-speech segments
135        let vad_threshold = 1.0 - self.config.sensitivity;
136        debug!(
137            "VAD threshold set to {} (sensitivity={})",
138            vad_threshold, self.config.sensitivity
139        );
140        let labels: Vec<LabeledAudio<i16>> = samples
141            .iter()
142            .copied()
143            .label(&mut vad, vad_threshold, self.config.padding_chunks as usize)
144            .collect();
145        trace!("Labeling complete, total chunks: {}", labels.len());
146
147        let mut current_speech_start: Option<f64> = None;
148        let mut chunk_index = 0;
149
150        for label in labels {
151            let chunk_start_time = chunk_index as f64 * chunk_duration_seconds;
152            match label {
153                LabeledAudio::Speech(_chunk) => {
154                    if current_speech_start.is_none() {
155                        trace!(
156                            "Speech started at {:.3}s (chunk #{})",
157                            chunk_start_time, chunk_index
158                        );
159                        current_speech_start = Some(chunk_start_time);
160                    }
161                }
162                LabeledAudio::NonSpeech(_chunk) => {
163                    if let Some(start_time) = current_speech_start.take() {
164                        let end_time = chunk_start_time;
165                        let duration = end_time - start_time;
166                        trace!(
167                            "Speech ended at {:.3}s (duration {:.3}s)",
168                            end_time, duration
169                        );
170                        // Filter out speech segments that are too short
171                        if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
172                            trace!(
173                                "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
174                                start_time, end_time, duration
175                            );
176                            segments.push(SpeechSegment {
177                                start_time,
178                                end_time,
179                                duration,
180                            });
181                        } else {
182                            trace!(
183                                "Discarded short segment: start={:.3}s, end={:.3}s, duration={:.3}s",
184                                start_time, end_time, duration
185                            );
186                        }
187                    }
188                }
189            }
190            chunk_index += 1;
191        }
192
193        // Handle the last speech segment (if exists)
194        if let Some(start_time) = current_speech_start {
195            let end_time = chunk_index as f64 * chunk_duration_seconds;
196            let duration = end_time - start_time;
197            trace!(
198                "Final speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
199                start_time, end_time, duration
200            );
201            if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
202                trace!(
203                    "Detected speech segment: start={:.3}s, end={:.3}s, duration={:.3}s",
204                    start_time, end_time, duration
205                );
206                segments.push(SpeechSegment {
207                    start_time,
208                    end_time,
209                    duration,
210                });
211            } else {
212                trace!(
213                    "Discarded short final segment: start={:.3}s, end={:.3}s, duration={:.3}s",
214                    start_time, end_time, duration
215                );
216            }
217        }
218
219        debug!("Speech segments detected: {}", segments.len());
220        Ok(segments)
221    }
222
223    /// Dynamically calculates the optimal VAD chunk size for a given audio sample rate using the Silero VAD V5 model.
224    ///
225    /// This function selects a chunk size (in samples) compatible with the Silero VAD V5 model's strict requirements.
226    /// For 8 kHz audio, only a 256-sample window is supported. For 16 kHz audio, only a 512-sample window is supported.
227    /// For sample rates that are a multiple of 16 kHz (e.g., 32 kHz, 48 kHz), a 512-sample window is also used, as required by the model.
228    ///
229    /// # Arguments
230    ///
231    /// - `sample_rate`: The audio sample rate in Hz (e.g., 8000, 16000)
232    ///
233    /// # Returns
234    ///
235    /// The chunk size in number of samples, as required by the Silero VAD V5 model.
236    ///
237    /// # Model Reference
238    ///
239    /// This logic follows the requirements of the [Silero VAD V5 model.](https://github.com/snakers4/silero-vad/releases/tag/v5.0)
240    ///
241    /// # Panics
242    ///
243    /// This function will panic if the sample rate is not supported by the model.
244    ///
245    /// # Examples
246    ///
247    /// ```rust
248    /// use subx_cli::services::vad::LocalVadDetector;
249    /// let detector = LocalVadDetector::new(Default::default()).unwrap();
250    /// assert_eq!(detector.calculate_chunk_size(8000), 256);
251    /// assert_eq!(detector.calculate_chunk_size(16000), 512);
252    /// ```
253    pub fn calculate_chunk_size(&self, sample_rate: u32) -> usize {
254        trace!("Calculating chunk size for sample_rate={}", sample_rate);
255        let chunk_size = match sample_rate {
256            8000 => 256,
257            16000 => 512,
258            _ => panic!(
259                "Unsupported VAD sample_rate={}. Only 8kHz/256, 16kHz/512 are allowed.",
260                sample_rate
261            ),
262        };
263        debug!(
264            "Final chunk_size for sample_rate {}: {}",
265            sample_rate, chunk_size
266        );
267        chunk_size
268    }
269
270    /// Get the internal VadAudioProcessor instance (for advanced use, e.g. partial audio cropping)
271    pub fn audio_processor(&self) -> &VadAudioProcessor {
272        &self.audio_processor
273    }
274}
275
276/// VAD detection result containing speech segments and processing metadata.
277///
278/// Represents the complete result of voice activity detection analysis,
279/// including identified speech segments, timing information, and audio metadata.
280#[derive(Debug, Clone)]
281pub struct VadResult {
282    /// Detected speech segments with timing and confidence
283    pub speech_segments: Vec<SpeechSegment>,
284    /// Time taken to process the audio file
285    pub processing_duration: Duration,
286    /// Original audio file information
287    pub audio_info: AudioInfo,
288}
289
290/// Individual speech segment identified by VAD.
291///
292/// Represents a continuous segment of detected speech with timing
293/// and confidence information.
294#[derive(Debug, Clone)]
295pub struct SpeechSegment {
296    /// Start time of the speech segment in seconds
297    pub start_time: f64,
298    /// End time of the speech segment in seconds
299    pub end_time: f64,
300    /// Duration of the speech segment in seconds
301    pub duration: f64,
302}
303
304/// Audio file metadata and properties.
305///
306/// Contains technical information about the processed audio file
307/// including format, duration, and sample information.
308#[derive(Debug, Clone)]
309pub struct AudioInfo {
310    /// Audio sample rate in Hz
311    pub sample_rate: u32,
312    /// Number of audio channels
313    pub channels: u16,
314    /// Total duration of audio in seconds
315    pub duration_seconds: f64,
316    /// Total number of audio samples
317    pub total_samples: usize,
318}