subx_cli/services/vad/
detector.rs

1use super::audio_processor::VadAudioProcessor;
2use crate::config::VadConfig;
3use crate::{Result, error::SubXError};
4use std::path::Path;
5use std::time::{Duration, Instant};
6use voice_activity_detector::{IteratorExt, LabeledAudio, VoiceActivityDetector};
7
8/// Local voice activity detector.
9///
10/// Provides voice activity detection using local processing without
11/// external API calls. Uses the `voice_activity_detector` crate for
12/// speech detection and analysis.
13pub struct LocalVadDetector {
14    config: VadConfig,
15    audio_processor: VadAudioProcessor,
16}
17
18impl LocalVadDetector {
19    /// Create a new local VAD detector.
20    ///
21    /// # Arguments
22    ///
23    /// * `config` - VAD configuration parameters
24    ///
25    /// # Returns
26    ///
27    /// A new `LocalVadDetector` instance
28    ///
29    /// # Errors
30    ///
31    /// Returns an error if the audio processor cannot be initialized
32    pub fn new(config: VadConfig) -> Result<Self> {
33        // Clone config to avoid moving while initializing audio_processor
34        let cfg_clone = config.clone();
35        Ok(Self {
36            config,
37            audio_processor: VadAudioProcessor::new(cfg_clone.sample_rate, 1)?,
38        })
39    }
40
41    /// Detect speech activity in an audio file.
42    ///
43    /// Processes the entire audio file to identify speech segments
44    /// with timestamps and confidence scores.
45    ///
46    /// # Arguments
47    ///
48    /// * `audio_path` - Path to the audio file to analyze
49    ///
50    /// # Returns
51    ///
52    /// VAD analysis results including speech segments and metadata
53    ///
54    /// # Errors
55    ///
56    /// Returns an error if:
57    /// - Audio file cannot be loaded
58    /// - VAD processing fails
59    /// - Audio format is unsupported
60    pub async fn detect_speech(&self, audio_path: &Path) -> Result<VadResult> {
61        let start_time = Instant::now();
62
63        // 1. Load and preprocess audio
64        let audio_data = self
65            .audio_processor
66            .load_and_prepare_audio(audio_path)
67            .await?;
68
69        // 2. Create VAD instance
70        let vad = VoiceActivityDetector::builder()
71            .sample_rate(self.config.sample_rate)
72            .chunk_size(self.config.chunk_size)
73            .build()
74            .map_err(|e| SubXError::audio_processing(format!("Failed to create VAD: {}", e)))?;
75
76        // 3. Execute speech detection
77        let speech_segments = self.detect_speech_segments(vad, &audio_data.samples)?;
78
79        let processing_duration = start_time.elapsed();
80
81        Ok(VadResult {
82            speech_segments,
83            processing_duration,
84            audio_info: audio_data.info,
85        })
86    }
87
88    fn detect_speech_segments(
89        &self,
90        vad: VoiceActivityDetector,
91        samples: &[i16],
92    ) -> Result<Vec<SpeechSegment>> {
93        let mut segments = Vec::new();
94        let chunk_duration_seconds = self.config.chunk_size as f64 / self.config.sample_rate as f64;
95
96        // Use label functionality to identify speech and non-speech segments
97        let labels: Vec<LabeledAudio<i16>> = samples
98            .iter()
99            .copied()
100            .label(
101                vad,
102                self.config.sensitivity,
103                self.config.padding_chunks as usize,
104            )
105            .collect();
106
107        let mut current_speech_start: Option<f64> = None;
108        let mut chunk_index = 0;
109
110        for label in labels {
111            let chunk_start_time = chunk_index as f64 * chunk_duration_seconds;
112
113            match label {
114                LabeledAudio::Speech(_chunk) => {
115                    if current_speech_start.is_none() {
116                        current_speech_start = Some(chunk_start_time);
117                    }
118                }
119                LabeledAudio::NonSpeech(_chunk) => {
120                    if let Some(start_time) = current_speech_start.take() {
121                        let end_time = chunk_start_time;
122                        let duration = end_time - start_time;
123
124                        // Filter out speech segments that are too short
125                        if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
126                            segments.push(SpeechSegment {
127                                start_time,
128                                end_time,
129                                probability: self.config.sensitivity, // Use configured sensitivity as probability
130                                duration,
131                            });
132                        }
133                    }
134                }
135            }
136
137            chunk_index += 1;
138        }
139
140        // Handle the last speech segment (if exists)
141        if let Some(start_time) = current_speech_start {
142            let end_time = chunk_index as f64 * chunk_duration_seconds;
143            let duration = end_time - start_time;
144
145            if duration >= self.config.min_speech_duration_ms as f64 / 1000.0 {
146                segments.push(SpeechSegment {
147                    start_time,
148                    end_time,
149                    probability: self.config.sensitivity,
150                    duration,
151                });
152            }
153        }
154
155        // Merge close segments
156        Ok(self.merge_close_segments(segments))
157    }
158
159    fn merge_close_segments(&self, segments: Vec<SpeechSegment>) -> Vec<SpeechSegment> {
160        if segments.is_empty() {
161            return segments;
162        }
163
164        let mut merged = Vec::new();
165        let mut current = segments[0].clone();
166        let merge_threshold = self.config.speech_merge_gap_ms as f64 / 1000.0;
167
168        for segment in segments.into_iter().skip(1) {
169            if segment.start_time - current.end_time <= merge_threshold {
170                // Merge segments
171                current.end_time = segment.end_time;
172                current.duration = current.end_time - current.start_time;
173                current.probability = current.probability.max(segment.probability);
174            } else {
175                // Store current segment, start new segment
176                merged.push(current);
177                current = segment;
178            }
179        }
180
181        merged.push(current);
182        merged
183    }
184}
185
186/// VAD detection result containing speech segments and processing metadata.
187///
188/// Represents the complete result of voice activity detection analysis,
189/// including identified speech segments, timing information, and audio metadata.
190#[derive(Debug, Clone)]
191pub struct VadResult {
192    /// Detected speech segments with timing and confidence
193    pub speech_segments: Vec<SpeechSegment>,
194    /// Time taken to process the audio file
195    pub processing_duration: Duration,
196    /// Original audio file information
197    pub audio_info: AudioInfo,
198}
199
200/// Individual speech segment identified by VAD.
201///
202/// Represents a continuous segment of detected speech with timing
203/// and confidence information.
204#[derive(Debug, Clone)]
205pub struct SpeechSegment {
206    /// Start time of the speech segment in seconds
207    pub start_time: f64,
208    /// End time of the speech segment in seconds
209    pub end_time: f64,
210    /// Confidence probability of speech detection (0.0-1.0)
211    pub probability: f32,
212    /// Duration of the speech segment in seconds
213    pub duration: f64,
214}
215
216/// Audio file metadata and properties.
217///
218/// Contains technical information about the processed audio file
219/// including format, duration, and sample information.
220#[derive(Debug, Clone)]
221pub struct AudioInfo {
222    /// Audio sample rate in Hz
223    pub sample_rate: u32,
224    /// Number of audio channels
225    pub channels: u16,
226    /// Total duration of audio in seconds
227    pub duration_seconds: f64,
228    /// Total number of audio samples
229    pub total_samples: usize,
230}