subx_cli/services/vad/
sync_detector.rs

1use super::{LocalVadDetector, VadResult};
2use crate::config::VadConfig;
3use crate::core::formats::{Subtitle, SubtitleEntry};
4use crate::core::sync::{SyncMethod, SyncResult};
5use crate::{Result, error::SubXError};
6use log::debug;
7use serde_json::json;
8use std::path::Path;
9
10/// VAD-based subtitle synchronization detector.
11///
12/// Uses Voice Activity Detection to analyze audio files and calculate
13/// subtitle timing offsets by comparing detected speech segments with
14/// subtitle timing information.
15pub struct VadSyncDetector {
16    vad_detector: LocalVadDetector,
17}
18
19impl VadSyncDetector {
20    /// Create a new VAD sync detector.
21    ///
22    /// # Arguments
23    ///
24    /// * `config` - VAD configuration parameters
25    ///
26    /// # Returns
27    ///
28    /// A new `VadSyncDetector` instance
29    ///
30    /// # Errors
31    ///
32    /// Returns an error if the VAD detector cannot be initialized
33    pub fn new(config: VadConfig) -> Result<Self> {
34        Ok(Self {
35            vad_detector: LocalVadDetector::new(config)?,
36        })
37    }
38
39    /// Detect synchronization offset between audio and subtitle.
40    ///
41    /// Analyzes the entire audio file using VAD to identify speech segments
42    /// and compares them with subtitle timing to calculate the offset.
43    ///
44    /// # Arguments
45    ///
46    /// * `audio_path` - Path to the audio file to analyze
47    /// * `subtitle` - Subtitle data with timing information
48    /// * `_analysis_window_seconds` - Ignored parameter (processes entire file)
49    ///
50    /// # Returns
51    ///
52    /// Synchronization result with detected offset and confidence
53    ///
54    /// # Errors
55    ///
56    /// Returns an error if:
57    /// - Audio analysis fails
58    /// - Subtitle has no entries
59    /// - VAD processing fails
60    pub async fn detect_sync_offset(
61        &self,
62        audio_path: &Path,
63        subtitle: &Subtitle,
64        analysis_window_seconds: u32,
65    ) -> Result<SyncResult> {
66        debug!(
67            "[VadSyncDetector] Starting sync offset detection | audio_path: {:?}, subtitle entries: {}",
68            audio_path,
69            subtitle.entries.len()
70        );
71        // 1. Get expected start time of first subtitle
72        let first_entry = self.get_first_subtitle_entry(subtitle)?;
73        debug!(
74            "[VadSyncDetector] First subtitle entry: start_time = {:.3}, end_time = {:.3}",
75            first_entry.start_time.as_secs_f64(),
76            first_entry.end_time.as_secs_f64()
77        );
78
79        // 2. Load audio and crop if analysis window is specified (in seconds)
80        debug!(
81            "[VadSyncDetector] Loading and cropping audio for VAD analysis: {:?}",
82            audio_path
83        );
84        let mut audio_data = self
85            .vad_detector
86            .audio_processor()
87            .load_and_prepare_audio_direct(audio_path)
88            .await?;
89        if analysis_window_seconds > 0 {
90            let sample_rate = audio_data.info.sample_rate;
91            let max_samples = (sample_rate as usize * analysis_window_seconds as usize)
92                .min(audio_data.samples.len());
93            audio_data.samples.truncate(max_samples);
94            audio_data.info.duration_seconds = audio_data.samples.len() as f64 / sample_rate as f64;
95            audio_data.info.total_samples = audio_data.samples.len();
96            debug!(
97                "[VadSyncDetector] Cropped audio to first {} seconds ({} samples)",
98                analysis_window_seconds, max_samples
99            );
100        }
101
102        // 3. Perform VAD analysis
103        debug!(
104            "[VadSyncDetector] Performing VAD analysis on (possibly cropped) audio file: {:?}",
105            audio_path
106        );
107        let vad_result = self
108            .vad_detector
109            .detect_speech_from_data(audio_data)
110            .await?;
111        debug!(
112            "[VadSyncDetector] VAD analysis complete | speech_segments: {}, processing_time_ms: {}",
113            vad_result.speech_segments.len(),
114            vad_result.processing_duration.as_millis()
115        );
116
117        // 4. Analyze results: compare first speech segment with first subtitle timing
118        debug!("[VadSyncDetector] Analyzing VAD result and subtitle alignment...");
119        let analysis_result = self.analyze_vad_result(&vad_result, first_entry)?;
120
121        debug!(
122            "[VadSyncDetector] Sync offset detection finished | offset_seconds: {:.3}, confidence: {:.3}",
123            analysis_result.offset_seconds, analysis_result.confidence
124        );
125        Ok(analysis_result)
126    }
127
128    fn get_first_subtitle_entry<'a>(&self, subtitle: &'a Subtitle) -> Result<&'a SubtitleEntry> {
129        subtitle
130            .entries
131            .first()
132            .ok_or_else(move || SubXError::audio_processing("No subtitle entries found"))
133    }
134
135    fn analyze_vad_result(
136        &self,
137        vad_result: &VadResult,
138        first_entry: &SubtitleEntry,
139    ) -> Result<SyncResult> {
140        // Detect first significant speech segment
141        let first_speech_time = self.find_first_significant_speech(vad_result)?;
142        debug!(
143            "[VadSyncDetector] Detected first significant speech segment: first_speech_time = {:.3} (seconds)",
144            first_speech_time
145        );
146        debug!(
147            "[VadSyncDetector] Speech segments count: {} | First segment: start = {:.3}, duration = {:.3}",
148            vad_result.speech_segments.len(),
149            vad_result
150                .speech_segments
151                .first()
152                .map(|s| s.start_time)
153                .unwrap_or(-1.0),
154            vad_result
155                .speech_segments
156                .first()
157                .map(|s| s.duration)
158                .unwrap_or(-1.0)
159        );
160
161        // Calculate offset: actual speech start time - expected subtitle start time
162        let expected_start = first_entry.start_time.as_secs_f64();
163        debug!(
164            "[VadSyncDetector] Expected subtitle start time: expected_start = {:.3} (seconds)",
165            expected_start
166        );
167        let offset_seconds = first_speech_time - expected_start;
168        debug!(
169            "[VadSyncDetector] Calculated offset_seconds = {:.3} (speech - subtitle)",
170            offset_seconds
171        );
172
173        // Calculate confidence
174        let confidence = self.calculate_confidence(vad_result);
175        debug!(
176            "[VadSyncDetector] Calculated confidence score: {:.3}",
177            confidence
178        );
179
180        let additional_info = Some(json!({
181            "speech_segments_count": vad_result.speech_segments.len(),
182            "first_speech_start": first_speech_time,
183            "expected_subtitle_start": expected_start,
184            "processing_time_ms": vad_result.processing_duration.as_millis(),
185            "audio_duration": vad_result.audio_info.duration_seconds,
186            "detected_segments": vad_result.speech_segments.iter().map(|s| {
187                json!({
188                    "start": s.start_time,
189                    "end": s.end_time,
190                    "duration": s.duration
191                })
192            }).collect::<Vec<_>>(),
193        }));
194
195        Ok(SyncResult {
196            offset_seconds: offset_seconds as f32,
197            confidence,
198            method_used: SyncMethod::LocalVad,
199            correlation_peak: 0.0,
200            additional_info,
201            processing_duration: vad_result.processing_duration,
202            warnings: Vec::new(),
203        })
204    }
205
206    fn find_first_significant_speech(&self, vad_result: &VadResult) -> Result<f64> {
207        // Find the first significant speech segment
208        for segment in &vad_result.speech_segments {
209            // Check if segment is long enough
210            if segment.duration >= 0.1 {
211                return Ok(segment.start_time);
212            }
213        }
214
215        // If no significant speech segment found but speech segments exist, return first one
216        if let Some(first_segment) = vad_result.speech_segments.first() {
217            return Ok(first_segment.start_time);
218        }
219
220        Err(SubXError::audio_processing(
221            "No significant speech segments found in audio",
222        ))
223    }
224
225    fn calculate_confidence(&self, vad_result: &VadResult) -> f32 {
226        if vad_result.speech_segments.is_empty() {
227            return 0.0;
228        }
229
230        let mut confidence: f32 = 0.6; // Base local VAD confidence
231
232        // Adjust confidence based on speech segment count
233        let segments_count = vad_result.speech_segments.len();
234        if segments_count >= 1 {
235            confidence += 0.1;
236        }
237        if segments_count >= 3 {
238            confidence += 0.1;
239        }
240
241        // Adjust confidence based on first speech segment quality
242        if let Some(first_segment) = vad_result.speech_segments.first() {
243            // Longer speech segments increase confidence
244            if first_segment.duration >= 0.5 {
245                confidence += 0.1;
246            }
247            if first_segment.duration >= 1.0 {
248                confidence += 0.05;
249            }
250        }
251
252        // Adjust confidence based on processing speed (local processing is usually fast)
253        if vad_result.processing_duration.as_secs() <= 1 {
254            confidence += 0.05;
255        }
256
257        confidence.min(0.95_f32) // Local VAD maximum confidence limit is 95%
258    }
259}