subx_cli/services/vad/
sync_detector.rs

1use super::{LocalVadDetector, VadResult};
2use crate::config::VadConfig;
3use crate::core::formats::{Subtitle, SubtitleEntry};
4use crate::core::sync::{SyncMethod, SyncResult};
5use crate::{Result, error::SubXError};
6use log::debug;
7use serde_json::json;
8use std::path::Path;
9
10/// VAD-based subtitle synchronization detector.
11///
12/// Uses Voice Activity Detection to analyze audio files and calculate
13/// subtitle timing offsets by comparing detected speech segments with
14/// subtitle timing information.
15pub struct VadSyncDetector {
16    vad_detector: LocalVadDetector,
17}
18
19impl VadSyncDetector {
20    /// Create a new VAD sync detector.
21    ///
22    /// # Arguments
23    ///
24    /// * `config` - VAD configuration parameters
25    ///
26    /// # Returns
27    ///
28    /// A new `VadSyncDetector` instance
29    ///
30    /// # Errors
31    ///
32    /// Returns an error if the VAD detector cannot be initialized
33    pub fn new(config: VadConfig) -> Result<Self> {
34        Ok(Self {
35            vad_detector: LocalVadDetector::new(config)?,
36        })
37    }
38
39    /// Detect synchronization offset between audio and subtitle.
40    ///
41    /// Analyzes the entire audio file using VAD to identify speech segments
42    /// and compares them with subtitle timing to calculate the offset.
43    ///
44    /// # Arguments
45    ///
46    /// * `audio_path` - Path to the audio file to analyze
47    /// * `subtitle` - Subtitle data with timing information
48    /// * `_analysis_window_seconds` - Ignored parameter (processes entire file)
49    ///
50    /// # Returns
51    ///
52    /// Synchronization result with detected offset and confidence
53    ///
54    /// # Errors
55    ///
56    /// Returns an error if:
57    /// - Audio analysis fails
58    /// - Subtitle has no entries
59    /// - VAD processing fails
60    pub async fn detect_sync_offset(
61        &self,
62        audio_path: &Path,
63        subtitle: &Subtitle,
64        _analysis_window_seconds: u32, // Ignore this parameter, process entire file
65    ) -> Result<SyncResult> {
66        debug!(
67            "[VadSyncDetector] Starting sync offset detection | audio_path: {:?}, subtitle entries: {}",
68            audio_path,
69            subtitle.entries.len()
70        );
71        // 1. Get expected start time of first subtitle
72        let first_entry = self.get_first_subtitle_entry(subtitle)?;
73        debug!(
74            "[VadSyncDetector] First subtitle entry: start_time = {:.3}, end_time = {:.3}",
75            first_entry.start_time.as_secs_f64(),
76            first_entry.end_time.as_secs_f64()
77        );
78
79        // 2. Perform VAD analysis on entire audio file
80        debug!(
81            "[VadSyncDetector] Performing VAD analysis on audio file: {:?}",
82            audio_path
83        );
84        let vad_result = self.vad_detector.detect_speech(audio_path).await?;
85        debug!(
86            "[VadSyncDetector] VAD analysis complete | speech_segments: {}, processing_time_ms: {}",
87            vad_result.speech_segments.len(),
88            vad_result.processing_duration.as_millis()
89        );
90
91        // 3. Analyze results: compare first speech segment with first subtitle timing
92        debug!("[VadSyncDetector] Analyzing VAD result and subtitle alignment...");
93        let analysis_result = self.analyze_vad_result(&vad_result, first_entry)?;
94
95        debug!(
96            "[VadSyncDetector] Sync offset detection finished | offset_seconds: {:.3}, confidence: {:.3}",
97            analysis_result.offset_seconds, analysis_result.confidence
98        );
99        Ok(analysis_result)
100    }
101
102    fn get_first_subtitle_entry<'a>(&self, subtitle: &'a Subtitle) -> Result<&'a SubtitleEntry> {
103        subtitle
104            .entries
105            .first()
106            .ok_or_else(move || SubXError::audio_processing("No subtitle entries found"))
107    }
108
109    fn analyze_vad_result(
110        &self,
111        vad_result: &VadResult,
112        first_entry: &SubtitleEntry,
113    ) -> Result<SyncResult> {
114        // Detect first significant speech segment
115        let first_speech_time = self.find_first_significant_speech(vad_result)?;
116        debug!(
117            "[VadSyncDetector] Detected first significant speech segment: first_speech_time = {:.3} (seconds)",
118            first_speech_time
119        );
120        debug!(
121            "[VadSyncDetector] Speech segments count: {} | First segment: start = {:.3}, duration = {:.3}",
122            vad_result.speech_segments.len(),
123            vad_result
124                .speech_segments
125                .first()
126                .map(|s| s.start_time)
127                .unwrap_or(-1.0),
128            vad_result
129                .speech_segments
130                .first()
131                .map(|s| s.duration)
132                .unwrap_or(-1.0)
133        );
134
135        // Calculate offset: actual speech start time - expected subtitle start time
136        let expected_start = first_entry.start_time.as_secs_f64();
137        debug!(
138            "[VadSyncDetector] Expected subtitle start time: expected_start = {:.3} (seconds)",
139            expected_start
140        );
141        let offset_seconds = first_speech_time - expected_start;
142        debug!(
143            "[VadSyncDetector] Calculated offset_seconds = {:.3} (speech - subtitle)",
144            offset_seconds
145        );
146
147        // Calculate confidence
148        let confidence = self.calculate_confidence(vad_result);
149        debug!(
150            "[VadSyncDetector] Calculated confidence score: {:.3}",
151            confidence
152        );
153
154        let additional_info = Some(json!({
155            "speech_segments_count": vad_result.speech_segments.len(),
156            "first_speech_start": first_speech_time,
157            "expected_subtitle_start": expected_start,
158            "processing_time_ms": vad_result.processing_duration.as_millis(),
159            "audio_duration": vad_result.audio_info.duration_seconds,
160            "detected_segments": vad_result.speech_segments.iter().map(|s| {
161                json!({
162                    "start": s.start_time,
163                    "end": s.end_time,
164                    "duration": s.duration
165                })
166            }).collect::<Vec<_>>(),
167        }));
168
169        Ok(SyncResult {
170            offset_seconds: offset_seconds as f32,
171            confidence,
172            method_used: SyncMethod::LocalVad,
173            correlation_peak: 0.0,
174            additional_info,
175            processing_duration: vad_result.processing_duration,
176            warnings: Vec::new(),
177        })
178    }
179
180    fn find_first_significant_speech(&self, vad_result: &VadResult) -> Result<f64> {
181        // Find the first significant speech segment
182        for segment in &vad_result.speech_segments {
183            // Check if segment is long enough
184            if segment.duration >= 0.1 {
185                return Ok(segment.start_time);
186            }
187        }
188
189        // If no significant speech segment found but speech segments exist, return first one
190        if let Some(first_segment) = vad_result.speech_segments.first() {
191            return Ok(first_segment.start_time);
192        }
193
194        Err(SubXError::audio_processing(
195            "No significant speech segments found in audio",
196        ))
197    }
198
199    fn calculate_confidence(&self, vad_result: &VadResult) -> f32 {
200        if vad_result.speech_segments.is_empty() {
201            return 0.0;
202        }
203
204        let mut confidence: f32 = 0.6; // Base local VAD confidence
205
206        // Adjust confidence based on speech segment count
207        let segments_count = vad_result.speech_segments.len();
208        if segments_count >= 1 {
209            confidence += 0.1;
210        }
211        if segments_count >= 3 {
212            confidence += 0.1;
213        }
214
215        // Adjust confidence based on first speech segment quality
216        if let Some(first_segment) = vad_result.speech_segments.first() {
217            // Longer speech segments increase confidence
218            if first_segment.duration >= 0.5 {
219                confidence += 0.1;
220            }
221            if first_segment.duration >= 1.0 {
222                confidence += 0.05;
223            }
224        }
225
226        // Adjust confidence based on processing speed (local processing is usually fast)
227        if vad_result.processing_duration.as_secs() <= 1 {
228            confidence += 0.05;
229        }
230
231        confidence.min(0.95_f32) // Local VAD maximum confidence limit is 95%
232    }
233}