subx_cli/services/vad/
sync_detector.rs

1use super::{LocalVadDetector, VadResult};
2use crate::config::VadConfig;
3use crate::core::formats::{Subtitle, SubtitleEntry};
4use crate::core::sync::{SyncMethod, SyncResult};
5use crate::{Result, error::SubXError};
6use serde_json::json;
7use std::path::Path;
8
9/// VAD-based subtitle synchronization detector.
10///
11/// Uses Voice Activity Detection to analyze audio files and calculate
12/// subtitle timing offsets by comparing detected speech segments with
13/// subtitle timing information.
14pub struct VadSyncDetector {
15    vad_detector: LocalVadDetector,
16}
17
18impl VadSyncDetector {
19    /// Create a new VAD sync detector.
20    ///
21    /// # Arguments
22    ///
23    /// * `config` - VAD configuration parameters
24    ///
25    /// # Returns
26    ///
27    /// A new `VadSyncDetector` instance
28    ///
29    /// # Errors
30    ///
31    /// Returns an error if the VAD detector cannot be initialized
32    pub fn new(config: VadConfig) -> Result<Self> {
33        Ok(Self {
34            vad_detector: LocalVadDetector::new(config)?,
35        })
36    }
37
38    /// Detect synchronization offset between audio and subtitle.
39    ///
40    /// Analyzes the entire audio file using VAD to identify speech segments
41    /// and compares them with subtitle timing to calculate the offset.
42    ///
43    /// # Arguments
44    ///
45    /// * `audio_path` - Path to the audio file to analyze
46    /// * `subtitle` - Subtitle data with timing information
47    /// * `_analysis_window_seconds` - Ignored parameter (processes entire file)
48    ///
49    /// # Returns
50    ///
51    /// Synchronization result with detected offset and confidence
52    ///
53    /// # Errors
54    ///
55    /// Returns an error if:
56    /// - Audio analysis fails
57    /// - Subtitle has no entries
58    /// - VAD processing fails
59    pub async fn detect_sync_offset(
60        &self,
61        audio_path: &Path,
62        subtitle: &Subtitle,
63        _analysis_window_seconds: u32, // Ignore this parameter, process entire file
64    ) -> Result<SyncResult> {
65        // 1. Get expected start time of first subtitle
66        let first_entry = self.get_first_subtitle_entry(subtitle)?;
67
68        // 2. Perform VAD analysis on entire audio file
69        let vad_result = self.vad_detector.detect_speech(audio_path).await?;
70
71        // 3. Analyze results: compare first speech segment with first subtitle timing
72        let analysis_result = self.analyze_vad_result(&vad_result, first_entry)?;
73
74        Ok(analysis_result)
75    }
76
77    fn get_first_subtitle_entry<'a>(&self, subtitle: &'a Subtitle) -> Result<&'a SubtitleEntry> {
78        subtitle
79            .entries
80            .first()
81            .ok_or_else(move || SubXError::audio_processing("No subtitle entries found"))
82    }
83
84    fn analyze_vad_result(
85        &self,
86        vad_result: &VadResult,
87        first_entry: &SubtitleEntry,
88    ) -> Result<SyncResult> {
89        // Detect first significant speech segment
90        let first_speech_time = self.find_first_significant_speech(vad_result)?;
91
92        // Calculate offset: actual speech start time - expected subtitle start time
93        let expected_start = first_entry.start_time.as_secs_f64();
94        let offset_seconds = first_speech_time - expected_start;
95
96        // Calculate confidence
97        let confidence = self.calculate_confidence(vad_result);
98
99        Ok(SyncResult {
100            offset_seconds: offset_seconds as f32,
101            confidence,
102            method_used: SyncMethod::LocalVad,
103            correlation_peak: 0.0,
104            additional_info: Some(json!({
105                "speech_segments_count": vad_result.speech_segments.len(),
106                "first_speech_start": first_speech_time,
107                "expected_subtitle_start": expected_start,
108                "processing_time_ms": vad_result.processing_duration.as_millis(),
109                "audio_duration": vad_result.audio_info.duration_seconds,
110                "detected_segments": vad_result.speech_segments.iter().map(|s| {
111                    json!({
112                        "start": s.start_time,
113                        "end": s.end_time,
114                        "duration": s.duration,
115                        "probability": s.probability
116                    })
117                }).collect::<Vec<_>>(),
118            })),
119            processing_duration: vad_result.processing_duration,
120            warnings: Vec::new(),
121        })
122    }
123
124    fn find_first_significant_speech(&self, vad_result: &VadResult) -> Result<f64> {
125        // Find the first significant speech segment
126        for segment in &vad_result.speech_segments {
127            // Check if segment is long enough and has high enough probability
128            if segment.duration >= 0.1 && segment.probability >= 0.5 {
129                return Ok(segment.start_time);
130            }
131        }
132
133        // If no significant speech segment found but speech segments exist, return first one
134        if let Some(first_segment) = vad_result.speech_segments.first() {
135            return Ok(first_segment.start_time);
136        }
137
138        Err(SubXError::audio_processing(
139            "No significant speech segments found in audio",
140        ))
141    }
142
143    fn calculate_confidence(&self, vad_result: &VadResult) -> f32 {
144        if vad_result.speech_segments.is_empty() {
145            return 0.0;
146        }
147
148        let mut confidence: f32 = 0.6; // Base local VAD confidence
149
150        // Adjust confidence based on speech segment count
151        let segments_count = vad_result.speech_segments.len();
152        if segments_count >= 1 {
153            confidence += 0.1;
154        }
155        if segments_count >= 3 {
156            confidence += 0.1;
157        }
158
159        // Adjust confidence based on first speech segment quality
160        if let Some(first_segment) = vad_result.speech_segments.first() {
161            // Longer speech segments increase confidence
162            if first_segment.duration >= 0.5 {
163                confidence += 0.1;
164            }
165            if first_segment.duration >= 1.0 {
166                confidence += 0.05;
167            }
168
169            // Higher probability increases confidence
170            if first_segment.probability >= 0.8 {
171                confidence += 0.05;
172            }
173        }
174
175        // Adjust confidence based on processing speed (local processing is usually fast)
176        if vad_result.processing_duration.as_secs() <= 1 {
177            confidence += 0.05;
178        }
179
180        confidence.min(0.95_f32) // Local VAD maximum confidence limit is 95%
181    }
182}