subx_cli/services/vad/
sync_detector.rs1use super::{LocalVadDetector, VadResult};
2use crate::config::VadConfig;
3use crate::core::formats::{Subtitle, SubtitleEntry};
4use crate::core::sync::{SyncMethod, SyncResult};
5use crate::{Result, error::SubXError};
6use log::debug;
7use serde_json::json;
8use std::path::Path;
9
10pub struct VadSyncDetector {
16 vad_detector: LocalVadDetector,
17}
18
19impl VadSyncDetector {
20 pub fn new(config: VadConfig) -> Result<Self> {
34 Ok(Self {
35 vad_detector: LocalVadDetector::new(config)?,
36 })
37 }
38
39 pub async fn detect_sync_offset(
61 &self,
62 audio_path: &Path,
63 subtitle: &Subtitle,
64 _analysis_window_seconds: u32, ) -> Result<SyncResult> {
66 debug!(
67 "[VadSyncDetector] Starting sync offset detection | audio_path: {:?}, subtitle entries: {}",
68 audio_path,
69 subtitle.entries.len()
70 );
71 let first_entry = self.get_first_subtitle_entry(subtitle)?;
73 debug!(
74 "[VadSyncDetector] First subtitle entry: start_time = {:.3}, end_time = {:.3}",
75 first_entry.start_time.as_secs_f64(),
76 first_entry.end_time.as_secs_f64()
77 );
78
79 debug!(
81 "[VadSyncDetector] Performing VAD analysis on audio file: {:?}",
82 audio_path
83 );
84 let vad_result = self.vad_detector.detect_speech(audio_path).await?;
85 debug!(
86 "[VadSyncDetector] VAD analysis complete | speech_segments: {}, processing_time_ms: {}",
87 vad_result.speech_segments.len(),
88 vad_result.processing_duration.as_millis()
89 );
90
91 debug!("[VadSyncDetector] Analyzing VAD result and subtitle alignment...");
93 let analysis_result = self.analyze_vad_result(&vad_result, first_entry)?;
94
95 debug!(
96 "[VadSyncDetector] Sync offset detection finished | offset_seconds: {:.3}, confidence: {:.3}",
97 analysis_result.offset_seconds, analysis_result.confidence
98 );
99 Ok(analysis_result)
100 }
101
102 fn get_first_subtitle_entry<'a>(&self, subtitle: &'a Subtitle) -> Result<&'a SubtitleEntry> {
103 subtitle
104 .entries
105 .first()
106 .ok_or_else(move || SubXError::audio_processing("No subtitle entries found"))
107 }
108
109 fn analyze_vad_result(
110 &self,
111 vad_result: &VadResult,
112 first_entry: &SubtitleEntry,
113 ) -> Result<SyncResult> {
114 let first_speech_time = self.find_first_significant_speech(vad_result)?;
116 debug!(
117 "[VadSyncDetector] Detected first significant speech segment: first_speech_time = {:.3} (seconds)",
118 first_speech_time
119 );
120 debug!(
121 "[VadSyncDetector] Speech segments count: {} | First segment: start = {:.3}, duration = {:.3}",
122 vad_result.speech_segments.len(),
123 vad_result
124 .speech_segments
125 .first()
126 .map(|s| s.start_time)
127 .unwrap_or(-1.0),
128 vad_result
129 .speech_segments
130 .first()
131 .map(|s| s.duration)
132 .unwrap_or(-1.0)
133 );
134
135 let expected_start = first_entry.start_time.as_secs_f64();
137 debug!(
138 "[VadSyncDetector] Expected subtitle start time: expected_start = {:.3} (seconds)",
139 expected_start
140 );
141 let offset_seconds = first_speech_time - expected_start;
142 debug!(
143 "[VadSyncDetector] Calculated offset_seconds = {:.3} (speech - subtitle)",
144 offset_seconds
145 );
146
147 let confidence = self.calculate_confidence(vad_result);
149 debug!(
150 "[VadSyncDetector] Calculated confidence score: {:.3}",
151 confidence
152 );
153
154 let additional_info = Some(json!({
155 "speech_segments_count": vad_result.speech_segments.len(),
156 "first_speech_start": first_speech_time,
157 "expected_subtitle_start": expected_start,
158 "processing_time_ms": vad_result.processing_duration.as_millis(),
159 "audio_duration": vad_result.audio_info.duration_seconds,
160 "detected_segments": vad_result.speech_segments.iter().map(|s| {
161 json!({
162 "start": s.start_time,
163 "end": s.end_time,
164 "duration": s.duration
165 })
166 }).collect::<Vec<_>>(),
167 }));
168
169 Ok(SyncResult {
170 offset_seconds: offset_seconds as f32,
171 confidence,
172 method_used: SyncMethod::LocalVad,
173 correlation_peak: 0.0,
174 additional_info,
175 processing_duration: vad_result.processing_duration,
176 warnings: Vec::new(),
177 })
178 }
179
180 fn find_first_significant_speech(&self, vad_result: &VadResult) -> Result<f64> {
181 for segment in &vad_result.speech_segments {
183 if segment.duration >= 0.1 {
185 return Ok(segment.start_time);
186 }
187 }
188
189 if let Some(first_segment) = vad_result.speech_segments.first() {
191 return Ok(first_segment.start_time);
192 }
193
194 Err(SubXError::audio_processing(
195 "No significant speech segments found in audio",
196 ))
197 }
198
199 fn calculate_confidence(&self, vad_result: &VadResult) -> f32 {
200 if vad_result.speech_segments.is_empty() {
201 return 0.0;
202 }
203
204 let mut confidence: f32 = 0.6; let segments_count = vad_result.speech_segments.len();
208 if segments_count >= 1 {
209 confidence += 0.1;
210 }
211 if segments_count >= 3 {
212 confidence += 0.1;
213 }
214
215 if let Some(first_segment) = vad_result.speech_segments.first() {
217 if first_segment.duration >= 0.5 {
219 confidence += 0.1;
220 }
221 if first_segment.duration >= 1.0 {
222 confidence += 0.05;
223 }
224 }
225
226 if vad_result.processing_duration.as_secs() <= 1 {
228 confidence += 0.05;
229 }
230
231 confidence.min(0.95_f32) }
233}