subx_cli/services/vad/
sync_detector.rs1use super::{LocalVadDetector, VadResult};
2use crate::config::VadConfig;
3use crate::core::formats::{Subtitle, SubtitleEntry};
4use crate::core::sync::{SyncMethod, SyncResult};
5use crate::{Result, error::SubXError};
6use log::debug;
7use serde_json::json;
8use std::path::Path;
9
10pub struct VadSyncDetector {
16 vad_detector: LocalVadDetector,
17}
18
19impl VadSyncDetector {
20 pub fn new(config: VadConfig) -> Result<Self> {
34 Ok(Self {
35 vad_detector: LocalVadDetector::new(config)?,
36 })
37 }
38
39 pub async fn detect_sync_offset(
61 &self,
62 audio_path: &Path,
63 subtitle: &Subtitle,
64 analysis_window_seconds: u32,
65 ) -> Result<SyncResult> {
66 debug!(
67 "[VadSyncDetector] Starting sync offset detection | audio_path: {:?}, subtitle entries: {}",
68 audio_path,
69 subtitle.entries.len()
70 );
71 let first_entry = self.get_first_subtitle_entry(subtitle)?;
73 debug!(
74 "[VadSyncDetector] First subtitle entry: start_time = {:.3}, end_time = {:.3}",
75 first_entry.start_time.as_secs_f64(),
76 first_entry.end_time.as_secs_f64()
77 );
78
79 debug!(
81 "[VadSyncDetector] Loading and cropping audio for VAD analysis: {:?}",
82 audio_path
83 );
84 let mut audio_data = self
85 .vad_detector
86 .audio_processor()
87 .load_and_prepare_audio_direct(audio_path)
88 .await?;
89 if analysis_window_seconds > 0 {
90 let sample_rate = audio_data.info.sample_rate;
91 let max_samples = (sample_rate as usize * analysis_window_seconds as usize)
92 .min(audio_data.samples.len());
93 audio_data.samples.truncate(max_samples);
94 audio_data.info.duration_seconds = audio_data.samples.len() as f64 / sample_rate as f64;
95 audio_data.info.total_samples = audio_data.samples.len();
96 debug!(
97 "[VadSyncDetector] Cropped audio to first {} seconds ({} samples)",
98 analysis_window_seconds, max_samples
99 );
100 }
101
102 debug!(
104 "[VadSyncDetector] Performing VAD analysis on (possibly cropped) audio file: {:?}",
105 audio_path
106 );
107 let vad_result = self
108 .vad_detector
109 .detect_speech_from_data(audio_data)
110 .await?;
111 debug!(
112 "[VadSyncDetector] VAD analysis complete | speech_segments: {}, processing_time_ms: {}",
113 vad_result.speech_segments.len(),
114 vad_result.processing_duration.as_millis()
115 );
116
117 debug!("[VadSyncDetector] Analyzing VAD result and subtitle alignment...");
119 let analysis_result = self.analyze_vad_result(&vad_result, first_entry)?;
120
121 debug!(
122 "[VadSyncDetector] Sync offset detection finished | offset_seconds: {:.3}, confidence: {:.3}",
123 analysis_result.offset_seconds, analysis_result.confidence
124 );
125 Ok(analysis_result)
126 }
127
128 fn get_first_subtitle_entry<'a>(&self, subtitle: &'a Subtitle) -> Result<&'a SubtitleEntry> {
129 subtitle
130 .entries
131 .first()
132 .ok_or_else(move || SubXError::audio_processing("No subtitle entries found"))
133 }
134
135 fn analyze_vad_result(
136 &self,
137 vad_result: &VadResult,
138 first_entry: &SubtitleEntry,
139 ) -> Result<SyncResult> {
140 let first_speech_time = self.find_first_significant_speech(vad_result)?;
142 debug!(
143 "[VadSyncDetector] Detected first significant speech segment: first_speech_time = {:.3} (seconds)",
144 first_speech_time
145 );
146 debug!(
147 "[VadSyncDetector] Speech segments count: {} | First segment: start = {:.3}, duration = {:.3}",
148 vad_result.speech_segments.len(),
149 vad_result
150 .speech_segments
151 .first()
152 .map(|s| s.start_time)
153 .unwrap_or(-1.0),
154 vad_result
155 .speech_segments
156 .first()
157 .map(|s| s.duration)
158 .unwrap_or(-1.0)
159 );
160
161 let expected_start = first_entry.start_time.as_secs_f64();
163 debug!(
164 "[VadSyncDetector] Expected subtitle start time: expected_start = {:.3} (seconds)",
165 expected_start
166 );
167 let offset_seconds = first_speech_time - expected_start;
168 debug!(
169 "[VadSyncDetector] Calculated offset_seconds = {:.3} (speech - subtitle)",
170 offset_seconds
171 );
172
173 let confidence = self.calculate_confidence(vad_result);
175 debug!(
176 "[VadSyncDetector] Calculated confidence score: {:.3}",
177 confidence
178 );
179
180 let additional_info = Some(json!({
181 "speech_segments_count": vad_result.speech_segments.len(),
182 "first_speech_start": first_speech_time,
183 "expected_subtitle_start": expected_start,
184 "processing_time_ms": vad_result.processing_duration.as_millis(),
185 "audio_duration": vad_result.audio_info.duration_seconds,
186 "detected_segments": vad_result.speech_segments.iter().map(|s| {
187 json!({
188 "start": s.start_time,
189 "end": s.end_time,
190 "duration": s.duration
191 })
192 }).collect::<Vec<_>>(),
193 }));
194
195 Ok(SyncResult {
196 offset_seconds: offset_seconds as f32,
197 confidence,
198 method_used: SyncMethod::LocalVad,
199 correlation_peak: 0.0,
200 additional_info,
201 processing_duration: vad_result.processing_duration,
202 warnings: Vec::new(),
203 })
204 }
205
206 fn find_first_significant_speech(&self, vad_result: &VadResult) -> Result<f64> {
207 for segment in &vad_result.speech_segments {
209 if segment.duration >= 0.1 {
211 return Ok(segment.start_time);
212 }
213 }
214
215 if let Some(first_segment) = vad_result.speech_segments.first() {
217 return Ok(first_segment.start_time);
218 }
219
220 Err(SubXError::audio_processing(
221 "No significant speech segments found in audio",
222 ))
223 }
224
225 fn calculate_confidence(&self, vad_result: &VadResult) -> f32 {
226 if vad_result.speech_segments.is_empty() {
227 return 0.0;
228 }
229
230 let mut confidence: f32 = 0.6; let segments_count = vad_result.speech_segments.len();
234 if segments_count >= 1 {
235 confidence += 0.1;
236 }
237 if segments_count >= 3 {
238 confidence += 0.1;
239 }
240
241 if let Some(first_segment) = vad_result.speech_segments.first() {
243 if first_segment.duration >= 0.5 {
245 confidence += 0.1;
246 }
247 if first_segment.duration >= 1.0 {
248 confidence += 0.05;
249 }
250 }
251
252 if vad_result.processing_duration.as_secs() <= 1 {
254 confidence += 0.05;
255 }
256
257 confidence.min(0.95_f32) }
259}