subx_cli/services/vad/
sync_detector.rs1use super::{LocalVadDetector, VadResult};
2use crate::config::VadConfig;
3use crate::core::formats::{Subtitle, SubtitleEntry};
4use crate::core::sync::{SyncMethod, SyncResult};
5use crate::{Result, error::SubXError};
6use serde_json::json;
7use std::path::Path;
8
9pub struct VadSyncDetector {
15 vad_detector: LocalVadDetector,
16}
17
18impl VadSyncDetector {
19 pub fn new(config: VadConfig) -> Result<Self> {
33 Ok(Self {
34 vad_detector: LocalVadDetector::new(config)?,
35 })
36 }
37
38 pub async fn detect_sync_offset(
60 &self,
61 audio_path: &Path,
62 subtitle: &Subtitle,
63 _analysis_window_seconds: u32, ) -> Result<SyncResult> {
65 let first_entry = self.get_first_subtitle_entry(subtitle)?;
67
68 let vad_result = self.vad_detector.detect_speech(audio_path).await?;
70
71 let analysis_result = self.analyze_vad_result(&vad_result, first_entry)?;
73
74 Ok(analysis_result)
75 }
76
77 fn get_first_subtitle_entry<'a>(&self, subtitle: &'a Subtitle) -> Result<&'a SubtitleEntry> {
78 subtitle
79 .entries
80 .first()
81 .ok_or_else(move || SubXError::audio_processing("No subtitle entries found"))
82 }
83
84 fn analyze_vad_result(
85 &self,
86 vad_result: &VadResult,
87 first_entry: &SubtitleEntry,
88 ) -> Result<SyncResult> {
89 let first_speech_time = self.find_first_significant_speech(vad_result)?;
91
92 let expected_start = first_entry.start_time.as_secs_f64();
94 let offset_seconds = first_speech_time - expected_start;
95
96 let confidence = self.calculate_confidence(vad_result);
98
99 Ok(SyncResult {
100 offset_seconds: offset_seconds as f32,
101 confidence,
102 method_used: SyncMethod::LocalVad,
103 correlation_peak: 0.0,
104 additional_info: Some(json!({
105 "speech_segments_count": vad_result.speech_segments.len(),
106 "first_speech_start": first_speech_time,
107 "expected_subtitle_start": expected_start,
108 "processing_time_ms": vad_result.processing_duration.as_millis(),
109 "audio_duration": vad_result.audio_info.duration_seconds,
110 "detected_segments": vad_result.speech_segments.iter().map(|s| {
111 json!({
112 "start": s.start_time,
113 "end": s.end_time,
114 "duration": s.duration,
115 "probability": s.probability
116 })
117 }).collect::<Vec<_>>(),
118 })),
119 processing_duration: vad_result.processing_duration,
120 warnings: Vec::new(),
121 })
122 }
123
124 fn find_first_significant_speech(&self, vad_result: &VadResult) -> Result<f64> {
125 for segment in &vad_result.speech_segments {
127 if segment.duration >= 0.1 && segment.probability >= 0.5 {
129 return Ok(segment.start_time);
130 }
131 }
132
133 if let Some(first_segment) = vad_result.speech_segments.first() {
135 return Ok(first_segment.start_time);
136 }
137
138 Err(SubXError::audio_processing(
139 "No significant speech segments found in audio",
140 ))
141 }
142
143 fn calculate_confidence(&self, vad_result: &VadResult) -> f32 {
144 if vad_result.speech_segments.is_empty() {
145 return 0.0;
146 }
147
148 let mut confidence: f32 = 0.6; let segments_count = vad_result.speech_segments.len();
152 if segments_count >= 1 {
153 confidence += 0.1;
154 }
155 if segments_count >= 3 {
156 confidence += 0.1;
157 }
158
159 if let Some(first_segment) = vad_result.speech_segments.first() {
161 if first_segment.duration >= 0.5 {
163 confidence += 0.1;
164 }
165 if first_segment.duration >= 1.0 {
166 confidence += 0.05;
167 }
168
169 if first_segment.probability >= 0.8 {
171 confidence += 0.05;
172 }
173 }
174
175 if vad_result.processing_duration.as_secs() <= 1 {
177 confidence += 0.05;
178 }
179
180 confidence.min(0.95_f32) }
182}