Skip to main content

oximedia_align/
lip_sync.rs

1//! Lip sync alignment for audio/video synchronization.
2//!
3//! Provides A/V offset detection, automatic correction, and tolerance checking
4//! to ensure lips match audio in video content.
5
6#![allow(dead_code)]
7#![allow(clippy::cast_precision_loss)]
8#![allow(clippy::too_many_arguments)]
9
10use serde::{Deserialize, Serialize};
11
12/// Standard lip sync tolerance window (ITU-R BT.1359)
13pub const ITU_TOLERANCE_MS: f64 = 45.0; // ±45ms
14
15/// Comfortable viewer tolerance (wider than ITU)
16pub const COMFORTABLE_TOLERANCE_MS: f64 = 90.0;
17
18/// A/V offset measurement between audio and video
19#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
20pub struct AvOffset {
21    /// Offset in milliseconds (positive = audio ahead, negative = video ahead)
22    pub offset_ms: f64,
23    /// Confidence in the measurement (0.0 to 1.0)
24    pub confidence: f64,
25    /// Detection method used
26    pub method: DetectionMethod,
27}
28
29impl AvOffset {
30    /// Create a new A/V offset
31    #[must_use]
32    pub fn new(offset_ms: f64, confidence: f64, method: DetectionMethod) -> Self {
33        Self {
34            offset_ms,
35            confidence,
36            method,
37        }
38    }
39
40    /// Convert offset to samples at a given sample rate
41    #[must_use]
42    pub fn to_samples(&self, sample_rate: u32) -> i64 {
43        (self.offset_ms * f64::from(sample_rate) / 1000.0).round() as i64
44    }
45
46    /// Convert offset to frames at a given frame rate
47    #[must_use]
48    pub fn to_frames(&self, fps: f64) -> f64 {
49        self.offset_ms * fps / 1000.0
50    }
51
52    /// Is the offset within the ITU tolerance window?
53    #[must_use]
54    pub fn within_itu_tolerance(&self) -> bool {
55        self.offset_ms.abs() <= ITU_TOLERANCE_MS
56    }
57
58    /// Is the offset within comfortable viewer tolerance?
59    #[must_use]
60    pub fn within_comfortable_tolerance(&self) -> bool {
61        self.offset_ms.abs() <= COMFORTABLE_TOLERANCE_MS
62    }
63}
64
65/// Method used to detect A/V offset
66#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
67pub enum DetectionMethod {
68    /// Audio cross-correlation with video motion
69    AudioMotionCorrelation,
70    /// Speech onset detection
71    SpeechOnset,
72    /// Visual mouth movement analysis
73    MouthMovement,
74    /// Clapper board detection
75    ClapperBoard,
76    /// Manual annotation
77    Manual,
78    /// Hybrid multi-method
79    Hybrid,
80}
81
82/// Lip sync detector configuration
83#[derive(Debug, Clone)]
84pub struct LipSyncConfig {
85    /// Analysis window in milliseconds
86    pub window_ms: f64,
87    /// Search range in milliseconds
88    pub search_range_ms: f64,
89    /// Minimum confidence to accept a detection
90    pub min_confidence: f64,
91    /// Sample rate of the audio
92    pub sample_rate: u32,
93    /// Frames per second of the video
94    pub fps: f64,
95}
96
97impl Default for LipSyncConfig {
98    fn default() -> Self {
99        Self {
100            window_ms: 500.0,
101            search_range_ms: 500.0,
102            min_confidence: 0.6,
103            sample_rate: 48000,
104            fps: 25.0,
105        }
106    }
107}
108
109impl LipSyncConfig {
110    /// Create a new config with custom parameters
111    #[must_use]
112    pub fn new(window_ms: f64, search_range_ms: f64, sample_rate: u32, fps: f64) -> Self {
113        Self {
114            window_ms,
115            search_range_ms,
116            min_confidence: 0.6,
117            sample_rate,
118            fps,
119        }
120    }
121
122    /// Convert window size to samples
123    #[must_use]
124    pub fn window_samples(&self) -> usize {
125        (self.window_ms * f64::from(self.sample_rate) / 1000.0) as usize
126    }
127
128    /// Convert search range to samples
129    #[must_use]
130    pub fn search_range_samples(&self) -> usize {
131        (self.search_range_ms * f64::from(self.sample_rate) / 1000.0) as usize
132    }
133}
134
135/// Lip sync correction to be applied
136#[derive(Debug, Clone, Copy)]
137pub struct LipSyncCorrection {
138    /// Delay to apply to audio (positive = delay, negative = advance)
139    pub audio_delay_ms: f64,
140    /// Delay to apply to video (positive = delay, negative = advance)
141    pub video_delay_ms: f64,
142    /// Whether correction is needed at all
143    pub needs_correction: bool,
144}
145
146impl LipSyncCorrection {
147    /// Create correction from an A/V offset
148    /// Positive `offset_ms` means audio is ahead, so delay audio
149    #[must_use]
150    pub fn from_offset(offset: &AvOffset, tolerance_ms: f64) -> Self {
151        if offset.offset_ms.abs() <= tolerance_ms {
152            return Self {
153                audio_delay_ms: 0.0,
154                video_delay_ms: 0.0,
155                needs_correction: false,
156            };
157        }
158
159        // Prefer delaying one stream rather than advancing the other
160        if offset.offset_ms > 0.0 {
161            // Audio is ahead: delay audio by the offset amount
162            Self {
163                audio_delay_ms: offset.offset_ms,
164                video_delay_ms: 0.0,
165                needs_correction: true,
166            }
167        } else {
168            // Video is ahead: delay video
169            Self {
170                audio_delay_ms: 0.0,
171                video_delay_ms: -offset.offset_ms,
172                needs_correction: true,
173            }
174        }
175    }
176
177    /// Total correction magnitude in ms
178    #[must_use]
179    pub fn magnitude_ms(&self) -> f64 {
180        self.audio_delay_ms + self.video_delay_ms
181    }
182}
183
184/// Lip sync analyzer
185#[derive(Debug, Clone)]
186pub struct LipSyncAnalyzer {
187    config: LipSyncConfig,
188    /// History of detected offsets
189    offset_history: Vec<AvOffset>,
190}
191
192impl LipSyncAnalyzer {
193    /// Create a new analyzer
194    #[must_use]
195    pub fn new(config: LipSyncConfig) -> Self {
196        Self {
197            config,
198            offset_history: Vec::new(),
199        }
200    }
201
202    /// Detect offset using cross-correlation of audio envelope and video activity
203    pub fn detect_offset_from_envelopes(
204        &mut self,
205        audio_envelope: &[f32],
206        video_activity: &[f32],
207    ) -> Option<AvOffset> {
208        if audio_envelope.is_empty() || video_activity.is_empty() {
209            return None;
210        }
211
212        let max_lag = self
213            .config
214            .search_range_samples()
215            .min(audio_envelope.len() / 2);
216        let window = self.config.window_samples().min(audio_envelope.len());
217
218        let mut best_lag = 0i64;
219        let mut best_corr = f64::NEG_INFINITY;
220
221        for lag in -(max_lag as i64)..=(max_lag as i64) {
222            let corr = cross_correlate_at_lag(audio_envelope, video_activity, lag, window);
223            if corr > best_corr {
224                best_corr = corr;
225                best_lag = lag;
226            }
227        }
228
229        // Normalize correlation
230        let audio_power: f64 = audio_envelope
231            .iter()
232            .map(|&x| f64::from(x) * f64::from(x))
233            .sum::<f64>()
234            / audio_envelope.len() as f64;
235        let video_power: f64 = video_activity
236            .iter()
237            .map(|&x| f64::from(x) * f64::from(x))
238            .sum::<f64>()
239            / video_activity.len() as f64;
240
241        let max_possible = (audio_power * video_power).sqrt() * window as f64;
242        let confidence = if max_possible > 0.0 {
243            (best_corr / max_possible).clamp(0.0, 1.0)
244        } else {
245            0.0
246        };
247
248        let offset_ms = best_lag as f64 / f64::from(self.config.sample_rate) * 1000.0;
249        let offset = AvOffset::new(
250            offset_ms,
251            confidence,
252            DetectionMethod::AudioMotionCorrelation,
253        );
254
255        if confidence >= self.config.min_confidence {
256            self.offset_history.push(offset);
257        }
258
259        Some(offset)
260    }
261
262    /// Get the median offset from history (more robust than latest)
263    #[must_use]
264    pub fn median_offset(&self) -> Option<f64> {
265        if self.offset_history.is_empty() {
266            return None;
267        }
268        let mut offsets: Vec<f64> = self.offset_history.iter().map(|o| o.offset_ms).collect();
269        offsets.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
270        let mid = offsets.len() / 2;
271        Some(if offsets.len() % 2 == 0 {
272            (offsets[mid - 1] + offsets[mid]) / 2.0
273        } else {
274            offsets[mid]
275        })
276    }
277
278    /// Get recommended correction based on history
279    #[must_use]
280    pub fn recommend_correction(&self, tolerance_ms: f64) -> Option<LipSyncCorrection> {
281        let median = self.median_offset()?;
282        let offset = AvOffset::new(median, 1.0, DetectionMethod::Hybrid);
283        Some(LipSyncCorrection::from_offset(&offset, tolerance_ms))
284    }
285
286    /// Clear the offset history
287    pub fn clear_history(&mut self) {
288        self.offset_history.clear();
289    }
290
291    /// Number of measurements in history
292    #[must_use]
293    pub fn history_len(&self) -> usize {
294        self.offset_history.len()
295    }
296}
297
298/// Compute cross-correlation between two signals at a given lag
299fn cross_correlate_at_lag(a: &[f32], b: &[f32], lag: i64, window: usize) -> f64 {
300    let mut sum = 0.0_f64;
301    let n = window.min(a.len()).min(b.len());
302    for i in 0..n {
303        let j = i as i64 + lag;
304        if j >= 0 && (j as usize) < b.len() {
305            sum += f64::from(a[i]) * f64::from(b[j as usize]);
306        }
307    }
308    sum
309}
310
311/// Tolerance checker for lip sync
312#[derive(Debug, Clone, Copy)]
313pub struct ToleranceChecker {
314    /// ITU-R BT.1359 tolerance in ms
315    pub itu_tolerance_ms: f64,
316    /// Custom tolerance in ms
317    pub custom_tolerance_ms: f64,
318}
319
320impl ToleranceChecker {
321    /// Create a new tolerance checker
322    #[must_use]
323    pub fn new(custom_tolerance_ms: f64) -> Self {
324        Self {
325            itu_tolerance_ms: ITU_TOLERANCE_MS,
326            custom_tolerance_ms,
327        }
328    }
329
330    /// Check if an offset passes ITU tolerance
331    #[must_use]
332    pub fn passes_itu(&self, offset_ms: f64) -> bool {
333        offset_ms.abs() <= self.itu_tolerance_ms
334    }
335
336    /// Check if an offset passes custom tolerance
337    #[must_use]
338    pub fn passes_custom(&self, offset_ms: f64) -> bool {
339        offset_ms.abs() <= self.custom_tolerance_ms
340    }
341
342    /// Rate the severity of the offset
343    #[must_use]
344    pub fn severity(&self, offset_ms: f64) -> SyncSeverity {
345        let abs_ms = offset_ms.abs();
346        if abs_ms <= ITU_TOLERANCE_MS {
347            SyncSeverity::None
348        } else if abs_ms <= COMFORTABLE_TOLERANCE_MS {
349            SyncSeverity::Minor
350        } else if abs_ms <= 200.0 {
351            SyncSeverity::Moderate
352        } else {
353            SyncSeverity::Severe
354        }
355    }
356}
357
358impl Default for ToleranceChecker {
359    fn default() -> Self {
360        Self::new(ITU_TOLERANCE_MS)
361    }
362}
363
364/// Severity of lip sync error
365#[derive(Debug, Clone, Copy, PartialEq, Eq)]
366pub enum SyncSeverity {
367    /// Within tolerance, not noticeable
368    None,
369    /// Slightly outside tolerance, barely noticeable
370    Minor,
371    /// Clearly noticeable lip sync error
372    Moderate,
373    /// Severe lip sync error, very distracting
374    Severe,
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn test_av_offset_creation() {
383        let offset = AvOffset::new(20.0, 0.9, DetectionMethod::Manual);
384        assert!((offset.offset_ms - 20.0).abs() < f64::EPSILON);
385        assert!((offset.confidence - 0.9).abs() < f64::EPSILON);
386        assert_eq!(offset.method, DetectionMethod::Manual);
387    }
388
389    #[test]
390    fn test_av_offset_to_samples() {
391        let offset = AvOffset::new(100.0, 0.9, DetectionMethod::Manual);
392        assert_eq!(offset.to_samples(48000), 4800);
393    }
394
395    #[test]
396    fn test_av_offset_to_frames() {
397        let offset = AvOffset::new(40.0, 0.9, DetectionMethod::Manual);
398        let frames = offset.to_frames(25.0);
399        assert!((frames - 1.0).abs() < 1e-6);
400    }
401
402    #[test]
403    fn test_av_offset_itu_tolerance() {
404        let within = AvOffset::new(40.0, 0.9, DetectionMethod::Manual);
405        assert!(within.within_itu_tolerance());
406
407        let outside = AvOffset::new(50.0, 0.9, DetectionMethod::Manual);
408        assert!(!outside.within_itu_tolerance());
409    }
410
411    #[test]
412    fn test_av_offset_comfortable_tolerance() {
413        let within = AvOffset::new(80.0, 0.9, DetectionMethod::Manual);
414        assert!(within.within_comfortable_tolerance());
415
416        let outside = AvOffset::new(100.0, 0.9, DetectionMethod::Manual);
417        assert!(!outside.within_comfortable_tolerance());
418    }
419
420    #[test]
421    fn test_lip_sync_config_default() {
422        let config = LipSyncConfig::default();
423        assert_eq!(config.sample_rate, 48000);
424        assert!((config.fps - 25.0).abs() < f64::EPSILON);
425    }
426
427    #[test]
428    fn test_lip_sync_config_window_samples() {
429        let config = LipSyncConfig::default(); // window_ms=500, sample_rate=48000
430        assert_eq!(config.window_samples(), 24000);
431    }
432
433    #[test]
434    fn test_lip_sync_correction_no_correction_needed() {
435        let offset = AvOffset::new(10.0, 0.9, DetectionMethod::Manual);
436        let correction = LipSyncCorrection::from_offset(&offset, 45.0);
437        assert!(!correction.needs_correction);
438    }
439
440    #[test]
441    fn test_lip_sync_correction_audio_ahead() {
442        let offset = AvOffset::new(100.0, 0.9, DetectionMethod::Manual);
443        let correction = LipSyncCorrection::from_offset(&offset, 45.0);
444        assert!(correction.needs_correction);
445        assert!(correction.audio_delay_ms > 0.0);
446        assert_eq!(correction.video_delay_ms, 0.0);
447    }
448
449    #[test]
450    fn test_lip_sync_correction_video_ahead() {
451        let offset = AvOffset::new(-100.0, 0.9, DetectionMethod::Manual);
452        let correction = LipSyncCorrection::from_offset(&offset, 45.0);
453        assert!(correction.needs_correction);
454        assert_eq!(correction.audio_delay_ms, 0.0);
455        assert!(correction.video_delay_ms > 0.0);
456    }
457
458    #[test]
459    fn test_lip_sync_correction_magnitude() {
460        let offset = AvOffset::new(100.0, 0.9, DetectionMethod::Manual);
461        let correction = LipSyncCorrection::from_offset(&offset, 45.0);
462        assert!((correction.magnitude_ms() - 100.0).abs() < f64::EPSILON);
463    }
464
465    #[test]
466    fn test_analyzer_detect_from_envelopes() {
467        let config = LipSyncConfig::new(100.0, 200.0, 48000, 25.0);
468        let mut analyzer = LipSyncAnalyzer::new(config);
469
470        // Create simple test signals
471        let n = 5000;
472        let mut audio = vec![0.0f32; n];
473        let mut video = vec![0.0f32; n];
474
475        // Place a transient at position 1000 in audio and 1100 in video
476        audio[1000] = 1.0;
477        audio[1001] = 0.8;
478        video[1100] = 1.0;
479        video[1101] = 0.8;
480
481        let result = analyzer.detect_offset_from_envelopes(&audio, &video);
482        assert!(result.is_some());
483    }
484
485    #[test]
486    fn test_analyzer_median_offset_empty() {
487        let analyzer = LipSyncAnalyzer::new(LipSyncConfig::default());
488        assert!(analyzer.median_offset().is_none());
489    }
490
491    #[test]
492    fn test_analyzer_clear_history() {
493        let config = LipSyncConfig::default();
494        let mut analyzer = LipSyncAnalyzer::new(config);
495        // Add manual entry to history
496        analyzer
497            .offset_history
498            .push(AvOffset::new(10.0, 0.9, DetectionMethod::Manual));
499        assert_eq!(analyzer.history_len(), 1);
500        analyzer.clear_history();
501        assert_eq!(analyzer.history_len(), 0);
502    }
503
504    #[test]
505    fn test_tolerance_checker_itu() {
506        let checker = ToleranceChecker::default();
507        assert!(checker.passes_itu(44.9));
508        assert!(!checker.passes_itu(45.1));
509    }
510
511    #[test]
512    fn test_tolerance_checker_severity() {
513        let checker = ToleranceChecker::default();
514        assert_eq!(checker.severity(30.0), SyncSeverity::None);
515        assert_eq!(checker.severity(70.0), SyncSeverity::Minor);
516        assert_eq!(checker.severity(150.0), SyncSeverity::Moderate);
517        assert_eq!(checker.severity(250.0), SyncSeverity::Severe);
518    }
519
520    #[test]
521    fn test_cross_correlate_at_lag() {
522        let a = vec![1.0f32, 0.0, 0.0, 0.0, 0.0];
523        let b = vec![0.0f32, 1.0, 0.0, 0.0, 0.0];
524        // At lag=1, a[0] aligns with b[1], should give 1.0
525        let corr = cross_correlate_at_lag(&a, &b, 1, 5);
526        assert!((corr - 1.0).abs() < 1e-6);
527    }
528}