Skip to main content

oximedia_dedup/
video_segment_dedup.rs

1//! Video segment deduplication using perceptual hashing and temporal windowing.
2//!
3//! This module detects duplicate or near-duplicate video segments within and
4//! across media files by:
5//! - Computing perceptual hashes over frame data
6//! - Using temporal windowing to compare ordered frame sequences
7//! - Scoring similarity via Hamming distance on 64-bit hashes
8
9#![allow(dead_code)]
10
11use std::collections::HashMap;
12
13// ── SegmentFingerprint ────────────────────────────────────────────────────────
14
15/// A compact fingerprint for a single video segment (group of frames).
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct SegmentFingerprint {
18    /// Perceptual hash of the segment (64-bit).
19    pub hash: u64,
20    /// Number of frames in this segment.
21    pub frame_count: usize,
22    /// Duration of this segment in milliseconds.
23    pub duration_ms: u64,
24}
25
26impl SegmentFingerprint {
27    /// Create a `SegmentFingerprint` from pre-computed values.
28    #[must_use]
29    pub fn new(hash: u64, frame_count: usize, duration_ms: u64) -> Self {
30        Self {
31            hash,
32            frame_count,
33            duration_ms,
34        }
35    }
36
37    /// Derive a `SegmentFingerprint` from raw frame data.
38    ///
39    /// Uses a two-pass FNV-1a–based perceptual hash that is sensitive to
40    /// the distribution of byte values (luminance-like) rather than the exact
41    /// raw bytes, giving some robustness to minor encoding differences.
42    #[must_use]
43    pub fn from_frame_data(data: &[u8], frame_count: usize, duration_ms: u64) -> Self {
44        let hash = perceptual_hash_u64(data);
45        Self {
46            hash,
47            frame_count,
48            duration_ms,
49        }
50    }
51
52    /// Compute the Hamming distance (number of differing bits) between two hashes.
53    #[must_use]
54    pub fn hamming_distance(&self, other: &Self) -> u32 {
55        (self.hash ^ other.hash).count_ones()
56    }
57
58    /// Returns the average milliseconds per frame.
59    ///
60    /// Returns 0 if `frame_count` is zero.
61    #[must_use]
62    pub fn ms_per_frame(&self) -> u64 {
63        if self.frame_count == 0 {
64            0
65        } else {
66            self.duration_ms / self.frame_count as u64
67        }
68    }
69}
70
71/// Compute a 64-bit perceptual hash from raw byte data.
72///
73/// Implements a difference-hash (dHash) approach over 65 equal segments:
74/// each bit of the output hash encodes whether the FNV-1a state of segment
75/// `i` is greater than the state of segment `i+1`. This captures gradient
76/// direction and is robust to absolute value shifts.
77fn perceptual_hash_u64(data: &[u8]) -> u64 {
78    if data.is_empty() {
79        return 0;
80    }
81
82    // Use 65 buckets to get 64 comparison pairs
83    const SEGS: usize = 65;
84    let seg_size = (data.len() + SEGS - 1) / SEGS;
85    let mut seg_vals = [0u64; SEGS];
86
87    for (i, chunk) in data.chunks(seg_size.max(1)).enumerate() {
88        if i >= SEGS {
89            break;
90        }
91        // FNV-1a with segment-index mixing to distinguish adjacent equal-value regions
92        let fnv_offset: u64 =
93            0xcbf2_9ce4_8422_2325u64.wrapping_add((i as u64).wrapping_mul(0x9e37_79b9_7f4a_7c15));
94        let mut state: u64 = fnv_offset;
95        for &b in chunk {
96            state ^= u64::from(b);
97            state = state.wrapping_mul(0x0100_0000_01b3);
98        }
99        seg_vals[i] = state;
100    }
101
102    // dHash: bit i = 1 if seg_vals[i] > seg_vals[i+1]
103    let mut hash = 0u64;
104    for i in 0..64 {
105        if seg_vals[i] > seg_vals[i + 1] {
106            hash |= 1u64 << i;
107        }
108    }
109    hash
110}
111
112// ── match_segments ────────────────────────────────────────────────────────────
113
114/// Compute the similarity score between two segment fingerprints.
115///
116/// Returns a value in `[0.0, 1.0]` where:
117/// - `1.0` = identical hashes
118/// - `0.0` = maximum Hamming distance (all 64 bits differ)
119///
120/// Frame count and duration are NOT used for scoring; they are metadata.
121#[must_use]
122pub fn match_segments(a: &SegmentFingerprint, b: &SegmentFingerprint) -> f32 {
123    let differing_bits = (a.hash ^ b.hash).count_ones();
124    // 64 bits total; normalize so 0 diff → 1.0, 64 diff → 0.0
125    1.0 - (differing_bits as f32 / 64.0)
126}
127
128// ── TemporalWindow ────────────────────────────────────────────────────────────
129
130/// A sliding window of `SegmentFingerprint` hashes used for temporal matching.
131#[derive(Debug, Clone)]
132pub struct TemporalWindow {
133    /// Hashes extracted from the fingerprints in this window.
134    pub hashes: Vec<u64>,
135    /// Index in the original sequence where this window starts.
136    pub start_idx: usize,
137    /// Total duration covered by this window (sum of segment durations), ms.
138    pub duration_ms: u64,
139}
140
141impl TemporalWindow {
142    /// Create a window from a slice of fingerprints at a given start index.
143    #[must_use]
144    pub fn from_fingerprints(fps: &[SegmentFingerprint], start_idx: usize) -> Self {
145        let hashes = fps.iter().map(|f| f.hash).collect();
146        let duration_ms = fps.iter().map(|f| f.duration_ms).sum();
147        Self {
148            hashes,
149            start_idx,
150            duration_ms,
151        }
152    }
153
154    /// Compare this window to another by average Hamming similarity.
155    ///
156    /// Returns `0.0` if either window is empty or they have different lengths.
157    #[must_use]
158    pub fn similarity(&self, other: &Self) -> f32 {
159        if self.hashes.is_empty() || other.hashes.is_empty() {
160            return 0.0;
161        }
162        if self.hashes.len() != other.hashes.len() {
163            return 0.0;
164        }
165        let total: f32 = self
166            .hashes
167            .iter()
168            .zip(other.hashes.iter())
169            .map(|(&a, &b)| {
170                let diff = (a ^ b).count_ones();
171                1.0 - (diff as f32 / 64.0)
172            })
173            .sum();
174        total / self.hashes.len() as f32
175    }
176}
177
178// ── TemporalWindowMatcher ─────────────────────────────────────────────────────
179
180/// Compares two sequences of `SegmentFingerprint`s using a sliding temporal window.
181///
182/// The matcher slides a window of `window_size` fingerprints across both
183/// sequences (with `stride` step), computing the average per-hash Hamming
184/// similarity for each aligned window pair.
185#[derive(Debug, Clone)]
186pub struct TemporalWindowMatcher {
187    /// Number of segments per window.
188    pub window_size: usize,
189    /// Step between consecutive windows.
190    pub stride: usize,
191}
192
193impl TemporalWindowMatcher {
194    /// Create a new matcher with explicit window/stride.
195    #[must_use]
196    pub fn new(window_size: usize, stride: usize) -> Self {
197        Self {
198            window_size: window_size.max(1),
199            stride: stride.max(1),
200        }
201    }
202
203    /// Create a matcher with sensible defaults (window=4, stride=2).
204    #[must_use]
205    pub fn with_defaults() -> Self {
206        Self::new(4, 2)
207    }
208
209    /// Compare two sequences by finding the highest-scoring window alignment.
210    ///
211    /// Extracts windows from `a` and `b` and tries all combinations, returning
212    /// the maximum similarity found across all window pairs.
213    ///
214    /// Returns `0.0` if either sequence is shorter than `window_size`.
215    #[must_use]
216    pub fn compare_sequences(&self, a: &[SegmentFingerprint], b: &[SegmentFingerprint]) -> f32 {
217        if a.len() < self.window_size || b.len() < self.window_size {
218            return 0.0;
219        }
220
221        let windows_a = self.extract_windows(a);
222        let windows_b = self.extract_windows(b);
223
224        let mut best = 0.0f32;
225        for wa in &windows_a {
226            for wb in &windows_b {
227                let sim = wa.similarity(wb);
228                if sim > best {
229                    best = sim;
230                }
231            }
232        }
233        best
234    }
235
236    /// Find the best-aligned window pair between two sequences.
237    ///
238    /// Returns `Some((offset_a, offset_b, similarity))` for the best match,
239    /// or `None` if either sequence is too short.
240    #[must_use]
241    pub fn find_best_alignment(
242        &self,
243        a: &[SegmentFingerprint],
244        b: &[SegmentFingerprint],
245    ) -> Option<(usize, usize, f32)> {
246        if a.len() < self.window_size || b.len() < self.window_size {
247            return None;
248        }
249
250        let windows_a = self.extract_windows(a);
251        let windows_b = self.extract_windows(b);
252
253        let mut best_sim = 0.0f32;
254        let mut best_offset_a = 0usize;
255        let mut best_offset_b = 0usize;
256
257        for wa in &windows_a {
258            for wb in &windows_b {
259                let sim = wa.similarity(wb);
260                if sim > best_sim {
261                    best_sim = sim;
262                    best_offset_a = wa.start_idx;
263                    best_offset_b = wb.start_idx;
264                }
265            }
266        }
267
268        if best_sim > 0.0 {
269            Some((best_offset_a, best_offset_b, best_sim))
270        } else {
271            None
272        }
273    }
274
275    /// Extract all windows from a sequence with the configured stride.
276    fn extract_windows(&self, seq: &[SegmentFingerprint]) -> Vec<TemporalWindow> {
277        if seq.len() < self.window_size {
278            return Vec::new();
279        }
280
281        let mut windows = Vec::new();
282        let mut start = 0;
283        while start + self.window_size <= seq.len() {
284            let slice = &seq[start..start + self.window_size];
285            windows.push(TemporalWindow::from_fingerprints(slice, start));
286            start += self.stride;
287        }
288        windows
289    }
290}
291
292// ── SegmentMatch ──────────────────────────────────────────────────────────────
293
294/// A detected match between two segments from possibly different videos.
295#[derive(Debug, Clone, PartialEq)]
296pub struct SegmentMatch {
297    /// Identifier of the first video.
298    pub video_a: String,
299    /// Index of the matching segment in video A's fingerprint sequence.
300    pub segment_a_idx: usize,
301    /// Identifier of the second video.
302    pub video_b: String,
303    /// Index of the matching segment in video B's fingerprint sequence.
304    pub segment_b_idx: usize,
305    /// Similarity score in `[0.0, 1.0]`.
306    pub similarity: f32,
307    /// Estimated temporal offset between the segments in milliseconds.
308    ///
309    /// Positive value means segment B starts later than segment A.
310    pub time_offset_ms: i64,
311}
312
313// ── VideoSegmentDeduplicator ──────────────────────────────────────────────────
314
315/// Finds duplicate video segments across a collection of indexed videos.
316///
317/// Each video is represented as an ordered sequence of [`SegmentFingerprint`]s.
318/// The deduplicator compares all pairs of fingerprint sequences and reports
319/// segment-level matches above a configurable similarity threshold.
320#[derive(Debug, Default)]
321pub struct VideoSegmentDeduplicator {
322    /// Stored fingerprint sequences keyed by video identifier.
323    videos: HashMap<String, Vec<SegmentFingerprint>>,
324    /// Default similarity threshold (0.0–1.0).
325    threshold: f32,
326}
327
328impl VideoSegmentDeduplicator {
329    /// Create a new deduplicator with the default threshold (0.8).
330    #[must_use]
331    pub fn new() -> Self {
332        Self {
333            videos: HashMap::new(),
334            threshold: 0.8,
335        }
336    }
337
338    /// Create a deduplicator with a custom similarity threshold.
339    #[must_use]
340    pub fn with_threshold(threshold: f32) -> Self {
341        Self {
342            videos: HashMap::new(),
343            threshold: threshold.clamp(0.0, 1.0),
344        }
345    }
346
347    /// Register a video's fingerprint sequence.
348    pub fn add_video(&mut self, video_id: &str, fingerprints: Vec<SegmentFingerprint>) {
349        self.videos.insert(video_id.to_owned(), fingerprints);
350    }
351
352    /// Returns the number of indexed videos.
353    #[must_use]
354    pub fn video_count(&self) -> usize {
355        self.videos.len()
356    }
357
358    /// Returns `true` if no videos have been added.
359    #[must_use]
360    pub fn is_empty(&self) -> bool {
361        self.videos.is_empty()
362    }
363
364    /// Find all matching segment pairs across all indexed videos.
365    ///
366    /// Performs pairwise comparison between each combination of
367    /// (video_a, segment_i) × (video_b, segment_j) for all distinct
368    /// video pairs. Within-video self-comparisons are skipped.
369    ///
370    /// Uses the deduplicator's configured threshold.
371    #[must_use]
372    pub fn find_duplicate_segments(&self) -> Vec<SegmentMatch> {
373        self.find_duplicate_segments_with_threshold(self.threshold)
374    }
375
376    /// Find matching segment pairs using an explicit similarity threshold.
377    #[must_use]
378    pub fn find_duplicate_segments_with_threshold(&self, threshold: f32) -> Vec<SegmentMatch> {
379        let mut matches = Vec::new();
380
381        let video_ids: Vec<&String> = self.videos.keys().collect();
382        let n = video_ids.len();
383
384        for i in 0..n {
385            for j in (i + 1)..n {
386                let id_a = video_ids[i];
387                let id_b = video_ids[j];
388                let fps_a = &self.videos[id_a];
389                let fps_b = &self.videos[id_b];
390
391                let mut video_matches =
392                    compare_segment_sequences(id_a, fps_a, id_b, fps_b, threshold);
393                matches.append(&mut video_matches);
394            }
395        }
396
397        matches
398    }
399
400    /// Find matches between a newly submitted video and all indexed videos
401    /// without permanently adding it to the index.
402    #[must_use]
403    pub fn query(
404        &self,
405        video_id: &str,
406        fingerprints: &[SegmentFingerprint],
407        threshold: f32,
408    ) -> Vec<SegmentMatch> {
409        let mut matches = Vec::new();
410
411        for (indexed_id, indexed_fps) in &self.videos {
412            if indexed_id == video_id {
413                continue;
414            }
415            let mut m = compare_segment_sequences(
416                video_id,
417                fingerprints,
418                indexed_id,
419                indexed_fps,
420                threshold,
421            );
422            matches.append(&mut m);
423        }
424
425        matches
426    }
427}
428
429/// Compare two ordered fingerprint sequences and return all segment-level matches.
430fn compare_segment_sequences(
431    id_a: &str,
432    fps_a: &[SegmentFingerprint],
433    id_b: &str,
434    fps_b: &[SegmentFingerprint],
435    threshold: f32,
436) -> Vec<SegmentMatch> {
437    let mut matches = Vec::new();
438
439    for (i, fp_a) in fps_a.iter().enumerate() {
440        for (j, fp_b) in fps_b.iter().enumerate() {
441            let sim = match_segments(fp_a, fp_b);
442            if sim >= threshold {
443                // Compute approximate time offset based on segment positions and durations.
444                let time_a: i64 = fps_a[..i].iter().map(|f| f.duration_ms as i64).sum();
445                let time_b: i64 = fps_b[..j].iter().map(|f| f.duration_ms as i64).sum();
446                let time_offset_ms = time_b - time_a;
447
448                matches.push(SegmentMatch {
449                    video_a: id_a.to_owned(),
450                    segment_a_idx: i,
451                    video_b: id_b.to_owned(),
452                    segment_b_idx: j,
453                    similarity: sim,
454                    time_offset_ms,
455                });
456            }
457        }
458    }
459
460    matches
461}
462
463// ── Tests ─────────────────────────────────────────────────────────────────────
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468
469    // ── SegmentFingerprint ─────────────────────────────────────────────────
470
471    #[test]
472    fn test_segment_fingerprint_new() {
473        let fp = SegmentFingerprint::new(0xDEAD_BEEF_1234_5678, 30, 1000);
474        assert_eq!(fp.hash, 0xDEAD_BEEF_1234_5678);
475        assert_eq!(fp.frame_count, 30);
476        assert_eq!(fp.duration_ms, 1000);
477    }
478
479    #[test]
480    fn test_from_frame_data_deterministic() {
481        let data = b"hello video frame data here";
482        let fp1 = SegmentFingerprint::from_frame_data(data, 10, 500);
483        let fp2 = SegmentFingerprint::from_frame_data(data, 10, 500);
484        assert_eq!(fp1.hash, fp2.hash);
485        assert_eq!(fp1.frame_count, 10);
486        assert_eq!(fp1.duration_ms, 500);
487    }
488
489    #[test]
490    fn test_from_frame_data_different_inputs_differ() {
491        // Use substantially different, longer inputs to ensure distinct hashes.
492        // The perceptual hash is designed for frame-sized data (>= 64 bytes).
493        let data_a: Vec<u8> = (0u8..=127).collect();
494        let data_b: Vec<u8> = (128u8..=255).collect();
495        let fp_a = SegmentFingerprint::from_frame_data(&data_a, 10, 500);
496        let fp_b = SegmentFingerprint::from_frame_data(&data_b, 10, 500);
497        assert_ne!(fp_a.hash, fp_b.hash);
498    }
499
500    #[test]
501    fn test_from_frame_data_empty() {
502        let fp = SegmentFingerprint::from_frame_data(b"", 0, 0);
503        assert_eq!(fp.hash, 0);
504        assert_eq!(fp.frame_count, 0);
505    }
506
507    #[test]
508    fn test_hamming_distance_identical() {
509        let fp = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
510        assert_eq!(fp.hamming_distance(&fp), 0);
511    }
512
513    #[test]
514    fn test_hamming_distance_all_differ() {
515        let fp_a = SegmentFingerprint::new(0x0000_0000_0000_0000, 30, 1000);
516        let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
517        assert_eq!(fp_a.hamming_distance(&fp_b), 64);
518    }
519
520    #[test]
521    fn test_ms_per_frame() {
522        let fp = SegmentFingerprint::new(0, 30, 1000);
523        assert_eq!(fp.ms_per_frame(), 33); // floor(1000/30)
524    }
525
526    #[test]
527    fn test_ms_per_frame_zero_frames() {
528        let fp = SegmentFingerprint::new(0, 0, 1000);
529        assert_eq!(fp.ms_per_frame(), 0);
530    }
531
532    // ── match_segments ─────────────────────────────────────────────────────
533
534    #[test]
535    fn test_match_segments_identical() {
536        let fp = SegmentFingerprint::new(0x1234_5678_ABCD_EF01, 30, 1000);
537        let sim = match_segments(&fp, &fp);
538        assert!((sim - 1.0).abs() < f32::EPSILON);
539    }
540
541    #[test]
542    fn test_match_segments_completely_different() {
543        let fp_a = SegmentFingerprint::new(0x0000_0000_0000_0000, 30, 1000);
544        let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
545        let sim = match_segments(&fp_a, &fp_b);
546        assert!((sim - 0.0).abs() < f32::EPSILON);
547    }
548
549    #[test]
550    fn test_match_segments_half_different() {
551        // 32 bits differ → 32/64 = 0.5 diff → 0.5 similarity
552        let fp_a = SegmentFingerprint::new(0x0000_0000_FFFF_FFFF, 30, 1000);
553        let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_0000_0000, 30, 1000);
554        // XOR = 0xFFFF_FFFF_FFFF_FFFF → 64 bits → sim = 0.0  (all bits differ when combined)
555        // Actually 0x0000_0000_FFFF_FFFF ^ 0xFFFF_FFFF_0000_0000 = 0xFFFF_FFFF_FFFF_FFFF → 64 ones
556        let sim = match_segments(&fp_a, &fp_b);
557        assert!((sim - 0.0).abs() < f32::EPSILON);
558    }
559
560    #[test]
561    fn test_match_segments_near_identical() {
562        // Only 1 bit differs
563        let fp_a = SegmentFingerprint::new(0b1000, 30, 1000);
564        let fp_b = SegmentFingerprint::new(0b1001, 30, 1000); // 1 bit diff
565        let sim = match_segments(&fp_a, &fp_b);
566        let expected = 1.0 - (1.0 / 64.0);
567        assert!((sim - expected).abs() < 0.001);
568    }
569
570    #[test]
571    fn test_match_segments_ignores_frame_count() {
572        let hash = 0xCAFE_BABE_DEAD_BEEF;
573        let fp_a = SegmentFingerprint::new(hash, 10, 500);
574        let fp_b = SegmentFingerprint::new(hash, 99, 9999);
575        // Same hash → similarity = 1.0 regardless of metadata
576        let sim = match_segments(&fp_a, &fp_b);
577        assert!((sim - 1.0).abs() < f32::EPSILON);
578    }
579
580    // ── TemporalWindow ─────────────────────────────────────────────────────
581
582    #[test]
583    fn test_temporal_window_from_fingerprints() {
584        let fps = vec![
585            SegmentFingerprint::new(0xAA, 10, 300),
586            SegmentFingerprint::new(0xBB, 10, 400),
587            SegmentFingerprint::new(0xCC, 10, 500),
588        ];
589        let win = TemporalWindow::from_fingerprints(&fps, 5);
590        assert_eq!(win.hashes, vec![0xAA, 0xBB, 0xCC]);
591        assert_eq!(win.start_idx, 5);
592        assert_eq!(win.duration_ms, 1200);
593    }
594
595    #[test]
596    fn test_temporal_window_similarity_identical() {
597        let fps = vec![
598            SegmentFingerprint::new(0x11, 10, 100),
599            SegmentFingerprint::new(0x22, 10, 100),
600        ];
601        let w1 = TemporalWindow::from_fingerprints(&fps, 0);
602        let w2 = TemporalWindow::from_fingerprints(&fps, 0);
603        assert!((w1.similarity(&w2) - 1.0).abs() < 0.001);
604    }
605
606    #[test]
607    fn test_temporal_window_similarity_different_lengths() {
608        let fps_a = vec![SegmentFingerprint::new(0x11, 10, 100)];
609        let fps_b = vec![
610            SegmentFingerprint::new(0x11, 10, 100),
611            SegmentFingerprint::new(0x22, 10, 100),
612        ];
613        let w1 = TemporalWindow::from_fingerprints(&fps_a, 0);
614        let w2 = TemporalWindow::from_fingerprints(&fps_b, 0);
615        assert!((w1.similarity(&w2) - 0.0).abs() < f32::EPSILON);
616    }
617
618    #[test]
619    fn test_temporal_window_similarity_empty() {
620        let w1 = TemporalWindow {
621            hashes: vec![],
622            start_idx: 0,
623            duration_ms: 0,
624        };
625        let w2 = TemporalWindow {
626            hashes: vec![],
627            start_idx: 0,
628            duration_ms: 0,
629        };
630        assert!((w1.similarity(&w2) - 0.0).abs() < f32::EPSILON);
631    }
632
633    // ── TemporalWindowMatcher ──────────────────────────────────────────────
634
635    #[test]
636    fn test_matcher_compare_identical_sequences() {
637        let fps: Vec<SegmentFingerprint> = (0..8)
638            .map(|i| SegmentFingerprint::new(i * 0x1111_1111_1111_1111, 10, 100))
639            .collect();
640        let matcher = TemporalWindowMatcher::new(4, 2);
641        let sim = matcher.compare_sequences(&fps, &fps);
642        assert!((sim - 1.0).abs() < 0.001);
643    }
644
645    #[test]
646    fn test_matcher_compare_too_short() {
647        let fps = vec![SegmentFingerprint::new(0xAB, 10, 100)];
648        let matcher = TemporalWindowMatcher::new(4, 2);
649        let sim = matcher.compare_sequences(&fps, &fps);
650        assert!((sim - 0.0).abs() < f32::EPSILON);
651    }
652
653    #[test]
654    fn test_matcher_find_best_alignment_identical() {
655        let fps: Vec<SegmentFingerprint> = (0..6)
656            .map(|i| SegmentFingerprint::new(i as u64, 10, 100))
657            .collect();
658        let matcher = TemporalWindowMatcher::new(3, 1);
659        let result = matcher.find_best_alignment(&fps, &fps);
660        assert!(result.is_some());
661        let (_, _, sim) = result.expect("alignment found");
662        assert!((sim - 1.0).abs() < 0.001);
663    }
664
665    #[test]
666    fn test_matcher_find_best_alignment_empty_sequences() {
667        let fps: Vec<SegmentFingerprint> = vec![];
668        let matcher = TemporalWindowMatcher::new(4, 2);
669        assert!(matcher.find_best_alignment(&fps, &fps).is_none());
670    }
671
672    // ── VideoSegmentDeduplicator ───────────────────────────────────────────
673
674    #[test]
675    fn test_deduplicator_empty() {
676        let dedup = VideoSegmentDeduplicator::new();
677        assert_eq!(dedup.video_count(), 0);
678        assert!(dedup.is_empty());
679        assert!(dedup.find_duplicate_segments().is_empty());
680    }
681
682    #[test]
683    fn test_deduplicator_single_video_no_matches() {
684        let mut dedup = VideoSegmentDeduplicator::new();
685        let fps = vec![SegmentFingerprint::new(0x1234, 10, 500)];
686        dedup.add_video("video_a", fps);
687        assert_eq!(dedup.video_count(), 1);
688        // Single video → no cross-video matches
689        assert!(dedup.find_duplicate_segments().is_empty());
690    }
691
692    #[test]
693    fn test_deduplicator_identical_videos_match() {
694        let mut dedup = VideoSegmentDeduplicator::with_threshold(0.9);
695        let fps: Vec<SegmentFingerprint> = vec![
696            SegmentFingerprint::new(0xAAAA_AAAA_AAAA_AAAA, 10, 500),
697            SegmentFingerprint::new(0xBBBB_BBBB_BBBB_BBBB, 10, 500),
698        ];
699        dedup.add_video("video_a", fps.clone());
700        dedup.add_video("video_b", fps);
701
702        let matches = dedup.find_duplicate_segments();
703        assert!(!matches.is_empty());
704        for m in &matches {
705            assert!(m.similarity >= 0.9);
706        }
707    }
708
709    #[test]
710    fn test_deduplicator_query_without_indexing() {
711        let mut dedup = VideoSegmentDeduplicator::new();
712        let fps_indexed = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
713        dedup.add_video("indexed", fps_indexed);
714
715        let fps_query = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
716        let results = dedup.query("query_video", &fps_query, 0.99);
717        assert_eq!(results.len(), 1);
718        assert_eq!(results[0].video_a, "query_video");
719        assert_eq!(results[0].video_b, "indexed");
720    }
721
722    #[test]
723    fn test_deduplicator_time_offset_computed() {
724        let mut dedup = VideoSegmentDeduplicator::with_threshold(0.99);
725        // video_a: 2 segments of 500ms each
726        let fps_a = vec![
727            SegmentFingerprint::new(0x0000_0000_0000_0001, 10, 500),
728            SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500),
729        ];
730        // video_b: same second segment first
731        let fps_b = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
732        dedup.add_video("a", fps_a);
733        dedup.add_video("b", fps_b);
734
735        let matches = dedup.find_duplicate_segments();
736        assert!(!matches.is_empty(), "expected at least one match");
737
738        // Find the match regardless of which video appears as A vs B
739        // (HashMap iteration order is non-deterministic).
740        let m = matches.iter().find(|m| {
741            // Case 1: a[1] matched b[0]
742            (m.video_a == "a" && m.segment_a_idx == 1 && m.video_b == "b" && m.segment_b_idx == 0)
743            // Case 2: b[0] matched a[1] (reversed iteration)
744            || (m.video_a == "b" && m.segment_a_idx == 0 && m.video_b == "a" && m.segment_b_idx == 1)
745        });
746        assert!(m.is_some(), "expected match between a[1] and b[0]");
747
748        let m = m.expect("match found");
749        // time_offset = time_b - time_a
750        // case 1: time_b(b[0]=0ms) - time_a(a[1]=500ms) = -500
751        // case 2: time_b(a[1]=500ms) - time_a(b[0]=0ms) = +500
752        assert!(
753            m.time_offset_ms == -500 || m.time_offset_ms == 500,
754            "expected ±500ms offset, got {}",
755            m.time_offset_ms
756        );
757    }
758}