Skip to main content

oximedia_dedup/
video_dedup.rs

1//! Video-level deduplication.
2//!
3//! This module provides:
4//! - `VideoFingerprint`: frame-level hash + temporal hash for a video
5//! - `VideoDuplicateDetector`: find duplicate or near-duplicate videos
6//! - `TrimmedDuplicateDetector`: sliding-window for trimmed/shifted duplicates
7//! - `DuplicatePair` / `DuplicateCluster`: result types
8
9#![allow(dead_code)]
10
11// ---------------------------------------------------------------------------
12// VideoFingerprint
13// ---------------------------------------------------------------------------
14
15/// A compact fingerprint for a video.
16///
17/// Contains hashes of sampled keyframes plus a temporal hash that encodes
18/// the overall frame sequence order.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct VideoFingerprint {
21    /// Unique identifier for this video.
22    pub video_id: u64,
23    /// Hashes of sampled keyframes.
24    pub keyframe_hashes: Vec<u64>,
25    /// Temporal hash computed from the ordered keyframe sequence.
26    pub temporal_hash: u64,
27    /// Duration in milliseconds.
28    pub duration_ms: u64,
29}
30
31impl VideoFingerprint {
32    /// Create a new fingerprint.
33    #[must_use]
34    pub fn new(video_id: u64, keyframe_hashes: Vec<u64>, duration_ms: u64) -> Self {
35        let temporal_hash = Self::compute_temporal_hash(&keyframe_hashes);
36        Self {
37            video_id,
38            keyframe_hashes,
39            temporal_hash,
40            duration_ms,
41        }
42    }
43
44    /// Compute a temporal hash from an ordered sequence of frame hashes.
45    ///
46    /// Accumulates with XOR and a left-rotation so order matters.
47    #[must_use]
48    pub fn compute_temporal_hash(frame_hashes: &[u64]) -> u64 {
49        let mut acc: u64 = 0;
50        for &h in frame_hashes {
51            acc = acc.rotate_left(7) ^ h;
52        }
53        acc
54    }
55
56    /// Estimate similarity to another fingerprint using set-intersection / union
57    /// of keyframe hash sets.
58    #[must_use]
59    pub fn keyframe_similarity(&self, other: &Self) -> f32 {
60        if self.keyframe_hashes.is_empty() && other.keyframe_hashes.is_empty() {
61            return 1.0;
62        }
63
64        let intersection = self
65            .keyframe_hashes
66            .iter()
67            .filter(|h| other.keyframe_hashes.contains(h))
68            .count();
69
70        // Union = |A| + |B| - |A ∩ B|
71        let union = self.keyframe_hashes.len() + other.keyframe_hashes.len() - intersection;
72
73        if union == 0 {
74            return 0.0;
75        }
76
77        intersection as f32 / union as f32
78    }
79}
80
81// ---------------------------------------------------------------------------
82// DuplicatePair
83// ---------------------------------------------------------------------------
84
85/// A detected duplicate pair of videos.
86#[derive(Debug, Clone, PartialEq)]
87pub struct DuplicatePair {
88    /// ID of the first video.
89    pub a_id: u64,
90    /// ID of the second video.
91    pub b_id: u64,
92    /// Similarity score in [0.0, 1.0].
93    pub similarity: f32,
94    /// Estimated temporal offset in milliseconds (positive = b starts later).
95    pub offset_ms: i64,
96    /// True if detected as a trimmed (start/end cut) duplicate.
97    pub is_trimmed: bool,
98}
99
100// ---------------------------------------------------------------------------
101// VideoDuplicateDetector
102// ---------------------------------------------------------------------------
103
104/// Detects duplicate and near-duplicate videos by comparing fingerprints.
105pub struct VideoDuplicateDetector {
106    /// Stored fingerprints.
107    fingerprints: Vec<VideoFingerprint>,
108}
109
110impl VideoDuplicateDetector {
111    /// Create an empty detector.
112    #[must_use]
113    pub fn new() -> Self {
114        Self {
115            fingerprints: Vec::new(),
116        }
117    }
118
119    /// Add a fingerprint to the detector.
120    pub fn add(&mut self, fingerprint: VideoFingerprint) {
121        self.fingerprints.push(fingerprint);
122    }
123
124    /// Find all duplicate pairs above the given similarity threshold.
125    ///
126    /// Returns a list of `(id_a, id_b, similarity)` tuples.
127    #[must_use]
128    pub fn find_duplicates(&self, threshold: f32) -> Vec<(u64, u64, f32)> {
129        let mut results = Vec::new();
130
131        let n = self.fingerprints.len();
132        for i in 0..n {
133            for j in (i + 1)..n {
134                let fp_a = &self.fingerprints[i];
135                let fp_b = &self.fingerprints[j];
136
137                let sim = fp_a.keyframe_similarity(fp_b);
138                if sim >= threshold {
139                    results.push((fp_a.video_id, fp_b.video_id, sim));
140                }
141            }
142        }
143
144        results
145    }
146
147    /// Return the number of indexed fingerprints.
148    #[must_use]
149    pub fn len(&self) -> usize {
150        self.fingerprints.len()
151    }
152
153    /// Return true if no fingerprints have been added.
154    #[must_use]
155    pub fn is_empty(&self) -> bool {
156        self.fingerprints.is_empty()
157    }
158}
159
160impl Default for VideoDuplicateDetector {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166// ---------------------------------------------------------------------------
167// TrimmedDuplicateDetector
168// ---------------------------------------------------------------------------
169
170/// Detects trimmed (start/end cut) duplicates using sliding-window matching.
171pub struct TrimmedDuplicateDetector;
172
173impl TrimmedDuplicateDetector {
174    /// Find trimmed duplicate pairs among the given fingerprints.
175    ///
176    /// Uses a sliding window over keyframe hash sequences.
177    /// `threshold` is the minimum fraction of matching hashes in the window.
178    #[must_use]
179    pub fn find_trimmed_duplicates(fingerprints: &[VideoFingerprint]) -> Vec<DuplicatePair> {
180        let mut pairs = Vec::new();
181        let n = fingerprints.len();
182
183        for i in 0..n {
184            for j in (i + 1)..n {
185                let fp_a = &fingerprints[i];
186                let fp_b = &fingerprints[j];
187
188                if let Some((sim, offset_ms)) = sliding_window_match(fp_a, fp_b) {
189                    pairs.push(DuplicatePair {
190                        a_id: fp_a.video_id,
191                        b_id: fp_b.video_id,
192                        similarity: sim,
193                        offset_ms,
194                        is_trimmed: offset_ms != 0,
195                    });
196                }
197            }
198        }
199
200        pairs
201    }
202}
203
204/// Find the best sliding-window alignment between two keyframe hash sequences.
205///
206/// Returns `Some((similarity, offset_ms))` if the best window alignment
207/// produces a match ratio ≥ 0.5, or `None` otherwise.
208fn sliding_window_match(fp_a: &VideoFingerprint, fp_b: &VideoFingerprint) -> Option<(f32, i64)> {
209    let a = &fp_a.keyframe_hashes;
210    let b = &fp_b.keyframe_hashes;
211
212    if a.is_empty() || b.is_empty() {
213        return None;
214    }
215
216    // ms per keyframe (approximate)
217    let ms_per_frame_a = if a.len() > 1 {
218        fp_a.duration_ms as i64 / a.len() as i64
219    } else {
220        0
221    };
222
223    let mut best_sim = 0.0f32;
224    let mut best_offset_ms: i64 = 0;
225
226    // Try all offsets where the shorter sequence aligns against the longer
227    let (shorter, longer) = if a.len() <= b.len() { (a, b) } else { (b, a) };
228    let max_offset = longer.len().saturating_sub(shorter.len()) + 1;
229
230    for offset in 0..max_offset {
231        let window = &longer[offset..offset + shorter.len()];
232        let matches = shorter
233            .iter()
234            .zip(window.iter())
235            .filter(|(x, y)| x == y)
236            .count();
237        let sim = matches as f32 / shorter.len() as f32;
238        if sim > best_sim {
239            best_sim = sim;
240            best_offset_ms = offset as i64 * ms_per_frame_a;
241        }
242    }
243
244    if best_sim >= 0.5 {
245        Some((best_sim, best_offset_ms))
246    } else {
247        None
248    }
249}
250
251// ---------------------------------------------------------------------------
252// DuplicateCluster
253// ---------------------------------------------------------------------------
254
255/// A cluster of duplicate videos.
256#[derive(Debug, Clone)]
257pub struct DuplicateCluster {
258    /// ID of the representative (first seen) video in the cluster.
259    pub representative: u64,
260    /// IDs of all other members of the cluster.
261    pub members: Vec<u64>,
262}
263
264impl DuplicateCluster {
265    /// Build duplicate clusters from a list of pairs using union-find.
266    #[must_use]
267    pub fn build_clusters(pairs: &[DuplicatePair]) -> Vec<Self> {
268        // Collect all unique IDs
269        let mut ids: Vec<u64> = Vec::new();
270        for pair in pairs {
271            if !ids.contains(&pair.a_id) {
272                ids.push(pair.a_id);
273            }
274            if !ids.contains(&pair.b_id) {
275                ids.push(pair.b_id);
276            }
277        }
278
279        // Union-find: parent[i] = index of parent in `ids`
280        let mut parent: Vec<usize> = (0..ids.len()).collect();
281
282        let find = |parent: &mut Vec<usize>, mut x: usize| -> usize {
283            while parent[x] != x {
284                parent[x] = parent[parent[x]]; // path compression
285                x = parent[x];
286            }
287            x
288        };
289
290        // Union sets for each pair
291        for pair in pairs {
292            if let (Some(ai), Some(bi)) = (
293                ids.iter().position(|&id| id == pair.a_id),
294                ids.iter().position(|&id| id == pair.b_id),
295            ) {
296                let ra = find(&mut parent, ai);
297                let rb = find(&mut parent, bi);
298                if ra != rb {
299                    parent[rb] = ra;
300                }
301            }
302        }
303
304        // Flatten all nodes to roots
305        let roots: Vec<usize> = (0..ids.len()).map(|i| find(&mut parent, i)).collect();
306
307        // Group by root
308        let mut cluster_map: std::collections::HashMap<usize, Vec<u64>> =
309            std::collections::HashMap::new();
310        for (i, &root) in roots.iter().enumerate() {
311            cluster_map.entry(root).or_default().push(ids[i]);
312        }
313
314        // Build result: only include clusters with more than one member
315        cluster_map
316            .into_values()
317            .filter(|members| members.len() > 1)
318            .map(|mut members| {
319                members.sort_unstable();
320                let representative = members[0];
321                let rest = members[1..].to_vec();
322                DuplicateCluster {
323                    representative,
324                    members: rest,
325                }
326            })
327            .collect()
328    }
329}
330
331// ---------------------------------------------------------------------------
332// Unit tests
333// ---------------------------------------------------------------------------
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    fn make_fp(id: u64, hashes: Vec<u64>, duration_ms: u64) -> VideoFingerprint {
340        VideoFingerprint::new(id, hashes, duration_ms)
341    }
342
343    // --- VideoFingerprint tests ---
344
345    #[test]
346    fn test_temporal_hash_empty() {
347        let h = VideoFingerprint::compute_temporal_hash(&[]);
348        assert_eq!(h, 0);
349    }
350
351    #[test]
352    fn test_temporal_hash_deterministic() {
353        let hashes = vec![1u64, 2, 3, 4, 5];
354        let h1 = VideoFingerprint::compute_temporal_hash(&hashes);
355        let h2 = VideoFingerprint::compute_temporal_hash(&hashes);
356        assert_eq!(h1, h2);
357    }
358
359    #[test]
360    fn test_temporal_hash_order_sensitive() {
361        let h1 = VideoFingerprint::compute_temporal_hash(&[1, 2, 3]);
362        let h2 = VideoFingerprint::compute_temporal_hash(&[3, 2, 1]);
363        assert_ne!(h1, h2);
364    }
365
366    #[test]
367    fn test_keyframe_similarity_identical() {
368        let fp = make_fp(1, vec![1, 2, 3, 4], 4000);
369        assert_eq!(fp.keyframe_similarity(&fp), 1.0);
370    }
371
372    #[test]
373    fn test_keyframe_similarity_disjoint() {
374        let fp_a = make_fp(1, vec![1, 2, 3], 3000);
375        let fp_b = make_fp(2, vec![4, 5, 6], 3000);
376        assert_eq!(fp_a.keyframe_similarity(&fp_b), 0.0);
377    }
378
379    #[test]
380    fn test_keyframe_similarity_partial() {
381        let fp_a = make_fp(1, vec![1, 2, 3, 4], 4000);
382        let fp_b = make_fp(2, vec![3, 4, 5, 6], 4000);
383        let sim = fp_a.keyframe_similarity(&fp_b);
384        // intersection=2, union=6 → 2/6 ≈ 0.333
385        assert!((sim - 1.0 / 3.0).abs() < 0.01);
386    }
387
388    #[test]
389    fn test_keyframe_similarity_empty_both() {
390        let fp_a = make_fp(1, vec![], 0);
391        let fp_b = make_fp(2, vec![], 0);
392        assert_eq!(fp_a.keyframe_similarity(&fp_b), 1.0);
393    }
394
395    // --- VideoDuplicateDetector tests ---
396
397    #[test]
398    fn test_detector_empty() {
399        let detector = VideoDuplicateDetector::new();
400        let dups = detector.find_duplicates(0.5);
401        assert!(dups.is_empty());
402    }
403
404    #[test]
405    fn test_detector_single() {
406        let mut detector = VideoDuplicateDetector::new();
407        detector.add(make_fp(1, vec![1, 2, 3], 3000));
408        let dups = detector.find_duplicates(0.5);
409        assert!(dups.is_empty());
410    }
411
412    #[test]
413    fn test_detector_identical_pair() {
414        let mut detector = VideoDuplicateDetector::new();
415        detector.add(make_fp(1, vec![1, 2, 3, 4, 5], 5000));
416        detector.add(make_fp(2, vec![1, 2, 3, 4, 5], 5000));
417        let dups = detector.find_duplicates(0.9);
418        assert_eq!(dups.len(), 1);
419        assert_eq!(dups[0].0, 1);
420        assert_eq!(dups[0].1, 2);
421        assert_eq!(dups[0].2, 1.0);
422    }
423
424    #[test]
425    fn test_detector_no_match_below_threshold() {
426        let mut detector = VideoDuplicateDetector::new();
427        detector.add(make_fp(1, vec![1, 2, 3], 3000));
428        detector.add(make_fp(2, vec![4, 5, 6], 3000));
429        let dups = detector.find_duplicates(0.5);
430        assert!(dups.is_empty());
431    }
432
433    // --- TrimmedDuplicateDetector tests ---
434
435    #[test]
436    fn test_trimmed_identical() {
437        let hashes = vec![1u64, 2, 3, 4, 5, 6, 7, 8];
438        let fps = vec![
439            make_fp(1, hashes.clone(), 8000),
440            make_fp(2, hashes.clone(), 8000),
441        ];
442        let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
443        assert!(!pairs.is_empty());
444        assert_eq!(pairs[0].similarity, 1.0);
445        assert!(!pairs[0].is_trimmed);
446    }
447
448    #[test]
449    fn test_trimmed_offset() {
450        // B is A with 2 frames prepended → trimmed duplicate
451        let hashes_a = vec![3u64, 4, 5, 6, 7];
452        let hashes_b = vec![1u64, 2, 3, 4, 5, 6, 7];
453        let fps = vec![make_fp(1, hashes_a, 5000), make_fp(2, hashes_b, 7000)];
454        let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
455        assert!(!pairs.is_empty());
456        assert_eq!(pairs[0].similarity, 1.0);
457        assert!(pairs[0].is_trimmed);
458    }
459
460    #[test]
461    fn test_trimmed_no_match() {
462        let fps = vec![
463            make_fp(1, vec![1, 2, 3], 3000),
464            make_fp(2, vec![7, 8, 9], 3000),
465        ];
466        let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
467        assert!(pairs.is_empty());
468    }
469
470    // --- DuplicateCluster tests ---
471
472    #[test]
473    fn test_clusters_empty_pairs() {
474        let clusters = DuplicateCluster::build_clusters(&[]);
475        assert!(clusters.is_empty());
476    }
477
478    #[test]
479    fn test_clusters_single_pair() {
480        let pairs = vec![DuplicatePair {
481            a_id: 1,
482            b_id: 2,
483            similarity: 1.0,
484            offset_ms: 0,
485            is_trimmed: false,
486        }];
487        let clusters = DuplicateCluster::build_clusters(&pairs);
488        assert_eq!(clusters.len(), 1);
489        assert_eq!(clusters[0].representative, 1);
490        assert!(clusters[0].members.contains(&2));
491    }
492
493    #[test]
494    fn test_clusters_chain() {
495        // 1-2, 2-3, 3-4 → single cluster {1,2,3,4}
496        let pairs = vec![
497            DuplicatePair {
498                a_id: 1,
499                b_id: 2,
500                similarity: 1.0,
501                offset_ms: 0,
502                is_trimmed: false,
503            },
504            DuplicatePair {
505                a_id: 2,
506                b_id: 3,
507                similarity: 1.0,
508                offset_ms: 0,
509                is_trimmed: false,
510            },
511            DuplicatePair {
512                a_id: 3,
513                b_id: 4,
514                similarity: 1.0,
515                offset_ms: 0,
516                is_trimmed: false,
517            },
518        ];
519        let clusters = DuplicateCluster::build_clusters(&pairs);
520        assert_eq!(clusters.len(), 1);
521        let total: usize = 1 + clusters[0].members.len();
522        assert_eq!(total, 4);
523    }
524
525    #[test]
526    fn test_clusters_two_separate() {
527        // {1,2} and {3,4} are independent clusters
528        let pairs = vec![
529            DuplicatePair {
530                a_id: 1,
531                b_id: 2,
532                similarity: 1.0,
533                offset_ms: 0,
534                is_trimmed: false,
535            },
536            DuplicatePair {
537                a_id: 3,
538                b_id: 4,
539                similarity: 1.0,
540                offset_ms: 0,
541                is_trimmed: false,
542            },
543        ];
544        let clusters = DuplicateCluster::build_clusters(&pairs);
545        assert_eq!(clusters.len(), 2);
546    }
547}