Skip to main content

oximedia_dedup/
segment_dedup.rs

1//! Segment-level deduplication for media streams.
2//!
3//! Detects duplicate segments within or across media files using
4//! rolling window hashing over frame sequences.
5
6#![allow(dead_code)]
7
8use std::collections::{BTreeMap, HashMap};
9use std::path::{Path, PathBuf};
10
11use crate::DedupResult;
12
13/// Hash value representing a media segment.
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub struct SegmentHash {
16    /// Raw hash bytes (32-byte BLAKE3-style)
17    data: [u8; 32],
18    /// Number of frames covered
19    frame_count: usize,
20}
21
22impl SegmentHash {
23    /// Create a new segment hash from raw bytes and frame count.
24    #[must_use]
25    pub fn new(data: [u8; 32], frame_count: usize) -> Self {
26        Self { data, frame_count }
27    }
28
29    /// Construct a segment hash by XOR-folding a byte slice into 32 bytes.
30    #[must_use]
31    pub fn from_bytes(bytes: &[u8], frame_count: usize) -> Self {
32        // Use a simple FNV-1a–style rolling hash that feeds into the 32-byte
33        // output, ensuring different byte sequences produce different hashes
34        // even when the same byte value repeats at stride-32 offsets.
35        let mut data = [0u8; 32];
36        let mut state: u64 = 0xcbf2_9ce4_8422_2325; // FNV offset basis
37        for &b in bytes {
38            state ^= u64::from(b);
39            state = state.wrapping_mul(0x0100_0000_01b3); // FNV prime
40        }
41        // Spread the 64-bit state into the first 8 bytes
42        let state_bytes = state.to_le_bytes();
43        data[..8].copy_from_slice(&state_bytes);
44        // Fill the remaining 24 bytes with additional mixing rounds
45        for chunk_idx in 1..4u64 {
46            state ^= chunk_idx;
47            for &b in bytes {
48                state ^= u64::from(b);
49                state = state.wrapping_mul(0x0100_0000_01b3);
50            }
51            let s = state.to_le_bytes();
52            let offset = chunk_idx as usize * 8;
53            data[offset..offset + 8].copy_from_slice(&s);
54        }
55        Self { data, frame_count }
56    }
57
58    /// Returns `true` if this hash matches another within a given
59    /// Hamming-bit tolerance.
60    #[must_use]
61    pub fn is_match(&self, other: &Self, max_diff_bits: u32) -> bool {
62        if self.frame_count != other.frame_count {
63            return false;
64        }
65        let diff: u32 = self
66            .data
67            .iter()
68            .zip(other.data.iter())
69            .map(|(a, b)| (a ^ b).count_ones())
70            .sum();
71        diff <= max_diff_bits
72    }
73
74    /// Returns the raw bytes of this hash.
75    #[must_use]
76    pub fn as_bytes(&self) -> &[u8; 32] {
77        &self.data
78    }
79
80    /// Returns the number of frames this hash covers.
81    #[must_use]
82    pub fn frame_count(&self) -> usize {
83        self.frame_count
84    }
85}
86
87// ── Config ────────────────────────────────────────────────────────────────────
88
89/// Configuration for segment-level deduplication.
90#[derive(Debug, Clone)]
91pub struct SegmentDedupConfig {
92    /// Number of frames per dedup window.
93    pub window_size_frames: usize,
94    /// Step between consecutive windows (stride).
95    pub stride_frames: usize,
96    /// Minimum number of consecutive matching frames to report as a shared clip.
97    pub min_match_length_frames: usize,
98    /// Maximum Hamming-bit distance to consider two segments identical.
99    pub max_diff_bits: u32,
100}
101
102impl Default for SegmentDedupConfig {
103    fn default() -> Self {
104        Self {
105            window_size_frames: 30,
106            stride_frames: 15,
107            min_match_length_frames: 15,
108            max_diff_bits: 4,
109        }
110    }
111}
112
113impl SegmentDedupConfig {
114    /// Create a new config with explicit parameters.
115    #[must_use]
116    pub fn new(window_size_frames: usize, stride_frames: usize, max_diff_bits: u32) -> Self {
117        Self {
118            window_size_frames,
119            stride_frames,
120            min_match_length_frames: window_size_frames / 2,
121            max_diff_bits,
122        }
123    }
124
125    /// Returns the window size in frames.
126    #[must_use]
127    pub fn window_size_frames(&self) -> usize {
128        self.window_size_frames
129    }
130
131    /// Returns the stride in frames.
132    #[must_use]
133    pub fn stride_frames(&self) -> usize {
134        self.stride_frames
135    }
136
137    /// Returns the minimum match length in frames required to emit a result.
138    #[must_use]
139    pub fn min_match_length_frames(&self) -> usize {
140        self.min_match_length_frames
141    }
142
143    /// Returns the maximum allowed Hamming-bit difference.
144    #[must_use]
145    pub fn max_diff_bits(&self) -> u32 {
146        self.max_diff_bits
147    }
148}
149
150// ── Segment record ────────────────────────────────────────────────────────────
151
152/// A record of one segment inserted into the deduplicator.
153#[derive(Debug, Clone)]
154pub struct SegmentRecord {
155    /// Source identifier (e.g. file path or stream ID).
156    pub source_id: String,
157    /// Frame offset where this segment begins.
158    pub frame_offset: usize,
159    /// The hash of this segment.
160    pub hash: SegmentHash,
161}
162
163// ── Deduplicator ──────────────────────────────────────────────────────────────
164
165/// Performs segment-level deduplication over a corpus of media segments.
166#[derive(Debug, Default)]
167pub struct SegmentDeduplicator {
168    config: SegmentDedupConfig,
169    /// Map from hash -> list of segment records sharing that hash.
170    index: HashMap<[u8; 32], Vec<SegmentRecord>>,
171    /// Total unique hashes inserted.
172    unique_count: usize,
173}
174
175impl SegmentDeduplicator {
176    /// Create a new deduplicator with default configuration.
177    #[must_use]
178    pub fn new() -> Self {
179        Self::with_config(SegmentDedupConfig::default())
180    }
181
182    /// Create a new deduplicator with explicit configuration.
183    #[must_use]
184    pub fn with_config(config: SegmentDedupConfig) -> Self {
185        Self {
186            config,
187            index: HashMap::new(),
188            unique_count: 0,
189        }
190    }
191
192    /// Add a segment identified by `source_id` starting at `frame_offset`
193    /// with the given raw bytes (representing that segment's content).
194    pub fn add_segment(&mut self, source_id: &str, frame_offset: usize, bytes: &[u8]) {
195        let hash = SegmentHash::from_bytes(bytes, self.config.window_size_frames);
196        let key = *hash.as_bytes();
197        let is_new = !self.index.contains_key(&key);
198        self.index.entry(key).or_default().push(SegmentRecord {
199            source_id: source_id.to_string(),
200            frame_offset,
201            hash,
202        });
203        if is_new {
204            self.unique_count += 1;
205        }
206    }
207
208    /// Find all duplicate segment groups (groups where 2+ sources share a hash).
209    #[must_use]
210    pub fn find_duplicates(&self) -> Vec<Vec<&SegmentRecord>> {
211        self.index
212            .values()
213            .filter(|group| group.len() > 1)
214            .map(|group| group.iter().collect())
215            .collect()
216    }
217
218    /// Returns the number of unique segment hashes stored.
219    #[must_use]
220    pub fn unique_count(&self) -> usize {
221        self.unique_count
222    }
223
224    /// Returns the total number of segment records stored.
225    #[must_use]
226    pub fn total_count(&self) -> usize {
227        self.index.values().map(Vec::len).sum()
228    }
229
230    /// Returns the underlying config.
231    #[must_use]
232    pub fn config(&self) -> &SegmentDedupConfig {
233        &self.config
234    }
235}
236
237// ── Shared-clip detection ─────────────────────────────────────────────────────
238
239/// A match describing a shared clip found between two video files.
240#[derive(Debug, Clone, PartialEq)]
241pub struct SharedClipMatch {
242    /// First file in the pair.
243    pub file_a: PathBuf,
244    /// Second file in the pair.
245    pub file_b: PathBuf,
246    /// Frame offset in `file_a` where the shared clip begins.
247    pub offset_a_frames: usize,
248    /// Frame offset in `file_b` where the shared clip begins.
249    pub offset_b_frames: usize,
250    /// Length of the shared clip in frames.
251    pub length_frames: usize,
252    /// Similarity confidence in [0.0, 1.0].
253    pub confidence: f32,
254}
255
256/// Compute a compact 64-bit rolling hash over a window of `u64` pHash values.
257///
258/// Uses a Rabin-fingerprint-style polynomial with multiplication + XOR fold so
259/// that the hash changes with every slide.
260fn hash_window(frames: &[u64]) -> u64 {
261    const BASE: u64 = 0x517c_c1b7_2722_0a95;
262    frames
263        .iter()
264        .fold(0u64, |acc, &h| acc.wrapping_mul(BASE).wrapping_add(h))
265}
266
267/// Find shared clips across pairs of pHash frame sequences.
268///
269/// For each `(file_a, file_b)` pair in `file_pairs`, the function receives
270/// the pHash frame sequence for each file via `hash_provider`, then slides a
271/// window of `config.window_size_frames` over both sequences in strides of
272/// `config.stride_frames`.  Matching windows (by hash value) that can be
273/// extended to at least `config.min_match_length_frames` are returned as
274/// [`SharedClipMatch`] entries.
275///
276/// The `hash_provider` closure is called with a file path and must return the
277/// ordered sequence of per-frame pHash values for that file.
278pub fn find_shared_clips_with_hashes(
279    file_pairs: &[(PathBuf, PathBuf)],
280    frame_hashes_a: &[Vec<u64>],
281    frame_hashes_b: &[Vec<u64>],
282    config: &SegmentDedupConfig,
283) -> DedupResult<Vec<SharedClipMatch>> {
284    assert_eq!(
285        file_pairs.len(),
286        frame_hashes_a.len(),
287        "file_pairs and frame_hashes_a must have the same length"
288    );
289    assert_eq!(
290        file_pairs.len(),
291        frame_hashes_b.len(),
292        "file_pairs and frame_hashes_b must have the same length"
293    );
294
295    let win = config.window_size_frames().max(1);
296    let stride = config.stride_frames().max(1);
297    let min_len = config.min_match_length_frames().max(1);
298
299    let mut matches = Vec::new();
300
301    for idx in 0..file_pairs.len() {
302        let (ref path_a, ref path_b) = file_pairs[idx];
303        let hashes_a = &frame_hashes_a[idx];
304        let hashes_b = &frame_hashes_b[idx];
305
306        // Build a BTreeMap<window_hash, Vec<offset_in_b>> from file_b's windows.
307        let mut index_b: BTreeMap<u64, Vec<usize>> = BTreeMap::new();
308        let mut off = 0;
309        while off + win <= hashes_b.len() {
310            let wh = hash_window(&hashes_b[off..off + win]);
311            index_b.entry(wh).or_default().push(off);
312            off += stride;
313        }
314
315        // Slide over file_a and look up matching windows.
316        let mut off_a = 0;
317        while off_a + win <= hashes_a.len() {
318            let wh = hash_window(&hashes_a[off_a..off_a + win]);
319            if let Some(b_offsets) = index_b.get(&wh) {
320                for &off_b in b_offsets {
321                    // Verify element-wise equality (guard against hash collisions).
322                    let equal = hashes_a[off_a..off_a + win] == hashes_b[off_b..off_b + win];
323                    if !equal {
324                        off_a += stride;
325                        continue;
326                    }
327
328                    // Extend the match as far as frames are identical.
329                    let mut length = win;
330                    while off_a + length < hashes_a.len()
331                        && off_b + length < hashes_b.len()
332                        && hashes_a[off_a + length] == hashes_b[off_b + length]
333                    {
334                        length += 1;
335                    }
336
337                    if length >= min_len {
338                        // Compute confidence: fraction of matched frames vs pairwise union.
339                        let max_possible = hashes_a.len().max(hashes_b.len());
340                        #[allow(clippy::cast_precision_loss)]
341                        let confidence = (length as f32) / (max_possible as f32).max(1.0);
342                        let confidence = confidence.min(1.0_f32);
343
344                        matches.push(SharedClipMatch {
345                            file_a: path_a.clone(),
346                            file_b: path_b.clone(),
347                            offset_a_frames: off_a,
348                            offset_b_frames: off_b,
349                            length_frames: length,
350                            confidence,
351                        });
352                    }
353                }
354            }
355            off_a += stride;
356        }
357    }
358
359    Ok(matches)
360}
361
362/// Find shared clips across file pairs.
363///
364/// This is a convenience wrapper that uses a caller-supplied closure to obtain
365/// per-frame pHash sequences for each file path, then delegates to
366/// [`find_shared_clips_with_hashes`].
367pub fn find_shared_clips<F>(
368    file_pairs: &[(PathBuf, PathBuf)],
369    config: &SegmentDedupConfig,
370    mut hash_provider: F,
371) -> DedupResult<Vec<SharedClipMatch>>
372where
373    F: FnMut(&Path) -> DedupResult<Vec<u64>>,
374{
375    let mut hashes_a = Vec::with_capacity(file_pairs.len());
376    let mut hashes_b = Vec::with_capacity(file_pairs.len());
377
378    for (path_a, path_b) in file_pairs {
379        hashes_a.push(hash_provider(path_a)?);
380        hashes_b.push(hash_provider(path_b)?);
381    }
382
383    find_shared_clips_with_hashes(file_pairs, &hashes_a, &hashes_b, config)
384}
385
386// ── Tests ─────────────────────────────────────────────────────────────────────
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391
392    fn make_hash(byte: u8, frames: usize) -> SegmentHash {
393        let mut data = [0u8; 32];
394        data[0] = byte;
395        SegmentHash::new(data, frames)
396    }
397
398    #[test]
399    fn test_segment_hash_is_match_exact() {
400        let h1 = make_hash(0xAB, 30);
401        let h2 = make_hash(0xAB, 30);
402        assert!(h1.is_match(&h2, 0));
403    }
404
405    #[test]
406    fn test_segment_hash_no_match_different_frames() {
407        let h1 = make_hash(0xAB, 30);
408        let h2 = make_hash(0xAB, 60);
409        assert!(!h1.is_match(&h2, 100));
410    }
411
412    #[test]
413    fn test_segment_hash_hamming_tolerance() {
414        let mut d1 = [0u8; 32];
415        let mut d2 = [0u8; 32];
416        d1[0] = 0b0000_0001;
417        d2[0] = 0b0000_0011; // 1-bit difference
418        let h1 = SegmentHash::new(d1, 30);
419        let h2 = SegmentHash::new(d2, 30);
420        assert!(h1.is_match(&h2, 1));
421        assert!(!h1.is_match(&h2, 0));
422    }
423
424    #[test]
425    fn test_segment_hash_from_bytes() {
426        let h = SegmentHash::from_bytes(b"hello world", 15);
427        assert_eq!(h.frame_count(), 15);
428        assert_ne!(h.as_bytes(), &[0u8; 32]);
429    }
430
431    #[test]
432    fn test_config_window_size_frames() {
433        let cfg = SegmentDedupConfig::new(48, 24, 8);
434        assert_eq!(cfg.window_size_frames(), 48);
435        assert_eq!(cfg.stride_frames(), 24);
436        assert_eq!(cfg.max_diff_bits(), 8);
437    }
438
439    #[test]
440    fn test_config_default() {
441        let cfg = SegmentDedupConfig::default();
442        assert_eq!(cfg.window_size_frames(), 30);
443    }
444
445    #[test]
446    fn test_add_segment_unique_count() {
447        let mut dedup = SegmentDeduplicator::new();
448        dedup.add_segment("source_a", 0, b"segment_content_one");
449        dedup.add_segment("source_b", 0, b"segment_content_two");
450        assert_eq!(dedup.unique_count(), 2);
451    }
452
453    #[test]
454    fn test_add_segment_duplicate_increments_total_not_unique() {
455        let mut dedup = SegmentDeduplicator::new();
456        dedup.add_segment("source_a", 0, b"same_content");
457        dedup.add_segment("source_b", 0, b"same_content");
458        // same bytes → same hash → unique_count stays 1, total = 2
459        assert_eq!(dedup.unique_count(), 1);
460        assert_eq!(dedup.total_count(), 2);
461    }
462
463    #[test]
464    fn test_find_duplicates_empty() {
465        let dedup = SegmentDeduplicator::new();
466        assert!(dedup.find_duplicates().is_empty());
467    }
468
469    #[test]
470    fn test_find_duplicates_no_dups() {
471        let mut dedup = SegmentDeduplicator::new();
472        dedup.add_segment("a", 0, b"aaa");
473        dedup.add_segment("b", 0, b"bbb");
474        assert!(dedup.find_duplicates().is_empty());
475    }
476
477    #[test]
478    fn test_find_duplicates_with_dups() {
479        let mut dedup = SegmentDeduplicator::new();
480        dedup.add_segment("src_a", 0, b"identical_bytes");
481        dedup.add_segment("src_b", 0, b"identical_bytes");
482        dedup.add_segment("src_c", 30, b"different");
483        let dups = dedup.find_duplicates();
484        assert_eq!(dups.len(), 1);
485        assert_eq!(dups[0].len(), 2);
486    }
487
488    #[test]
489    fn test_with_config_preserves_config() {
490        let cfg = SegmentDedupConfig::new(60, 30, 2);
491        let dedup = SegmentDeduplicator::with_config(cfg);
492        assert_eq!(dedup.config().window_size_frames(), 60);
493    }
494
495    #[test]
496    fn test_segment_record_fields() {
497        let mut dedup = SegmentDeduplicator::new();
498        dedup.add_segment("my_video.mp4", 120, b"frame_data_xyz");
499        // Verify the record is stored correctly.
500        let total = dedup.total_count();
501        assert_eq!(total, 1);
502    }
503
504    #[test]
505    fn test_multiple_sources_multiple_segments() {
506        let mut dedup = SegmentDeduplicator::new();
507        for i in 0u8..5 {
508            dedup.add_segment("fileA", (i as usize) * 30, &[i; 64]);
509            dedup.add_segment("fileB", (i as usize) * 30, &[i; 64]);
510        }
511        // 5 unique hashes, 10 total
512        assert_eq!(dedup.unique_count(), 5);
513        assert_eq!(dedup.total_count(), 10);
514        assert_eq!(dedup.find_duplicates().len(), 5);
515    }
516
517    // ── find_shared_clips tests ───────────────────────────────────────────────
518
519    /// Helper: build a Vec<u64> from a repeating pattern of values.
520    fn phash_seq(values: &[u64], repeat: usize) -> Vec<u64> {
521        values
522            .iter()
523            .cloned()
524            .cycle()
525            .take(values.len() * repeat)
526            .collect()
527    }
528
529    #[test]
530    fn test_segment_dedup_identical_content_detected() {
531        // Both files have the exact same 60-frame pHash sequence.
532        let frames: Vec<u64> = (0u64..60).map(|i| i * 0x1234_5678).collect();
533        let config = SegmentDedupConfig {
534            window_size_frames: 10,
535            stride_frames: 5,
536            min_match_length_frames: 10,
537            max_diff_bits: 0,
538        };
539        let path_a = PathBuf::from("video_a.mp4");
540        let path_b = PathBuf::from("video_b.mp4");
541        let pairs = vec![(path_a.clone(), path_b.clone())];
542
543        let result = find_shared_clips_with_hashes(
544            &pairs,
545            std::slice::from_ref(&frames),
546            std::slice::from_ref(&frames),
547            &config,
548        )
549        .expect("find_shared_clips_with_hashes should succeed");
550
551        assert!(!result.is_empty(), "should detect at least one shared clip");
552        // The best match should have confidence = 1.0 when sequences are identical.
553        let best = result
554            .iter()
555            .max_by(|a, b| a.confidence.partial_cmp(&b.confidence).unwrap())
556            .expect("at least one match");
557        assert!(
558            (best.confidence - 1.0).abs() < f32::EPSILON,
559            "confidence should be 1.0 for identical sequences, got {}",
560            best.confidence
561        );
562    }
563
564    #[test]
565    fn test_segment_dedup_no_match_returns_empty() {
566        // File A and File B have completely different pHash sequences.
567        let frames_a: Vec<u64> = (0u64..60).map(|i| i * 0xAAAA_AAAA).collect();
568        let frames_b: Vec<u64> = (0u64..60).map(|i| i * 0x5555_5555 + 1).collect();
569
570        let config = SegmentDedupConfig {
571            window_size_frames: 10,
572            stride_frames: 5,
573            min_match_length_frames: 10,
574            max_diff_bits: 0,
575        };
576        let pairs = vec![(PathBuf::from("unique_a.mp4"), PathBuf::from("unique_b.mp4"))];
577
578        let result = find_shared_clips_with_hashes(&pairs, &[frames_a], &[frames_b], &config)
579            .expect("find_shared_clips_with_hashes should succeed");
580
581        assert!(
582            result.is_empty(),
583            "completely different sequences should produce no matches"
584        );
585    }
586}