1#![allow(dead_code)]
7
8use std::collections::{BTreeMap, HashMap};
9use std::path::{Path, PathBuf};
10
11use crate::DedupResult;
12
13#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub struct SegmentHash {
16 data: [u8; 32],
18 frame_count: usize,
20}
21
22impl SegmentHash {
23 #[must_use]
25 pub fn new(data: [u8; 32], frame_count: usize) -> Self {
26 Self { data, frame_count }
27 }
28
29 #[must_use]
31 pub fn from_bytes(bytes: &[u8], frame_count: usize) -> Self {
32 let mut data = [0u8; 32];
36 let mut state: u64 = 0xcbf2_9ce4_8422_2325; for &b in bytes {
38 state ^= u64::from(b);
39 state = state.wrapping_mul(0x0100_0000_01b3); }
41 let state_bytes = state.to_le_bytes();
43 data[..8].copy_from_slice(&state_bytes);
44 for chunk_idx in 1..4u64 {
46 state ^= chunk_idx;
47 for &b in bytes {
48 state ^= u64::from(b);
49 state = state.wrapping_mul(0x0100_0000_01b3);
50 }
51 let s = state.to_le_bytes();
52 let offset = chunk_idx as usize * 8;
53 data[offset..offset + 8].copy_from_slice(&s);
54 }
55 Self { data, frame_count }
56 }
57
58 #[must_use]
61 pub fn is_match(&self, other: &Self, max_diff_bits: u32) -> bool {
62 if self.frame_count != other.frame_count {
63 return false;
64 }
65 let diff: u32 = self
66 .data
67 .iter()
68 .zip(other.data.iter())
69 .map(|(a, b)| (a ^ b).count_ones())
70 .sum();
71 diff <= max_diff_bits
72 }
73
74 #[must_use]
76 pub fn as_bytes(&self) -> &[u8; 32] {
77 &self.data
78 }
79
80 #[must_use]
82 pub fn frame_count(&self) -> usize {
83 self.frame_count
84 }
85}
86
87#[derive(Debug, Clone)]
91pub struct SegmentDedupConfig {
92 pub window_size_frames: usize,
94 pub stride_frames: usize,
96 pub min_match_length_frames: usize,
98 pub max_diff_bits: u32,
100}
101
102impl Default for SegmentDedupConfig {
103 fn default() -> Self {
104 Self {
105 window_size_frames: 30,
106 stride_frames: 15,
107 min_match_length_frames: 15,
108 max_diff_bits: 4,
109 }
110 }
111}
112
113impl SegmentDedupConfig {
114 #[must_use]
116 pub fn new(window_size_frames: usize, stride_frames: usize, max_diff_bits: u32) -> Self {
117 Self {
118 window_size_frames,
119 stride_frames,
120 min_match_length_frames: window_size_frames / 2,
121 max_diff_bits,
122 }
123 }
124
125 #[must_use]
127 pub fn window_size_frames(&self) -> usize {
128 self.window_size_frames
129 }
130
131 #[must_use]
133 pub fn stride_frames(&self) -> usize {
134 self.stride_frames
135 }
136
137 #[must_use]
139 pub fn min_match_length_frames(&self) -> usize {
140 self.min_match_length_frames
141 }
142
143 #[must_use]
145 pub fn max_diff_bits(&self) -> u32 {
146 self.max_diff_bits
147 }
148}
149
150#[derive(Debug, Clone)]
154pub struct SegmentRecord {
155 pub source_id: String,
157 pub frame_offset: usize,
159 pub hash: SegmentHash,
161}
162
163#[derive(Debug, Default)]
167pub struct SegmentDeduplicator {
168 config: SegmentDedupConfig,
169 index: HashMap<[u8; 32], Vec<SegmentRecord>>,
171 unique_count: usize,
173}
174
175impl SegmentDeduplicator {
176 #[must_use]
178 pub fn new() -> Self {
179 Self::with_config(SegmentDedupConfig::default())
180 }
181
182 #[must_use]
184 pub fn with_config(config: SegmentDedupConfig) -> Self {
185 Self {
186 config,
187 index: HashMap::new(),
188 unique_count: 0,
189 }
190 }
191
192 pub fn add_segment(&mut self, source_id: &str, frame_offset: usize, bytes: &[u8]) {
195 let hash = SegmentHash::from_bytes(bytes, self.config.window_size_frames);
196 let key = *hash.as_bytes();
197 let is_new = !self.index.contains_key(&key);
198 self.index.entry(key).or_default().push(SegmentRecord {
199 source_id: source_id.to_string(),
200 frame_offset,
201 hash,
202 });
203 if is_new {
204 self.unique_count += 1;
205 }
206 }
207
208 #[must_use]
210 pub fn find_duplicates(&self) -> Vec<Vec<&SegmentRecord>> {
211 self.index
212 .values()
213 .filter(|group| group.len() > 1)
214 .map(|group| group.iter().collect())
215 .collect()
216 }
217
218 #[must_use]
220 pub fn unique_count(&self) -> usize {
221 self.unique_count
222 }
223
224 #[must_use]
226 pub fn total_count(&self) -> usize {
227 self.index.values().map(Vec::len).sum()
228 }
229
230 #[must_use]
232 pub fn config(&self) -> &SegmentDedupConfig {
233 &self.config
234 }
235}
236
237#[derive(Debug, Clone, PartialEq)]
241pub struct SharedClipMatch {
242 pub file_a: PathBuf,
244 pub file_b: PathBuf,
246 pub offset_a_frames: usize,
248 pub offset_b_frames: usize,
250 pub length_frames: usize,
252 pub confidence: f32,
254}
255
256fn hash_window(frames: &[u64]) -> u64 {
261 const BASE: u64 = 0x517c_c1b7_2722_0a95;
262 frames
263 .iter()
264 .fold(0u64, |acc, &h| acc.wrapping_mul(BASE).wrapping_add(h))
265}
266
267pub fn find_shared_clips_with_hashes(
279 file_pairs: &[(PathBuf, PathBuf)],
280 frame_hashes_a: &[Vec<u64>],
281 frame_hashes_b: &[Vec<u64>],
282 config: &SegmentDedupConfig,
283) -> DedupResult<Vec<SharedClipMatch>> {
284 assert_eq!(
285 file_pairs.len(),
286 frame_hashes_a.len(),
287 "file_pairs and frame_hashes_a must have the same length"
288 );
289 assert_eq!(
290 file_pairs.len(),
291 frame_hashes_b.len(),
292 "file_pairs and frame_hashes_b must have the same length"
293 );
294
295 let win = config.window_size_frames().max(1);
296 let stride = config.stride_frames().max(1);
297 let min_len = config.min_match_length_frames().max(1);
298
299 let mut matches = Vec::new();
300
301 for idx in 0..file_pairs.len() {
302 let (ref path_a, ref path_b) = file_pairs[idx];
303 let hashes_a = &frame_hashes_a[idx];
304 let hashes_b = &frame_hashes_b[idx];
305
306 let mut index_b: BTreeMap<u64, Vec<usize>> = BTreeMap::new();
308 let mut off = 0;
309 while off + win <= hashes_b.len() {
310 let wh = hash_window(&hashes_b[off..off + win]);
311 index_b.entry(wh).or_default().push(off);
312 off += stride;
313 }
314
315 let mut off_a = 0;
317 while off_a + win <= hashes_a.len() {
318 let wh = hash_window(&hashes_a[off_a..off_a + win]);
319 if let Some(b_offsets) = index_b.get(&wh) {
320 for &off_b in b_offsets {
321 let equal = hashes_a[off_a..off_a + win] == hashes_b[off_b..off_b + win];
323 if !equal {
324 off_a += stride;
325 continue;
326 }
327
328 let mut length = win;
330 while off_a + length < hashes_a.len()
331 && off_b + length < hashes_b.len()
332 && hashes_a[off_a + length] == hashes_b[off_b + length]
333 {
334 length += 1;
335 }
336
337 if length >= min_len {
338 let max_possible = hashes_a.len().max(hashes_b.len());
340 #[allow(clippy::cast_precision_loss)]
341 let confidence = (length as f32) / (max_possible as f32).max(1.0);
342 let confidence = confidence.min(1.0_f32);
343
344 matches.push(SharedClipMatch {
345 file_a: path_a.clone(),
346 file_b: path_b.clone(),
347 offset_a_frames: off_a,
348 offset_b_frames: off_b,
349 length_frames: length,
350 confidence,
351 });
352 }
353 }
354 }
355 off_a += stride;
356 }
357 }
358
359 Ok(matches)
360}
361
362pub fn find_shared_clips<F>(
368 file_pairs: &[(PathBuf, PathBuf)],
369 config: &SegmentDedupConfig,
370 mut hash_provider: F,
371) -> DedupResult<Vec<SharedClipMatch>>
372where
373 F: FnMut(&Path) -> DedupResult<Vec<u64>>,
374{
375 let mut hashes_a = Vec::with_capacity(file_pairs.len());
376 let mut hashes_b = Vec::with_capacity(file_pairs.len());
377
378 for (path_a, path_b) in file_pairs {
379 hashes_a.push(hash_provider(path_a)?);
380 hashes_b.push(hash_provider(path_b)?);
381 }
382
383 find_shared_clips_with_hashes(file_pairs, &hashes_a, &hashes_b, config)
384}
385
386#[cfg(test)]
389mod tests {
390 use super::*;
391
392 fn make_hash(byte: u8, frames: usize) -> SegmentHash {
393 let mut data = [0u8; 32];
394 data[0] = byte;
395 SegmentHash::new(data, frames)
396 }
397
398 #[test]
399 fn test_segment_hash_is_match_exact() {
400 let h1 = make_hash(0xAB, 30);
401 let h2 = make_hash(0xAB, 30);
402 assert!(h1.is_match(&h2, 0));
403 }
404
405 #[test]
406 fn test_segment_hash_no_match_different_frames() {
407 let h1 = make_hash(0xAB, 30);
408 let h2 = make_hash(0xAB, 60);
409 assert!(!h1.is_match(&h2, 100));
410 }
411
412 #[test]
413 fn test_segment_hash_hamming_tolerance() {
414 let mut d1 = [0u8; 32];
415 let mut d2 = [0u8; 32];
416 d1[0] = 0b0000_0001;
417 d2[0] = 0b0000_0011; let h1 = SegmentHash::new(d1, 30);
419 let h2 = SegmentHash::new(d2, 30);
420 assert!(h1.is_match(&h2, 1));
421 assert!(!h1.is_match(&h2, 0));
422 }
423
424 #[test]
425 fn test_segment_hash_from_bytes() {
426 let h = SegmentHash::from_bytes(b"hello world", 15);
427 assert_eq!(h.frame_count(), 15);
428 assert_ne!(h.as_bytes(), &[0u8; 32]);
429 }
430
431 #[test]
432 fn test_config_window_size_frames() {
433 let cfg = SegmentDedupConfig::new(48, 24, 8);
434 assert_eq!(cfg.window_size_frames(), 48);
435 assert_eq!(cfg.stride_frames(), 24);
436 assert_eq!(cfg.max_diff_bits(), 8);
437 }
438
439 #[test]
440 fn test_config_default() {
441 let cfg = SegmentDedupConfig::default();
442 assert_eq!(cfg.window_size_frames(), 30);
443 }
444
445 #[test]
446 fn test_add_segment_unique_count() {
447 let mut dedup = SegmentDeduplicator::new();
448 dedup.add_segment("source_a", 0, b"segment_content_one");
449 dedup.add_segment("source_b", 0, b"segment_content_two");
450 assert_eq!(dedup.unique_count(), 2);
451 }
452
453 #[test]
454 fn test_add_segment_duplicate_increments_total_not_unique() {
455 let mut dedup = SegmentDeduplicator::new();
456 dedup.add_segment("source_a", 0, b"same_content");
457 dedup.add_segment("source_b", 0, b"same_content");
458 assert_eq!(dedup.unique_count(), 1);
460 assert_eq!(dedup.total_count(), 2);
461 }
462
463 #[test]
464 fn test_find_duplicates_empty() {
465 let dedup = SegmentDeduplicator::new();
466 assert!(dedup.find_duplicates().is_empty());
467 }
468
469 #[test]
470 fn test_find_duplicates_no_dups() {
471 let mut dedup = SegmentDeduplicator::new();
472 dedup.add_segment("a", 0, b"aaa");
473 dedup.add_segment("b", 0, b"bbb");
474 assert!(dedup.find_duplicates().is_empty());
475 }
476
477 #[test]
478 fn test_find_duplicates_with_dups() {
479 let mut dedup = SegmentDeduplicator::new();
480 dedup.add_segment("src_a", 0, b"identical_bytes");
481 dedup.add_segment("src_b", 0, b"identical_bytes");
482 dedup.add_segment("src_c", 30, b"different");
483 let dups = dedup.find_duplicates();
484 assert_eq!(dups.len(), 1);
485 assert_eq!(dups[0].len(), 2);
486 }
487
488 #[test]
489 fn test_with_config_preserves_config() {
490 let cfg = SegmentDedupConfig::new(60, 30, 2);
491 let dedup = SegmentDeduplicator::with_config(cfg);
492 assert_eq!(dedup.config().window_size_frames(), 60);
493 }
494
495 #[test]
496 fn test_segment_record_fields() {
497 let mut dedup = SegmentDeduplicator::new();
498 dedup.add_segment("my_video.mp4", 120, b"frame_data_xyz");
499 let total = dedup.total_count();
501 assert_eq!(total, 1);
502 }
503
504 #[test]
505 fn test_multiple_sources_multiple_segments() {
506 let mut dedup = SegmentDeduplicator::new();
507 for i in 0u8..5 {
508 dedup.add_segment("fileA", (i as usize) * 30, &[i; 64]);
509 dedup.add_segment("fileB", (i as usize) * 30, &[i; 64]);
510 }
511 assert_eq!(dedup.unique_count(), 5);
513 assert_eq!(dedup.total_count(), 10);
514 assert_eq!(dedup.find_duplicates().len(), 5);
515 }
516
517 fn phash_seq(values: &[u64], repeat: usize) -> Vec<u64> {
521 values
522 .iter()
523 .cloned()
524 .cycle()
525 .take(values.len() * repeat)
526 .collect()
527 }
528
529 #[test]
530 fn test_segment_dedup_identical_content_detected() {
531 let frames: Vec<u64> = (0u64..60).map(|i| i * 0x1234_5678).collect();
533 let config = SegmentDedupConfig {
534 window_size_frames: 10,
535 stride_frames: 5,
536 min_match_length_frames: 10,
537 max_diff_bits: 0,
538 };
539 let path_a = PathBuf::from("video_a.mp4");
540 let path_b = PathBuf::from("video_b.mp4");
541 let pairs = vec![(path_a.clone(), path_b.clone())];
542
543 let result = find_shared_clips_with_hashes(
544 &pairs,
545 std::slice::from_ref(&frames),
546 std::slice::from_ref(&frames),
547 &config,
548 )
549 .expect("find_shared_clips_with_hashes should succeed");
550
551 assert!(!result.is_empty(), "should detect at least one shared clip");
552 let best = result
554 .iter()
555 .max_by(|a, b| a.confidence.partial_cmp(&b.confidence).unwrap())
556 .expect("at least one match");
557 assert!(
558 (best.confidence - 1.0).abs() < f32::EPSILON,
559 "confidence should be 1.0 for identical sequences, got {}",
560 best.confidence
561 );
562 }
563
564 #[test]
565 fn test_segment_dedup_no_match_returns_empty() {
566 let frames_a: Vec<u64> = (0u64..60).map(|i| i * 0xAAAA_AAAA).collect();
568 let frames_b: Vec<u64> = (0u64..60).map(|i| i * 0x5555_5555 + 1).collect();
569
570 let config = SegmentDedupConfig {
571 window_size_frames: 10,
572 stride_frames: 5,
573 min_match_length_frames: 10,
574 max_diff_bits: 0,
575 };
576 let pairs = vec![(PathBuf::from("unique_a.mp4"), PathBuf::from("unique_b.mp4"))];
577
578 let result = find_shared_clips_with_hashes(&pairs, &[frames_a], &[frames_b], &config)
579 .expect("find_shared_clips_with_hashes should succeed");
580
581 assert!(
582 result.is_empty(),
583 "completely different sequences should produce no matches"
584 );
585 }
586}