1#![allow(dead_code)]
10
11use std::collections::HashMap;
12
13#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct SegmentFingerprint {
18 pub hash: u64,
20 pub frame_count: usize,
22 pub duration_ms: u64,
24}
25
26impl SegmentFingerprint {
27 #[must_use]
29 pub fn new(hash: u64, frame_count: usize, duration_ms: u64) -> Self {
30 Self {
31 hash,
32 frame_count,
33 duration_ms,
34 }
35 }
36
37 #[must_use]
43 pub fn from_frame_data(data: &[u8], frame_count: usize, duration_ms: u64) -> Self {
44 let hash = perceptual_hash_u64(data);
45 Self {
46 hash,
47 frame_count,
48 duration_ms,
49 }
50 }
51
52 #[must_use]
54 pub fn hamming_distance(&self, other: &Self) -> u32 {
55 (self.hash ^ other.hash).count_ones()
56 }
57
58 #[must_use]
62 pub fn ms_per_frame(&self) -> u64 {
63 if self.frame_count == 0 {
64 0
65 } else {
66 self.duration_ms / self.frame_count as u64
67 }
68 }
69}
70
71fn perceptual_hash_u64(data: &[u8]) -> u64 {
78 if data.is_empty() {
79 return 0;
80 }
81
82 const SEGS: usize = 65;
84 let seg_size = (data.len() + SEGS - 1) / SEGS;
85 let mut seg_vals = [0u64; SEGS];
86
87 for (i, chunk) in data.chunks(seg_size.max(1)).enumerate() {
88 if i >= SEGS {
89 break;
90 }
91 let fnv_offset: u64 =
93 0xcbf2_9ce4_8422_2325u64.wrapping_add((i as u64).wrapping_mul(0x9e37_79b9_7f4a_7c15));
94 let mut state: u64 = fnv_offset;
95 for &b in chunk {
96 state ^= u64::from(b);
97 state = state.wrapping_mul(0x0100_0000_01b3);
98 }
99 seg_vals[i] = state;
100 }
101
102 let mut hash = 0u64;
104 for i in 0..64 {
105 if seg_vals[i] > seg_vals[i + 1] {
106 hash |= 1u64 << i;
107 }
108 }
109 hash
110}
111
112#[must_use]
122pub fn match_segments(a: &SegmentFingerprint, b: &SegmentFingerprint) -> f32 {
123 let differing_bits = (a.hash ^ b.hash).count_ones();
124 1.0 - (differing_bits as f32 / 64.0)
126}
127
128#[derive(Debug, Clone)]
132pub struct TemporalWindow {
133 pub hashes: Vec<u64>,
135 pub start_idx: usize,
137 pub duration_ms: u64,
139}
140
141impl TemporalWindow {
142 #[must_use]
144 pub fn from_fingerprints(fps: &[SegmentFingerprint], start_idx: usize) -> Self {
145 let hashes = fps.iter().map(|f| f.hash).collect();
146 let duration_ms = fps.iter().map(|f| f.duration_ms).sum();
147 Self {
148 hashes,
149 start_idx,
150 duration_ms,
151 }
152 }
153
154 #[must_use]
158 pub fn similarity(&self, other: &Self) -> f32 {
159 if self.hashes.is_empty() || other.hashes.is_empty() {
160 return 0.0;
161 }
162 if self.hashes.len() != other.hashes.len() {
163 return 0.0;
164 }
165 let total: f32 = self
166 .hashes
167 .iter()
168 .zip(other.hashes.iter())
169 .map(|(&a, &b)| {
170 let diff = (a ^ b).count_ones();
171 1.0 - (diff as f32 / 64.0)
172 })
173 .sum();
174 total / self.hashes.len() as f32
175 }
176}
177
178#[derive(Debug, Clone)]
186pub struct TemporalWindowMatcher {
187 pub window_size: usize,
189 pub stride: usize,
191}
192
193impl TemporalWindowMatcher {
194 #[must_use]
196 pub fn new(window_size: usize, stride: usize) -> Self {
197 Self {
198 window_size: window_size.max(1),
199 stride: stride.max(1),
200 }
201 }
202
203 #[must_use]
205 pub fn with_defaults() -> Self {
206 Self::new(4, 2)
207 }
208
209 #[must_use]
216 pub fn compare_sequences(&self, a: &[SegmentFingerprint], b: &[SegmentFingerprint]) -> f32 {
217 if a.len() < self.window_size || b.len() < self.window_size {
218 return 0.0;
219 }
220
221 let windows_a = self.extract_windows(a);
222 let windows_b = self.extract_windows(b);
223
224 let mut best = 0.0f32;
225 for wa in &windows_a {
226 for wb in &windows_b {
227 let sim = wa.similarity(wb);
228 if sim > best {
229 best = sim;
230 }
231 }
232 }
233 best
234 }
235
236 #[must_use]
241 pub fn find_best_alignment(
242 &self,
243 a: &[SegmentFingerprint],
244 b: &[SegmentFingerprint],
245 ) -> Option<(usize, usize, f32)> {
246 if a.len() < self.window_size || b.len() < self.window_size {
247 return None;
248 }
249
250 let windows_a = self.extract_windows(a);
251 let windows_b = self.extract_windows(b);
252
253 let mut best_sim = 0.0f32;
254 let mut best_offset_a = 0usize;
255 let mut best_offset_b = 0usize;
256
257 for wa in &windows_a {
258 for wb in &windows_b {
259 let sim = wa.similarity(wb);
260 if sim > best_sim {
261 best_sim = sim;
262 best_offset_a = wa.start_idx;
263 best_offset_b = wb.start_idx;
264 }
265 }
266 }
267
268 if best_sim > 0.0 {
269 Some((best_offset_a, best_offset_b, best_sim))
270 } else {
271 None
272 }
273 }
274
275 fn extract_windows(&self, seq: &[SegmentFingerprint]) -> Vec<TemporalWindow> {
277 if seq.len() < self.window_size {
278 return Vec::new();
279 }
280
281 let mut windows = Vec::new();
282 let mut start = 0;
283 while start + self.window_size <= seq.len() {
284 let slice = &seq[start..start + self.window_size];
285 windows.push(TemporalWindow::from_fingerprints(slice, start));
286 start += self.stride;
287 }
288 windows
289 }
290}
291
292#[derive(Debug, Clone, PartialEq)]
296pub struct SegmentMatch {
297 pub video_a: String,
299 pub segment_a_idx: usize,
301 pub video_b: String,
303 pub segment_b_idx: usize,
305 pub similarity: f32,
307 pub time_offset_ms: i64,
311}
312
313#[derive(Debug, Default)]
321pub struct VideoSegmentDeduplicator {
322 videos: HashMap<String, Vec<SegmentFingerprint>>,
324 threshold: f32,
326}
327
328impl VideoSegmentDeduplicator {
329 #[must_use]
331 pub fn new() -> Self {
332 Self {
333 videos: HashMap::new(),
334 threshold: 0.8,
335 }
336 }
337
338 #[must_use]
340 pub fn with_threshold(threshold: f32) -> Self {
341 Self {
342 videos: HashMap::new(),
343 threshold: threshold.clamp(0.0, 1.0),
344 }
345 }
346
347 pub fn add_video(&mut self, video_id: &str, fingerprints: Vec<SegmentFingerprint>) {
349 self.videos.insert(video_id.to_owned(), fingerprints);
350 }
351
352 #[must_use]
354 pub fn video_count(&self) -> usize {
355 self.videos.len()
356 }
357
358 #[must_use]
360 pub fn is_empty(&self) -> bool {
361 self.videos.is_empty()
362 }
363
364 #[must_use]
372 pub fn find_duplicate_segments(&self) -> Vec<SegmentMatch> {
373 self.find_duplicate_segments_with_threshold(self.threshold)
374 }
375
376 #[must_use]
378 pub fn find_duplicate_segments_with_threshold(&self, threshold: f32) -> Vec<SegmentMatch> {
379 let mut matches = Vec::new();
380
381 let video_ids: Vec<&String> = self.videos.keys().collect();
382 let n = video_ids.len();
383
384 for i in 0..n {
385 for j in (i + 1)..n {
386 let id_a = video_ids[i];
387 let id_b = video_ids[j];
388 let fps_a = &self.videos[id_a];
389 let fps_b = &self.videos[id_b];
390
391 let mut video_matches =
392 compare_segment_sequences(id_a, fps_a, id_b, fps_b, threshold);
393 matches.append(&mut video_matches);
394 }
395 }
396
397 matches
398 }
399
400 #[must_use]
403 pub fn query(
404 &self,
405 video_id: &str,
406 fingerprints: &[SegmentFingerprint],
407 threshold: f32,
408 ) -> Vec<SegmentMatch> {
409 let mut matches = Vec::new();
410
411 for (indexed_id, indexed_fps) in &self.videos {
412 if indexed_id == video_id {
413 continue;
414 }
415 let mut m = compare_segment_sequences(
416 video_id,
417 fingerprints,
418 indexed_id,
419 indexed_fps,
420 threshold,
421 );
422 matches.append(&mut m);
423 }
424
425 matches
426 }
427}
428
429fn compare_segment_sequences(
431 id_a: &str,
432 fps_a: &[SegmentFingerprint],
433 id_b: &str,
434 fps_b: &[SegmentFingerprint],
435 threshold: f32,
436) -> Vec<SegmentMatch> {
437 let mut matches = Vec::new();
438
439 for (i, fp_a) in fps_a.iter().enumerate() {
440 for (j, fp_b) in fps_b.iter().enumerate() {
441 let sim = match_segments(fp_a, fp_b);
442 if sim >= threshold {
443 let time_a: i64 = fps_a[..i].iter().map(|f| f.duration_ms as i64).sum();
445 let time_b: i64 = fps_b[..j].iter().map(|f| f.duration_ms as i64).sum();
446 let time_offset_ms = time_b - time_a;
447
448 matches.push(SegmentMatch {
449 video_a: id_a.to_owned(),
450 segment_a_idx: i,
451 video_b: id_b.to_owned(),
452 segment_b_idx: j,
453 similarity: sim,
454 time_offset_ms,
455 });
456 }
457 }
458 }
459
460 matches
461}
462
463#[cfg(test)]
466mod tests {
467 use super::*;
468
469 #[test]
472 fn test_segment_fingerprint_new() {
473 let fp = SegmentFingerprint::new(0xDEAD_BEEF_1234_5678, 30, 1000);
474 assert_eq!(fp.hash, 0xDEAD_BEEF_1234_5678);
475 assert_eq!(fp.frame_count, 30);
476 assert_eq!(fp.duration_ms, 1000);
477 }
478
479 #[test]
480 fn test_from_frame_data_deterministic() {
481 let data = b"hello video frame data here";
482 let fp1 = SegmentFingerprint::from_frame_data(data, 10, 500);
483 let fp2 = SegmentFingerprint::from_frame_data(data, 10, 500);
484 assert_eq!(fp1.hash, fp2.hash);
485 assert_eq!(fp1.frame_count, 10);
486 assert_eq!(fp1.duration_ms, 500);
487 }
488
489 #[test]
490 fn test_from_frame_data_different_inputs_differ() {
491 let data_a: Vec<u8> = (0u8..=127).collect();
494 let data_b: Vec<u8> = (128u8..=255).collect();
495 let fp_a = SegmentFingerprint::from_frame_data(&data_a, 10, 500);
496 let fp_b = SegmentFingerprint::from_frame_data(&data_b, 10, 500);
497 assert_ne!(fp_a.hash, fp_b.hash);
498 }
499
500 #[test]
501 fn test_from_frame_data_empty() {
502 let fp = SegmentFingerprint::from_frame_data(b"", 0, 0);
503 assert_eq!(fp.hash, 0);
504 assert_eq!(fp.frame_count, 0);
505 }
506
507 #[test]
508 fn test_hamming_distance_identical() {
509 let fp = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
510 assert_eq!(fp.hamming_distance(&fp), 0);
511 }
512
513 #[test]
514 fn test_hamming_distance_all_differ() {
515 let fp_a = SegmentFingerprint::new(0x0000_0000_0000_0000, 30, 1000);
516 let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
517 assert_eq!(fp_a.hamming_distance(&fp_b), 64);
518 }
519
520 #[test]
521 fn test_ms_per_frame() {
522 let fp = SegmentFingerprint::new(0, 30, 1000);
523 assert_eq!(fp.ms_per_frame(), 33); }
525
526 #[test]
527 fn test_ms_per_frame_zero_frames() {
528 let fp = SegmentFingerprint::new(0, 0, 1000);
529 assert_eq!(fp.ms_per_frame(), 0);
530 }
531
532 #[test]
535 fn test_match_segments_identical() {
536 let fp = SegmentFingerprint::new(0x1234_5678_ABCD_EF01, 30, 1000);
537 let sim = match_segments(&fp, &fp);
538 assert!((sim - 1.0).abs() < f32::EPSILON);
539 }
540
541 #[test]
542 fn test_match_segments_completely_different() {
543 let fp_a = SegmentFingerprint::new(0x0000_0000_0000_0000, 30, 1000);
544 let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 30, 1000);
545 let sim = match_segments(&fp_a, &fp_b);
546 assert!((sim - 0.0).abs() < f32::EPSILON);
547 }
548
549 #[test]
550 fn test_match_segments_half_different() {
551 let fp_a = SegmentFingerprint::new(0x0000_0000_FFFF_FFFF, 30, 1000);
553 let fp_b = SegmentFingerprint::new(0xFFFF_FFFF_0000_0000, 30, 1000);
554 let sim = match_segments(&fp_a, &fp_b);
557 assert!((sim - 0.0).abs() < f32::EPSILON);
558 }
559
560 #[test]
561 fn test_match_segments_near_identical() {
562 let fp_a = SegmentFingerprint::new(0b1000, 30, 1000);
564 let fp_b = SegmentFingerprint::new(0b1001, 30, 1000); let sim = match_segments(&fp_a, &fp_b);
566 let expected = 1.0 - (1.0 / 64.0);
567 assert!((sim - expected).abs() < 0.001);
568 }
569
570 #[test]
571 fn test_match_segments_ignores_frame_count() {
572 let hash = 0xCAFE_BABE_DEAD_BEEF;
573 let fp_a = SegmentFingerprint::new(hash, 10, 500);
574 let fp_b = SegmentFingerprint::new(hash, 99, 9999);
575 let sim = match_segments(&fp_a, &fp_b);
577 assert!((sim - 1.0).abs() < f32::EPSILON);
578 }
579
580 #[test]
583 fn test_temporal_window_from_fingerprints() {
584 let fps = vec![
585 SegmentFingerprint::new(0xAA, 10, 300),
586 SegmentFingerprint::new(0xBB, 10, 400),
587 SegmentFingerprint::new(0xCC, 10, 500),
588 ];
589 let win = TemporalWindow::from_fingerprints(&fps, 5);
590 assert_eq!(win.hashes, vec![0xAA, 0xBB, 0xCC]);
591 assert_eq!(win.start_idx, 5);
592 assert_eq!(win.duration_ms, 1200);
593 }
594
595 #[test]
596 fn test_temporal_window_similarity_identical() {
597 let fps = vec![
598 SegmentFingerprint::new(0x11, 10, 100),
599 SegmentFingerprint::new(0x22, 10, 100),
600 ];
601 let w1 = TemporalWindow::from_fingerprints(&fps, 0);
602 let w2 = TemporalWindow::from_fingerprints(&fps, 0);
603 assert!((w1.similarity(&w2) - 1.0).abs() < 0.001);
604 }
605
606 #[test]
607 fn test_temporal_window_similarity_different_lengths() {
608 let fps_a = vec![SegmentFingerprint::new(0x11, 10, 100)];
609 let fps_b = vec![
610 SegmentFingerprint::new(0x11, 10, 100),
611 SegmentFingerprint::new(0x22, 10, 100),
612 ];
613 let w1 = TemporalWindow::from_fingerprints(&fps_a, 0);
614 let w2 = TemporalWindow::from_fingerprints(&fps_b, 0);
615 assert!((w1.similarity(&w2) - 0.0).abs() < f32::EPSILON);
616 }
617
618 #[test]
619 fn test_temporal_window_similarity_empty() {
620 let w1 = TemporalWindow {
621 hashes: vec![],
622 start_idx: 0,
623 duration_ms: 0,
624 };
625 let w2 = TemporalWindow {
626 hashes: vec![],
627 start_idx: 0,
628 duration_ms: 0,
629 };
630 assert!((w1.similarity(&w2) - 0.0).abs() < f32::EPSILON);
631 }
632
633 #[test]
636 fn test_matcher_compare_identical_sequences() {
637 let fps: Vec<SegmentFingerprint> = (0..8)
638 .map(|i| SegmentFingerprint::new(i * 0x1111_1111_1111_1111, 10, 100))
639 .collect();
640 let matcher = TemporalWindowMatcher::new(4, 2);
641 let sim = matcher.compare_sequences(&fps, &fps);
642 assert!((sim - 1.0).abs() < 0.001);
643 }
644
645 #[test]
646 fn test_matcher_compare_too_short() {
647 let fps = vec![SegmentFingerprint::new(0xAB, 10, 100)];
648 let matcher = TemporalWindowMatcher::new(4, 2);
649 let sim = matcher.compare_sequences(&fps, &fps);
650 assert!((sim - 0.0).abs() < f32::EPSILON);
651 }
652
653 #[test]
654 fn test_matcher_find_best_alignment_identical() {
655 let fps: Vec<SegmentFingerprint> = (0..6)
656 .map(|i| SegmentFingerprint::new(i as u64, 10, 100))
657 .collect();
658 let matcher = TemporalWindowMatcher::new(3, 1);
659 let result = matcher.find_best_alignment(&fps, &fps);
660 assert!(result.is_some());
661 let (_, _, sim) = result.expect("alignment found");
662 assert!((sim - 1.0).abs() < 0.001);
663 }
664
665 #[test]
666 fn test_matcher_find_best_alignment_empty_sequences() {
667 let fps: Vec<SegmentFingerprint> = vec![];
668 let matcher = TemporalWindowMatcher::new(4, 2);
669 assert!(matcher.find_best_alignment(&fps, &fps).is_none());
670 }
671
672 #[test]
675 fn test_deduplicator_empty() {
676 let dedup = VideoSegmentDeduplicator::new();
677 assert_eq!(dedup.video_count(), 0);
678 assert!(dedup.is_empty());
679 assert!(dedup.find_duplicate_segments().is_empty());
680 }
681
682 #[test]
683 fn test_deduplicator_single_video_no_matches() {
684 let mut dedup = VideoSegmentDeduplicator::new();
685 let fps = vec![SegmentFingerprint::new(0x1234, 10, 500)];
686 dedup.add_video("video_a", fps);
687 assert_eq!(dedup.video_count(), 1);
688 assert!(dedup.find_duplicate_segments().is_empty());
690 }
691
692 #[test]
693 fn test_deduplicator_identical_videos_match() {
694 let mut dedup = VideoSegmentDeduplicator::with_threshold(0.9);
695 let fps: Vec<SegmentFingerprint> = vec![
696 SegmentFingerprint::new(0xAAAA_AAAA_AAAA_AAAA, 10, 500),
697 SegmentFingerprint::new(0xBBBB_BBBB_BBBB_BBBB, 10, 500),
698 ];
699 dedup.add_video("video_a", fps.clone());
700 dedup.add_video("video_b", fps);
701
702 let matches = dedup.find_duplicate_segments();
703 assert!(!matches.is_empty());
704 for m in &matches {
705 assert!(m.similarity >= 0.9);
706 }
707 }
708
709 #[test]
710 fn test_deduplicator_query_without_indexing() {
711 let mut dedup = VideoSegmentDeduplicator::new();
712 let fps_indexed = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
713 dedup.add_video("indexed", fps_indexed);
714
715 let fps_query = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
716 let results = dedup.query("query_video", &fps_query, 0.99);
717 assert_eq!(results.len(), 1);
718 assert_eq!(results[0].video_a, "query_video");
719 assert_eq!(results[0].video_b, "indexed");
720 }
721
722 #[test]
723 fn test_deduplicator_time_offset_computed() {
724 let mut dedup = VideoSegmentDeduplicator::with_threshold(0.99);
725 let fps_a = vec![
727 SegmentFingerprint::new(0x0000_0000_0000_0001, 10, 500),
728 SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500),
729 ];
730 let fps_b = vec![SegmentFingerprint::new(0xFFFF_FFFF_FFFF_FFFF, 10, 500)];
732 dedup.add_video("a", fps_a);
733 dedup.add_video("b", fps_b);
734
735 let matches = dedup.find_duplicate_segments();
736 assert!(!matches.is_empty(), "expected at least one match");
737
738 let m = matches.iter().find(|m| {
741 (m.video_a == "a" && m.segment_a_idx == 1 && m.video_b == "b" && m.segment_b_idx == 0)
743 || (m.video_a == "b" && m.segment_a_idx == 0 && m.video_b == "a" && m.segment_b_idx == 1)
745 });
746 assert!(m.is_some(), "expected match between a[1] and b[0]");
747
748 let m = m.expect("match found");
749 assert!(
753 m.time_offset_ms == -500 || m.time_offset_ms == 500,
754 "expected ±500ms offset, got {}",
755 m.time_offset_ms
756 );
757 }
758}