1#![allow(dead_code)]
10
11#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct VideoFingerprint {
21 pub video_id: u64,
23 pub keyframe_hashes: Vec<u64>,
25 pub temporal_hash: u64,
27 pub duration_ms: u64,
29}
30
31impl VideoFingerprint {
32 #[must_use]
34 pub fn new(video_id: u64, keyframe_hashes: Vec<u64>, duration_ms: u64) -> Self {
35 let temporal_hash = Self::compute_temporal_hash(&keyframe_hashes);
36 Self {
37 video_id,
38 keyframe_hashes,
39 temporal_hash,
40 duration_ms,
41 }
42 }
43
44 #[must_use]
48 pub fn compute_temporal_hash(frame_hashes: &[u64]) -> u64 {
49 let mut acc: u64 = 0;
50 for &h in frame_hashes {
51 acc = acc.rotate_left(7) ^ h;
52 }
53 acc
54 }
55
56 #[must_use]
59 pub fn keyframe_similarity(&self, other: &Self) -> f32 {
60 if self.keyframe_hashes.is_empty() && other.keyframe_hashes.is_empty() {
61 return 1.0;
62 }
63
64 let intersection = self
65 .keyframe_hashes
66 .iter()
67 .filter(|h| other.keyframe_hashes.contains(h))
68 .count();
69
70 let union = self.keyframe_hashes.len() + other.keyframe_hashes.len() - intersection;
72
73 if union == 0 {
74 return 0.0;
75 }
76
77 intersection as f32 / union as f32
78 }
79}
80
81#[derive(Debug, Clone, PartialEq)]
87pub struct DuplicatePair {
88 pub a_id: u64,
90 pub b_id: u64,
92 pub similarity: f32,
94 pub offset_ms: i64,
96 pub is_trimmed: bool,
98}
99
100pub struct VideoDuplicateDetector {
106 fingerprints: Vec<VideoFingerprint>,
108}
109
110impl VideoDuplicateDetector {
111 #[must_use]
113 pub fn new() -> Self {
114 Self {
115 fingerprints: Vec::new(),
116 }
117 }
118
119 pub fn add(&mut self, fingerprint: VideoFingerprint) {
121 self.fingerprints.push(fingerprint);
122 }
123
124 #[must_use]
128 pub fn find_duplicates(&self, threshold: f32) -> Vec<(u64, u64, f32)> {
129 let mut results = Vec::new();
130
131 let n = self.fingerprints.len();
132 for i in 0..n {
133 for j in (i + 1)..n {
134 let fp_a = &self.fingerprints[i];
135 let fp_b = &self.fingerprints[j];
136
137 let sim = fp_a.keyframe_similarity(fp_b);
138 if sim >= threshold {
139 results.push((fp_a.video_id, fp_b.video_id, sim));
140 }
141 }
142 }
143
144 results
145 }
146
147 #[must_use]
149 pub fn len(&self) -> usize {
150 self.fingerprints.len()
151 }
152
153 #[must_use]
155 pub fn is_empty(&self) -> bool {
156 self.fingerprints.is_empty()
157 }
158}
159
160impl Default for VideoDuplicateDetector {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166pub struct TrimmedDuplicateDetector;
172
173impl TrimmedDuplicateDetector {
174 #[must_use]
179 pub fn find_trimmed_duplicates(fingerprints: &[VideoFingerprint]) -> Vec<DuplicatePair> {
180 let mut pairs = Vec::new();
181 let n = fingerprints.len();
182
183 for i in 0..n {
184 for j in (i + 1)..n {
185 let fp_a = &fingerprints[i];
186 let fp_b = &fingerprints[j];
187
188 if let Some((sim, offset_ms)) = sliding_window_match(fp_a, fp_b) {
189 pairs.push(DuplicatePair {
190 a_id: fp_a.video_id,
191 b_id: fp_b.video_id,
192 similarity: sim,
193 offset_ms,
194 is_trimmed: offset_ms != 0,
195 });
196 }
197 }
198 }
199
200 pairs
201 }
202}
203
204fn sliding_window_match(fp_a: &VideoFingerprint, fp_b: &VideoFingerprint) -> Option<(f32, i64)> {
209 let a = &fp_a.keyframe_hashes;
210 let b = &fp_b.keyframe_hashes;
211
212 if a.is_empty() || b.is_empty() {
213 return None;
214 }
215
216 let ms_per_frame_a = if a.len() > 1 {
218 fp_a.duration_ms as i64 / a.len() as i64
219 } else {
220 0
221 };
222
223 let mut best_sim = 0.0f32;
224 let mut best_offset_ms: i64 = 0;
225
226 let (shorter, longer) = if a.len() <= b.len() { (a, b) } else { (b, a) };
228 let max_offset = longer.len().saturating_sub(shorter.len()) + 1;
229
230 for offset in 0..max_offset {
231 let window = &longer[offset..offset + shorter.len()];
232 let matches = shorter
233 .iter()
234 .zip(window.iter())
235 .filter(|(x, y)| x == y)
236 .count();
237 let sim = matches as f32 / shorter.len() as f32;
238 if sim > best_sim {
239 best_sim = sim;
240 best_offset_ms = offset as i64 * ms_per_frame_a;
241 }
242 }
243
244 if best_sim >= 0.5 {
245 Some((best_sim, best_offset_ms))
246 } else {
247 None
248 }
249}
250
251#[derive(Debug, Clone)]
257pub struct DuplicateCluster {
258 pub representative: u64,
260 pub members: Vec<u64>,
262}
263
264impl DuplicateCluster {
265 #[must_use]
267 pub fn build_clusters(pairs: &[DuplicatePair]) -> Vec<Self> {
268 let mut ids: Vec<u64> = Vec::new();
270 for pair in pairs {
271 if !ids.contains(&pair.a_id) {
272 ids.push(pair.a_id);
273 }
274 if !ids.contains(&pair.b_id) {
275 ids.push(pair.b_id);
276 }
277 }
278
279 let mut parent: Vec<usize> = (0..ids.len()).collect();
281
282 let find = |parent: &mut Vec<usize>, mut x: usize| -> usize {
283 while parent[x] != x {
284 parent[x] = parent[parent[x]]; x = parent[x];
286 }
287 x
288 };
289
290 for pair in pairs {
292 if let (Some(ai), Some(bi)) = (
293 ids.iter().position(|&id| id == pair.a_id),
294 ids.iter().position(|&id| id == pair.b_id),
295 ) {
296 let ra = find(&mut parent, ai);
297 let rb = find(&mut parent, bi);
298 if ra != rb {
299 parent[rb] = ra;
300 }
301 }
302 }
303
304 let roots: Vec<usize> = (0..ids.len()).map(|i| find(&mut parent, i)).collect();
306
307 let mut cluster_map: std::collections::HashMap<usize, Vec<u64>> =
309 std::collections::HashMap::new();
310 for (i, &root) in roots.iter().enumerate() {
311 cluster_map.entry(root).or_default().push(ids[i]);
312 }
313
314 cluster_map
316 .into_values()
317 .filter(|members| members.len() > 1)
318 .map(|mut members| {
319 members.sort_unstable();
320 let representative = members[0];
321 let rest = members[1..].to_vec();
322 DuplicateCluster {
323 representative,
324 members: rest,
325 }
326 })
327 .collect()
328 }
329}
330
331#[cfg(test)]
336mod tests {
337 use super::*;
338
339 fn make_fp(id: u64, hashes: Vec<u64>, duration_ms: u64) -> VideoFingerprint {
340 VideoFingerprint::new(id, hashes, duration_ms)
341 }
342
343 #[test]
346 fn test_temporal_hash_empty() {
347 let h = VideoFingerprint::compute_temporal_hash(&[]);
348 assert_eq!(h, 0);
349 }
350
351 #[test]
352 fn test_temporal_hash_deterministic() {
353 let hashes = vec![1u64, 2, 3, 4, 5];
354 let h1 = VideoFingerprint::compute_temporal_hash(&hashes);
355 let h2 = VideoFingerprint::compute_temporal_hash(&hashes);
356 assert_eq!(h1, h2);
357 }
358
359 #[test]
360 fn test_temporal_hash_order_sensitive() {
361 let h1 = VideoFingerprint::compute_temporal_hash(&[1, 2, 3]);
362 let h2 = VideoFingerprint::compute_temporal_hash(&[3, 2, 1]);
363 assert_ne!(h1, h2);
364 }
365
366 #[test]
367 fn test_keyframe_similarity_identical() {
368 let fp = make_fp(1, vec![1, 2, 3, 4], 4000);
369 assert_eq!(fp.keyframe_similarity(&fp), 1.0);
370 }
371
372 #[test]
373 fn test_keyframe_similarity_disjoint() {
374 let fp_a = make_fp(1, vec![1, 2, 3], 3000);
375 let fp_b = make_fp(2, vec![4, 5, 6], 3000);
376 assert_eq!(fp_a.keyframe_similarity(&fp_b), 0.0);
377 }
378
379 #[test]
380 fn test_keyframe_similarity_partial() {
381 let fp_a = make_fp(1, vec![1, 2, 3, 4], 4000);
382 let fp_b = make_fp(2, vec![3, 4, 5, 6], 4000);
383 let sim = fp_a.keyframe_similarity(&fp_b);
384 assert!((sim - 1.0 / 3.0).abs() < 0.01);
386 }
387
388 #[test]
389 fn test_keyframe_similarity_empty_both() {
390 let fp_a = make_fp(1, vec![], 0);
391 let fp_b = make_fp(2, vec![], 0);
392 assert_eq!(fp_a.keyframe_similarity(&fp_b), 1.0);
393 }
394
395 #[test]
398 fn test_detector_empty() {
399 let detector = VideoDuplicateDetector::new();
400 let dups = detector.find_duplicates(0.5);
401 assert!(dups.is_empty());
402 }
403
404 #[test]
405 fn test_detector_single() {
406 let mut detector = VideoDuplicateDetector::new();
407 detector.add(make_fp(1, vec![1, 2, 3], 3000));
408 let dups = detector.find_duplicates(0.5);
409 assert!(dups.is_empty());
410 }
411
412 #[test]
413 fn test_detector_identical_pair() {
414 let mut detector = VideoDuplicateDetector::new();
415 detector.add(make_fp(1, vec![1, 2, 3, 4, 5], 5000));
416 detector.add(make_fp(2, vec![1, 2, 3, 4, 5], 5000));
417 let dups = detector.find_duplicates(0.9);
418 assert_eq!(dups.len(), 1);
419 assert_eq!(dups[0].0, 1);
420 assert_eq!(dups[0].1, 2);
421 assert_eq!(dups[0].2, 1.0);
422 }
423
424 #[test]
425 fn test_detector_no_match_below_threshold() {
426 let mut detector = VideoDuplicateDetector::new();
427 detector.add(make_fp(1, vec![1, 2, 3], 3000));
428 detector.add(make_fp(2, vec![4, 5, 6], 3000));
429 let dups = detector.find_duplicates(0.5);
430 assert!(dups.is_empty());
431 }
432
433 #[test]
436 fn test_trimmed_identical() {
437 let hashes = vec![1u64, 2, 3, 4, 5, 6, 7, 8];
438 let fps = vec![
439 make_fp(1, hashes.clone(), 8000),
440 make_fp(2, hashes.clone(), 8000),
441 ];
442 let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
443 assert!(!pairs.is_empty());
444 assert_eq!(pairs[0].similarity, 1.0);
445 assert!(!pairs[0].is_trimmed);
446 }
447
448 #[test]
449 fn test_trimmed_offset() {
450 let hashes_a = vec![3u64, 4, 5, 6, 7];
452 let hashes_b = vec![1u64, 2, 3, 4, 5, 6, 7];
453 let fps = vec![make_fp(1, hashes_a, 5000), make_fp(2, hashes_b, 7000)];
454 let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
455 assert!(!pairs.is_empty());
456 assert_eq!(pairs[0].similarity, 1.0);
457 assert!(pairs[0].is_trimmed);
458 }
459
460 #[test]
461 fn test_trimmed_no_match() {
462 let fps = vec![
463 make_fp(1, vec![1, 2, 3], 3000),
464 make_fp(2, vec![7, 8, 9], 3000),
465 ];
466 let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
467 assert!(pairs.is_empty());
468 }
469
470 #[test]
473 fn test_clusters_empty_pairs() {
474 let clusters = DuplicateCluster::build_clusters(&[]);
475 assert!(clusters.is_empty());
476 }
477
478 #[test]
479 fn test_clusters_single_pair() {
480 let pairs = vec![DuplicatePair {
481 a_id: 1,
482 b_id: 2,
483 similarity: 1.0,
484 offset_ms: 0,
485 is_trimmed: false,
486 }];
487 let clusters = DuplicateCluster::build_clusters(&pairs);
488 assert_eq!(clusters.len(), 1);
489 assert_eq!(clusters[0].representative, 1);
490 assert!(clusters[0].members.contains(&2));
491 }
492
493 #[test]
494 fn test_clusters_chain() {
495 let pairs = vec![
497 DuplicatePair {
498 a_id: 1,
499 b_id: 2,
500 similarity: 1.0,
501 offset_ms: 0,
502 is_trimmed: false,
503 },
504 DuplicatePair {
505 a_id: 2,
506 b_id: 3,
507 similarity: 1.0,
508 offset_ms: 0,
509 is_trimmed: false,
510 },
511 DuplicatePair {
512 a_id: 3,
513 b_id: 4,
514 similarity: 1.0,
515 offset_ms: 0,
516 is_trimmed: false,
517 },
518 ];
519 let clusters = DuplicateCluster::build_clusters(&pairs);
520 assert_eq!(clusters.len(), 1);
521 let total: usize = 1 + clusters[0].members.len();
522 assert_eq!(total, 4);
523 }
524
525 #[test]
526 fn test_clusters_two_separate() {
527 let pairs = vec![
529 DuplicatePair {
530 a_id: 1,
531 b_id: 2,
532 similarity: 1.0,
533 offset_ms: 0,
534 is_trimmed: false,
535 },
536 DuplicatePair {
537 a_id: 3,
538 b_id: 4,
539 similarity: 1.0,
540 offset_ms: 0,
541 is_trimmed: false,
542 },
543 ];
544 let clusters = DuplicateCluster::build_clusters(&pairs);
545 assert_eq!(clusters.len(), 2);
546 }
547}