#![allow(dead_code)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct VideoFingerprint {
pub video_id: u64,
pub keyframe_hashes: Vec<u64>,
pub temporal_hash: u64,
pub duration_ms: u64,
}
impl VideoFingerprint {
#[must_use]
pub fn new(video_id: u64, keyframe_hashes: Vec<u64>, duration_ms: u64) -> Self {
let temporal_hash = Self::compute_temporal_hash(&keyframe_hashes);
Self {
video_id,
keyframe_hashes,
temporal_hash,
duration_ms,
}
}
#[must_use]
pub fn compute_temporal_hash(frame_hashes: &[u64]) -> u64 {
let mut acc: u64 = 0;
for &h in frame_hashes {
acc = acc.rotate_left(7) ^ h;
}
acc
}
#[must_use]
pub fn keyframe_similarity(&self, other: &Self) -> f32 {
if self.keyframe_hashes.is_empty() && other.keyframe_hashes.is_empty() {
return 1.0;
}
let intersection = self
.keyframe_hashes
.iter()
.filter(|h| other.keyframe_hashes.contains(h))
.count();
let union = self.keyframe_hashes.len() + other.keyframe_hashes.len() - intersection;
if union == 0 {
return 0.0;
}
intersection as f32 / union as f32
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct DuplicatePair {
pub a_id: u64,
pub b_id: u64,
pub similarity: f32,
pub offset_ms: i64,
pub is_trimmed: bool,
}
pub struct VideoDuplicateDetector {
fingerprints: Vec<VideoFingerprint>,
}
impl VideoDuplicateDetector {
#[must_use]
pub fn new() -> Self {
Self {
fingerprints: Vec::new(),
}
}
pub fn add(&mut self, fingerprint: VideoFingerprint) {
self.fingerprints.push(fingerprint);
}
#[must_use]
pub fn find_duplicates(&self, threshold: f32) -> Vec<(u64, u64, f32)> {
let mut results = Vec::new();
let n = self.fingerprints.len();
for i in 0..n {
for j in (i + 1)..n {
let fp_a = &self.fingerprints[i];
let fp_b = &self.fingerprints[j];
let sim = fp_a.keyframe_similarity(fp_b);
if sim >= threshold {
results.push((fp_a.video_id, fp_b.video_id, sim));
}
}
}
results
}
#[must_use]
pub fn len(&self) -> usize {
self.fingerprints.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.fingerprints.is_empty()
}
}
impl Default for VideoDuplicateDetector {
fn default() -> Self {
Self::new()
}
}
pub struct TrimmedDuplicateDetector;
impl TrimmedDuplicateDetector {
#[must_use]
pub fn find_trimmed_duplicates(fingerprints: &[VideoFingerprint]) -> Vec<DuplicatePair> {
let mut pairs = Vec::new();
let n = fingerprints.len();
for i in 0..n {
for j in (i + 1)..n {
let fp_a = &fingerprints[i];
let fp_b = &fingerprints[j];
if let Some((sim, offset_ms)) = sliding_window_match(fp_a, fp_b) {
pairs.push(DuplicatePair {
a_id: fp_a.video_id,
b_id: fp_b.video_id,
similarity: sim,
offset_ms,
is_trimmed: offset_ms != 0,
});
}
}
}
pairs
}
}
fn sliding_window_match(fp_a: &VideoFingerprint, fp_b: &VideoFingerprint) -> Option<(f32, i64)> {
let a = &fp_a.keyframe_hashes;
let b = &fp_b.keyframe_hashes;
if a.is_empty() || b.is_empty() {
return None;
}
let ms_per_frame_a = if a.len() > 1 {
fp_a.duration_ms as i64 / a.len() as i64
} else {
0
};
let mut best_sim = 0.0f32;
let mut best_offset_ms: i64 = 0;
let (shorter, longer) = if a.len() <= b.len() { (a, b) } else { (b, a) };
let max_offset = longer.len().saturating_sub(shorter.len()) + 1;
for offset in 0..max_offset {
let window = &longer[offset..offset + shorter.len()];
let matches = shorter
.iter()
.zip(window.iter())
.filter(|(x, y)| x == y)
.count();
let sim = matches as f32 / shorter.len() as f32;
if sim > best_sim {
best_sim = sim;
best_offset_ms = offset as i64 * ms_per_frame_a;
}
}
if best_sim >= 0.5 {
Some((best_sim, best_offset_ms))
} else {
None
}
}
#[derive(Debug, Clone)]
pub struct DuplicateCluster {
pub representative: u64,
pub members: Vec<u64>,
}
impl DuplicateCluster {
#[must_use]
pub fn build_clusters(pairs: &[DuplicatePair]) -> Vec<Self> {
let mut ids: Vec<u64> = Vec::new();
for pair in pairs {
if !ids.contains(&pair.a_id) {
ids.push(pair.a_id);
}
if !ids.contains(&pair.b_id) {
ids.push(pair.b_id);
}
}
let mut parent: Vec<usize> = (0..ids.len()).collect();
let find = |parent: &mut Vec<usize>, mut x: usize| -> usize {
while parent[x] != x {
parent[x] = parent[parent[x]]; x = parent[x];
}
x
};
for pair in pairs {
if let (Some(ai), Some(bi)) = (
ids.iter().position(|&id| id == pair.a_id),
ids.iter().position(|&id| id == pair.b_id),
) {
let ra = find(&mut parent, ai);
let rb = find(&mut parent, bi);
if ra != rb {
parent[rb] = ra;
}
}
}
let roots: Vec<usize> = (0..ids.len()).map(|i| find(&mut parent, i)).collect();
let mut cluster_map: std::collections::HashMap<usize, Vec<u64>> =
std::collections::HashMap::new();
for (i, &root) in roots.iter().enumerate() {
cluster_map.entry(root).or_default().push(ids[i]);
}
cluster_map
.into_values()
.filter(|members| members.len() > 1)
.map(|mut members| {
members.sort_unstable();
let representative = members[0];
let rest = members[1..].to_vec();
DuplicateCluster {
representative,
members: rest,
}
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_fp(id: u64, hashes: Vec<u64>, duration_ms: u64) -> VideoFingerprint {
VideoFingerprint::new(id, hashes, duration_ms)
}
#[test]
fn test_temporal_hash_empty() {
let h = VideoFingerprint::compute_temporal_hash(&[]);
assert_eq!(h, 0);
}
#[test]
fn test_temporal_hash_deterministic() {
let hashes = vec![1u64, 2, 3, 4, 5];
let h1 = VideoFingerprint::compute_temporal_hash(&hashes);
let h2 = VideoFingerprint::compute_temporal_hash(&hashes);
assert_eq!(h1, h2);
}
#[test]
fn test_temporal_hash_order_sensitive() {
let h1 = VideoFingerprint::compute_temporal_hash(&[1, 2, 3]);
let h2 = VideoFingerprint::compute_temporal_hash(&[3, 2, 1]);
assert_ne!(h1, h2);
}
#[test]
fn test_keyframe_similarity_identical() {
let fp = make_fp(1, vec![1, 2, 3, 4], 4000);
assert_eq!(fp.keyframe_similarity(&fp), 1.0);
}
#[test]
fn test_keyframe_similarity_disjoint() {
let fp_a = make_fp(1, vec![1, 2, 3], 3000);
let fp_b = make_fp(2, vec![4, 5, 6], 3000);
assert_eq!(fp_a.keyframe_similarity(&fp_b), 0.0);
}
#[test]
fn test_keyframe_similarity_partial() {
let fp_a = make_fp(1, vec![1, 2, 3, 4], 4000);
let fp_b = make_fp(2, vec![3, 4, 5, 6], 4000);
let sim = fp_a.keyframe_similarity(&fp_b);
assert!((sim - 1.0 / 3.0).abs() < 0.01);
}
#[test]
fn test_keyframe_similarity_empty_both() {
let fp_a = make_fp(1, vec![], 0);
let fp_b = make_fp(2, vec![], 0);
assert_eq!(fp_a.keyframe_similarity(&fp_b), 1.0);
}
#[test]
fn test_detector_empty() {
let detector = VideoDuplicateDetector::new();
let dups = detector.find_duplicates(0.5);
assert!(dups.is_empty());
}
#[test]
fn test_detector_single() {
let mut detector = VideoDuplicateDetector::new();
detector.add(make_fp(1, vec![1, 2, 3], 3000));
let dups = detector.find_duplicates(0.5);
assert!(dups.is_empty());
}
#[test]
fn test_detector_identical_pair() {
let mut detector = VideoDuplicateDetector::new();
detector.add(make_fp(1, vec![1, 2, 3, 4, 5], 5000));
detector.add(make_fp(2, vec![1, 2, 3, 4, 5], 5000));
let dups = detector.find_duplicates(0.9);
assert_eq!(dups.len(), 1);
assert_eq!(dups[0].0, 1);
assert_eq!(dups[0].1, 2);
assert_eq!(dups[0].2, 1.0);
}
#[test]
fn test_detector_no_match_below_threshold() {
let mut detector = VideoDuplicateDetector::new();
detector.add(make_fp(1, vec![1, 2, 3], 3000));
detector.add(make_fp(2, vec![4, 5, 6], 3000));
let dups = detector.find_duplicates(0.5);
assert!(dups.is_empty());
}
#[test]
fn test_trimmed_identical() {
let hashes = vec![1u64, 2, 3, 4, 5, 6, 7, 8];
let fps = vec![
make_fp(1, hashes.clone(), 8000),
make_fp(2, hashes.clone(), 8000),
];
let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
assert!(!pairs.is_empty());
assert_eq!(pairs[0].similarity, 1.0);
assert!(!pairs[0].is_trimmed);
}
#[test]
fn test_trimmed_offset() {
let hashes_a = vec![3u64, 4, 5, 6, 7];
let hashes_b = vec![1u64, 2, 3, 4, 5, 6, 7];
let fps = vec![make_fp(1, hashes_a, 5000), make_fp(2, hashes_b, 7000)];
let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
assert!(!pairs.is_empty());
assert_eq!(pairs[0].similarity, 1.0);
assert!(pairs[0].is_trimmed);
}
#[test]
fn test_trimmed_no_match() {
let fps = vec![
make_fp(1, vec![1, 2, 3], 3000),
make_fp(2, vec![7, 8, 9], 3000),
];
let pairs = TrimmedDuplicateDetector::find_trimmed_duplicates(&fps);
assert!(pairs.is_empty());
}
#[test]
fn test_clusters_empty_pairs() {
let clusters = DuplicateCluster::build_clusters(&[]);
assert!(clusters.is_empty());
}
#[test]
fn test_clusters_single_pair() {
let pairs = vec![DuplicatePair {
a_id: 1,
b_id: 2,
similarity: 1.0,
offset_ms: 0,
is_trimmed: false,
}];
let clusters = DuplicateCluster::build_clusters(&pairs);
assert_eq!(clusters.len(), 1);
assert_eq!(clusters[0].representative, 1);
assert!(clusters[0].members.contains(&2));
}
#[test]
fn test_clusters_chain() {
let pairs = vec![
DuplicatePair {
a_id: 1,
b_id: 2,
similarity: 1.0,
offset_ms: 0,
is_trimmed: false,
},
DuplicatePair {
a_id: 2,
b_id: 3,
similarity: 1.0,
offset_ms: 0,
is_trimmed: false,
},
DuplicatePair {
a_id: 3,
b_id: 4,
similarity: 1.0,
offset_ms: 0,
is_trimmed: false,
},
];
let clusters = DuplicateCluster::build_clusters(&pairs);
assert_eq!(clusters.len(), 1);
let total: usize = 1 + clusters[0].members.len();
assert_eq!(total, 4);
}
#[test]
fn test_clusters_two_separate() {
let pairs = vec![
DuplicatePair {
a_id: 1,
b_id: 2,
similarity: 1.0,
offset_ms: 0,
is_trimmed: false,
},
DuplicatePair {
a_id: 3,
b_id: 4,
similarity: 1.0,
offset_ms: 0,
is_trimmed: false,
},
];
let clusters = DuplicateCluster::build_clusters(&pairs);
assert_eq!(clusters.len(), 2);
}
}