#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct DHash(pub u64);
impl DHash {
#[must_use]
pub fn from_pixels(pixels: &[u8]) -> Option<Self> {
if pixels.len() < 72 {
return None;
}
let mut hash = 0u64;
for row in 0..8usize {
for col in 0..8usize {
let left = pixels[row * 9 + col];
let right = pixels[row * 9 + col + 1];
if left > right {
hash |= 1u64 << (row * 8 + col);
}
}
}
Some(Self(hash))
}
#[must_use]
pub fn hamming(self, other: Self) -> u32 {
(self.0 ^ other.0).count_ones()
}
#[must_use]
pub fn similarity(self, other: Self) -> f64 {
1.0 - f64::from(self.hamming(other)) / 64.0
}
}
#[derive(Debug, Clone, Copy)]
pub struct KeyframeHash {
pub frame_index: usize,
pub hash: DHash,
}
impl KeyframeHash {
#[must_use]
pub fn new(frame_index: usize, hash: DHash) -> Self {
Self { frame_index, hash }
}
}
#[derive(Debug, Clone)]
pub struct VideoDescriptor {
pub id: u64,
pub label: String,
pub keyframe_pixels: Vec<Vec<u8>>,
pub duration_ms: u64,
pub size_bytes: u64,
}
impl VideoDescriptor {
#[must_use]
pub fn new(
id: u64,
label: impl Into<String>,
keyframe_pixels: Vec<Vec<u8>>,
duration_ms: u64,
size_bytes: u64,
) -> Self {
Self {
id,
label: label.into(),
keyframe_pixels,
duration_ms,
size_bytes,
}
}
}
#[derive(Debug, Clone)]
struct VideoHashEntry {
id: u64,
label: String,
hashes: Vec<KeyframeHash>,
duration_ms: u64,
size_bytes: u64,
}
impl VideoHashEntry {
fn from_descriptor(desc: &VideoDescriptor) -> Self {
let hashes: Vec<KeyframeHash> = desc
.keyframe_pixels
.iter()
.enumerate()
.filter_map(|(i, pixels)| DHash::from_pixels(pixels).map(|h| KeyframeHash::new(i, h)))
.collect();
Self {
id: desc.id,
label: desc.label.clone(),
hashes,
duration_ms: desc.duration_ms,
size_bytes: desc.size_bytes,
}
}
fn similarity_to(&self, other: &Self, hamming_threshold: u32) -> f64 {
if self.hashes.is_empty() && other.hashes.is_empty() {
return 1.0;
}
if self.hashes.is_empty() || other.hashes.is_empty() {
return 0.0;
}
let matches = self
.hashes
.iter()
.filter(|kfa| {
other
.hashes
.iter()
.any(|kfb| kfa.hash.hamming(kfb.hash) <= hamming_threshold)
})
.count();
let union = self.hashes.len() + other.hashes.len() - matches;
if union == 0 {
return 0.0;
}
matches as f64 / union as f64
}
}
#[derive(Debug, Clone)]
pub struct SimilarityMatrix {
pub n: usize,
scores: Vec<f64>,
pub ids: Vec<u64>,
}
impl SimilarityMatrix {
fn build(entries: &[VideoHashEntry], hamming_threshold: u32) -> Self {
let n = entries.len();
let mut scores = vec![0.0f64; n * n];
let ids: Vec<u64> = entries.iter().map(|e| e.id).collect();
for i in 0..n {
scores[i * n + i] = 1.0;
for j in (i + 1)..n {
let sim = entries[i].similarity_to(&entries[j], hamming_threshold);
scores[i * n + j] = sim;
scores[j * n + i] = sim;
}
}
Self { n, scores, ids }
}
#[must_use]
pub fn get(&self, i: usize, j: usize) -> f64 {
if i >= self.n || j >= self.n {
return 0.0;
}
self.scores[i * self.n + j]
}
#[must_use]
pub fn pairs_above(&self, threshold: f64) -> Vec<(usize, usize, f64)> {
let mut result = Vec::new();
for i in 0..self.n {
for j in (i + 1)..self.n {
let s = self.scores[i * self.n + j];
if s >= threshold {
result.push((i, j, s));
}
}
}
result
}
}
#[derive(Debug, Clone)]
pub struct DuplicateVideoGroup {
pub id: usize,
pub video_ids: Vec<u64>,
pub labels: Vec<String>,
pub mean_similarity: f64,
pub representative_idx: usize,
pub reclaimable_bytes: u64,
}
impl DuplicateVideoGroup {
#[must_use]
pub fn representative_label(&self) -> &str {
self.labels
.get(self.representative_idx)
.map(String::as_str)
.unwrap_or("")
}
}
#[derive(Debug, Clone)]
pub struct PipelineStats {
pub total_videos: usize,
pub duplicate_groups: usize,
pub duplicate_videos: usize,
pub reclaimable_bytes: u64,
pub mean_group_similarity: f64,
}
impl PipelineStats {
#[must_use]
pub fn summary(&self) -> String {
format!(
"{} videos processed: {} groups, {} duplicates, {} bytes reclaimable, mean_sim={:.3}",
self.total_videos,
self.duplicate_groups,
self.duplicate_videos,
self.reclaimable_bytes,
self.mean_group_similarity,
)
}
}
#[derive(Debug, Clone)]
pub struct PipelineResult {
pub groups: Vec<DuplicateVideoGroup>,
pub stats: PipelineStats,
pub similarity_matrix: SimilarityMatrix,
}
#[derive(Debug, Clone)]
pub struct VideoDedupPipeline {
pub similarity_threshold: f64,
pub hamming_threshold: u32,
}
impl Default for VideoDedupPipeline {
fn default() -> Self {
Self::new()
}
}
impl VideoDedupPipeline {
#[must_use]
pub fn new() -> Self {
Self {
similarity_threshold: 0.80,
hamming_threshold: 8,
}
}
#[must_use]
pub fn with_similarity_threshold(mut self, t: f64) -> Self {
self.similarity_threshold = t;
self
}
#[must_use]
pub fn with_hamming_threshold(mut self, t: u32) -> Self {
self.hamming_threshold = t;
self
}
#[must_use]
pub fn run(&self, descriptors: &[VideoDescriptor]) -> PipelineResult {
if descriptors.is_empty() {
return PipelineResult {
groups: Vec::new(),
stats: PipelineStats {
total_videos: 0,
duplicate_groups: 0,
duplicate_videos: 0,
reclaimable_bytes: 0,
mean_group_similarity: 0.0,
},
similarity_matrix: SimilarityMatrix {
n: 0,
scores: Vec::new(),
ids: Vec::new(),
},
};
}
let entries: Vec<VideoHashEntry> = descriptors
.iter()
.map(VideoHashEntry::from_descriptor)
.collect();
let matrix = SimilarityMatrix::build(&entries, self.hamming_threshold);
let n = entries.len();
let mut parent: Vec<usize> = (0..n).collect();
let mut rank: Vec<usize> = vec![0; n];
let valid_pairs = matrix.pairs_above(self.similarity_threshold);
for &(i, j, _) in &valid_pairs {
Self::union_by_rank(&mut parent, &mut rank, i, j);
}
let mut group_map: HashMap<usize, Vec<usize>> = HashMap::new();
for i in 0..n {
let root = Self::find_root(&mut parent, i);
group_map.entry(root).or_default().push(i);
}
let id_to_entry: HashMap<u64, &VideoHashEntry> =
entries.iter().map(|e| (e.id, e)).collect();
let mut groups: Vec<DuplicateVideoGroup> = group_map
.into_values()
.filter(|members| members.len() >= 2)
.enumerate()
.map(|(gid, members)| {
let mut sim_sum = 0.0f64;
let mut sim_count = 0usize;
for ii in 0..members.len() {
for jj in (ii + 1)..members.len() {
sim_sum += matrix.get(members[ii], members[jj]);
sim_count += 1;
}
}
let mean_similarity = if sim_count > 0 {
sim_sum / sim_count as f64
} else {
0.0
};
let mut best_idx = 0usize;
let mut best_sim = f64::NEG_INFINITY;
for (li, &gi) in members.iter().enumerate() {
let total: f64 = members
.iter()
.enumerate()
.filter(|&(lj, _)| lj != li)
.map(|(_, &gj)| matrix.get(gi, gj))
.sum();
if total > best_sim {
best_sim = total;
best_idx = li;
}
}
let reclaimable_bytes: u64 = members
.iter()
.enumerate()
.filter(|&(li, _)| li != best_idx)
.filter_map(|(_, &gi)| entries.get(gi).map(|e| e.size_bytes))
.sum();
let video_ids: Vec<u64> = members.iter().map(|&gi| entries[gi].id).collect();
let labels: Vec<String> = members
.iter()
.map(|&gi| entries[gi].label.clone())
.collect();
for &vid in &video_ids {
let _ = id_to_entry.get(&vid);
}
DuplicateVideoGroup {
id: gid,
video_ids,
labels,
mean_similarity,
representative_idx: best_idx,
reclaimable_bytes,
}
})
.collect();
groups.sort_by_key(|g| g.id);
let duplicate_videos: usize = groups.iter().map(|g| g.video_ids.len()).sum();
let reclaimable_bytes: u64 = groups.iter().map(|g| g.reclaimable_bytes).sum();
let mean_group_similarity = if groups.is_empty() {
0.0
} else {
groups.iter().map(|g| g.mean_similarity).sum::<f64>() / groups.len() as f64
};
PipelineResult {
stats: PipelineStats {
total_videos: n,
duplicate_groups: groups.len(),
duplicate_videos,
reclaimable_bytes,
mean_group_similarity,
},
groups,
similarity_matrix: matrix,
}
}
fn find_root(parent: &mut Vec<usize>, x: usize) -> usize {
let mut root = x;
while parent[root] != root {
root = parent[root];
}
let mut cur = x;
while cur != root {
let next = parent[cur];
parent[cur] = root;
cur = next;
}
root
}
fn union_by_rank(parent: &mut Vec<usize>, rank: &mut Vec<usize>, a: usize, b: usize) {
let ra = Self::find_root(parent, a);
let rb = Self::find_root(parent, b);
if ra == rb {
return;
}
match rank[ra].cmp(&rank[rb]) {
std::cmp::Ordering::Less => parent[ra] = rb,
std::cmp::Ordering::Greater => parent[rb] = ra,
std::cmp::Ordering::Equal => {
parent[rb] = ra;
rank[ra] += 1;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_pixels(seed: u8) -> Vec<u8> {
(0..72u8)
.map(|i| {
let v = (i as u16)
.wrapping_mul(37)
.wrapping_add(seed as u16)
.wrapping_mul(19);
(v & 0xFF) as u8
})
.collect()
}
#[test]
fn test_dhash_from_pixels_too_small() {
let pixels = vec![0u8; 10];
assert!(DHash::from_pixels(&pixels).is_none());
}
#[test]
fn test_dhash_from_pixels_identical() {
let pixels = make_pixels(0);
let h1 = DHash::from_pixels(&pixels).expect("hash should succeed");
let h2 = DHash::from_pixels(&pixels).expect("hash should succeed");
assert_eq!(h1, h2);
assert_eq!(h1.hamming(h2), 0);
assert_eq!(h1.similarity(h2), 1.0);
}
#[test]
fn test_dhash_different_seeds_differ() {
let h1 = DHash::from_pixels(&make_pixels(0)).expect("hash");
let h2 = DHash::from_pixels(&make_pixels(100)).expect("hash");
assert_ne!(h1, h2);
assert!(h1.hamming(h2) > 0);
}
#[test]
fn test_dhash_similarity_range() {
let h1 = DHash::from_pixels(&make_pixels(0)).expect("hash");
let h2 = DHash::from_pixels(&make_pixels(50)).expect("hash");
let sim = h1.similarity(h2);
assert!((0.0..=1.0).contains(&sim));
}
#[test]
fn test_similarity_matrix_self_similarity() {
let desc = VideoDescriptor::new(
1,
"a.mp4",
vec![make_pixels(0), make_pixels(10)],
5000,
1024,
);
let entry = VideoHashEntry::from_descriptor(&desc);
let entries = vec![entry];
let matrix = SimilarityMatrix::build(&entries, 8);
assert_eq!(matrix.get(0, 0), 1.0);
}
#[test]
fn test_similarity_matrix_out_of_bounds() {
let desc = VideoDescriptor::new(1, "a.mp4", vec![make_pixels(0)], 1000, 512);
let entry = VideoHashEntry::from_descriptor(&desc);
let matrix = SimilarityMatrix::build(&[entry], 8);
assert_eq!(matrix.get(99, 99), 0.0);
}
#[test]
fn test_pipeline_empty() {
let result = VideoDedupPipeline::new().run(&[]);
assert!(result.groups.is_empty());
assert_eq!(result.stats.total_videos, 0);
}
#[test]
fn test_pipeline_single_video_no_duplicates() {
let desc = VideoDescriptor::new(1, "solo.mp4", vec![make_pixels(0)], 3000, 512);
let result = VideoDedupPipeline::new().run(&[desc]);
assert!(result.groups.is_empty());
assert_eq!(result.stats.duplicate_videos, 0);
}
#[test]
fn test_pipeline_identical_videos_grouped() {
let pixels = vec![make_pixels(0), make_pixels(10), make_pixels(20)];
let a = VideoDescriptor::new(1, "a.mp4", pixels.clone(), 5000, 1024);
let b = VideoDescriptor::new(2, "b.mp4", pixels.clone(), 5000, 1024);
let result = VideoDedupPipeline::new()
.with_similarity_threshold(0.5)
.run(&[a, b]);
assert_eq!(result.groups.len(), 1);
assert_eq!(result.groups[0].video_ids.len(), 2);
assert!(result.stats.reclaimable_bytes > 0);
}
#[test]
fn test_pipeline_distinct_videos_not_grouped() {
let a = VideoDescriptor::new(1, "a.mp4", vec![make_pixels(0)], 1000, 512);
let b = VideoDescriptor::new(2, "b.mp4", vec![make_pixels(200)], 1000, 512);
let result = VideoDedupPipeline::new()
.with_similarity_threshold(0.99)
.with_hamming_threshold(0)
.run(&[a, b]);
assert!(result.groups.is_empty());
}
#[test]
fn test_pipeline_representative_selected() {
let pixels = vec![make_pixels(0), make_pixels(5)];
let a = VideoDescriptor::new(1, "a.mp4", pixels.clone(), 5000, 2048);
let b = VideoDescriptor::new(2, "b.mp4", pixels.clone(), 5000, 1024);
let result = VideoDedupPipeline::new()
.with_similarity_threshold(0.5)
.run(&[a, b]);
assert_eq!(result.groups.len(), 1);
let group = &result.groups[0];
assert!(group.representative_idx < group.video_ids.len());
assert!(!group.representative_label().is_empty());
}
#[test]
fn test_pipeline_stats_summary() {
let result = VideoDedupPipeline::new().run(&[]);
assert!(!result.stats.summary().is_empty());
}
#[test]
fn test_pipeline_reclaimable_bytes() {
let pixels = vec![make_pixels(0)];
let a = VideoDescriptor::new(1, "a.mp4", pixels.clone(), 1000, 500);
let b = VideoDescriptor::new(2, "b.mp4", pixels.clone(), 1000, 800);
let c = VideoDescriptor::new(3, "c.mp4", pixels.clone(), 1000, 300);
let result = VideoDedupPipeline::new()
.with_similarity_threshold(0.5)
.run(&[a, b, c]);
if !result.groups.is_empty() {
assert!(result.stats.reclaimable_bytes > 0);
}
}
}