#![allow(dead_code)]
#![allow(clippy::cast_precision_loss)]
use std::collections::HashMap;
use crate::lsh_index::BitLshIndex;
use crate::visual::{self, Image, SsimParams};
#[derive(Debug, Clone)]
pub struct HierarchicalConfig {
pub enable_fast_pass: bool,
pub enable_medium_pass: bool,
pub enable_slow_pass: bool,
pub perceptual_max_distance: u32,
pub lsh_num_tables: usize,
pub lsh_bits_per_table: usize,
pub lsh_seed: u64,
pub ssim_threshold: f64,
pub ssim_thumbnail_size: usize,
}
impl Default for HierarchicalConfig {
fn default() -> Self {
Self {
enable_fast_pass: true,
enable_medium_pass: true,
enable_slow_pass: true,
perceptual_max_distance: 10,
lsh_num_tables: 8,
lsh_bits_per_table: 8,
lsh_seed: 42,
ssim_threshold: 0.85,
ssim_thumbnail_size: 16,
}
}
}
#[derive(Debug, Clone)]
pub struct DedupItem {
pub id: String,
pub content_hash: Vec<u8>,
pub perceptual_hash: u64,
pub thumbnail: Vec<u8>,
}
#[derive(Debug, Clone)]
pub struct DedupGroup {
pub item_ids: Vec<String>,
pub pass: DetectionPass,
pub similarity: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DetectionPass {
FastHash,
MediumPerceptual,
SlowSsim,
}
impl DetectionPass {
#[must_use]
pub fn label(self) -> &'static str {
match self {
Self::FastHash => "exact-hash",
Self::MediumPerceptual => "perceptual-lsh",
Self::SlowSsim => "ssim-verified",
}
}
}
#[derive(Debug, Clone)]
pub struct HierarchicalResult {
pub exact_groups: Vec<DedupGroup>,
pub perceptual_groups: Vec<DedupGroup>,
pub ssim_groups: Vec<DedupGroup>,
pub total_items: usize,
pub fast_pass_hits: usize,
pub medium_pass_candidates: usize,
pub slow_pass_verified: usize,
}
impl HierarchicalResult {
#[must_use]
pub fn total_groups(&self) -> usize {
self.exact_groups.len() + self.perceptual_groups.len() + self.ssim_groups.len()
}
#[must_use]
pub fn total_duplicates(&self) -> usize {
let count_dupes = |groups: &[DedupGroup]| -> usize {
groups
.iter()
.map(|g| g.item_ids.len().saturating_sub(1))
.sum()
};
count_dupes(&self.exact_groups)
+ count_dupes(&self.perceptual_groups)
+ count_dupes(&self.ssim_groups)
}
#[must_use]
pub fn all_groups(&self) -> Vec<&DedupGroup> {
let mut all = Vec::new();
for g in &self.exact_groups {
all.push(g);
}
for g in &self.perceptual_groups {
all.push(g);
}
for g in &self.ssim_groups {
all.push(g);
}
all
}
}
pub struct HierarchicalDedup {
config: HierarchicalConfig,
items: Vec<DedupItem>,
}
impl HierarchicalDedup {
#[must_use]
pub fn new(config: HierarchicalConfig) -> Self {
Self {
config,
items: Vec::new(),
}
}
pub fn add_item(
&mut self,
id: &str,
content_hash: &[u8],
perceptual_hash: u64,
thumbnail: &[u8],
) {
self.items.push(DedupItem {
id: id.to_string(),
content_hash: content_hash.to_vec(),
perceptual_hash,
thumbnail: thumbnail.to_vec(),
});
}
#[must_use]
pub fn item_count(&self) -> usize {
self.items.len()
}
#[must_use]
pub fn run(&self) -> HierarchicalResult {
let mut result = HierarchicalResult {
exact_groups: Vec::new(),
perceptual_groups: Vec::new(),
ssim_groups: Vec::new(),
total_items: self.items.len(),
fast_pass_hits: 0,
medium_pass_candidates: 0,
slow_pass_verified: 0,
};
if self.items.len() < 2 {
return result;
}
let mut assigned_exact = vec![false; self.items.len()];
if self.config.enable_fast_pass {
let groups = self.fast_pass();
for group in &groups {
for id in &group.item_ids {
if let Some(pos) = self.items.iter().position(|item| item.id == *id) {
assigned_exact[pos] = true;
}
}
}
result.fast_pass_hits = groups
.iter()
.map(|g| g.item_ids.len().saturating_sub(1))
.sum();
result.exact_groups = groups;
}
let remaining: Vec<usize> = (0..self.items.len())
.filter(|&i| !assigned_exact[i])
.collect();
if remaining.len() < 2 {
return result;
}
if self.config.enable_medium_pass {
let (perceptual_groups, candidate_count) = self.medium_pass(&remaining);
result.medium_pass_candidates = candidate_count;
if self.config.enable_slow_pass {
let (verified, verified_count) = self.slow_pass(&perceptual_groups);
result.slow_pass_verified = verified_count;
result.ssim_groups = verified;
} else {
result.perceptual_groups = perceptual_groups;
}
}
result
}
fn fast_pass(&self) -> Vec<DedupGroup> {
let mut hash_groups: HashMap<Vec<u8>, Vec<usize>> = HashMap::new();
for (i, item) in self.items.iter().enumerate() {
hash_groups
.entry(item.content_hash.clone())
.or_default()
.push(i);
}
hash_groups
.values()
.filter(|indices| indices.len() >= 2)
.map(|indices| DedupGroup {
item_ids: indices.iter().map(|&i| self.items[i].id.clone()).collect(),
pass: DetectionPass::FastHash,
similarity: 1.0,
})
.collect()
}
fn medium_pass(&self, remaining: &[usize]) -> (Vec<DedupGroup>, usize) {
let mut lsh = BitLshIndex::new(
self.config.lsh_num_tables,
self.config.lsh_bits_per_table,
self.config.lsh_seed,
);
for &idx in remaining {
lsh.insert(idx as u64, self.items[idx].perceptual_hash);
}
let mut seen_pairs = std::collections::HashSet::new();
let mut candidate_count = 0usize;
let mut edges: Vec<(usize, usize, f64)> = Vec::new();
for &idx in remaining {
let candidates = lsh.query_candidates(self.items[idx].perceptual_hash);
for (cid, chash) in candidates {
let cidx = cid as usize;
if cidx == idx {
continue;
}
let (lo, hi) = if idx < cidx { (idx, cidx) } else { (cidx, idx) };
if seen_pairs.insert((lo, hi)) {
candidate_count += 1;
let dist = (self.items[idx].perceptual_hash ^ chash).count_ones();
if dist <= self.config.perceptual_max_distance {
let similarity = 1.0 - f64::from(dist) / 64.0;
edges.push((lo, hi, similarity));
}
}
}
}
let groups = self.union_find_groups(&edges);
(groups, candidate_count)
}
fn slow_pass(&self, candidate_groups: &[DedupGroup]) -> (Vec<DedupGroup>, usize) {
let res = self.config.ssim_thumbnail_size.max(4);
let expected_pixels = res * res;
let ssim_params = SsimParams::default();
let mut verified_groups = Vec::new();
let mut verified_count = 0usize;
for group in candidate_groups {
let mut images: Vec<(String, Option<Image>)> = Vec::new();
for id in &group.item_ids {
if let Some(item) = self.items.iter().find(|it| it.id == *id) {
let img = if item.thumbnail.len() == expected_pixels {
Image::from_data(res, res, 1, item.thumbnail.clone()).ok()
} else {
None
};
images.push((id.clone(), img));
}
}
let mut verified_members: Vec<String> = Vec::new();
let mut best_ssim = 0.0f64;
if images.len() >= 2 {
if let Some(ref anchor_img) = images[0].1 {
verified_members.push(images[0].0.clone());
for (id, img_opt) in images.iter().skip(1) {
if let Some(ref img) = img_opt {
let ssim = visual::compute_ssim(anchor_img, img, &ssim_params);
verified_count += 1;
if ssim >= self.config.ssim_threshold {
verified_members.push(id.clone());
if ssim > best_ssim {
best_ssim = ssim;
}
}
}
}
}
}
if verified_members.len() >= 2 {
verified_groups.push(DedupGroup {
item_ids: verified_members,
pass: DetectionPass::SlowSsim,
similarity: best_ssim,
});
}
}
(verified_groups, verified_count)
}
fn union_find_groups(&self, edges: &[(usize, usize, f64)]) -> Vec<DedupGroup> {
if edges.is_empty() {
return Vec::new();
}
let mut all_indices = std::collections::HashSet::new();
for &(a, b, _) in edges {
all_indices.insert(a);
all_indices.insert(b);
}
let idx_list: Vec<usize> = all_indices.into_iter().collect();
let mut parent: HashMap<usize, usize> = idx_list.iter().map(|&i| (i, i)).collect();
fn find(parent: &mut HashMap<usize, usize>, x: usize) -> usize {
let p = parent.get(&x).copied().unwrap_or(x);
if p == x {
return x;
}
let root = find(parent, p);
parent.insert(x, root);
root
}
for &(a, b, _) in edges {
let ra = find(&mut parent, a);
let rb = find(&mut parent, b);
if ra != rb {
parent.insert(ra, rb);
}
}
let mut groups: HashMap<usize, Vec<usize>> = HashMap::new();
for &i in &idx_list {
let root = find(&mut parent, i);
groups.entry(root).or_default().push(i);
}
let mut group_best_sim: HashMap<usize, f64> = HashMap::new();
for &(a, _, sim) in edges {
let root = find(&mut parent, a);
let entry = group_best_sim.entry(root).or_insert(0.0);
if sim > *entry {
*entry = sim;
}
}
groups
.into_iter()
.filter(|(_, members)| members.len() >= 2)
.map(|(root, members)| DedupGroup {
item_ids: members.iter().map(|&i| self.items[i].id.clone()).collect(),
pass: DetectionPass::MediumPerceptual,
similarity: group_best_sim.get(&root).copied().unwrap_or(0.0),
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_hash(byte: u8) -> Vec<u8> {
vec![byte; 32]
}
fn make_thumbnail(val: u8, size: usize) -> Vec<u8> {
vec![val; size * size]
}
#[test]
fn test_hierarchical_config_default() {
let cfg = HierarchicalConfig::default();
assert!(cfg.enable_fast_pass);
assert!(cfg.enable_medium_pass);
assert!(cfg.enable_slow_pass);
assert_eq!(cfg.perceptual_max_distance, 10);
}
#[test]
fn test_detection_pass_label() {
assert_eq!(DetectionPass::FastHash.label(), "exact-hash");
assert_eq!(DetectionPass::MediumPerceptual.label(), "perceptual-lsh");
assert_eq!(DetectionPass::SlowSsim.label(), "ssim-verified");
}
#[test]
fn test_empty_pipeline() {
let dedup = HierarchicalDedup::new(HierarchicalConfig::default());
let result = dedup.run();
assert_eq!(result.total_items, 0);
assert_eq!(result.total_groups(), 0);
assert_eq!(result.total_duplicates(), 0);
}
#[test]
fn test_single_item() {
let mut dedup = HierarchicalDedup::new(HierarchicalConfig::default());
dedup.add_item(
"only.mp4",
&make_hash(0xAB),
0xDEAD,
&make_thumbnail(128, 16),
);
let result = dedup.run();
assert_eq!(result.total_items, 1);
assert_eq!(result.total_groups(), 0);
}
#[test]
fn test_exact_duplicates_fast_pass() {
let mut dedup = HierarchicalDedup::new(HierarchicalConfig::default());
let hash = make_hash(0xAB);
let thumb = make_thumbnail(128, 16);
dedup.add_item("a.mp4", &hash, 0xDEAD_BEEF, &thumb);
dedup.add_item("b.mp4", &hash, 0xDEAD_BEEF, &thumb);
dedup.add_item(
"c.mp4",
&make_hash(0xCD),
0x1234_5678,
&make_thumbnail(64, 16),
);
let result = dedup.run();
assert_eq!(result.exact_groups.len(), 1);
assert_eq!(result.exact_groups[0].item_ids.len(), 2);
assert_eq!(result.exact_groups[0].pass, DetectionPass::FastHash);
assert_eq!(result.exact_groups[0].similarity, 1.0);
assert_eq!(result.fast_pass_hits, 1);
}
#[test]
fn test_perceptual_near_duplicates() {
let mut cfg = HierarchicalConfig::default();
cfg.enable_slow_pass = false; cfg.perceptual_max_distance = 5;
let mut dedup = HierarchicalDedup::new(cfg);
let base_hash = 0xFFFF_FFFF_FFFF_FFFFu64;
let similar_hash = base_hash ^ 0b111;
dedup.add_item(
"a.mp4",
&make_hash(0xAA),
base_hash,
&make_thumbnail(128, 16),
);
dedup.add_item(
"b.mp4",
&make_hash(0xBB),
similar_hash,
&make_thumbnail(130, 16),
);
let result = dedup.run();
assert!(result.exact_groups.is_empty(), "Different hashes");
if !result.perceptual_groups.is_empty() {
assert_eq!(
result.perceptual_groups[0].pass,
DetectionPass::MediumPerceptual
);
assert!(result.perceptual_groups[0].similarity > 0.9);
}
}
#[test]
fn test_ssim_verification_pass() {
let cfg = HierarchicalConfig {
ssim_thumbnail_size: 8,
perceptual_max_distance: 10,
ssim_threshold: 0.5, ..HierarchicalConfig::default()
};
let mut dedup = HierarchicalDedup::new(cfg);
let thumb = make_thumbnail(128, 8);
let base_hash = 0xFFFF_FFFF_FFFF_FFFFu64;
let similar_hash = base_hash ^ 0b11;
dedup.add_item("x.mp4", &make_hash(0x11), base_hash, &thumb);
dedup.add_item("y.mp4", &make_hash(0x22), similar_hash, &thumb);
let result = dedup.run();
assert!(result.exact_groups.is_empty());
if !result.ssim_groups.is_empty() {
assert_eq!(result.ssim_groups[0].pass, DetectionPass::SlowSsim);
}
}
#[test]
fn test_three_exact_duplicates() {
let mut dedup = HierarchicalDedup::new(HierarchicalConfig::default());
let hash = make_hash(0xFF);
let thumb = make_thumbnail(200, 16);
dedup.add_item("a.mp4", &hash, 0x1111, &thumb);
dedup.add_item("b.mp4", &hash, 0x1111, &thumb);
dedup.add_item("c.mp4", &hash, 0x1111, &thumb);
let result = dedup.run();
assert_eq!(result.exact_groups.len(), 1);
assert_eq!(result.exact_groups[0].item_ids.len(), 3);
assert_eq!(result.total_duplicates(), 2);
}
#[test]
fn test_fast_pass_only() {
let cfg = HierarchicalConfig {
enable_fast_pass: true,
enable_medium_pass: false,
enable_slow_pass: false,
..HierarchicalConfig::default()
};
let mut dedup = HierarchicalDedup::new(cfg);
let hash = make_hash(0xAA);
dedup.add_item("a.mp4", &hash, 0xDEAD, &make_thumbnail(128, 16));
dedup.add_item("b.mp4", &hash, 0xDEAD, &make_thumbnail(128, 16));
let result = dedup.run();
assert_eq!(result.exact_groups.len(), 1);
assert!(result.perceptual_groups.is_empty());
assert!(result.ssim_groups.is_empty());
}
#[test]
fn test_all_groups_aggregation() {
let mut dedup = HierarchicalDedup::new(HierarchicalConfig::default());
let hash = make_hash(0x01);
let thumb = make_thumbnail(128, 16);
dedup.add_item("a.mp4", &hash, 0x1111, &thumb);
dedup.add_item("b.mp4", &hash, 0x1111, &thumb);
let result = dedup.run();
let all = result.all_groups();
assert!(!all.is_empty());
}
#[test]
fn test_item_count() {
let mut dedup = HierarchicalDedup::new(HierarchicalConfig::default());
assert_eq!(dedup.item_count(), 0);
dedup.add_item("a.mp4", &[0; 32], 0, &[128; 64]);
assert_eq!(dedup.item_count(), 1);
}
#[test]
fn test_distant_perceptual_hashes_not_grouped() {
let mut cfg = HierarchicalConfig::default();
cfg.enable_slow_pass = false;
cfg.perceptual_max_distance = 3;
let mut dedup = HierarchicalDedup::new(cfg);
dedup.add_item(
"a.mp4",
&make_hash(0xAA),
0x0000_0000_0000_0000,
&make_thumbnail(128, 16),
);
dedup.add_item(
"b.mp4",
&make_hash(0xBB),
0xFFFF_FFFF_FFFF_FFFF,
&make_thumbnail(200, 16),
);
let result = dedup.run();
assert!(result.exact_groups.is_empty());
assert!(result.perceptual_groups.is_empty());
}
#[test]
fn test_mixed_exact_and_near_duplicates() {
let mut cfg = HierarchicalConfig::default();
cfg.enable_slow_pass = false;
cfg.perceptual_max_distance = 5;
let mut dedup = HierarchicalDedup::new(cfg);
let hash_a = make_hash(0xAA);
dedup.add_item("exact1.mp4", &hash_a, 0x1111, &make_thumbnail(128, 16));
dedup.add_item("exact2.mp4", &hash_a, 0x1111, &make_thumbnail(128, 16));
let base = 0xFFFF_FFFF_FFFF_FFFFu64;
let near = base ^ 0b11;
dedup.add_item(
"near1.mp4",
&make_hash(0xBB),
base,
&make_thumbnail(200, 16),
);
dedup.add_item(
"near2.mp4",
&make_hash(0xCC),
near,
&make_thumbnail(202, 16),
);
let result = dedup.run();
assert_eq!(result.exact_groups.len(), 1);
assert_eq!(result.total_items, 4);
}
#[test]
fn test_hierarchical_result_total_groups() {
let result = HierarchicalResult {
exact_groups: vec![DedupGroup {
item_ids: vec!["a".into(), "b".into()],
pass: DetectionPass::FastHash,
similarity: 1.0,
}],
perceptual_groups: vec![DedupGroup {
item_ids: vec!["c".into(), "d".into()],
pass: DetectionPass::MediumPerceptual,
similarity: 0.95,
}],
ssim_groups: Vec::new(),
total_items: 4,
fast_pass_hits: 1,
medium_pass_candidates: 5,
slow_pass_verified: 0,
};
assert_eq!(result.total_groups(), 2);
assert_eq!(result.total_duplicates(), 2);
}
}