use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct ImageFingerprint {
pub content_hash: u64,
pub width: u32,
pub height: u32,
}
#[derive(Debug, Clone)]
pub struct DeduplicatedImage {
pub fingerprint: ImageFingerprint,
pub page_numbers: Vec<u32>,
pub occurrence_count: usize,
}
#[derive(Debug, Clone)]
pub struct ImageRef {
pub data: Vec<u8>,
pub width: u32,
pub height: u32,
pub page_number: u32,
pub object_id: Option<(u32, u16)>,
}
pub fn fingerprint(image: &ImageRef) -> ImageFingerprint {
ImageFingerprint {
content_hash: fnv1a_hash(&image.data),
width: image.width,
height: image.height,
}
}
pub fn deduplicate(images: &[ImageRef]) -> Vec<DeduplicatedImage> {
let mut groups: HashMap<ImageFingerprint, Vec<u32>> = HashMap::new();
for img in images {
let fp = fingerprint(img);
groups.entry(fp).or_default().push(img.page_number);
}
let mut result: Vec<DeduplicatedImage> = groups
.into_iter()
.map(|(fp, pages)| DeduplicatedImage {
fingerprint: fp,
occurrence_count: pages.len(),
page_numbers: pages,
})
.collect();
result.sort_by(|a, b| b.occurrence_count.cmp(&a.occurrence_count));
result
}
pub fn duplicate_count(images: &[ImageRef]) -> usize {
let deduped = deduplicate(images);
let unique = deduped.len();
images.len().saturating_sub(unique)
}
pub fn savings_ratio(images: &[ImageRef]) -> f64 {
if images.is_empty() {
return 0.0;
}
duplicate_count(images) as f64 / images.len() as f64
}
fn fnv1a_hash(data: &[u8]) -> u64 {
let mut hash: u64 = 0xcbf29ce484222325;
for &byte in data {
hash ^= byte as u64;
hash = hash.wrapping_mul(0x100000001b3);
}
hash
}
#[cfg(test)]
mod tests {
use super::*;
fn make_image(data: &[u8], page: u32, w: u32, h: u32) -> ImageRef {
ImageRef {
data: data.to_vec(),
width: w,
height: h,
page_number: page,
object_id: None,
}
}
#[test]
fn test_fingerprint_deterministic() {
let img = make_image(b"hello image data", 1, 100, 50);
let fp1 = fingerprint(&img);
let fp2 = fingerprint(&img);
assert_eq!(fp1, fp2);
}
#[test]
fn test_fingerprint_different_data() {
let img1 = make_image(b"image A", 1, 100, 50);
let img2 = make_image(b"image B", 1, 100, 50);
assert_ne!(fingerprint(&img1), fingerprint(&img2));
}
#[test]
fn test_deduplicate() {
let images = vec![
make_image(b"logo", 1, 50, 50),
make_image(b"logo", 2, 50, 50), make_image(b"logo", 3, 50, 50), make_image(b"photo", 1, 200, 150), ];
let deduped = deduplicate(&images);
assert_eq!(deduped.len(), 2); assert_eq!(deduped[0].occurrence_count, 3);
assert_eq!(deduped[0].page_numbers.len(), 3);
assert_eq!(deduped[1].occurrence_count, 1);
}
#[test]
fn test_duplicate_count() {
let images = vec![
make_image(b"same", 1, 10, 10),
make_image(b"same", 2, 10, 10),
make_image(b"diff", 3, 10, 10),
];
assert_eq!(duplicate_count(&images), 1); }
#[test]
fn test_savings_ratio() {
let images = vec![
make_image(b"x", 1, 10, 10),
make_image(b"x", 2, 10, 10),
make_image(b"x", 3, 10, 10),
make_image(b"x", 4, 10, 10),
];
let ratio = savings_ratio(&images);
assert!((ratio - 0.75).abs() < 0.001);
}
#[test]
fn test_empty_images() {
let images: Vec<ImageRef> = vec![];
assert_eq!(duplicate_count(&images), 0);
assert_eq!(savings_ratio(&images), 0.0);
assert!(deduplicate(&images).is_empty());
}
}