1use std::collections::HashMap;
5
6#[derive(Debug, Clone, PartialEq, Eq, Hash)]
8pub struct ImageFingerprint {
9 pub content_hash: u64,
11 pub width: u32,
13 pub height: u32,
15}
16
17#[derive(Debug, Clone)]
19pub struct DeduplicatedImage {
20 pub fingerprint: ImageFingerprint,
22 pub page_numbers: Vec<u32>,
24 pub occurrence_count: usize,
26}
27
28#[derive(Debug, Clone)]
30pub struct ImageRef {
31 pub data: Vec<u8>,
33 pub width: u32,
35 pub height: u32,
37 pub page_number: u32,
39 pub object_id: Option<(u32, u16)>,
41}
42
43pub fn fingerprint(image: &ImageRef) -> ImageFingerprint {
45 ImageFingerprint {
46 content_hash: fnv1a_hash(&image.data),
47 width: image.width,
48 height: image.height,
49 }
50}
51
52pub fn deduplicate(images: &[ImageRef]) -> Vec<DeduplicatedImage> {
54 let mut groups: HashMap<ImageFingerprint, Vec<u32>> = HashMap::new();
55
56 for img in images {
57 let fp = fingerprint(img);
58 groups.entry(fp).or_default().push(img.page_number);
59 }
60
61 let mut result: Vec<DeduplicatedImage> = groups
62 .into_iter()
63 .map(|(fp, pages)| DeduplicatedImage {
64 fingerprint: fp,
65 occurrence_count: pages.len(),
66 page_numbers: pages,
67 })
68 .collect();
69
70 result.sort_by(|a, b| b.occurrence_count.cmp(&a.occurrence_count));
71 result
72}
73
74pub fn duplicate_count(images: &[ImageRef]) -> usize {
76 let deduped = deduplicate(images);
77 let unique = deduped.len();
78 images.len().saturating_sub(unique)
79}
80
81pub fn savings_ratio(images: &[ImageRef]) -> f64 {
83 if images.is_empty() {
84 return 0.0;
85 }
86 duplicate_count(images) as f64 / images.len() as f64
87}
88
89fn fnv1a_hash(data: &[u8]) -> u64 {
91 let mut hash: u64 = 0xcbf29ce484222325;
92 for &byte in data {
93 hash ^= byte as u64;
94 hash = hash.wrapping_mul(0x100000001b3);
95 }
96 hash
97}
98
99#[cfg(test)]
100mod tests {
101 use super::*;
102
103 fn make_image(data: &[u8], page: u32, w: u32, h: u32) -> ImageRef {
104 ImageRef {
105 data: data.to_vec(),
106 width: w,
107 height: h,
108 page_number: page,
109 object_id: None,
110 }
111 }
112
113 #[test]
114 fn test_fingerprint_deterministic() {
115 let img = make_image(b"hello image data", 1, 100, 50);
116 let fp1 = fingerprint(&img);
117 let fp2 = fingerprint(&img);
118 assert_eq!(fp1, fp2);
119 }
120
121 #[test]
122 fn test_fingerprint_different_data() {
123 let img1 = make_image(b"image A", 1, 100, 50);
124 let img2 = make_image(b"image B", 1, 100, 50);
125 assert_ne!(fingerprint(&img1), fingerprint(&img2));
126 }
127
128 #[test]
129 fn test_deduplicate() {
130 let images = vec![
131 make_image(b"logo", 1, 50, 50),
132 make_image(b"logo", 2, 50, 50), make_image(b"logo", 3, 50, 50), make_image(b"photo", 1, 200, 150), ];
136 let deduped = deduplicate(&images);
137 assert_eq!(deduped.len(), 2); assert_eq!(deduped[0].occurrence_count, 3);
140 assert_eq!(deduped[0].page_numbers.len(), 3);
141 assert_eq!(deduped[1].occurrence_count, 1);
142 }
143
144 #[test]
145 fn test_duplicate_count() {
146 let images = vec![
147 make_image(b"same", 1, 10, 10),
148 make_image(b"same", 2, 10, 10),
149 make_image(b"diff", 3, 10, 10),
150 ];
151 assert_eq!(duplicate_count(&images), 1); }
153
154 #[test]
155 fn test_savings_ratio() {
156 let images = vec![
157 make_image(b"x", 1, 10, 10),
158 make_image(b"x", 2, 10, 10),
159 make_image(b"x", 3, 10, 10),
160 make_image(b"x", 4, 10, 10),
161 ];
162 let ratio = savings_ratio(&images);
163 assert!((ratio - 0.75).abs() < 0.001);
165 }
166
167 #[test]
168 fn test_empty_images() {
169 let images: Vec<ImageRef> = vec![];
170 assert_eq!(duplicate_count(&images), 0);
171 assert_eq!(savings_ratio(&images), 0.0);
172 assert!(deduplicate(&images).is_empty());
173 }
174}