Skip to main content

edgeparse_core/utils/
image_dedup.rs

1//! Image deduplication — identifies duplicate images across pages
2//! using content hashing to reduce output size and improve processing.
3
4use std::collections::HashMap;
5
6/// An image fingerprint for deduplication.
7#[derive(Debug, Clone, PartialEq, Eq, Hash)]
8pub struct ImageFingerprint {
9    /// Hash of the raw image data.
10    pub content_hash: u64,
11    /// Width in pixels.
12    pub width: u32,
13    /// Height in pixels.
14    pub height: u32,
15}
16
17/// Information about a deduplicated image.
18#[derive(Debug, Clone)]
19pub struct DeduplicatedImage {
20    /// Unique fingerprint for this image.
21    pub fingerprint: ImageFingerprint,
22    /// Page numbers where this image appears (1-based).
23    pub page_numbers: Vec<u32>,
24    /// Number of occurrences.
25    pub occurrence_count: usize,
26}
27
28/// An image reference before deduplication.
29#[derive(Debug, Clone)]
30pub struct ImageRef {
31    /// Raw image data (or a portion for fingerprinting).
32    pub data: Vec<u8>,
33    /// Width in pixels.
34    pub width: u32,
35    /// Height in pixels.
36    pub height: u32,
37    /// Page number (1-based).
38    pub page_number: u32,
39    /// Object ID in the PDF.
40    pub object_id: Option<(u32, u16)>,
41}
42
43/// Compute a fingerprint for image data using FNV-1a hash.
44pub fn fingerprint(image: &ImageRef) -> ImageFingerprint {
45    ImageFingerprint {
46        content_hash: fnv1a_hash(&image.data),
47        width: image.width,
48        height: image.height,
49    }
50}
51
52/// Deduplicate a list of image references, grouping identical images.
53pub fn deduplicate(images: &[ImageRef]) -> Vec<DeduplicatedImage> {
54    let mut groups: HashMap<ImageFingerprint, Vec<u32>> = HashMap::new();
55
56    for img in images {
57        let fp = fingerprint(img);
58        groups.entry(fp).or_default().push(img.page_number);
59    }
60
61    let mut result: Vec<DeduplicatedImage> = groups
62        .into_iter()
63        .map(|(fp, pages)| DeduplicatedImage {
64            fingerprint: fp,
65            occurrence_count: pages.len(),
66            page_numbers: pages,
67        })
68        .collect();
69
70    result.sort_by(|a, b| b.occurrence_count.cmp(&a.occurrence_count));
71    result
72}
73
74/// Count how many images are duplicates (total - unique).
75pub fn duplicate_count(images: &[ImageRef]) -> usize {
76    let deduped = deduplicate(images);
77    let unique = deduped.len();
78    images.len().saturating_sub(unique)
79}
80
81/// Compute deduplication savings ratio (0.0 = no savings, 1.0 = all duplicates).
82pub fn savings_ratio(images: &[ImageRef]) -> f64 {
83    if images.is_empty() {
84        return 0.0;
85    }
86    duplicate_count(images) as f64 / images.len() as f64
87}
88
89/// FNV-1a 64-bit hash for byte slices.
90fn fnv1a_hash(data: &[u8]) -> u64 {
91    let mut hash: u64 = 0xcbf29ce484222325;
92    for &byte in data {
93        hash ^= byte as u64;
94        hash = hash.wrapping_mul(0x100000001b3);
95    }
96    hash
97}
98
99#[cfg(test)]
100mod tests {
101    use super::*;
102
103    fn make_image(data: &[u8], page: u32, w: u32, h: u32) -> ImageRef {
104        ImageRef {
105            data: data.to_vec(),
106            width: w,
107            height: h,
108            page_number: page,
109            object_id: None,
110        }
111    }
112
113    #[test]
114    fn test_fingerprint_deterministic() {
115        let img = make_image(b"hello image data", 1, 100, 50);
116        let fp1 = fingerprint(&img);
117        let fp2 = fingerprint(&img);
118        assert_eq!(fp1, fp2);
119    }
120
121    #[test]
122    fn test_fingerprint_different_data() {
123        let img1 = make_image(b"image A", 1, 100, 50);
124        let img2 = make_image(b"image B", 1, 100, 50);
125        assert_ne!(fingerprint(&img1), fingerprint(&img2));
126    }
127
128    #[test]
129    fn test_deduplicate() {
130        let images = vec![
131            make_image(b"logo", 1, 50, 50),
132            make_image(b"logo", 2, 50, 50),    // duplicate
133            make_image(b"logo", 3, 50, 50),    // duplicate
134            make_image(b"photo", 1, 200, 150), // unique
135        ];
136        let deduped = deduplicate(&images);
137        assert_eq!(deduped.len(), 2); // 2 unique images
138                                      // "logo" has 3 occurrences, should be first (sorted by count desc)
139        assert_eq!(deduped[0].occurrence_count, 3);
140        assert_eq!(deduped[0].page_numbers.len(), 3);
141        assert_eq!(deduped[1].occurrence_count, 1);
142    }
143
144    #[test]
145    fn test_duplicate_count() {
146        let images = vec![
147            make_image(b"same", 1, 10, 10),
148            make_image(b"same", 2, 10, 10),
149            make_image(b"diff", 3, 10, 10),
150        ];
151        assert_eq!(duplicate_count(&images), 1); // 3 images - 2 unique = 1 dup
152    }
153
154    #[test]
155    fn test_savings_ratio() {
156        let images = vec![
157            make_image(b"x", 1, 10, 10),
158            make_image(b"x", 2, 10, 10),
159            make_image(b"x", 3, 10, 10),
160            make_image(b"x", 4, 10, 10),
161        ];
162        let ratio = savings_ratio(&images);
163        // 4 images, 1 unique = 3 duplicates. 3/4 = 0.75
164        assert!((ratio - 0.75).abs() < 0.001);
165    }
166
167    #[test]
168    fn test_empty_images() {
169        let images: Vec<ImageRef> = vec![];
170        assert_eq!(duplicate_count(&images), 0);
171        assert_eq!(savings_ratio(&images), 0.0);
172        assert!(deduplicate(&images).is_empty());
173    }
174}