Skip to main content

justpdf_core/writer/
clean.rs

1use std::collections::HashMap;
2
3use crate::object::{PdfDict, PdfObject};
4
5/// Statistics from the cleanup process.
6#[derive(Debug, Default, PartialEq, Eq)]
7pub struct CleanStats {
8    pub duplicate_objects_removed: usize,
9    pub empty_objects_removed: usize,
10    pub total_objects_before: usize,
11    pub total_objects_after: usize,
12}
13
14/// Clean and optimize a PDF document's object list.
15///
16/// Performs the following operations:
17/// 1. Removes duplicate objects (identical content), rewriting references
18/// 2. Removes null/empty objects
19/// 3. Compacts object numbers sequentially to eliminate gaps
20///
21/// Returns statistics about what was cleaned.
22pub fn clean_objects(objects: &mut Vec<(u32, PdfObject)>) -> CleanStats {
23    let total_before = objects.len();
24
25    // Step 1: Remove duplicates by hashing object content
26    let dups_removed = dedup_objects(objects);
27
28    // Step 2: Remove null objects
29    let nulls_removed = remove_null_objects(objects);
30
31    // Step 3: Compact object numbers
32    compact_object_numbers(objects);
33
34    CleanStats {
35        duplicate_objects_removed: dups_removed,
36        empty_objects_removed: nulls_removed,
37        total_objects_before: total_before,
38        total_objects_after: objects.len(),
39    }
40}
41
42/// Hash a PdfObject to a canonical string for deduplication.
43/// This uses the Display representation which is deterministic for our types.
44fn hash_object(obj: &PdfObject) -> String {
45    // Use format! which delegates to Display
46    format!("{}", obj)
47}
48
49/// Remove duplicate objects: when two objects have identical content,
50/// keep the first and rewrite all references to point to it.
51/// Returns the number of duplicates removed.
52fn dedup_objects(objects: &mut Vec<(u32, PdfObject)>) -> usize {
53    // Build a map from object hash -> first object number with that hash
54    let mut hash_to_first: HashMap<String, u32> = HashMap::new();
55    // Map from removed obj_num -> replacement obj_num
56    let mut remap: HashMap<u32, u32> = HashMap::new();
57
58    for (obj_num, obj) in objects.iter() {
59        let h = hash_object(obj);
60        match hash_to_first.get(&h) {
61            Some(&first_num) if first_num != *obj_num => {
62                remap.insert(*obj_num, first_num);
63            }
64            _ => {
65                hash_to_first.insert(h, *obj_num);
66            }
67        }
68    }
69
70    if remap.is_empty() {
71        return 0;
72    }
73
74    let removed = remap.len();
75
76    // Remove the duplicate objects
77    objects.retain(|(obj_num, _)| !remap.contains_key(obj_num));
78
79    // Rewrite all references in remaining objects
80    for (_, obj) in objects.iter_mut() {
81        rewrite_references(obj, &remap);
82    }
83
84    removed
85}
86
87/// Recursively rewrite indirect references according to the remap table.
88pub(crate) fn rewrite_references(obj: &mut PdfObject, remap: &HashMap<u32, u32>) {
89    match obj {
90        PdfObject::Reference(r) => {
91            if let Some(&new_num) = remap.get(&r.obj_num) {
92                r.obj_num = new_num;
93            }
94        }
95        PdfObject::Array(items) => {
96            for item in items.iter_mut() {
97                rewrite_references(item, remap);
98            }
99        }
100        PdfObject::Dict(dict) => {
101            rewrite_references_in_dict(dict, remap);
102        }
103        PdfObject::Stream { dict, .. } => {
104            rewrite_references_in_dict(dict, remap);
105        }
106        _ => {}
107    }
108}
109
110/// Rewrite references within a dictionary.
111fn rewrite_references_in_dict(dict: &mut PdfDict, remap: &HashMap<u32, u32>) {
112    // We need to collect keys first since we can't mutate while iterating
113    let keys: Vec<Vec<u8>> = dict.keys().cloned().collect();
114    for key in keys {
115        if let Some(val) = dict.get(&key) {
116            let mut val = val.clone();
117            rewrite_references(&mut val, remap);
118            dict.insert(key, val);
119        }
120    }
121}
122
123/// Remove null objects from the list. Returns the number removed.
124fn remove_null_objects(objects: &mut Vec<(u32, PdfObject)>) -> usize {
125    // Collect object numbers that are referenced by other objects
126    let mut referenced: std::collections::HashSet<u32> = std::collections::HashSet::new();
127    for (_, obj) in objects.iter() {
128        collect_references(obj, &mut referenced);
129    }
130
131    let before = objects.len();
132
133    // Remove null objects that are not referenced
134    objects.retain(|(obj_num, obj)| {
135        if obj.is_null() && !referenced.contains(obj_num) {
136            false
137        } else {
138            true
139        }
140    });
141
142    before - objects.len()
143}
144
145/// Collect all object numbers referenced by indirect references in an object.
146fn collect_references(obj: &PdfObject, refs: &mut std::collections::HashSet<u32>) {
147    match obj {
148        PdfObject::Reference(r) => {
149            refs.insert(r.obj_num);
150        }
151        PdfObject::Array(items) => {
152            for item in items {
153                collect_references(item, refs);
154            }
155        }
156        PdfObject::Dict(dict) => {
157            for (_, val) in dict.iter() {
158                collect_references(val, refs);
159            }
160        }
161        PdfObject::Stream { dict, .. } => {
162            for (_, val) in dict.iter() {
163                collect_references(val, refs);
164            }
165        }
166        _ => {}
167    }
168}
169
170/// Compact object numbers sequentially starting from 1, rewriting all references.
171fn compact_object_numbers(objects: &mut Vec<(u32, PdfObject)>) {
172    // Build a mapping from old obj_num -> new obj_num
173    let mut remap: HashMap<u32, u32> = HashMap::new();
174    for (i, (obj_num, _)) in objects.iter().enumerate() {
175        let new_num = (i + 1) as u32;
176        if *obj_num != new_num {
177            remap.insert(*obj_num, new_num);
178        }
179    }
180
181    if remap.is_empty() {
182        return;
183    }
184
185    // Renumber objects
186    for (i, (obj_num, _)) in objects.iter_mut().enumerate() {
187        *obj_num = (i + 1) as u32;
188    }
189
190    // Rewrite references
191    for (_, obj) in objects.iter_mut() {
192        rewrite_references(obj, &remap);
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199    use crate::object::{IndirectRef, PdfDict, PdfObject};
200
201    #[test]
202    fn test_clean_removes_duplicates() {
203        let mut objects = vec![
204            (1, PdfObject::Integer(42)),
205            (2, PdfObject::Integer(42)), // duplicate of 1
206            (3, PdfObject::Array(vec![
207                PdfObject::Reference(IndirectRef { obj_num: 2, gen_num: 0 }),
208            ])),
209        ];
210
211        let stats = clean_objects(&mut objects);
212
213        assert_eq!(stats.duplicate_objects_removed, 1);
214        assert_eq!(stats.total_objects_before, 3);
215        assert_eq!(stats.total_objects_after, 2);
216
217        // Object 2 should be removed, and the reference in object 3 should now point to 1
218        assert_eq!(objects.len(), 2);
219
220        // After compaction, the array (originally obj 3) should reference obj 1
221        let arr_obj = objects.iter().find(|(_, obj)| obj.is_array());
222        assert!(arr_obj.is_some());
223        if let (_, PdfObject::Array(items)) = arr_obj.unwrap() {
224            if let PdfObject::Reference(r) = &items[0] {
225                assert_eq!(r.obj_num, 1); // points to the surviving duplicate
226            } else {
227                panic!("expected reference");
228            }
229        }
230    }
231
232    #[test]
233    fn test_clean_removes_null_objects() {
234        let mut objects = vec![
235            (1, PdfObject::Integer(10)),
236            (2, PdfObject::Null), // unreferenced null -> removed
237            (3, PdfObject::String(b"hello".to_vec())),
238        ];
239
240        let stats = clean_objects(&mut objects);
241
242        assert_eq!(stats.empty_objects_removed, 1);
243        assert_eq!(stats.total_objects_after, 2);
244    }
245
246    #[test]
247    fn test_clean_preserves_referenced_null() {
248        let mut objects = vec![
249            (1, PdfObject::Reference(IndirectRef { obj_num: 2, gen_num: 0 })),
250            (2, PdfObject::Null), // referenced null -> kept
251        ];
252
253        let stats = clean_objects(&mut objects);
254
255        assert_eq!(stats.empty_objects_removed, 0);
256        assert_eq!(stats.total_objects_after, 2);
257    }
258
259    #[test]
260    fn test_compact_renumbering() {
261        let mut objects = vec![
262            (1, PdfObject::Integer(10)),
263            (5, PdfObject::Integer(20)),
264            (10, PdfObject::Reference(IndirectRef { obj_num: 5, gen_num: 0 })),
265        ];
266
267        compact_object_numbers(&mut objects);
268
269        // Should be renumbered to 1, 2, 3
270        assert_eq!(objects[0].0, 1);
271        assert_eq!(objects[1].0, 2);
272        assert_eq!(objects[2].0, 3);
273
274        // Reference should be updated: old 5 -> new 2
275        if let PdfObject::Reference(r) = &objects[2].1 {
276            assert_eq!(r.obj_num, 2);
277        } else {
278            panic!("expected reference");
279        }
280    }
281
282    #[test]
283    fn test_compact_already_sequential() {
284        let mut objects = vec![
285            (1, PdfObject::Integer(10)),
286            (2, PdfObject::Integer(20)),
287            (3, PdfObject::Integer(30)),
288        ];
289
290        compact_object_numbers(&mut objects);
291
292        assert_eq!(objects[0].0, 1);
293        assert_eq!(objects[1].0, 2);
294        assert_eq!(objects[2].0, 3);
295    }
296
297    #[test]
298    fn test_clean_stats_correct() {
299        let mut objects = vec![
300            (1, PdfObject::Integer(42)),
301            (2, PdfObject::Integer(42)),  // dup
302            (3, PdfObject::Null),          // unreferenced null
303            (4, PdfObject::String(b"keep".to_vec())),
304            (5, PdfObject::Integer(99)),
305        ];
306
307        let stats = clean_objects(&mut objects);
308
309        assert_eq!(stats.total_objects_before, 5);
310        assert_eq!(stats.duplicate_objects_removed, 1);
311        assert_eq!(stats.empty_objects_removed, 1);
312        assert_eq!(stats.total_objects_after, 3); // 5 - 1 dup - 1 null
313    }
314
315    #[test]
316    fn test_dedup_dict_objects() {
317        let mut d1 = PdfDict::new();
318        d1.insert(b"Key".to_vec(), PdfObject::Integer(1));
319        let mut d2 = PdfDict::new();
320        d2.insert(b"Key".to_vec(), PdfObject::Integer(1));
321
322        let mut objects = vec![
323            (1, PdfObject::Dict(d1)),
324            (2, PdfObject::Dict(d2)),
325        ];
326
327        let removed = dedup_objects(&mut objects);
328        assert_eq!(removed, 1);
329        assert_eq!(objects.len(), 1);
330    }
331
332    #[test]
333    fn test_rewrite_references_nested() {
334        let mut remap = HashMap::new();
335        remap.insert(5u32, 1u32);
336
337        let mut obj = PdfObject::Array(vec![
338            PdfObject::Dict({
339                let mut d = PdfDict::new();
340                d.insert(
341                    b"Ref".to_vec(),
342                    PdfObject::Reference(IndirectRef { obj_num: 5, gen_num: 0 }),
343                );
344                d
345            }),
346        ]);
347
348        rewrite_references(&mut obj, &remap);
349
350        if let PdfObject::Array(items) = &obj {
351            if let PdfObject::Dict(d) = &items[0] {
352                if let Some(PdfObject::Reference(r)) = d.get(b"Ref") {
353                    assert_eq!(r.obj_num, 1);
354                } else {
355                    panic!("expected reference");
356                }
357            }
358        }
359    }
360
361    #[test]
362    fn test_clean_empty_list() {
363        let mut objects: Vec<(u32, PdfObject)> = vec![];
364        let stats = clean_objects(&mut objects);
365
366        assert_eq!(stats.total_objects_before, 0);
367        assert_eq!(stats.total_objects_after, 0);
368        assert_eq!(stats.duplicate_objects_removed, 0);
369        assert_eq!(stats.empty_objects_removed, 0);
370    }
371
372    // ── Name/String special char tests for dedup ────────────────────
373
374    #[test]
375    fn test_dedup_names_with_spaces() {
376        // Two dicts with the same Name that contains a space
377        // must be correctly identified as duplicates
378        let mut d1 = PdfDict::new();
379        d1.insert(
380            b"BaseFont".to_vec(),
381            PdfObject::Name(b"Pretendard Black".to_vec()),
382        );
383        let mut d2 = PdfDict::new();
384        d2.insert(
385            b"BaseFont".to_vec(),
386            PdfObject::Name(b"Pretendard Black".to_vec()),
387        );
388
389        let mut objects = vec![
390            (1, PdfObject::Dict(d1)),
391            (2, PdfObject::Dict(d2)),
392        ];
393
394        let removed = dedup_objects(&mut objects);
395        assert_eq!(removed, 1, "Identical dicts with space-names should dedup");
396    }
397
398    #[test]
399    fn test_no_false_dedup_similar_names() {
400        // "Pretendard Black" and "Pretendard Bold" must NOT dedup
401        let mut d1 = PdfDict::new();
402        d1.insert(
403            b"BaseFont".to_vec(),
404            PdfObject::Name(b"Pretendard Black".to_vec()),
405        );
406        let mut d2 = PdfDict::new();
407        d2.insert(
408            b"BaseFont".to_vec(),
409            PdfObject::Name(b"Pretendard Bold".to_vec()),
410        );
411
412        let mut objects = vec![
413            (1, PdfObject::Dict(d1)),
414            (2, PdfObject::Dict(d2)),
415        ];
416
417        let removed = dedup_objects(&mut objects);
418        assert_eq!(removed, 0, "Different names must not dedup");
419    }
420
421    #[test]
422    fn test_hash_name_with_special_chars() {
423        // Same name with spaces must hash identically
424        let obj1 = PdfObject::Name(b"Font Name Here".to_vec());
425        let obj2 = PdfObject::Name(b"Font Name Here".to_vec());
426        assert_eq!(hash_object(&obj1), hash_object(&obj2));
427
428        // Different names must hash differently
429        let obj3 = PdfObject::Name(b"Font Name There".to_vec());
430        assert_ne!(hash_object(&obj1), hash_object(&obj3));
431    }
432
433    #[test]
434    fn test_hash_string_with_parens() {
435        let obj1 = PdfObject::String(b"hello(world)".to_vec());
436        let obj2 = PdfObject::String(b"hello(world)".to_vec());
437        assert_eq!(hash_object(&obj1), hash_object(&obj2));
438    }
439}