1use std::collections::HashMap;
2
3use crate::object::{PdfDict, PdfObject};
4
5#[derive(Debug, Default, PartialEq, Eq)]
7pub struct CleanStats {
8 pub duplicate_objects_removed: usize,
9 pub empty_objects_removed: usize,
10 pub total_objects_before: usize,
11 pub total_objects_after: usize,
12}
13
14pub fn clean_objects(objects: &mut Vec<(u32, PdfObject)>) -> CleanStats {
23 let total_before = objects.len();
24
25 let dups_removed = dedup_objects(objects);
27
28 let nulls_removed = remove_null_objects(objects);
30
31 compact_object_numbers(objects);
33
34 CleanStats {
35 duplicate_objects_removed: dups_removed,
36 empty_objects_removed: nulls_removed,
37 total_objects_before: total_before,
38 total_objects_after: objects.len(),
39 }
40}
41
42fn hash_object(obj: &PdfObject) -> String {
45 format!("{}", obj)
47}
48
49fn dedup_objects(objects: &mut Vec<(u32, PdfObject)>) -> usize {
53 let mut hash_to_first: HashMap<String, u32> = HashMap::new();
55 let mut remap: HashMap<u32, u32> = HashMap::new();
57
58 for (obj_num, obj) in objects.iter() {
59 let h = hash_object(obj);
60 match hash_to_first.get(&h) {
61 Some(&first_num) if first_num != *obj_num => {
62 remap.insert(*obj_num, first_num);
63 }
64 _ => {
65 hash_to_first.insert(h, *obj_num);
66 }
67 }
68 }
69
70 if remap.is_empty() {
71 return 0;
72 }
73
74 let removed = remap.len();
75
76 objects.retain(|(obj_num, _)| !remap.contains_key(obj_num));
78
79 for (_, obj) in objects.iter_mut() {
81 rewrite_references(obj, &remap);
82 }
83
84 removed
85}
86
87pub(crate) fn rewrite_references(obj: &mut PdfObject, remap: &HashMap<u32, u32>) {
89 match obj {
90 PdfObject::Reference(r) => {
91 if let Some(&new_num) = remap.get(&r.obj_num) {
92 r.obj_num = new_num;
93 }
94 }
95 PdfObject::Array(items) => {
96 for item in items.iter_mut() {
97 rewrite_references(item, remap);
98 }
99 }
100 PdfObject::Dict(dict) => {
101 rewrite_references_in_dict(dict, remap);
102 }
103 PdfObject::Stream { dict, .. } => {
104 rewrite_references_in_dict(dict, remap);
105 }
106 _ => {}
107 }
108}
109
110fn rewrite_references_in_dict(dict: &mut PdfDict, remap: &HashMap<u32, u32>) {
112 let keys: Vec<Vec<u8>> = dict.keys().cloned().collect();
114 for key in keys {
115 if let Some(val) = dict.get(&key) {
116 let mut val = val.clone();
117 rewrite_references(&mut val, remap);
118 dict.insert(key, val);
119 }
120 }
121}
122
123fn remove_null_objects(objects: &mut Vec<(u32, PdfObject)>) -> usize {
125 let mut referenced: std::collections::HashSet<u32> = std::collections::HashSet::new();
127 for (_, obj) in objects.iter() {
128 collect_references(obj, &mut referenced);
129 }
130
131 let before = objects.len();
132
133 objects.retain(|(obj_num, obj)| {
135 if obj.is_null() && !referenced.contains(obj_num) {
136 false
137 } else {
138 true
139 }
140 });
141
142 before - objects.len()
143}
144
145fn collect_references(obj: &PdfObject, refs: &mut std::collections::HashSet<u32>) {
147 match obj {
148 PdfObject::Reference(r) => {
149 refs.insert(r.obj_num);
150 }
151 PdfObject::Array(items) => {
152 for item in items {
153 collect_references(item, refs);
154 }
155 }
156 PdfObject::Dict(dict) => {
157 for (_, val) in dict.iter() {
158 collect_references(val, refs);
159 }
160 }
161 PdfObject::Stream { dict, .. } => {
162 for (_, val) in dict.iter() {
163 collect_references(val, refs);
164 }
165 }
166 _ => {}
167 }
168}
169
170fn compact_object_numbers(objects: &mut Vec<(u32, PdfObject)>) {
172 let mut remap: HashMap<u32, u32> = HashMap::new();
174 for (i, (obj_num, _)) in objects.iter().enumerate() {
175 let new_num = (i + 1) as u32;
176 if *obj_num != new_num {
177 remap.insert(*obj_num, new_num);
178 }
179 }
180
181 if remap.is_empty() {
182 return;
183 }
184
185 for (i, (obj_num, _)) in objects.iter_mut().enumerate() {
187 *obj_num = (i + 1) as u32;
188 }
189
190 for (_, obj) in objects.iter_mut() {
192 rewrite_references(obj, &remap);
193 }
194}
195
196#[cfg(test)]
197mod tests {
198 use super::*;
199 use crate::object::{IndirectRef, PdfDict, PdfObject};
200
201 #[test]
202 fn test_clean_removes_duplicates() {
203 let mut objects = vec![
204 (1, PdfObject::Integer(42)),
205 (2, PdfObject::Integer(42)), (3, PdfObject::Array(vec![
207 PdfObject::Reference(IndirectRef { obj_num: 2, gen_num: 0 }),
208 ])),
209 ];
210
211 let stats = clean_objects(&mut objects);
212
213 assert_eq!(stats.duplicate_objects_removed, 1);
214 assert_eq!(stats.total_objects_before, 3);
215 assert_eq!(stats.total_objects_after, 2);
216
217 assert_eq!(objects.len(), 2);
219
220 let arr_obj = objects.iter().find(|(_, obj)| obj.is_array());
222 assert!(arr_obj.is_some());
223 if let (_, PdfObject::Array(items)) = arr_obj.unwrap() {
224 if let PdfObject::Reference(r) = &items[0] {
225 assert_eq!(r.obj_num, 1); } else {
227 panic!("expected reference");
228 }
229 }
230 }
231
232 #[test]
233 fn test_clean_removes_null_objects() {
234 let mut objects = vec![
235 (1, PdfObject::Integer(10)),
236 (2, PdfObject::Null), (3, PdfObject::String(b"hello".to_vec())),
238 ];
239
240 let stats = clean_objects(&mut objects);
241
242 assert_eq!(stats.empty_objects_removed, 1);
243 assert_eq!(stats.total_objects_after, 2);
244 }
245
246 #[test]
247 fn test_clean_preserves_referenced_null() {
248 let mut objects = vec![
249 (1, PdfObject::Reference(IndirectRef { obj_num: 2, gen_num: 0 })),
250 (2, PdfObject::Null), ];
252
253 let stats = clean_objects(&mut objects);
254
255 assert_eq!(stats.empty_objects_removed, 0);
256 assert_eq!(stats.total_objects_after, 2);
257 }
258
259 #[test]
260 fn test_compact_renumbering() {
261 let mut objects = vec![
262 (1, PdfObject::Integer(10)),
263 (5, PdfObject::Integer(20)),
264 (10, PdfObject::Reference(IndirectRef { obj_num: 5, gen_num: 0 })),
265 ];
266
267 compact_object_numbers(&mut objects);
268
269 assert_eq!(objects[0].0, 1);
271 assert_eq!(objects[1].0, 2);
272 assert_eq!(objects[2].0, 3);
273
274 if let PdfObject::Reference(r) = &objects[2].1 {
276 assert_eq!(r.obj_num, 2);
277 } else {
278 panic!("expected reference");
279 }
280 }
281
282 #[test]
283 fn test_compact_already_sequential() {
284 let mut objects = vec![
285 (1, PdfObject::Integer(10)),
286 (2, PdfObject::Integer(20)),
287 (3, PdfObject::Integer(30)),
288 ];
289
290 compact_object_numbers(&mut objects);
291
292 assert_eq!(objects[0].0, 1);
293 assert_eq!(objects[1].0, 2);
294 assert_eq!(objects[2].0, 3);
295 }
296
297 #[test]
298 fn test_clean_stats_correct() {
299 let mut objects = vec![
300 (1, PdfObject::Integer(42)),
301 (2, PdfObject::Integer(42)), (3, PdfObject::Null), (4, PdfObject::String(b"keep".to_vec())),
304 (5, PdfObject::Integer(99)),
305 ];
306
307 let stats = clean_objects(&mut objects);
308
309 assert_eq!(stats.total_objects_before, 5);
310 assert_eq!(stats.duplicate_objects_removed, 1);
311 assert_eq!(stats.empty_objects_removed, 1);
312 assert_eq!(stats.total_objects_after, 3); }
314
315 #[test]
316 fn test_dedup_dict_objects() {
317 let mut d1 = PdfDict::new();
318 d1.insert(b"Key".to_vec(), PdfObject::Integer(1));
319 let mut d2 = PdfDict::new();
320 d2.insert(b"Key".to_vec(), PdfObject::Integer(1));
321
322 let mut objects = vec![
323 (1, PdfObject::Dict(d1)),
324 (2, PdfObject::Dict(d2)),
325 ];
326
327 let removed = dedup_objects(&mut objects);
328 assert_eq!(removed, 1);
329 assert_eq!(objects.len(), 1);
330 }
331
332 #[test]
333 fn test_rewrite_references_nested() {
334 let mut remap = HashMap::new();
335 remap.insert(5u32, 1u32);
336
337 let mut obj = PdfObject::Array(vec![
338 PdfObject::Dict({
339 let mut d = PdfDict::new();
340 d.insert(
341 b"Ref".to_vec(),
342 PdfObject::Reference(IndirectRef { obj_num: 5, gen_num: 0 }),
343 );
344 d
345 }),
346 ]);
347
348 rewrite_references(&mut obj, &remap);
349
350 if let PdfObject::Array(items) = &obj {
351 if let PdfObject::Dict(d) = &items[0] {
352 if let Some(PdfObject::Reference(r)) = d.get(b"Ref") {
353 assert_eq!(r.obj_num, 1);
354 } else {
355 panic!("expected reference");
356 }
357 }
358 }
359 }
360
361 #[test]
362 fn test_clean_empty_list() {
363 let mut objects: Vec<(u32, PdfObject)> = vec![];
364 let stats = clean_objects(&mut objects);
365
366 assert_eq!(stats.total_objects_before, 0);
367 assert_eq!(stats.total_objects_after, 0);
368 assert_eq!(stats.duplicate_objects_removed, 0);
369 assert_eq!(stats.empty_objects_removed, 0);
370 }
371
372 #[test]
375 fn test_dedup_names_with_spaces() {
376 let mut d1 = PdfDict::new();
379 d1.insert(
380 b"BaseFont".to_vec(),
381 PdfObject::Name(b"Pretendard Black".to_vec()),
382 );
383 let mut d2 = PdfDict::new();
384 d2.insert(
385 b"BaseFont".to_vec(),
386 PdfObject::Name(b"Pretendard Black".to_vec()),
387 );
388
389 let mut objects = vec![
390 (1, PdfObject::Dict(d1)),
391 (2, PdfObject::Dict(d2)),
392 ];
393
394 let removed = dedup_objects(&mut objects);
395 assert_eq!(removed, 1, "Identical dicts with space-names should dedup");
396 }
397
398 #[test]
399 fn test_no_false_dedup_similar_names() {
400 let mut d1 = PdfDict::new();
402 d1.insert(
403 b"BaseFont".to_vec(),
404 PdfObject::Name(b"Pretendard Black".to_vec()),
405 );
406 let mut d2 = PdfDict::new();
407 d2.insert(
408 b"BaseFont".to_vec(),
409 PdfObject::Name(b"Pretendard Bold".to_vec()),
410 );
411
412 let mut objects = vec![
413 (1, PdfObject::Dict(d1)),
414 (2, PdfObject::Dict(d2)),
415 ];
416
417 let removed = dedup_objects(&mut objects);
418 assert_eq!(removed, 0, "Different names must not dedup");
419 }
420
421 #[test]
422 fn test_hash_name_with_special_chars() {
423 let obj1 = PdfObject::Name(b"Font Name Here".to_vec());
425 let obj2 = PdfObject::Name(b"Font Name Here".to_vec());
426 assert_eq!(hash_object(&obj1), hash_object(&obj2));
427
428 let obj3 = PdfObject::Name(b"Font Name There".to_vec());
430 assert_ne!(hash_object(&obj1), hash_object(&obj3));
431 }
432
433 #[test]
434 fn test_hash_string_with_parens() {
435 let obj1 = PdfObject::String(b"hello(world)".to_vec());
436 let obj2 = PdfObject::String(b"hello(world)".to_vec());
437 assert_eq!(hash_object(&obj1), hash_object(&obj2));
438 }
439}