Skip to main content

pdfluent_lopdf/
processor.rs

1use crate::Result;
2use crate::{Document, Object, ObjectId};
3use std::collections::BTreeMap;
4use std::fs::File;
5use std::io::Write;
6
7impl Document {
8    /// Change producer of document information dictionary.
9    pub fn change_producer(&mut self, producer: &str) {
10        if let Ok(info) = self.trailer.get_mut(b"Info") {
11            if let Some(dict) = match info {
12                Object::Dictionary(dict) => Some(dict),
13                Object::Reference(id) => {
14                    self.objects.get_mut(id).and_then(|o| o.as_dict_mut().ok())
15                }
16                _ => None,
17            } {
18                dict.set("Producer", Object::string_literal(producer));
19            }
20        }
21    }
22
23    /// Compress PDF stream objects.
24    pub fn compress(&mut self) {
25        for object in self.objects.values_mut() {
26            if let Object::Stream(stream) = object {
27                if stream.allows_compression {
28                    // Ignore any error and continue to compress other streams.
29                    let _ = stream.compress();
30                }
31            }
32        }
33    }
34
35    /// Decompress PDF stream objects.
36    pub fn decompress(&mut self) {
37        for object in self.objects.values_mut() {
38            if let Object::Stream(stream) = object {
39                let _ = stream.decompress();
40            }
41        }
42    }
43
44    /// Delete pages.
45    pub fn delete_pages(&mut self, page_numbers: &[u32]) {
46        // Collect ObjectIds for all pages-to-delete in one pass through get_pages().
47        // Then remove page references from Kids arrays and update Count — all in a
48        // single object traversal rather than calling delete_object() (which calls
49        // traverse_objects()) once per page.  The original O(n_pages × n_objects)
50        // loop caused 126 s for a 91-page PDF. (#manipulation-timeout)
51        use std::collections::HashSet;
52
53        let pages = self.get_pages();
54        let ids_to_delete: HashSet<ObjectId> = page_numbers
55            .iter()
56            .filter_map(|pn| pages.get(pn).copied())
57            .collect();
58
59        if ids_to_delete.is_empty() {
60            return;
61        }
62
63        // Track which page-tree nodes need their Count decremented and by how much.
64        let mut count_delta: BTreeMap<ObjectId, i64> = BTreeMap::new();
65
66        for &page_id in &ids_to_delete {
67            // Walk up the Parent chain and record count decrements.
68            if let Some(page_obj) = self.objects.get(&page_id) {
69                let parent_ref = page_obj
70                    .as_dict()
71                    .ok()
72                    .and_then(|d| d.get(b"Parent").ok())
73                    .and_then(|o| o.as_reference().ok());
74                let mut cur = parent_ref;
75                while let Some(tree_id) = cur {
76                    *count_delta.entry(tree_id).or_insert(0) += 1;
77                    cur = self
78                        .objects
79                        .get(&tree_id)
80                        .and_then(|o| o.as_dict().ok())
81                        .and_then(|d| d.get(b"Parent").ok())
82                        .and_then(|o| o.as_reference().ok());
83                }
84            }
85        }
86
87        // Remove deleted page references from all Kids arrays in a single pass.
88        for obj in self.objects.values_mut() {
89            match obj {
90                Object::Array(arr) => {
91                    arr.retain(|item| match item {
92                        Object::Reference(r) => !ids_to_delete.contains(r),
93                        _ => true,
94                    });
95                }
96                Object::Dictionary(dict) => {
97                    if let Ok(Object::Array(arr)) = dict.get_mut(b"Kids") {
98                        arr.retain(|item| match item {
99                            Object::Reference(r) => !ids_to_delete.contains(r),
100                            _ => true,
101                        });
102                    }
103                }
104                _ => {}
105            }
106        }
107
108        // Apply Count decrements to page-tree nodes.
109        for (tree_id, delta) in count_delta {
110            if let Some(obj) = self.objects.get_mut(&tree_id) {
111                if let Ok(dict) = obj.as_dict_mut() {
112                    if let Ok(count) = dict.get(b"Count").and_then(Object::as_i64) {
113                        dict.set("Count", (count - delta).max(0));
114                    }
115                }
116            }
117        }
118
119        // Remove the page objects themselves.
120        for page_id in ids_to_delete {
121            self.objects.remove(&page_id);
122        }
123    }
124
125    /// Prune all unused objects.
126    pub fn prune_objects(&mut self) -> Vec<ObjectId> {
127        let mut ids = vec![];
128        let refs = self.traverse_objects(|_| {});
129        for id in self.objects.keys() {
130            if !refs.contains(id) {
131                ids.push(*id);
132            }
133        }
134
135        for id in &ids {
136            self.objects.remove(id);
137        }
138
139        ids
140    }
141
142    /// Delete object by object ID.
143    pub fn delete_object(&mut self, id: ObjectId) -> Option<Object> {
144        let action = |object: &mut Object| match object {
145            Object::Array(array) => {
146                if let Some(index) = array.iter().position(|item: &Object| match *item {
147                    Object::Reference(ref_id) => ref_id == id,
148                    _ => false,
149                }) {
150                    array.remove(index);
151                }
152            }
153            Object::Dictionary(dict) => {
154                let keys: Vec<Vec<u8>> = dict
155                    .iter()
156                    .filter(|&(_, item): &(&Vec<u8>, &Object)| match *item {
157                        Object::Reference(ref_id) => ref_id == id,
158                        _ => false,
159                    })
160                    .map(|(k, _)| k.clone())
161                    .collect();
162                for key in keys {
163                    dict.remove(&key);
164                }
165            }
166            _ => {}
167        };
168        self.traverse_objects(action);
169        self.objects.remove(&id)
170    }
171
172    /// Delete zero length stream objects.
173    pub fn delete_zero_length_streams(&mut self) -> Vec<ObjectId> {
174        let mut ids = vec![];
175        for id in self.objects.keys() {
176            if self
177                .objects
178                .get(id)
179                .and_then(|o| Object::as_stream(o).ok())
180                .map(|stream| stream.content.is_empty())
181                .unwrap_or(false)
182            {
183                ids.push(*id);
184            }
185        }
186
187        for id in &ids {
188            self.delete_object(*id);
189        }
190
191        ids
192    }
193
194    /// Renumber objects, normally called after delete_unused_objects.
195    pub fn renumber_objects(&mut self) {
196        self.renumber_objects_with(1)
197    }
198
199    fn update_bookmark_pages(&mut self, bookmarks: &[u32], old: &ObjectId, new: &ObjectId) {
200        for id in bookmarks {
201            let (children, page) = match self.bookmark_table.get(id) {
202                Some(n) => (n.children.clone(), n.page),
203                None => return,
204            };
205
206            if page == *old {
207                let bookmark = self.bookmark_table.get_mut(id).unwrap();
208                bookmark.page = *new;
209            }
210
211            if !children.is_empty() {
212                self.update_bookmark_pages(&children[..], old, new);
213            }
214        }
215    }
216
217    pub fn renumber_bookmarks(&mut self, old: &ObjectId, new: &ObjectId) {
218        if !self.bookmarks.is_empty() {
219            self.update_bookmark_pages(&self.bookmarks.clone(), old, new);
220        }
221    }
222
223    /// Renumber objects with a custom starting id, this is very useful in case of multiple
224    /// document object insertions in a single main document
225    pub fn renumber_objects_with(&mut self, starting_id: u32) {
226        let mut replace = BTreeMap::new();
227        let mut new_id = starting_id;
228        let mut i = 0;
229
230        // Check if we need to order the pages first, as this means the first page doesn't have a lower ID.
231        // So it ends up in a random spot based on its ID. We check first to avoid double traversal, unless we have too.
232
233        let mut page_order: Vec<(i32, (u32, u16))> = self
234            .page_iter()
235            .map(|id| {
236                i += 1;
237                (i, id)
238            })
239            .collect();
240
241        page_order.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
242
243        i = 0;
244
245        let needs_ordering = page_order.iter().any(|a| {
246            i += 1;
247            a.0 != i
248        });
249
250        if needs_ordering {
251            let mut pages = page_order.clone();
252            pages.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
253            let mut objects = BTreeMap::new();
254
255            for (old, new) in pages.iter().zip(page_order) {
256                if let Some(object) = self.objects.remove(&old.1) {
257                    objects.insert((new.1.0, old.1.1), object);
258                    replace.insert(old.1, (new.1.0, old.1.1));
259                }
260
261                if old.1 != new.1 {
262                    self.renumber_bookmarks(&old.1, &(new.1.0, old.1.1));
263                }
264            }
265
266            for (new, object) in objects {
267                self.objects.insert(new, object);
268            }
269
270            let action = |object: &mut Object| {
271                if let Object::Reference(id) = object {
272                    if replace.contains_key(id) {
273                        *id = replace[id];
274                    }
275                }
276            };
277
278            self.traverse_objects(action);
279            replace.clear();
280        }
281
282        let mut ids = self.objects.keys().cloned().collect::<Vec<ObjectId>>();
283        ids.sort_unstable();
284
285        for id in ids {
286            if id.0 != new_id {
287                replace.insert(id, (new_id, id.1));
288            }
289
290            new_id += 1;
291        }
292
293        let mut objects = BTreeMap::new();
294
295        // remove and collect all removed objects
296        for (old, new) in &replace {
297            if let Some(object) = self.objects.remove(old) {
298                objects.insert(*new, object);
299            }
300
301            if old != new {
302                self.renumber_bookmarks(old, new);
303            }
304        }
305
306        // insert new replaced keys objects
307        for (new, object) in objects {
308            self.objects.insert(new, object);
309        }
310
311        let action = |object: &mut Object| {
312            if let Object::Reference(id) = object {
313                if replace.contains_key(id) {
314                    *id = replace[id];
315                }
316            }
317        };
318
319        self.traverse_objects(action);
320
321        self.max_id = new_id - 1;
322    }
323
324    pub fn change_content_stream(&mut self, stream_id: ObjectId, content: Vec<u8>) {
325        if let Some(Object::Stream(stream)) = self.objects.get_mut(&stream_id) {
326            stream.set_plain_content(content);
327            // Ignore any compression error.
328            let _ = stream.compress();
329        }
330    }
331
332    pub fn change_page_content(&mut self, page_id: ObjectId, content: Vec<u8>) -> Result<()> {
333        let contents = self
334            .get_dictionary(page_id)
335            .and_then(|page| page.get(b"Contents"))?;
336        match contents {
337            Object::Reference(id) => self.change_content_stream(*id, content),
338            Object::Array(arr) => {
339                if arr.len() == 1 {
340                    if let Ok(id) = arr[0].as_reference() {
341                        self.change_content_stream(id, content)
342                    }
343                } else {
344                    let new_stream = self.add_object(super::Stream::new(dictionary! {}, content));
345                    if let Ok(Object::Dictionary(dict)) = self.get_object_mut(page_id) {
346                        dict.set("Contents", new_stream);
347                    }
348                }
349            }
350            _ => {}
351        }
352        Ok(())
353    }
354
355    pub fn extract_stream(&self, stream_id: ObjectId, decompress: bool) -> Result<()> {
356        let mut file = File::create(format!("{stream_id:?}.bin"))?;
357        if let Ok(Object::Stream(stream)) = self.get_object(stream_id) {
358            if decompress {
359                if let Ok(data) = stream.decompressed_content() {
360                    file.write_all(&data)?;
361                } else {
362                    file.write_all(&stream.content)?;
363                }
364            } else {
365                file.write_all(&stream.content)?;
366            }
367        }
368        Ok(())
369    }
370}