Skip to main content

pdfluent_lopdf/
document.rs

1use super::encodings::Encoding;
2use super::{Bookmark, Dictionary, Object, ObjectId};
3use crate::encryption::crypt_filters::*;
4use crate::encryption::{self, EncryptionState, PasswordAlgorithm};
5use crate::xobject::PdfImage;
6use crate::xref::{Xref, XrefType};
7use crate::{Error, ObjectStream, Result, Stream};
8use log::debug;
9use std::cmp::max;
10use std::collections::{BTreeMap, HashMap, HashSet};
11use std::io::Write;
12use std::str;
13use std::sync::Arc;
14
15/// A PDF document.
16///
17/// This can both be a combination of multiple incremental updates
18/// or just one (the last) incremental update in a PDF file.
19#[derive(Debug, Clone)]
20pub struct Document {
21    /// The version of the PDF specification to which the file conforms.
22    pub version: String,
23
24    /// The binary mark important for PDF A/2,3 tells various software tools to classify
25    /// the file as containing 8-bit binary that should be preserved during processing
26    pub binary_mark: Vec<u8>,
27
28    /// The trailer gives the location of the cross-reference table and of certain special objects.
29    pub trailer: Dictionary,
30
31    /// The cross-reference table contains locations of the indirect objects.
32    pub reference_table: Xref,
33
34    /// The objects that make up the document contained in the file.
35    pub objects: BTreeMap<ObjectId, Object>,
36
37    /// Current maximum object id within the document.
38    pub max_id: u32,
39
40    /// Current maximum object id within Bookmarks.
41    pub max_bookmark_id: u32,
42
43    /// The bookmarks in the document. Render at the very end of document after renumbering objects.
44    pub bookmarks: Vec<u32>,
45
46    /// used to locate a stored Bookmark so children can be appended to it via its id. Otherwise we
47    /// need to do recursive lookups and returns on the bookmarks internal layout Vec
48    pub bookmark_table: HashMap<u32, Bookmark>,
49
50    /// The byte the cross-reference table starts at.
51    /// This value is only set during reading, but not when writing the file.
52    /// It is used to support incremental updates in PDFs.
53    /// Default value is `0`.
54    pub xref_start: usize,
55
56    /// The encryption state stores the parameters that were used to decrypt this document if the
57    /// document has been decrypted.
58    pub encryption_state: Option<EncryptionState>,
59
60    /// ObjStm container object IDs that have not yet been decompressed.
61    ///
62    /// Populated only when `LoadOptions::lazy_objstm` is set during loading.
63    /// Call `Document::resolve_pending_object_streams` to extract the objects
64    /// contained in these streams before accessing them.
65    pub pending_obj_streams: Vec<ObjectId>,
66}
67
68impl Document {
69    /// Create new PDF document.
70    pub fn new() -> Self {
71        Self {
72            version: "1.4".to_string(),
73            binary_mark: vec![0xBB, 0xAD, 0xC0, 0xDE],
74            trailer: Dictionary::new(),
75            reference_table: Xref::new(0, XrefType::CrossReferenceStream),
76            objects: BTreeMap::new(),
77            max_id: 0,
78            max_bookmark_id: 0,
79            bookmarks: Vec::new(),
80            bookmark_table: HashMap::new(),
81            xref_start: 0,
82            encryption_state: None,
83            pending_obj_streams: Vec::new(),
84        }
85    }
86
87    /// Create a new PDF document that is an incremental update to a previous document.
88    pub fn new_from_prev(prev: &Document) -> Self {
89        let mut new_trailer = prev.trailer.clone();
90        new_trailer.set("Prev", Object::Integer(prev.xref_start as i64));
91        Self {
92            version: "1.4".to_string(),
93            binary_mark: vec![0xBB, 0xAD, 0xC0, 0xDE],
94            trailer: new_trailer,
95            reference_table: Xref::new(0, prev.reference_table.cross_reference_type),
96            objects: BTreeMap::new(),
97            max_id: prev.max_id,
98            max_bookmark_id: prev.max_bookmark_id,
99            bookmarks: Vec::new(),
100            bookmark_table: HashMap::new(),
101            xref_start: 0,
102            encryption_state: None,
103            pending_obj_streams: Vec::new(),
104        }
105    }
106
107    const DEREF_LIMIT: usize = 128;
108
109    fn recursive_fix_pages(&mut self, bookmarks: &[u32], first: bool) -> ObjectId {
110        if !bookmarks.is_empty() {
111            for id in bookmarks {
112                let (children, mut page) = match self.bookmark_table.get(id) {
113                    Some(n) => (n.children.clone(), n.page),
114                    None => return (0, 0),
115                };
116
117                if 0 == page.0 && !children.is_empty() {
118                    let objectid = self.recursive_fix_pages(&children[..], false);
119
120                    let bookmark = self.bookmark_table.get_mut(id).unwrap();
121                    bookmark.page = objectid;
122                    page = objectid;
123                }
124
125                if !first && 0 != page.0 {
126                    return page;
127                }
128
129                if first && !children.is_empty() {
130                    self.recursive_fix_pages(&children[..], first);
131                }
132            }
133        }
134
135        (0, 0)
136    }
137
138    /// Adjusts the Parents that have a ObjectId of (0,_) to that
139    /// of their first child. will recurse through all entries
140    /// till all parents of children are set. This should be
141    /// ran before building the final bookmark objects but after
142    /// renumbering of objects.
143    pub fn adjust_zero_pages(&mut self) {
144        self.recursive_fix_pages(&self.bookmarks.clone(), true);
145    }
146
147    /// Follow references if the supplied object is a reference.
148    ///
149    /// Returns a tuple of an optional object id and final object.
150    /// The object id will be None if the object was not a
151    /// reference. Otherwise, it will be the last object id in the
152    /// reference chain.
153    pub fn dereference<'a>(
154        &'a self,
155        mut object: &'a Object,
156    ) -> Result<(Option<ObjectId>, &'a Object)> {
157        let mut nb_deref = 0;
158        let mut id = None;
159
160        while let Ok(ref_id) = object.as_reference() {
161            id = Some(ref_id);
162            object = self
163                .objects
164                .get(&ref_id)
165                .ok_or(Error::ObjectNotFound(ref_id))?;
166
167            nb_deref += 1;
168            if nb_deref > Self::DEREF_LIMIT {
169                return Err(Error::ReferenceLimit);
170            }
171        }
172
173        Ok((id, object))
174    }
175
176    /// Get object by object id, will iteratively dereference a referenced object.
177    pub fn get_object(&self, id: ObjectId) -> Result<&Object> {
178        let object = self.objects.get(&id).ok_or(Error::ObjectNotFound(id))?;
179        self.dereference(object).map(|(_, object)| object)
180    }
181
182    /// Determines if an object exists in the current document (or incremental update.)
183    /// with the given `ObjectId`.
184    /// `true` if the object exists, `false` if it does not exist.
185    pub fn has_object(&self, id: ObjectId) -> bool {
186        self.objects.contains_key(&id)
187    }
188
189    /// Get mutable reference to object by object ID, will iteratively dereference a referenced object.
190    pub fn get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object> {
191        let object = self.objects.get(&id).ok_or(Error::ObjectNotFound(id))?;
192        let (ref_id, _obj) = self.dereference(object)?;
193
194        let target_id = ref_id.unwrap_or(id);
195        self.objects
196            .get_mut(&target_id)
197            .ok_or(Error::ObjectNotFound(target_id))
198    }
199
200    /// Decompress and extract all ObjStm streams deferred by `LoadOptions::lazy_objstm`.
201    ///
202    /// Must be called before accessing any object that resides inside an ObjStm
203    /// container when the document was loaded with `lazy_objstm: true`.  Safe to
204    /// call on documents loaded without the lazy flag (it is a no-op then).
205    ///
206    /// After this call `pending_obj_streams` is empty and every contained object
207    /// is accessible via `get_object`.
208    pub fn resolve_pending_object_streams(&mut self) -> Result<()> {
209        // Drain the pending list so we can iterate while mutating self.objects.
210        let ids: Vec<ObjectId> = self.pending_obj_streams.drain(..).collect();
211        for container_id in ids {
212            let mut stream = self
213                .objects
214                .get(&container_id)
215                .ok_or(Error::ObjStmDecompress {
216                    container_id: container_id.0,
217                })?
218                .as_stream()?
219                .clone();
220            let obj_stream =
221                ObjectStream::new(&mut stream).map_err(|_| Error::ObjStmDecompress {
222                    container_id: container_id.0,
223                })?;
224            // Only insert objects whose xref entry assigns them to this container.
225            // This prevents stale copies from older ObjStm containers (incremental
226            // saves) from winning non-deterministically.
227            for (id, object) in obj_stream.objects {
228                if self
229                    .reference_table
230                    .compressed_object_belongs_to(id, container_id)
231                {
232                    self.objects.entry(id).or_insert(object);
233                }
234            }
235            // The container is no longer needed; drop it to free memory.
236            self.objects.remove(&container_id);
237        }
238        Ok(())
239    }
240
241    /// Get the object ID of the page that contains `id`.
242    pub fn get_object_page(&self, id: ObjectId) -> Result<ObjectId> {
243        for (_, object_id) in self.get_pages() {
244            let page = self.get_object(object_id)?.as_dict()?;
245            let annots = page.get(b"Annots")?.as_array()?;
246            let mut objects_ids = annots.iter().map(Object::as_reference);
247
248            let contains = objects_ids.any(|object_id| Some(id) == object_id.ok());
249            if contains {
250                return Ok(object_id);
251            }
252        }
253
254        Err(Error::PageNumberNotFound(0))
255    }
256
257    /// Get dictionary object by id.
258    pub fn get_dictionary(&self, id: ObjectId) -> Result<&Dictionary> {
259        self.get_object(id).and_then(Object::as_dict)
260    }
261
262    /// Get a mutable dictionary object by id.
263    pub fn get_dictionary_mut(&mut self, id: ObjectId) -> Result<&mut Dictionary> {
264        self.get_object_mut(id).and_then(Object::as_dict_mut)
265    }
266
267    /// Get dictionary in dictionary by key.
268    pub fn get_dict_in_dict<'a>(
269        &'a self,
270        node: &'a Dictionary,
271        key: &[u8],
272    ) -> Result<&'a Dictionary> {
273        match node.get(key)? {
274            Object::Reference(object_id) => self.get_dictionary(*object_id),
275            Object::Dictionary(dic) => Ok(dic),
276            obj => Err(Error::ObjectType {
277                expected: "Dictionary",
278                found: obj.enum_variant(),
279            }),
280        }
281    }
282
283    /// Traverse objects from trailer recursively, return all referenced object IDs.
284    pub fn traverse_objects<A: Fn(&mut Object)>(&mut self, action: A) -> Vec<ObjectId> {
285        fn traverse_array<A: Fn(&mut Object)>(
286            array: &mut [Object],
287            action: &A,
288            refs: &mut Vec<ObjectId>,
289        ) {
290            for item in array.iter_mut() {
291                traverse_object(item, action, refs);
292            }
293        }
294        fn traverse_dictionary<A: Fn(&mut Object)>(
295            dict: &mut Dictionary,
296            action: &A,
297            refs: &mut Vec<ObjectId>,
298        ) {
299            for (_, v) in dict.iter_mut() {
300                traverse_object(v, action, refs);
301            }
302        }
303        fn traverse_object<A: Fn(&mut Object)>(
304            object: &mut Object,
305            action: &A,
306            refs: &mut Vec<ObjectId>,
307        ) {
308            action(object);
309            match object {
310                Object::Array(array) => traverse_array(array, action, refs),
311                Object::Dictionary(dict) => traverse_dictionary(dict, action, refs),
312                Object::Stream(stream) => traverse_dictionary(&mut stream.dict, action, refs),
313                Object::Reference(id) if !refs.contains(id) => {
314                    refs.push(*id);
315                }
316                _ => {}
317            }
318        }
319        let mut refs = vec![];
320        traverse_dictionary(&mut self.trailer, &action, &mut refs);
321        let mut index = 0;
322        while index < refs.len() {
323            if let Some(object) = self.objects.get_mut(&refs[index]) {
324                traverse_object(object, &action, &mut refs);
325            }
326            index += 1;
327        }
328        refs
329    }
330
331    /// Return dictionary with encryption information
332    pub fn get_encrypted(&self) -> Result<&Dictionary> {
333        self.trailer
334            .get(b"Encrypt")
335            .and_then(Object::as_reference)
336            .and_then(|id| self.get_dictionary(id))
337    }
338
339    /// Return true if PDF document is currently encrypted
340    pub fn is_encrypted(&self) -> bool {
341        self.get_encrypted().is_ok()
342    }
343
344    /// Return true if the document was originally encrypted when loaded
345    pub fn was_encrypted(&self) -> bool {
346        self.encryption_state.is_some()
347    }
348
349    /// Authenticate the provided owner password directly as bytes without sanitization
350    pub fn authenticate_raw_owner_password<P>(&self, password: P) -> Result<()>
351    where
352        P: AsRef<[u8]>,
353    {
354        if !self.is_encrypted() {
355            return Err(Error::NotEncrypted);
356        }
357
358        let password = password.as_ref();
359        let algorithm = PasswordAlgorithm::try_from(self)?;
360        algorithm.authenticate_owner_password(self, password)?;
361
362        Ok(())
363    }
364
365    /// Authenticate the provided user password directly as bytes without sanitization
366    pub fn authenticate_raw_user_password<P>(&self, password: P) -> Result<()>
367    where
368        P: AsRef<[u8]>,
369    {
370        if !self.is_encrypted() {
371            return Err(Error::NotEncrypted);
372        }
373
374        let password = password.as_ref();
375        let algorithm = PasswordAlgorithm::try_from(self)?;
376        algorithm.authenticate_user_password(self, password)?;
377
378        Ok(())
379    }
380
381    /// Authenticate the provided owner/user password as bytes without sanitization
382    pub fn authenticate_raw_password<P>(&self, password: P) -> Result<()>
383    where
384        P: AsRef<[u8]>,
385    {
386        if !self.is_encrypted() {
387            return Err(Error::NotEncrypted);
388        }
389
390        let password = password.as_ref();
391        let algorithm = PasswordAlgorithm::try_from(self)?;
392        algorithm
393            .authenticate_owner_password(self, password)
394            .or(algorithm.authenticate_user_password(self, password))?;
395
396        Ok(())
397    }
398
399    /// Authenticate the provided owner password
400    pub fn authenticate_owner_password(&self, password: &str) -> Result<()> {
401        if !self.is_encrypted() {
402            return Err(Error::NotEncrypted);
403        }
404
405        let algorithm = PasswordAlgorithm::try_from(self)?;
406        let password = algorithm.sanitize_password(password)?;
407        algorithm.authenticate_owner_password(self, &password)?;
408
409        Ok(())
410    }
411
412    /// Authenticate the provided user password
413    pub fn authenticate_user_password(&self, password: &str) -> Result<()> {
414        if !self.is_encrypted() {
415            return Err(Error::NotEncrypted);
416        }
417
418        let algorithm = PasswordAlgorithm::try_from(self)?;
419        let password = algorithm.sanitize_password(password)?;
420        algorithm.authenticate_user_password(self, &password)?;
421
422        Ok(())
423    }
424
425    /// Authenticate the provided owner/user password
426    pub fn authenticate_password(&self, password: &str) -> Result<()> {
427        if !self.is_encrypted() {
428            return Err(Error::NotEncrypted);
429        }
430
431        let algorithm = PasswordAlgorithm::try_from(self)?;
432        let password = algorithm.sanitize_password(password)?;
433        algorithm
434            .authenticate_owner_password(self, &password)
435            .or(algorithm.authenticate_user_password(self, &password))?;
436
437        Ok(())
438    }
439
440    /// Returns a `BTreeMap` of the crypt filters available in the PDF document if any.
441    pub fn get_crypt_filters(&self) -> BTreeMap<Vec<u8>, Arc<dyn CryptFilter>> {
442        let mut crypt_filters = BTreeMap::new();
443
444        if let Ok(filters) = self
445            .get_encrypted()
446            .and_then(|dict| dict.get(b"CF"))
447            .and_then(|object| object.as_dict())
448        {
449            for (name, filter) in filters {
450                let Ok(filter) = filter.as_dict() else {
451                    continue;
452                };
453
454                if filter.get(b"Type").is_ok() && !filter.has_type(b"CryptFilter") {
455                    continue;
456                }
457
458                // Get the Crypt Filter Method (CFM) used, if any, by the PDF reader to decrypt data.
459                let cfm = filter.get(b"CFM").and_then(|object| object.as_name()).ok();
460
461                let crypt_filter: Arc<dyn CryptFilter> = match cfm {
462                    // The application shall ask the security handler for the file encryption key
463                    // and shall implicitly decrypt data using the RC4 algorithm.
464                    Some(b"V2") => Arc::new(Rc4CryptFilter),
465                    // The application shall ask the security handler for the file encryption key
466                    // and shall implicitly decrypt data using the AES-128 algorithm in Cipher
467                    // Block Chaining (CBC) mode with a 16-byte block size and an initialization
468                    // vector that shall be randomly generated and placed as the first 16 bytes in
469                    // the stream or string. The key size (Length) shall be 128 bits.
470                    Some(b"AESV2") => Arc::new(Aes128CryptFilter),
471                    // The application shall ask the security handler for the file encryption key
472                    // and shall implicitly decrypt data using the AES-256 algorithm in Cipher
473                    // Block Chaining (CBC) with padding mode with a 16-byte block size and an
474                    // initialization vector that is randomly generated and placed as the first 16
475                    // bytes in the stream or string. The key size (Length) shall be 256 bits.
476                    Some(b"AESV3") => Arc::new(Aes256CryptFilter),
477                    // The application shall not decrypt data but shall direct the input stream to
478                    // the security handler for decryption.
479                    Some(b"Identity") | None => Arc::new(IdentityCryptFilter),
480                    // Unknown crypt filter method.
481                    _ => continue,
482                };
483
484                crypt_filters.insert(name.to_vec(), crypt_filter);
485            }
486        }
487
488        crypt_filters
489    }
490
491    /// Replaces all encrypted Strings and Streams with their encrypted contents
492    pub fn encrypt(&mut self, state: &EncryptionState) -> Result<()> {
493        if self.is_encrypted() {
494            return Err(Error::AlreadyEncrypted);
495        }
496
497        let encrypted = state.encode()?;
498
499        for (&id, obj) in self.objects.iter_mut() {
500            encryption::encrypt_object(state, id, obj)?;
501        }
502
503        let object_id = self.add_object(encrypted);
504        self.trailer.set(b"Encrypt", Object::Reference(object_id));
505        self.encryption_state = None;
506
507        Ok(())
508    }
509
510    /// Replaces all encrypted Strings and Streams with their decrypted contents
511    pub fn decrypt(&mut self, password: &str) -> Result<()> {
512        if !self.is_encrypted() {
513            return Err(Error::NotEncrypted);
514        }
515
516        let algorithm = PasswordAlgorithm::try_from(&*self)?;
517        let password = algorithm.sanitize_password(password)?;
518        self.decrypt_raw(&password)
519    }
520
521    /// Replaces all encrypted Strings and Streams with their decrypted contents with the password
522    /// provided directly as bytes without sanitization
523    pub fn decrypt_raw<P>(&mut self, password: P) -> Result<()>
524    where
525        P: AsRef<[u8]>,
526    {
527        if !self.is_encrypted() {
528            return Err(Error::NotEncrypted);
529        }
530
531        self.authenticate_raw_password(&password)?;
532
533        // Find the ID of the encryption dict; we'll want to skip it when decrypting
534        let encryption_obj_id = self
535            .trailer
536            .get(b"Encrypt")
537            .and_then(Object::as_reference)?;
538
539        let state = EncryptionState::decode(&*self, password)?;
540
541        for (&id, obj) in self.objects.iter_mut() {
542            // The encryption dictionary is not encrypted, leave it alone
543            if id == encryption_obj_id {
544                continue;
545            }
546
547            encryption::decrypt_object(&state, id, obj)?;
548        }
549
550        // Add the objects from the object streams now that they have been decrypted.
551        let mut object_streams = vec![];
552
553        for (_, object) in self.objects.iter_mut() {
554            let Ok(ref mut stream) = object.as_stream_mut() else {
555                continue;
556            };
557
558            if !stream.dict.has_type(b"ObjStm") {
559                continue;
560            }
561
562            let Some(obj_stream) = ObjectStream::new(stream).ok() else {
563                continue;
564            };
565
566            // TODO: Is insert and replace intended behavior?
567            // See https://github.com/J-F-Liu/lopdf/issues/160 for more info
568            object_streams.extend(obj_stream.objects);
569        }
570
571        // Only add entries, but never replace entries
572        for (id, entry) in object_streams {
573            self.objects.entry(id).or_insert(entry);
574        }
575
576        let object_id = self.trailer.remove(b"Encrypt").unwrap().as_reference()?;
577        self.objects.remove(&object_id);
578
579        self.encryption_state = Some(state);
580
581        Ok(())
582    }
583
584    /// Return the PDF document catalog, which is the root of the document's object graph.
585    pub fn catalog(&self) -> Result<&Dictionary> {
586        self.trailer
587            .get(b"Root")
588            .and_then(Object::as_reference)
589            .and_then(|id| self.get_dictionary(id))
590    }
591
592    /// Return a mutable reference to the PDF document catalog, which is the root of the document's
593    /// object graph.
594    pub fn catalog_mut(&mut self) -> Result<&mut Dictionary> {
595        self.trailer
596            .get(b"Root")
597            .and_then(Object::as_reference)
598            .and_then(move |id| self.get_dictionary_mut(id))
599    }
600
601    /// Get page numbers and corresponding object ids.
602    pub fn get_pages(&self) -> BTreeMap<u32, ObjectId> {
603        self.page_iter()
604            .enumerate()
605            .map(|(i, p)| ((i + 1) as u32, p))
606            .collect()
607    }
608
609    pub fn page_iter(&self) -> impl Iterator<Item = ObjectId> + '_ {
610        PageTreeIter::new(self)
611    }
612
613    /// Get content stream object ids of a page.
614    pub fn get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId> {
615        let mut streams = vec![];
616        if let Ok(page) = self.get_dictionary(page_id) {
617            let mut nb_deref = 0;
618            // Since we're looking for object IDs, we can't use get_deref
619            // so manually walk any references in contents object
620            if let Ok(mut contents) = page.get(b"Contents") {
621                loop {
622                    match contents {
623                        Object::Reference(id) => match self.objects.get(id) {
624                            None | Some(Object::Stream(_)) => {
625                                streams.push(*id);
626                            }
627                            Some(o) => {
628                                nb_deref += 1;
629                                if nb_deref < Self::DEREF_LIMIT {
630                                    contents = o;
631                                    continue;
632                                }
633                            }
634                        },
635                        Object::Array(arr) => {
636                            for content in arr {
637                                if let Ok(id) = content.as_reference() {
638                                    streams.push(id)
639                                }
640                            }
641                        }
642                        _ => {}
643                    }
644                    break;
645                }
646            }
647        }
648        streams
649    }
650
651    /// Add content to a page. All existing content will be unchanged.
652    pub fn add_page_contents(&mut self, page_id: ObjectId, content: Vec<u8>) -> Result<()> {
653        let page = self.get_dictionary(page_id)?;
654        let mut current_content_list: Vec<Object> = match page.get(b"Contents") {
655            Ok(Object::Reference(id)) => {
656                vec![Object::Reference(*id)]
657            }
658            Ok(Object::Array(arr)) => arr.clone(),
659            _ => vec![],
660        };
661        let content_object_id =
662            self.add_object(Object::Stream(Stream::new(Dictionary::new(), content)));
663        current_content_list.push(Object::Reference(content_object_id));
664
665        let page_mut = self.get_object_mut(page_id).and_then(Object::as_dict_mut)?;
666        page_mut.set("Contents", current_content_list);
667        Ok(())
668    }
669
670    /// Get content of a page.
671    pub fn get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>> {
672        let mut content = Vec::new();
673        let content_streams = self.get_page_contents(page_id);
674        for object_id in content_streams {
675            if let Ok(content_stream) = self.get_object(object_id).and_then(Object::as_stream) {
676                match content_stream.decompressed_content() {
677                    Ok(data) => content.write_all(&data)?,
678                    Err(_) => content.write_all(&content_stream.content)?,
679                };
680            }
681        }
682        Ok(content)
683    }
684
685    /// Get resources used by a page.
686    pub fn get_page_resources(
687        &self,
688        page_id: ObjectId,
689    ) -> Result<(Option<&Dictionary>, Vec<ObjectId>)> {
690        fn collect_resources(
691            page_node: &Dictionary,
692            resource_ids: &mut Vec<ObjectId>,
693            doc: &Document,
694            already_seen: &mut HashSet<ObjectId>,
695        ) -> Result<()> {
696            if let Ok(resource_id) = page_node.get(b"Resources").and_then(Object::as_reference) {
697                resource_ids.push(resource_id);
698            }
699            if let Ok(parent_id) = page_node.get(b"Parent").and_then(Object::as_reference) {
700                if already_seen.contains(&parent_id) {
701                    return Err(Error::ReferenceCycle(parent_id));
702                }
703                already_seen.insert(parent_id);
704                let parent_dict = doc.get_dictionary(parent_id)?;
705                collect_resources(parent_dict, resource_ids, doc, already_seen)?;
706            }
707            Ok(())
708        }
709
710        let mut resource_dict = None;
711        let mut resource_ids = Vec::new();
712        if let Ok(page) = self.get_dictionary(page_id) {
713            resource_dict = page.get(b"Resources").and_then(Object::as_dict).ok();
714            collect_resources(page, &mut resource_ids, self, &mut HashSet::new())?;
715        }
716        Ok((resource_dict, resource_ids))
717    }
718
719    /// Get fonts used by a page.
720    pub fn get_page_fonts(&self, page_id: ObjectId) -> Result<BTreeMap<Vec<u8>, &Dictionary>> {
721        fn collect_fonts_from_resources<'a>(
722            resources: &'a Dictionary,
723            fonts: &mut BTreeMap<Vec<u8>, &'a Dictionary>,
724            doc: &'a Document,
725        ) {
726            if let Ok(font) = resources.get(b"Font") {
727                let font_dict = match font {
728                    Object::Reference(id) => doc.get_object(*id).and_then(Object::as_dict).ok(),
729                    Object::Dictionary(dict) => Some(dict),
730                    _ => None,
731                };
732                if let Some(font_dict) = font_dict {
733                    for (name, value) in font_dict.iter() {
734                        let font = match value {
735                            Object::Reference(id) => doc.get_dictionary(*id).ok(),
736                            Object::Dictionary(dict) => Some(dict),
737                            _ => None,
738                        };
739                        if !fonts.contains_key(name) {
740                            font.map(|font| fonts.insert(name.clone(), font));
741                        }
742                    }
743                }
744            }
745        }
746
747        let mut fonts = BTreeMap::new();
748        let (resource_dict, resource_ids) = self.get_page_resources(page_id)?;
749        if let Some(resources) = resource_dict {
750            collect_fonts_from_resources(resources, &mut fonts, self);
751        }
752        for resource_id in resource_ids {
753            if let Ok(resources) = self.get_dictionary(resource_id) {
754                collect_fonts_from_resources(resources, &mut fonts, self);
755            }
756        }
757        Ok(fonts)
758    }
759
760    /// Get the PDF annotations of a page. The /Subtype of each annotation dictionary defines the
761    /// annotation type (Text, Link, Highlight, Underline, Ink, Popup, Widget, etc.). The /Rect of
762    /// an annotation dictionary defines its location on the page.
763    pub fn get_page_annotations(&self, page_id: ObjectId) -> Result<Vec<&Dictionary>> {
764        let mut annotations = vec![];
765        if let Ok(page) = self.get_dictionary(page_id) {
766            match page.get(b"Annots") {
767                Ok(Object::Reference(id)) => self
768                    .get_object(*id)
769                    .and_then(Object::as_array)?
770                    .iter()
771                    .flat_map(Object::as_reference)
772                    .flat_map(|id| self.get_dictionary(id))
773                    .for_each(|a| annotations.push(a)),
774                Ok(Object::Array(a)) => a
775                    .iter()
776                    .flat_map(Object::as_reference)
777                    .flat_map(|id| self.get_dictionary(id))
778                    .for_each(|a| annotations.push(a)),
779                _ => {}
780            }
781        }
782        Ok(annotations)
783    }
784
785    pub fn get_page_images(&'_ self, page_id: ObjectId) -> Result<Vec<PdfImage<'_>>> {
786        let mut images = vec![];
787        if let Ok(page) = self.get_dictionary(page_id) {
788            let resources = self.get_dict_in_dict(page, b"Resources")?;
789            let xobject = match self.get_dict_in_dict(resources, b"XObject") {
790                Ok(xobject) => xobject,
791                Err(err) => match err {
792                    // XObject is optional, no images found
793                    Error::DictKey(_) => return Ok(Vec::default()),
794                    _ => Err(err)?,
795                },
796            };
797
798            for (_, xvalue) in xobject.iter() {
799                let id = xvalue.as_reference()?;
800                let xvalue = self.get_object(id)?;
801                let xvalue = xvalue.as_stream()?;
802                let dict = &xvalue.dict;
803                if dict.get(b"Subtype")?.as_name()? != b"Image" {
804                    continue;
805                }
806                let width = dict.get(b"Width")?.as_i64()?;
807                let height = dict.get(b"Height")?.as_i64()?;
808                let color_space = match dict.get(b"ColorSpace") {
809                    Ok(cs) => match cs {
810                        Object::Array(array) => {
811                            Some(String::from_utf8_lossy(array[0].as_name()?).to_string())
812                        }
813                        Object::Name(name) => Some(String::from_utf8_lossy(name).to_string()),
814                        _ => None,
815                    },
816                    Err(_) => None,
817                };
818                let bits_per_component = match dict.get(b"BitsPerComponent") {
819                    Ok(bpc) => Some(bpc.as_i64()?),
820                    Err(_) => None,
821                };
822                let mut filters = vec![];
823                if let Ok(filter) = dict.get(b"Filter") {
824                    match filter {
825                        Object::Array(array) => {
826                            for obj in array.iter() {
827                                let name = obj.as_name()?;
828                                filters.push(String::from_utf8_lossy(name).to_string());
829                            }
830                        }
831                        Object::Name(name) => {
832                            filters.push(String::from_utf8_lossy(name).to_string());
833                        }
834                        _ => {}
835                    }
836                };
837
838                images.push(PdfImage {
839                    id,
840                    width,
841                    height,
842                    color_space,
843                    bits_per_component,
844                    filters: Some(filters),
845                    content: &xvalue.content,
846                    origin_dict: &xvalue.dict,
847                });
848            }
849        }
850        Ok(images)
851    }
852
853    pub fn decode_text(encoding: &Encoding, bytes: &[u8]) -> Result<String> {
854        debug!("Decoding text with {encoding:#?}");
855        encoding.bytes_to_string(bytes)
856    }
857
858    pub fn encode_text(encoding: &Encoding, text: &str) -> Vec<u8> {
859        encoding.string_to_bytes(text)
860    }
861}
862
863impl Default for Document {
864    fn default() -> Self {
865        Self::new()
866    }
867}
868
869struct PageTreeIter<'a> {
870    doc: &'a Document,
871    stack: Vec<&'a [Object]>,
872    kids: Option<&'a [Object]>,
873    iter_limit: usize,
874}
875
876impl<'a> PageTreeIter<'a> {
877    const PAGE_TREE_DEPTH_LIMIT: usize = 256;
878
879    fn new(doc: &'a Document) -> Self {
880        if let Ok(page_tree_id) = doc
881            .catalog()
882            .and_then(|cat| cat.get(b"Pages"))
883            .and_then(Object::as_reference)
884        {
885            Self {
886                doc,
887                kids: Self::kids(doc, page_tree_id),
888                stack: Vec::with_capacity(32),
889                iter_limit: doc.objects.len(),
890            }
891        } else {
892            Self {
893                doc,
894                kids: None,
895                stack: Vec::new(),
896                iter_limit: doc.objects.len(),
897            }
898        }
899    }
900
901    fn kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]> {
902        doc.get_dictionary(page_tree_id)
903            .and_then(|page_tree| page_tree.get_deref(b"Kids", doc))
904            .and_then(Object::as_array)
905            .map(|k| k.as_slice())
906            .ok()
907    }
908}
909
910impl Iterator for PageTreeIter<'_> {
911    type Item = ObjectId;
912
913    fn next(&mut self) -> Option<Self::Item> {
914        loop {
915            while let Some((kid, new_kids)) = self.kids.and_then(|k| k.split_first()) {
916                if self.iter_limit == 0 {
917                    return None;
918                }
919                self.iter_limit -= 1;
920
921                self.kids = Some(new_kids);
922
923                if let Ok(kid_id) = kid.as_reference() {
924                    if let Ok(type_name) = self
925                        .doc
926                        .get_dictionary(kid_id)
927                        .and_then(Dictionary::get_type)
928                    {
929                        match type_name {
930                            b"Page" => {
931                                return Some(kid_id);
932                            }
933                            b"Pages" if self.stack.len() < Self::PAGE_TREE_DEPTH_LIMIT => {
934                                let kids = self.kids.unwrap();
935                                if !kids.is_empty() {
936                                    self.stack.push(kids);
937                                }
938                                self.kids = Self::kids(self.doc, kid_id);
939                            }
940                            _ => {}
941                        }
942                    }
943                }
944            }
945
946            // Current level exhausted, try to pop.
947            if let kids @ Some(_) = self.stack.pop() {
948                self.kids = kids;
949            } else {
950                return None;
951            }
952        }
953    }
954
955    fn size_hint(&self) -> (usize, Option<usize>) {
956        let kids = self.kids.unwrap_or(&[]);
957
958        let nb_pages: usize = kids
959            .iter()
960            .chain(self.stack.iter().flat_map(|k| k.iter()))
961            .map(|kid| {
962                if let Ok(dict) = kid
963                    .as_reference()
964                    .and_then(|id| self.doc.get_dictionary(id))
965                {
966                    if let Ok(b"Pages") = dict.get_type() {
967                        let count = dict
968                            .get_deref(b"Count", self.doc)
969                            .and_then(Object::as_i64)
970                            .unwrap_or(0);
971                        // Don't let page count go backwards in case of an invalid document.
972                        max(0, count) as usize
973                    } else {
974                        1
975                    }
976                } else {
977                    1
978                }
979            })
980            .sum();
981
982        (nb_pages, Some(nb_pages))
983    }
984}
985
986impl std::iter::FusedIterator for PageTreeIter<'_> {}