Skip to main content

lopdf/
document.rs

1use super::encodings::Encoding;
2use super::{Bookmark, Dictionary, Object, ObjectId};
3use crate::encryption::crypt_filters::*;
4use crate::encryption::{self, EncryptionState, PasswordAlgorithm};
5use crate::xobject::PdfImage;
6use crate::xref::{Xref, XrefType};
7use crate::{Error, ObjectStream, Result, Stream};
8use log::debug;
9use std::cmp::max;
10use std::collections::{BTreeMap, HashMap, HashSet};
11use std::io::Write;
12use std::str;
13use std::sync::Arc;
14
15/// A PDF document.
16///
17/// This can both be a combination of multiple incremental updates
18/// or just one (the last) incremental update in a PDF file.
19#[derive(Debug, Clone)]
20pub struct Document {
21    /// The version of the PDF specification to which the file conforms.
22    pub version: String,
23
24    /// The binary mark important for PDF A/2,3 tells various software tools to classify
25    /// the file as containing 8-bit binary that should be preserved during processing
26    pub binary_mark: Vec<u8>,
27
28    /// The trailer gives the location of the cross-reference table and of certain special objects.
29    pub trailer: Dictionary,
30
31    /// The cross-reference table contains locations of the indirect objects.
32    pub reference_table: Xref,
33
34    /// The objects that make up the document contained in the file.
35    pub objects: BTreeMap<ObjectId, Object>,
36
37    /// Current maximum object id within the document.
38    pub max_id: u32,
39
40    /// Current maximum object id within Bookmarks.
41    pub max_bookmark_id: u32,
42
43    /// The bookmarks in the document. Render at the very end of document after renumbering objects.
44    pub bookmarks: Vec<u32>,
45
46    /// used to locate a stored Bookmark so children can be appended to it via its id. Otherwise we
47    /// need to do recursive lookups and returns on the bookmarks internal layout Vec
48    pub bookmark_table: HashMap<u32, Bookmark>,
49
50    /// The byte the cross-reference table starts at.
51    /// This value is only set during reading, but not when writing the file.
52    /// It is used to support incremental updates in PDFs.
53    /// Default value is `0`.
54    pub xref_start: usize,
55
56    /// The encryption state stores the parameters that were used to decrypt this document if the
57    /// document has been decrypted.
58    pub encryption_state: Option<EncryptionState>,
59}
60
61impl Document {
62    /// Create new PDF document.
63    pub fn new() -> Self {
64        Self {
65            version: "1.4".to_string(),
66            binary_mark: vec![0xBB, 0xAD, 0xC0, 0xDE],
67            trailer: Dictionary::new(),
68            reference_table: Xref::new(0, XrefType::CrossReferenceStream),
69            objects: BTreeMap::new(),
70            max_id: 0,
71            max_bookmark_id: 0,
72            bookmarks: Vec::new(),
73            bookmark_table: HashMap::new(),
74            xref_start: 0,
75            encryption_state: None,
76        }
77    }
78
79    /// Create a new PDF document that is an incremental update to a previous document.
80    pub fn new_from_prev(prev: &Document) -> Self {
81        let mut new_trailer = prev.trailer.clone();
82        new_trailer.set("Prev", Object::Integer(prev.xref_start as i64));
83        Self {
84            version: "1.4".to_string(),
85            binary_mark: vec![0xBB, 0xAD, 0xC0, 0xDE],
86            trailer: new_trailer,
87            reference_table: Xref::new(0, prev.reference_table.cross_reference_type),
88            objects: BTreeMap::new(),
89            max_id: prev.max_id,
90            max_bookmark_id: prev.max_bookmark_id,
91            bookmarks: Vec::new(),
92            bookmark_table: HashMap::new(),
93            xref_start: 0,
94            encryption_state: None,
95        }
96    }
97
98    const DEREF_LIMIT: usize = 128;
99
100    fn recursive_fix_pages(&mut self, bookmarks: &[u32], first: bool) -> ObjectId {
101        if !bookmarks.is_empty() {
102            for id in bookmarks {
103                let (children, mut page) = match self.bookmark_table.get(id) {
104                    Some(n) => (n.children.clone(), n.page),
105                    None => return (0, 0),
106                };
107
108                if 0 == page.0 && !children.is_empty() {
109                    let objectid = self.recursive_fix_pages(&children[..], false);
110
111                    let bookmark = self.bookmark_table.get_mut(id).unwrap();
112                    bookmark.page = objectid;
113                    page = objectid;
114                }
115
116                if !first && 0 != page.0 {
117                    return page;
118                }
119
120                if first && !children.is_empty() {
121                    self.recursive_fix_pages(&children[..], first);
122                }
123            }
124        }
125
126        (0, 0)
127    }
128
129    /// Adjusts the Parents that have a ObjectId of (0,_) to that
130    /// of their first child. will recurse through all entries
131    /// till all parents of children are set. This should be
132    /// ran before building the final bookmark objects but after
133    /// renumbering of objects.
134    pub fn adjust_zero_pages(&mut self) {
135        self.recursive_fix_pages(&self.bookmarks.clone(), true);
136    }
137
138    /// Follow references if the supplied object is a reference.
139    ///
140    /// Returns a tuple of an optional object id and final object.
141    /// The object id will be None if the object was not a
142    /// reference. Otherwise, it will be the last object id in the
143    /// reference chain.
144    pub fn dereference<'a>(&'a self, mut object: &'a Object) -> Result<(Option<ObjectId>, &'a Object)> {
145        let mut nb_deref = 0;
146        let mut id = None;
147
148        while let Ok(ref_id) = object.as_reference() {
149            id = Some(ref_id);
150            object = self.objects.get(&ref_id).ok_or(Error::ObjectNotFound(ref_id))?;
151
152            nb_deref += 1;
153            if nb_deref > Self::DEREF_LIMIT {
154                return Err(Error::ReferenceLimit);
155            }
156        }
157
158        Ok((id, object))
159    }
160
161    /// Get object by object id, will iteratively dereference a referenced object.
162    pub fn get_object(&self, id: ObjectId) -> Result<&Object> {
163        let object = self.objects.get(&id).ok_or(Error::ObjectNotFound(id))?;
164        self.dereference(object).map(|(_, object)| object)
165    }
166
167    /// Determines if an object exists in the current document (or incremental update.)
168    /// with the given `ObjectId`.
169    /// `true` if the object exists, `false` if it does not exist.
170    pub fn has_object(&self, id: ObjectId) -> bool {
171        self.objects.contains_key(&id)
172    }
173
174    /// Get mutable reference to object by object ID, will iteratively dereference a referenced object.
175    pub fn get_object_mut(&mut self, id: ObjectId) -> Result<&mut Object> {
176        let object = self.objects.get(&id).ok_or(Error::ObjectNotFound(id))?;
177        let (ref_id, _obj) = self.dereference(object)?;
178
179        Ok(self.objects.get_mut(&ref_id.unwrap_or(id)).unwrap())
180    }
181
182    /// Get the object ID of the page that contains `id`.
183    pub fn get_object_page(&self, id: ObjectId) -> Result<ObjectId> {
184        for (_, object_id) in self.get_pages() {
185            let page = self.get_object(object_id)?.as_dict()?;
186            let annots = page.get(b"Annots")?.as_array()?;
187            let mut objects_ids = annots.iter().map(Object::as_reference);
188
189            let contains = objects_ids.any(|object_id| Some(id) == object_id.ok());
190            if contains {
191                return Ok(object_id);
192            }
193        }
194
195        Err(Error::PageNumberNotFound(0))
196    }
197
198    /// Get dictionary object by id.
199    pub fn get_dictionary(&self, id: ObjectId) -> Result<&Dictionary> {
200        self.get_object(id).and_then(Object::as_dict)
201    }
202
203    /// Get a mutable dictionary object by id.
204    pub fn get_dictionary_mut(&mut self, id: ObjectId) -> Result<&mut Dictionary> {
205        self.get_object_mut(id).and_then(Object::as_dict_mut)
206    }
207
208    /// Get dictionary in dictionary by key.
209    pub fn get_dict_in_dict<'a>(&'a self, node: &'a Dictionary, key: &[u8]) -> Result<&'a Dictionary> {
210        match node.get(key)? {
211            Object::Reference(object_id) => self.get_dictionary(*object_id),
212            Object::Dictionary(dic) => Ok(dic),
213            obj => Err(Error::ObjectType {
214                expected: "Dictionary",
215                found: obj.enum_variant(),
216            }),
217        }
218    }
219
220    /// Traverse objects from trailer recursively, return all referenced object IDs.
221    pub fn traverse_objects<A: Fn(&mut Object)>(&mut self, action: A) -> Vec<ObjectId> {
222        fn traverse_array<A: Fn(&mut Object)>(array: &mut [Object], action: &A, refs: &mut Vec<ObjectId>) {
223            for item in array.iter_mut() {
224                traverse_object(item, action, refs);
225            }
226        }
227        fn traverse_dictionary<A: Fn(&mut Object)>(dict: &mut Dictionary, action: &A, refs: &mut Vec<ObjectId>) {
228            for (_, v) in dict.iter_mut() {
229                traverse_object(v, action, refs);
230            }
231        }
232        fn traverse_object<A: Fn(&mut Object)>(object: &mut Object, action: &A, refs: &mut Vec<ObjectId>) {
233            action(object);
234            match object {
235                Object::Array(array) => traverse_array(array, action, refs),
236                Object::Dictionary(dict) => traverse_dictionary(dict, action, refs),
237                Object::Stream(stream) => traverse_dictionary(&mut stream.dict, action, refs),
238                Object::Reference(id) => {
239                    if !refs.contains(id) {
240                        refs.push(*id);
241                    }
242                }
243                _ => {}
244            }
245        }
246        let mut refs = vec![];
247        traverse_dictionary(&mut self.trailer, &action, &mut refs);
248        let mut index = 0;
249        while index < refs.len() {
250            if let Some(object) = self.objects.get_mut(&refs[index]) {
251                traverse_object(object, &action, &mut refs);
252            }
253            index += 1;
254        }
255        refs
256    }
257
258    /// Return dictionary with encryption information
259    pub fn get_encrypted(&self) -> Result<&Dictionary> {
260        self.trailer
261            .get(b"Encrypt")
262            .and_then(Object::as_reference)
263            .and_then(|id| self.get_dictionary(id))
264    }
265
266    /// Return true if PDF document is currently encrypted
267    pub fn is_encrypted(&self) -> bool {
268        self.get_encrypted().is_ok()
269    }
270
271    /// Return true if the document was originally encrypted when loaded
272    pub fn was_encrypted(&self) -> bool {
273        self.encryption_state.is_some()
274    }
275
276    /// Authenticate the provided owner password directly as bytes without sanitization
277    pub fn authenticate_raw_owner_password<P>(&self, password: P) -> Result<()>
278    where
279        P: AsRef<[u8]>,
280    {
281        if !self.is_encrypted() {
282            return Err(Error::NotEncrypted);
283        }
284
285        let password = password.as_ref();
286        let algorithm = PasswordAlgorithm::try_from(self)?;
287        algorithm.authenticate_owner_password(self, password)?;
288
289        Ok(())
290    }
291
292    /// Authenticate the provided user password directly as bytes without sanitization
293    pub fn authenticate_raw_user_password<P>(&self, password: P) -> Result<()>
294    where
295        P: AsRef<[u8]>,
296    {
297        if !self.is_encrypted() {
298            return Err(Error::NotEncrypted);
299        }
300
301        let password = password.as_ref();
302        let algorithm = PasswordAlgorithm::try_from(self)?;
303        algorithm.authenticate_user_password(self, password)?;
304
305        Ok(())
306    }
307
308    /// Authenticate the provided owner/user password as bytes without sanitization
309    pub fn authenticate_raw_password<P>(&self, password: P) -> Result<()>
310    where
311        P: AsRef<[u8]>,
312    {
313        if !self.is_encrypted() {
314            return Err(Error::NotEncrypted);
315        }
316
317        let password = password.as_ref();
318        let algorithm = PasswordAlgorithm::try_from(self)?;
319        algorithm
320            .authenticate_owner_password(self, password)
321            .or(algorithm.authenticate_user_password(self, password))?;
322
323        Ok(())
324    }
325
326    /// Authenticate the provided owner password
327    pub fn authenticate_owner_password(&self, password: &str) -> Result<()> {
328        if !self.is_encrypted() {
329            return Err(Error::NotEncrypted);
330        }
331
332        let algorithm = PasswordAlgorithm::try_from(self)?;
333        let password = algorithm.sanitize_password(password)?;
334        algorithm.authenticate_owner_password(self, &password)?;
335
336        Ok(())
337    }
338
339    /// Authenticate the provided user password
340    pub fn authenticate_user_password(&self, password: &str) -> Result<()> {
341        if !self.is_encrypted() {
342            return Err(Error::NotEncrypted);
343        }
344
345        let algorithm = PasswordAlgorithm::try_from(self)?;
346        let password = algorithm.sanitize_password(password)?;
347        algorithm.authenticate_user_password(self, &password)?;
348
349        Ok(())
350    }
351
352    /// Authenticate the provided owner/user password
353    pub fn authenticate_password(&self, password: &str) -> Result<()> {
354        if !self.is_encrypted() {
355            return Err(Error::NotEncrypted);
356        }
357
358        let algorithm = PasswordAlgorithm::try_from(self)?;
359        let password = algorithm.sanitize_password(password)?;
360        algorithm
361            .authenticate_owner_password(self, &password)
362            .or(algorithm.authenticate_user_password(self, &password))?;
363
364        Ok(())
365    }
366
367    /// Returns a `BTreeMap` of the crypt filters available in the PDF document if any.
368    pub fn get_crypt_filters(&self) -> BTreeMap<Vec<u8>, Arc<dyn CryptFilter>> {
369        let mut crypt_filters = BTreeMap::new();
370
371        if let Ok(filters) = self
372            .get_encrypted()
373            .and_then(|dict| dict.get(b"CF"))
374            .and_then(|object| object.as_dict())
375        {
376            for (name, filter) in filters {
377                let Ok(filter) = filter.as_dict() else {
378                    continue;
379                };
380
381                if filter.get(b"Type").is_ok() && !filter.has_type(b"CryptFilter") {
382                    continue;
383                }
384
385                // Get the Crypt Filter Method (CFM) used, if any, by the PDF reader to decrypt data.
386                let cfm = filter.get(b"CFM").and_then(|object| object.as_name()).ok();
387
388                let crypt_filter: Arc<dyn CryptFilter> = match cfm {
389                    // The application shall ask the security handler for the file encryption key
390                    // and shall implicitly decrypt data using the RC4 algorithm.
391                    Some(b"V2") => Arc::new(Rc4CryptFilter),
392                    // The application shall ask the security handler for the file encryption key
393                    // and shall implicitly decrypt data using the AES-128 algorithm in Cipher
394                    // Block Chaining (CBC) mode with a 16-byte block size and an initialization
395                    // vector that shall be randomly generated and placed as the first 16 bytes in
396                    // the stream or string. The key size (Length) shall be 128 bits.
397                    Some(b"AESV2") => Arc::new(Aes128CryptFilter),
398                    // The application shall ask the security handler for the file encryption key
399                    // and shall implicitly decrypt data using the AES-256 algorithm in Cipher
400                    // Block Chaining (CBC) with padding mode with a 16-byte block size and an
401                    // initialization vector that is randomly generated and placed as the first 16
402                    // bytes in the stream or string. The key size (Length) shall be 256 bits.
403                    Some(b"AESV3") => Arc::new(Aes256CryptFilter),
404                    // The application shall not decrypt data but shall direct the input stream to
405                    // the security handler for decryption.
406                    Some(b"Identity") | None => Arc::new(IdentityCryptFilter),
407                    // Unknown crypt filter method.
408                    _ => continue,
409                };
410
411                crypt_filters.insert(name.to_vec(), crypt_filter);
412            }
413        }
414
415        crypt_filters
416    }
417
418    /// Replaces all encrypted Strings and Streams with their encrypted contents
419    pub fn encrypt(&mut self, state: &EncryptionState) -> Result<()> {
420        if self.is_encrypted() {
421            return Err(Error::AlreadyEncrypted);
422        }
423
424        let encrypted = state.encode()?;
425
426        for (&id, obj) in self.objects.iter_mut() {
427            encryption::encrypt_object(state, id, obj)?;
428        }
429
430        let object_id = self.add_object(encrypted);
431        self.trailer.set(b"Encrypt", Object::Reference(object_id));
432        self.encryption_state = None;
433
434        Ok(())
435    }
436
437    /// Replaces all encrypted Strings and Streams with their decrypted contents
438    pub fn decrypt(&mut self, password: &str) -> Result<()> {
439        if !self.is_encrypted() {
440            return Err(Error::NotEncrypted);
441        }
442
443        let algorithm = PasswordAlgorithm::try_from(&*self)?;
444        let password = algorithm.sanitize_password(password)?;
445        self.decrypt_raw(&password)
446    }
447
448    /// Replaces all encrypted Strings and Streams with their decrypted contents with the password
449    /// provided directly as bytes without sanitization
450    pub fn decrypt_raw<P>(&mut self, password: P) -> Result<()>
451    where
452        P: AsRef<[u8]>,
453    {
454        if !self.is_encrypted() {
455            return Err(Error::NotEncrypted);
456        }
457
458        self.authenticate_raw_password(&password)?;
459
460        // Find the ID of the encryption dict; we'll want to skip it when decrypting
461        let encryption_obj_id = self.trailer.get(b"Encrypt").and_then(Object::as_reference)?;
462
463        let state = EncryptionState::decode(&*self, password)?;
464
465        for (&id, obj) in self.objects.iter_mut() {
466            // The encryption dictionary is not encrypted, leave it alone
467            if id == encryption_obj_id {
468                continue;
469            }
470
471            encryption::decrypt_object(&state, id, obj)?;
472        }
473
474        // Add the objects from the object streams now that they have been decrypted.
475        let mut object_streams = vec![];
476
477        for (_, object) in self.objects.iter_mut() {
478            let Ok(ref mut stream) = object.as_stream_mut() else {
479                continue;
480            };
481
482            if !stream.dict.has_type(b"ObjStm") {
483                continue;
484            }
485
486            let Some(obj_stream) = ObjectStream::new(stream).ok() else {
487                continue;
488            };
489
490            // TODO: Is insert and replace intended behavior?
491            // See https://github.com/J-F-Liu/lopdf/issues/160 for more info
492            object_streams.extend(obj_stream.objects);
493        }
494
495        // Only add entries, but never replace entries
496        for (id, entry) in object_streams {
497            self.objects.entry(id).or_insert(entry);
498        }
499
500        let object_id = self.trailer.remove(b"Encrypt").unwrap().as_reference()?;
501        self.objects.remove(&object_id);
502
503        self.encryption_state = Some(state);
504
505        Ok(())
506    }
507
508    /// Return the PDF document catalog, which is the root of the document's object graph.
509    pub fn catalog(&self) -> Result<&Dictionary> {
510        self.trailer
511            .get(b"Root")
512            .and_then(Object::as_reference)
513            .and_then(|id| self.get_dictionary(id))
514    }
515
516    /// Return a mutable reference to the PDF document catalog, which is the root of the document's
517    /// object graph.
518    pub fn catalog_mut(&mut self) -> Result<&mut Dictionary> {
519        self.trailer
520            .get(b"Root")
521            .and_then(Object::as_reference)
522            .and_then(move |id| self.get_dictionary_mut(id))
523    }
524
525    /// Get page numbers and corresponding object ids.
526    pub fn get_pages(&self) -> BTreeMap<u32, ObjectId> {
527        self.page_iter().enumerate().map(|(i, p)| ((i + 1) as u32, p)).collect()
528    }
529
530    pub fn page_iter(&self) -> impl Iterator<Item = ObjectId> + '_ {
531        PageTreeIter::new(self)
532    }
533
534    /// Get content stream object ids of a page.
535    pub fn get_page_contents(&self, page_id: ObjectId) -> Vec<ObjectId> {
536        let mut streams = vec![];
537        if let Ok(page) = self.get_dictionary(page_id) {
538            let mut nb_deref = 0;
539            // Since we're looking for object IDs, we can't use get_deref
540            // so manually walk any references in contents object
541            if let Ok(mut contents) = page.get(b"Contents") {
542                loop {
543                    match contents {
544                        Object::Reference(id) => match self.objects.get(id) {
545                            None | Some(Object::Stream(_)) => {
546                                streams.push(*id);
547                            }
548                            Some(o) => {
549                                nb_deref += 1;
550                                if nb_deref < Self::DEREF_LIMIT {
551                                    contents = o;
552                                    continue;
553                                }
554                            }
555                        },
556                        Object::Array(arr) => {
557                            for content in arr {
558                                if let Ok(id) = content.as_reference() {
559                                    streams.push(id)
560                                }
561                            }
562                        }
563                        _ => {}
564                    }
565                    break;
566                }
567            }
568        }
569        streams
570    }
571
572    /// Add content to a page. All existing content will be unchanged.
573    pub fn add_page_contents(&mut self, page_id: ObjectId, content: Vec<u8>) -> Result<()> {
574        let page = self.get_dictionary(page_id)?;
575        let mut current_content_list: Vec<Object> = match page.get(b"Contents") {
576            Ok(Object::Reference(id)) => {
577                vec![Object::Reference(*id)]
578            }
579            Ok(Object::Array(arr)) => arr.clone(),
580            _ => vec![],
581        };
582        let content_object_id = self.add_object(Object::Stream(Stream::new(Dictionary::new(), content)));
583        current_content_list.push(Object::Reference(content_object_id));
584
585        let page_mut = self.get_object_mut(page_id).and_then(Object::as_dict_mut)?;
586        page_mut.set("Contents", current_content_list);
587        Ok(())
588    }
589
590    /// Get content of a page.
591    pub fn get_page_content(&self, page_id: ObjectId) -> Result<Vec<u8>> {
592        let mut content = Vec::new();
593        let content_streams = self.get_page_contents(page_id);
594        for object_id in content_streams {
595            if let Ok(content_stream) = self.get_object(object_id).and_then(Object::as_stream) {
596                match content_stream.decompressed_content() {
597                    Ok(data) => content.write_all(&data)?,
598                    Err(_) => content.write_all(&content_stream.content)?,
599                };
600            }
601        }
602        Ok(content)
603    }
604
605    /// Get resources used by a page.
606    pub fn get_page_resources(&self, page_id: ObjectId) -> Result<(Option<&Dictionary>, Vec<ObjectId>)> {
607        fn collect_resources(
608            page_node: &Dictionary, resource_ids: &mut Vec<ObjectId>, doc: &Document,
609            already_seen: &mut HashSet<ObjectId>,
610        ) -> Result<()> {
611            if let Ok(resource_id) = page_node.get(b"Resources").and_then(Object::as_reference) {
612                resource_ids.push(resource_id);
613            }
614            if let Ok(parent_id) = page_node.get(b"Parent").and_then(Object::as_reference) {
615                if already_seen.contains(&parent_id) {
616                    return Err(Error::ReferenceCycle(parent_id));
617                }
618                already_seen.insert(parent_id);
619                let parent_dict = doc.get_dictionary(parent_id)?;
620                collect_resources(parent_dict, resource_ids, doc, already_seen)?;
621            }
622            Ok(())
623        }
624
625        let mut resource_dict = None;
626        let mut resource_ids = Vec::new();
627        if let Ok(page) = self.get_dictionary(page_id) {
628            resource_dict = page.get(b"Resources").and_then(Object::as_dict).ok();
629            collect_resources(page, &mut resource_ids, self, &mut HashSet::new())?;
630        }
631        Ok((resource_dict, resource_ids))
632    }
633
634    /// Get fonts used by a page.
635    pub fn get_page_fonts(&self, page_id: ObjectId) -> Result<BTreeMap<Vec<u8>, &Dictionary>> {
636        fn collect_fonts_from_resources<'a>(
637            resources: &'a Dictionary, fonts: &mut BTreeMap<Vec<u8>, &'a Dictionary>, doc: &'a Document,
638        ) {
639            if let Ok(font) = resources.get(b"Font") {
640                let font_dict = match font {
641                    Object::Reference(id) => doc.get_object(*id).and_then(Object::as_dict).ok(),
642                    Object::Dictionary(dict) => Some(dict),
643                    _ => None,
644                };
645                if let Some(font_dict) = font_dict {
646                    for (name, value) in font_dict.iter() {
647                        let font = match value {
648                            Object::Reference(id) => doc.get_dictionary(*id).ok(),
649                            Object::Dictionary(dict) => Some(dict),
650                            _ => None,
651                        };
652                        if !fonts.contains_key(name) {
653                            font.map(|font| fonts.insert(name.clone(), font));
654                        }
655                    }
656                }
657            }
658        }
659
660        let mut fonts = BTreeMap::new();
661        let (resource_dict, resource_ids) = self.get_page_resources(page_id)?;
662        if let Some(resources) = resource_dict {
663            collect_fonts_from_resources(resources, &mut fonts, self);
664        }
665        for resource_id in resource_ids {
666            if let Ok(resources) = self.get_dictionary(resource_id) {
667                collect_fonts_from_resources(resources, &mut fonts, self);
668            }
669        }
670        Ok(fonts)
671    }
672
673    /// Get the PDF annotations of a page. The /Subtype of each annotation dictionary defines the
674    /// annotation type (Text, Link, Highlight, Underline, Ink, Popup, Widget, etc.). The /Rect of
675    /// an annotation dictionary defines its location on the page.
676    pub fn get_page_annotations(&self, page_id: ObjectId) -> Result<Vec<&Dictionary>> {
677        let mut annotations = vec![];
678        if let Ok(page) = self.get_dictionary(page_id) {
679            match page.get(b"Annots") {
680                Ok(Object::Reference(id)) => self
681                    .get_object(*id)
682                    .and_then(Object::as_array)?
683                    .iter()
684                    .flat_map(Object::as_reference)
685                    .flat_map(|id| self.get_dictionary(id))
686                    .for_each(|a| annotations.push(a)),
687                Ok(Object::Array(a)) => a
688                    .iter()
689                    .flat_map(Object::as_reference)
690                    .flat_map(|id| self.get_dictionary(id))
691                    .for_each(|a| annotations.push(a)),
692                _ => {}
693            }
694        }
695        Ok(annotations)
696    }
697
698    pub fn get_page_images(&'_ self, page_id: ObjectId) -> Result<Vec<PdfImage<'_>>> {
699        let mut images = vec![];
700        if let Ok(page) = self.get_dictionary(page_id) {
701            let resources = self.get_dict_in_dict(page, b"Resources")?;
702            let xobject = match self.get_dict_in_dict(resources, b"XObject") {
703                Ok(xobject) => xobject,
704                Err(err) => match err {
705                    // XObject is optional, no images found
706                    Error::DictKey(_) => return Ok(Vec::default()),
707                    _ => Err(err)?,
708                },
709            };
710
711            for (_, xvalue) in xobject.iter() {
712                let id = xvalue.as_reference()?;
713                let xvalue = self.get_object(id)?;
714                let xvalue = xvalue.as_stream()?;
715                let dict = &xvalue.dict;
716                if dict.get(b"Subtype")?.as_name()? != b"Image" {
717                    continue;
718                }
719                let width = dict.get(b"Width")?.as_i64()?;
720                let height = dict.get(b"Height")?.as_i64()?;
721                let color_space = match dict.get(b"ColorSpace") {
722                    Ok(cs) => match cs {
723                        Object::Array(array) => Some(String::from_utf8_lossy(array[0].as_name()?).to_string()),
724                        Object::Name(name) => Some(String::from_utf8_lossy(name).to_string()),
725                        _ => None,
726                    },
727                    Err(_) => None,
728                };
729                let bits_per_component = match dict.get(b"BitsPerComponent") {
730                    Ok(bpc) => Some(bpc.as_i64()?),
731                    Err(_) => None,
732                };
733                let mut filters = vec![];
734                if let Ok(filter) = dict.get(b"Filter") {
735                    match filter {
736                        Object::Array(array) => {
737                            for obj in array.iter() {
738                                let name = obj.as_name()?;
739                                filters.push(String::from_utf8_lossy(name).to_string());
740                            }
741                        }
742                        Object::Name(name) => {
743                            filters.push(String::from_utf8_lossy(name).to_string());
744                        }
745                        _ => {}
746                    }
747                };
748
749                images.push(PdfImage {
750                    id,
751                    width,
752                    height,
753                    color_space,
754                    bits_per_component,
755                    filters: Some(filters),
756                    content: &xvalue.content,
757                    origin_dict: &xvalue.dict,
758                });
759            }
760        }
761        Ok(images)
762    }
763
764    pub fn decode_text(encoding: &Encoding, bytes: &[u8]) -> Result<String> {
765        debug!("Decoding text with {encoding:#?}");
766        encoding.bytes_to_string(bytes)
767    }
768
769    pub fn encode_text(encoding: &Encoding, text: &str) -> Vec<u8> {
770        encoding.string_to_bytes(text)
771    }
772}
773
774impl Default for Document {
775    fn default() -> Self {
776        Self::new()
777    }
778}
779
780struct PageTreeIter<'a> {
781    doc: &'a Document,
782    stack: Vec<&'a [Object]>,
783    kids: Option<&'a [Object]>,
784    iter_limit: usize,
785}
786
787impl<'a> PageTreeIter<'a> {
788    const PAGE_TREE_DEPTH_LIMIT: usize = 256;
789
790    fn new(doc: &'a Document) -> Self {
791        if let Ok(page_tree_id) = doc
792            .catalog()
793            .and_then(|cat| cat.get(b"Pages"))
794            .and_then(Object::as_reference)
795        {
796            Self {
797                doc,
798                kids: Self::kids(doc, page_tree_id),
799                stack: Vec::with_capacity(32),
800                iter_limit: doc.objects.len(),
801            }
802        } else {
803            Self {
804                doc,
805                kids: None,
806                stack: Vec::new(),
807                iter_limit: doc.objects.len(),
808            }
809        }
810    }
811
812    fn kids(doc: &Document, page_tree_id: ObjectId) -> Option<&[Object]> {
813        doc.get_dictionary(page_tree_id)
814            .and_then(|page_tree| page_tree.get_deref(b"Kids", doc))
815            .and_then(Object::as_array)
816            .map(|k| k.as_slice())
817            .ok()
818    }
819}
820
821impl Iterator for PageTreeIter<'_> {
822    type Item = ObjectId;
823
824    fn next(&mut self) -> Option<Self::Item> {
825        loop {
826            while let Some((kid, new_kids)) = self.kids.and_then(|k| k.split_first()) {
827                if self.iter_limit == 0 {
828                    return None;
829                }
830                self.iter_limit -= 1;
831
832                self.kids = Some(new_kids);
833
834                if let Ok(kid_id) = kid.as_reference() {
835                    if let Ok(type_name) = self.doc.get_dictionary(kid_id).and_then(Dictionary::get_type) {
836                        match type_name {
837                            b"Page" => {
838                                return Some(kid_id);
839                            }
840                            b"Pages" => {
841                                if self.stack.len() < Self::PAGE_TREE_DEPTH_LIMIT {
842                                    let kids = self.kids.unwrap();
843                                    if !kids.is_empty() {
844                                        self.stack.push(kids);
845                                    }
846                                    self.kids = Self::kids(self.doc, kid_id);
847                                }
848                            }
849                            _ => {}
850                        }
851                    }
852                }
853            }
854
855            // Current level exhausted, try to pop.
856            if let kids @ Some(_) = self.stack.pop() {
857                self.kids = kids;
858            } else {
859                return None;
860            }
861        }
862    }
863
864    fn size_hint(&self) -> (usize, Option<usize>) {
865        let kids = self.kids.unwrap_or(&[]);
866
867        let nb_pages: usize = kids
868            .iter()
869            .chain(self.stack.iter().flat_map(|k| k.iter()))
870            .map(|kid| {
871                if let Ok(dict) = kid.as_reference().and_then(|id| self.doc.get_dictionary(id)) {
872                    if let Ok(b"Pages") = dict.get_type() {
873                        let count = dict.get_deref(b"Count", self.doc).and_then(Object::as_i64).unwrap_or(0);
874                        // Don't let page count go backwards in case of an invalid document.
875                        max(0, count) as usize
876                    } else {
877                        1
878                    }
879                } else {
880                    1
881                }
882            })
883            .sum();
884
885        (nb_pages, Some(nb_pages))
886    }
887}
888
889impl std::iter::FusedIterator for PageTreeIter<'_> {}