Skip to main content

pdf_objects/
document.rs

1use pdf_graphics::{PageBox, Rect};
2
3use crate::error::{PdfError, PdfResult};
4use crate::types::{ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfValue};
5
6#[derive(Debug, Clone)]
7pub struct DocumentCatalog {
8    pub catalog_ref: ObjectRef,
9    pub pages_ref: ObjectRef,
10}
11
12#[derive(Debug, Clone)]
13pub struct PageInfo {
14    pub page_ref: ObjectRef,
15    pub resources: PdfDictionary,
16    pub page_box: PageBox,
17    pub content_refs: Vec<ObjectRef>,
18    pub annotation_refs: Vec<ObjectRef>,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedDocument {
23    pub file: PdfFile,
24    pub catalog: DocumentCatalog,
25    pub pages: Vec<PageInfo>,
26}
27
28pub fn build_document(file: PdfFile) -> PdfResult<ParsedDocument> {
29    // Encrypted PDFs are decrypted in-place earlier in the pipeline
30    // (parser::decrypt_document_if_encrypted removes the /Encrypt entry from
31    // the trailer on success). A trailer that still contains /Encrypt by the
32    // time we get here means the Standard Security Handler code path was
33    // skipped entirely — an unsupported scheme, for example.
34    if file.trailer.contains_key("Encrypt") {
35        return Err(PdfError::Unsupported(
36            "encrypted PDFs are not supported".to_string(),
37        ));
38    }
39
40    let root = file
41        .trailer
42        .get("Root")
43        .ok_or_else(|| PdfError::Corrupt("trailer is missing Root".to_string()))?;
44    let root_ref = match root {
45        PdfValue::Reference(object_ref) => *object_ref,
46        _ => return Err(PdfError::Corrupt("Root is not a reference".to_string())),
47    };
48    let root_dict = file.get_dictionary(root_ref)?;
49    if root_dict.get("Type").and_then(PdfValue::as_name) != Some("Catalog") {
50        return Err(PdfError::Corrupt("Root catalog has wrong type".to_string()));
51    }
52
53    let pages_ref = match root_dict.get("Pages") {
54        Some(PdfValue::Reference(object_ref)) => *object_ref,
55        _ => return Err(PdfError::Corrupt("Catalog is missing Pages".to_string())),
56    };
57    let catalog = DocumentCatalog {
58        catalog_ref: root_ref,
59        pages_ref,
60    };
61
62    let mut pages = Vec::new();
63    let mut visited = std::collections::BTreeSet::new();
64    collect_pages(
65        &file,
66        pages_ref,
67        &mut pages,
68        None,
69        None,
70        None,
71        0,
72        &mut visited,
73    )?;
74
75    Ok(ParsedDocument {
76        file,
77        catalog,
78        pages,
79    })
80}
81
82const MAX_PAGE_TREE_DEPTH: usize = 64;
83
84#[allow(clippy::too_many_arguments)]
85fn collect_pages(
86    file: &PdfFile,
87    node_ref: ObjectRef,
88    output: &mut Vec<PageInfo>,
89    inherited_resources: Option<&PdfDictionary>,
90    inherited_media_box: Option<Rect>,
91    inherited_rotate: Option<i32>,
92    depth: usize,
93    visited: &mut std::collections::BTreeSet<ObjectRef>,
94) -> PdfResult<()> {
95    if depth > MAX_PAGE_TREE_DEPTH {
96        return Err(PdfError::Corrupt(
97            "page tree exceeds maximum depth".to_string(),
98        ));
99    }
100    if !visited.insert(node_ref) {
101        return Err(PdfError::Corrupt("cycle detected in page tree".to_string()));
102    }
103    let dictionary = file.get_dictionary(node_ref)?;
104    match dictionary.get("Type").and_then(PdfValue::as_name) {
105        Some("Pages") => {
106            let resources = dictionary
107                .get("Resources")
108                .map(|value| file.resolve_dict(value))
109                .transpose()?
110                .or(inherited_resources);
111            let media_box = dictionary
112                .get("MediaBox")
113                .map(|value| parse_rect(file.resolve(value)?))
114                .transpose()?
115                .or(inherited_media_box);
116            let rotate = dictionary
117                .get("Rotate")
118                .map(|value| parse_rotation(file.resolve(value)?))
119                .transpose()?
120                .or(inherited_rotate);
121            let kids = dictionary
122                .get("Kids")
123                .and_then(PdfValue::as_array)
124                .ok_or_else(|| PdfError::Corrupt("Pages node is missing Kids".to_string()))?;
125            for kid in kids {
126                let kid_ref = match kid {
127                    PdfValue::Reference(object_ref) => *object_ref,
128                    _ => {
129                        return Err(PdfError::Corrupt(
130                            "Pages Kids entry is not an object reference".to_string(),
131                        ));
132                    }
133                };
134                collect_pages(
135                    file,
136                    kid_ref,
137                    output,
138                    resources,
139                    media_box,
140                    rotate,
141                    depth + 1,
142                    visited,
143                )?;
144            }
145        }
146        Some("Page") => {
147            let resources = dictionary
148                .get("Resources")
149                .map(|value| file.resolve_dict(value))
150                .transpose()?
151                .or(inherited_resources)
152                .cloned()
153                .ok_or_else(|| PdfError::Corrupt("page is missing Resources".to_string()))?;
154            let media_box = dictionary
155                .get("MediaBox")
156                .map(|value| parse_rect(file.resolve(value)?))
157                .transpose()?
158                .or(inherited_media_box)
159                .ok_or_else(|| PdfError::Corrupt("page is missing MediaBox".to_string()))?;
160            let crop_box = dictionary
161                .get("CropBox")
162                .map(|value| parse_rect(file.resolve(value)?))
163                .transpose()?
164                .unwrap_or(media_box);
165            let rotate = dictionary
166                .get("Rotate")
167                .map(|value| parse_rotation(file.resolve(value)?))
168                .transpose()?
169                .or(inherited_rotate)
170                .unwrap_or(0);
171            let content_refs = parse_contents_refs(dictionary)?;
172            let annotation_refs = dictionary
173                .get("Annots")
174                .and_then(PdfValue::as_array)
175                .map(|entries| {
176                    entries
177                        .iter()
178                        .map(|entry| match entry {
179                            PdfValue::Reference(object_ref) => Ok(*object_ref),
180                            _ => Err(PdfError::Corrupt(
181                                "annotation entry is not a reference".to_string(),
182                            )),
183                        })
184                        .collect::<PdfResult<Vec<_>>>()
185                })
186                .transpose()?
187                .unwrap_or_default();
188            output.push(PageInfo {
189                page_ref: node_ref,
190                resources,
191                page_box: PageBox {
192                    media_box,
193                    crop_box,
194                    rotate,
195                },
196                content_refs,
197                annotation_refs,
198            });
199        }
200        other => {
201            return Err(PdfError::Corrupt(format!(
202                "unexpected page tree node type: {other:?}"
203            )));
204        }
205    }
206    Ok(())
207}
208
209fn parse_rotation(value: &PdfValue) -> PdfResult<i32> {
210    value
211        .as_integer()
212        .map(|value| value as i32)
213        .ok_or_else(|| PdfError::Corrupt("Rotate is not an integer".to_string()))
214}
215
216fn parse_rect(value: &PdfValue) -> PdfResult<Rect> {
217    let array = value
218        .as_array()
219        .ok_or_else(|| PdfError::Corrupt("expected box array".to_string()))?;
220    if array.len() != 4 {
221        return Err(PdfError::Corrupt(
222            "box array must contain four numbers".to_string(),
223        ));
224    }
225    let left = array[0]
226        .as_number()
227        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
228    let bottom = array[1]
229        .as_number()
230        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
231    let right = array[2]
232        .as_number()
233        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
234    let top = array[3]
235        .as_number()
236        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
237    Ok(Rect {
238        x: left,
239        y: bottom,
240        width: right - left,
241        height: top - bottom,
242    }
243    .normalize())
244}
245
246fn parse_contents_refs(page: &PdfDictionary) -> PdfResult<Vec<ObjectRef>> {
247    match page.get("Contents") {
248        Some(PdfValue::Reference(object_ref)) => Ok(vec![*object_ref]),
249        Some(PdfValue::Array(entries)) => entries
250            .iter()
251            .map(|entry| match entry {
252                PdfValue::Reference(object_ref) => Ok(*object_ref),
253                _ => Err(PdfError::Unsupported(
254                    "direct content streams are not supported".to_string(),
255                )),
256            })
257            .collect(),
258        Some(PdfValue::Dictionary(_)) => Err(PdfError::Unsupported(
259            "direct content streams are not supported".to_string(),
260        )),
261        Some(_) => Err(PdfError::Corrupt(
262            "page Contents entry is not a reference or array".to_string(),
263        )),
264        None => Ok(Vec::new()),
265    }
266}
267
268pub fn get_stream(file: &PdfFile, object_ref: ObjectRef) -> PdfResult<&crate::types::PdfStream> {
269    match file.get_object(object_ref)? {
270        PdfObject::Stream(stream) => Ok(stream),
271        PdfObject::Value(_) => Err(PdfError::Corrupt(format!(
272            "expected stream object at {} {}",
273            object_ref.object_number, object_ref.generation
274        ))),
275    }
276}