Skip to main content

pdf_objects/
document.rs

1use pdf_graphics::{PageBox, Rect};
2
3use crate::error::{PdfError, PdfResult};
4use crate::types::{ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfValue};
5
6#[derive(Debug, Clone)]
7pub struct DocumentCatalog {
8    pub catalog_ref: ObjectRef,
9    pub pages_ref: ObjectRef,
10}
11
12#[derive(Debug, Clone)]
13pub struct PageInfo {
14    pub page_ref: ObjectRef,
15    pub resources: PdfDictionary,
16    pub page_box: PageBox,
17    pub content_refs: Vec<ObjectRef>,
18    pub annotation_refs: Vec<ObjectRef>,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedDocument {
23    pub file: PdfFile,
24    pub catalog: DocumentCatalog,
25    pub pages: Vec<PageInfo>,
26}
27
28pub fn build_document(file: PdfFile) -> PdfResult<ParsedDocument> {
29    if file.trailer.contains_key("Encrypt") {
30        return Err(PdfError::Unsupported(
31            "encrypted PDFs are not supported".to_string(),
32        ));
33    }
34
35    let root = file
36        .trailer
37        .get("Root")
38        .ok_or_else(|| PdfError::Corrupt("trailer is missing Root".to_string()))?;
39    let root_ref = match root {
40        PdfValue::Reference(object_ref) => *object_ref,
41        _ => return Err(PdfError::Corrupt("Root is not a reference".to_string())),
42    };
43    let root_dict = file.get_dictionary(root_ref)?;
44    if root_dict.get("Type").and_then(PdfValue::as_name) != Some("Catalog") {
45        return Err(PdfError::Corrupt("Root catalog has wrong type".to_string()));
46    }
47
48    let pages_ref = match root_dict.get("Pages") {
49        Some(PdfValue::Reference(object_ref)) => *object_ref,
50        _ => return Err(PdfError::Corrupt("Catalog is missing Pages".to_string())),
51    };
52    let catalog = DocumentCatalog {
53        catalog_ref: root_ref,
54        pages_ref,
55    };
56
57    let mut pages = Vec::new();
58    let mut visited = std::collections::BTreeSet::new();
59    collect_pages(
60        &file,
61        pages_ref,
62        &mut pages,
63        None,
64        None,
65        None,
66        0,
67        &mut visited,
68    )?;
69
70    Ok(ParsedDocument {
71        file,
72        catalog,
73        pages,
74    })
75}
76
77const MAX_PAGE_TREE_DEPTH: usize = 64;
78
79#[allow(clippy::too_many_arguments)]
80fn collect_pages(
81    file: &PdfFile,
82    node_ref: ObjectRef,
83    output: &mut Vec<PageInfo>,
84    inherited_resources: Option<&PdfDictionary>,
85    inherited_media_box: Option<Rect>,
86    inherited_rotate: Option<i32>,
87    depth: usize,
88    visited: &mut std::collections::BTreeSet<ObjectRef>,
89) -> PdfResult<()> {
90    if depth > MAX_PAGE_TREE_DEPTH {
91        return Err(PdfError::Corrupt(
92            "page tree exceeds maximum depth".to_string(),
93        ));
94    }
95    if !visited.insert(node_ref) {
96        return Err(PdfError::Corrupt("cycle detected in page tree".to_string()));
97    }
98    let dictionary = file.get_dictionary(node_ref)?;
99    match dictionary.get("Type").and_then(PdfValue::as_name) {
100        Some("Pages") => {
101            let resources = dictionary
102                .get("Resources")
103                .map(|value| file.resolve_dict(value))
104                .transpose()?
105                .or(inherited_resources);
106            let media_box = dictionary
107                .get("MediaBox")
108                .map(|value| parse_rect(file.resolve(value)?))
109                .transpose()?
110                .or(inherited_media_box);
111            let rotate = dictionary
112                .get("Rotate")
113                .map(|value| parse_rotation(file.resolve(value)?))
114                .transpose()?
115                .or(inherited_rotate);
116            let kids = dictionary
117                .get("Kids")
118                .and_then(PdfValue::as_array)
119                .ok_or_else(|| PdfError::Corrupt("Pages node is missing Kids".to_string()))?;
120            for kid in kids {
121                let kid_ref = match kid {
122                    PdfValue::Reference(object_ref) => *object_ref,
123                    _ => {
124                        return Err(PdfError::Corrupt(
125                            "Pages Kids entry is not an object reference".to_string(),
126                        ));
127                    }
128                };
129                collect_pages(
130                    file,
131                    kid_ref,
132                    output,
133                    resources,
134                    media_box,
135                    rotate,
136                    depth + 1,
137                    visited,
138                )?;
139            }
140        }
141        Some("Page") => {
142            let resources = dictionary
143                .get("Resources")
144                .map(|value| file.resolve_dict(value))
145                .transpose()?
146                .or(inherited_resources)
147                .cloned()
148                .ok_or_else(|| PdfError::Corrupt("page is missing Resources".to_string()))?;
149            let media_box = dictionary
150                .get("MediaBox")
151                .map(|value| parse_rect(file.resolve(value)?))
152                .transpose()?
153                .or(inherited_media_box)
154                .ok_or_else(|| PdfError::Corrupt("page is missing MediaBox".to_string()))?;
155            let crop_box = dictionary
156                .get("CropBox")
157                .map(|value| parse_rect(file.resolve(value)?))
158                .transpose()?
159                .unwrap_or(media_box);
160            let rotate = dictionary
161                .get("Rotate")
162                .map(|value| parse_rotation(file.resolve(value)?))
163                .transpose()?
164                .or(inherited_rotate)
165                .unwrap_or(0);
166            let content_refs = parse_contents_refs(dictionary)?;
167            let annotation_refs = dictionary
168                .get("Annots")
169                .and_then(PdfValue::as_array)
170                .map(|entries| {
171                    entries
172                        .iter()
173                        .map(|entry| match entry {
174                            PdfValue::Reference(object_ref) => Ok(*object_ref),
175                            _ => Err(PdfError::Corrupt(
176                                "annotation entry is not a reference".to_string(),
177                            )),
178                        })
179                        .collect::<PdfResult<Vec<_>>>()
180                })
181                .transpose()?
182                .unwrap_or_default();
183            output.push(PageInfo {
184                page_ref: node_ref,
185                resources,
186                page_box: PageBox {
187                    media_box,
188                    crop_box,
189                    rotate,
190                },
191                content_refs,
192                annotation_refs,
193            });
194        }
195        other => {
196            return Err(PdfError::Corrupt(format!(
197                "unexpected page tree node type: {other:?}"
198            )));
199        }
200    }
201    Ok(())
202}
203
204fn parse_rotation(value: &PdfValue) -> PdfResult<i32> {
205    value
206        .as_integer()
207        .map(|value| value as i32)
208        .ok_or_else(|| PdfError::Corrupt("Rotate is not an integer".to_string()))
209}
210
211fn parse_rect(value: &PdfValue) -> PdfResult<Rect> {
212    let array = value
213        .as_array()
214        .ok_or_else(|| PdfError::Corrupt("expected box array".to_string()))?;
215    if array.len() != 4 {
216        return Err(PdfError::Corrupt(
217            "box array must contain four numbers".to_string(),
218        ));
219    }
220    let left = array[0]
221        .as_number()
222        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
223    let bottom = array[1]
224        .as_number()
225        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
226    let right = array[2]
227        .as_number()
228        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
229    let top = array[3]
230        .as_number()
231        .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
232    Ok(Rect {
233        x: left,
234        y: bottom,
235        width: right - left,
236        height: top - bottom,
237    }
238    .normalize())
239}
240
241fn parse_contents_refs(page: &PdfDictionary) -> PdfResult<Vec<ObjectRef>> {
242    match page.get("Contents") {
243        Some(PdfValue::Reference(object_ref)) => Ok(vec![*object_ref]),
244        Some(PdfValue::Array(entries)) => entries
245            .iter()
246            .map(|entry| match entry {
247                PdfValue::Reference(object_ref) => Ok(*object_ref),
248                _ => Err(PdfError::Unsupported(
249                    "direct content streams are not supported".to_string(),
250                )),
251            })
252            .collect(),
253        Some(PdfValue::Dictionary(_)) => Err(PdfError::Unsupported(
254            "direct content streams are not supported".to_string(),
255        )),
256        Some(_) => Err(PdfError::Corrupt(
257            "page Contents entry is not a reference or array".to_string(),
258        )),
259        None => Ok(Vec::new()),
260    }
261}
262
263pub fn get_stream(file: &PdfFile, object_ref: ObjectRef) -> PdfResult<&crate::types::PdfStream> {
264    match file.get_object(object_ref)? {
265        PdfObject::Stream(stream) => Ok(stream),
266        PdfObject::Value(_) => Err(PdfError::Corrupt(format!(
267            "expected stream object at {} {}",
268            object_ref.object_number, object_ref.generation
269        ))),
270    }
271}