1use pdf_graphics::{PageBox, Rect};
2
3use crate::error::{PdfError, PdfResult};
4use crate::types::{ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfValue};
5
6#[derive(Debug, Clone)]
7pub struct DocumentCatalog {
8 pub catalog_ref: ObjectRef,
9 pub pages_ref: ObjectRef,
10}
11
12#[derive(Debug, Clone)]
13pub struct PageInfo {
14 pub page_ref: ObjectRef,
15 pub resources: PdfDictionary,
16 pub page_box: PageBox,
17 pub content_refs: Vec<ObjectRef>,
18 pub annotation_refs: Vec<ObjectRef>,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedDocument {
23 pub file: PdfFile,
24 pub catalog: DocumentCatalog,
25 pub pages: Vec<PageInfo>,
26}
27
28pub fn build_document(file: PdfFile) -> PdfResult<ParsedDocument> {
29 if file.trailer.contains_key("Encrypt") {
30 return Err(PdfError::Unsupported(
31 "encrypted PDFs are not supported".to_string(),
32 ));
33 }
34
35 let root = file
36 .trailer
37 .get("Root")
38 .ok_or_else(|| PdfError::Corrupt("trailer is missing Root".to_string()))?;
39 let root_ref = match root {
40 PdfValue::Reference(object_ref) => *object_ref,
41 _ => return Err(PdfError::Corrupt("Root is not a reference".to_string())),
42 };
43 let root_dict = file.get_dictionary(root_ref)?;
44 if root_dict.get("Type").and_then(PdfValue::as_name) != Some("Catalog") {
45 return Err(PdfError::Corrupt("Root catalog has wrong type".to_string()));
46 }
47
48 let pages_ref = match root_dict.get("Pages") {
49 Some(PdfValue::Reference(object_ref)) => *object_ref,
50 _ => return Err(PdfError::Corrupt("Catalog is missing Pages".to_string())),
51 };
52 let catalog = DocumentCatalog {
53 catalog_ref: root_ref,
54 pages_ref,
55 };
56
57 let mut pages = Vec::new();
58 let mut visited = std::collections::BTreeSet::new();
59 collect_pages(
60 &file,
61 pages_ref,
62 &mut pages,
63 None,
64 None,
65 None,
66 0,
67 &mut visited,
68 )?;
69
70 Ok(ParsedDocument {
71 file,
72 catalog,
73 pages,
74 })
75}
76
77const MAX_PAGE_TREE_DEPTH: usize = 64;
78
79#[allow(clippy::too_many_arguments)]
80fn collect_pages(
81 file: &PdfFile,
82 node_ref: ObjectRef,
83 output: &mut Vec<PageInfo>,
84 inherited_resources: Option<&PdfDictionary>,
85 inherited_media_box: Option<Rect>,
86 inherited_rotate: Option<i32>,
87 depth: usize,
88 visited: &mut std::collections::BTreeSet<ObjectRef>,
89) -> PdfResult<()> {
90 if depth > MAX_PAGE_TREE_DEPTH {
91 return Err(PdfError::Corrupt(
92 "page tree exceeds maximum depth".to_string(),
93 ));
94 }
95 if !visited.insert(node_ref) {
96 return Err(PdfError::Corrupt("cycle detected in page tree".to_string()));
97 }
98 let dictionary = file.get_dictionary(node_ref)?;
99 match dictionary.get("Type").and_then(PdfValue::as_name) {
100 Some("Pages") => {
101 let resources = dictionary
102 .get("Resources")
103 .map(|value| file.resolve_dict(value))
104 .transpose()?
105 .or(inherited_resources);
106 let media_box = dictionary
107 .get("MediaBox")
108 .map(|value| parse_rect(file.resolve(value)?))
109 .transpose()?
110 .or(inherited_media_box);
111 let rotate = dictionary
112 .get("Rotate")
113 .map(|value| parse_rotation(file.resolve(value)?))
114 .transpose()?
115 .or(inherited_rotate);
116 let kids = dictionary
117 .get("Kids")
118 .and_then(PdfValue::as_array)
119 .ok_or_else(|| PdfError::Corrupt("Pages node is missing Kids".to_string()))?;
120 for kid in kids {
121 let kid_ref = match kid {
122 PdfValue::Reference(object_ref) => *object_ref,
123 _ => {
124 return Err(PdfError::Corrupt(
125 "Pages Kids entry is not an object reference".to_string(),
126 ));
127 }
128 };
129 collect_pages(
130 file,
131 kid_ref,
132 output,
133 resources,
134 media_box,
135 rotate,
136 depth + 1,
137 visited,
138 )?;
139 }
140 }
141 Some("Page") => {
142 let resources = dictionary
143 .get("Resources")
144 .map(|value| file.resolve_dict(value))
145 .transpose()?
146 .or(inherited_resources)
147 .cloned()
148 .ok_or_else(|| PdfError::Corrupt("page is missing Resources".to_string()))?;
149 let media_box = dictionary
150 .get("MediaBox")
151 .map(|value| parse_rect(file.resolve(value)?))
152 .transpose()?
153 .or(inherited_media_box)
154 .ok_or_else(|| PdfError::Corrupt("page is missing MediaBox".to_string()))?;
155 let crop_box = dictionary
156 .get("CropBox")
157 .map(|value| parse_rect(file.resolve(value)?))
158 .transpose()?
159 .unwrap_or(media_box);
160 let rotate = dictionary
161 .get("Rotate")
162 .map(|value| parse_rotation(file.resolve(value)?))
163 .transpose()?
164 .or(inherited_rotate)
165 .unwrap_or(0);
166 let content_refs = parse_contents_refs(dictionary)?;
167 let annotation_refs = dictionary
168 .get("Annots")
169 .and_then(PdfValue::as_array)
170 .map(|entries| {
171 entries
172 .iter()
173 .map(|entry| match entry {
174 PdfValue::Reference(object_ref) => Ok(*object_ref),
175 _ => Err(PdfError::Corrupt(
176 "annotation entry is not a reference".to_string(),
177 )),
178 })
179 .collect::<PdfResult<Vec<_>>>()
180 })
181 .transpose()?
182 .unwrap_or_default();
183 output.push(PageInfo {
184 page_ref: node_ref,
185 resources,
186 page_box: PageBox {
187 media_box,
188 crop_box,
189 rotate,
190 },
191 content_refs,
192 annotation_refs,
193 });
194 }
195 other => {
196 return Err(PdfError::Corrupt(format!(
197 "unexpected page tree node type: {other:?}"
198 )));
199 }
200 }
201 Ok(())
202}
203
204fn parse_rotation(value: &PdfValue) -> PdfResult<i32> {
205 value
206 .as_integer()
207 .map(|value| value as i32)
208 .ok_or_else(|| PdfError::Corrupt("Rotate is not an integer".to_string()))
209}
210
211fn parse_rect(value: &PdfValue) -> PdfResult<Rect> {
212 let array = value
213 .as_array()
214 .ok_or_else(|| PdfError::Corrupt("expected box array".to_string()))?;
215 if array.len() != 4 {
216 return Err(PdfError::Corrupt(
217 "box array must contain four numbers".to_string(),
218 ));
219 }
220 let left = array[0]
221 .as_number()
222 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
223 let bottom = array[1]
224 .as_number()
225 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
226 let right = array[2]
227 .as_number()
228 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
229 let top = array[3]
230 .as_number()
231 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
232 Ok(Rect {
233 x: left,
234 y: bottom,
235 width: right - left,
236 height: top - bottom,
237 }
238 .normalize())
239}
240
241fn parse_contents_refs(page: &PdfDictionary) -> PdfResult<Vec<ObjectRef>> {
242 match page.get("Contents") {
243 Some(PdfValue::Reference(object_ref)) => Ok(vec![*object_ref]),
244 Some(PdfValue::Array(entries)) => entries
245 .iter()
246 .map(|entry| match entry {
247 PdfValue::Reference(object_ref) => Ok(*object_ref),
248 _ => Err(PdfError::Unsupported(
249 "direct content streams are not supported".to_string(),
250 )),
251 })
252 .collect(),
253 Some(PdfValue::Dictionary(_)) => Err(PdfError::Unsupported(
254 "direct content streams are not supported".to_string(),
255 )),
256 Some(_) => Err(PdfError::Corrupt(
257 "page Contents entry is not a reference or array".to_string(),
258 )),
259 None => Ok(Vec::new()),
260 }
261}
262
263pub fn get_stream(file: &PdfFile, object_ref: ObjectRef) -> PdfResult<&crate::types::PdfStream> {
264 match file.get_object(object_ref)? {
265 PdfObject::Stream(stream) => Ok(stream),
266 PdfObject::Value(_) => Err(PdfError::Corrupt(format!(
267 "expected stream object at {} {}",
268 object_ref.object_number, object_ref.generation
269 ))),
270 }
271}