1use pdf_graphics::{PageBox, Rect};
2
3use crate::error::{PdfError, PdfResult};
4use crate::types::{ObjectRef, PdfDictionary, PdfFile, PdfObject, PdfValue};
5
6#[derive(Debug, Clone)]
7pub struct DocumentCatalog {
8 pub catalog_ref: ObjectRef,
9 pub pages_ref: ObjectRef,
10}
11
12#[derive(Debug, Clone)]
13pub struct PageInfo {
14 pub page_ref: ObjectRef,
15 pub resources: PdfDictionary,
16 pub page_box: PageBox,
17 pub content_refs: Vec<ObjectRef>,
18 pub annotation_refs: Vec<ObjectRef>,
19}
20
21#[derive(Debug, Clone)]
22pub struct ParsedDocument {
23 pub file: PdfFile,
24 pub catalog: DocumentCatalog,
25 pub pages: Vec<PageInfo>,
26}
27
28pub fn build_document(file: PdfFile) -> PdfResult<ParsedDocument> {
29 if file.trailer.contains_key("Encrypt") {
35 return Err(PdfError::Unsupported(
36 "encrypted PDFs are not supported".to_string(),
37 ));
38 }
39
40 let root = file
41 .trailer
42 .get("Root")
43 .ok_or_else(|| PdfError::Corrupt("trailer is missing Root".to_string()))?;
44 let root_ref = match root {
45 PdfValue::Reference(object_ref) => *object_ref,
46 _ => return Err(PdfError::Corrupt("Root is not a reference".to_string())),
47 };
48 let root_dict = file.get_dictionary(root_ref)?;
49 if root_dict.get("Type").and_then(PdfValue::as_name) != Some("Catalog") {
50 return Err(PdfError::Corrupt("Root catalog has wrong type".to_string()));
51 }
52
53 let pages_ref = match root_dict.get("Pages") {
54 Some(PdfValue::Reference(object_ref)) => *object_ref,
55 _ => return Err(PdfError::Corrupt("Catalog is missing Pages".to_string())),
56 };
57 let catalog = DocumentCatalog {
58 catalog_ref: root_ref,
59 pages_ref,
60 };
61
62 let mut pages = Vec::new();
63 let mut visited = std::collections::BTreeSet::new();
64 collect_pages(
65 &file,
66 pages_ref,
67 &mut pages,
68 None,
69 None,
70 None,
71 0,
72 &mut visited,
73 )?;
74
75 Ok(ParsedDocument {
76 file,
77 catalog,
78 pages,
79 })
80}
81
82const MAX_PAGE_TREE_DEPTH: usize = 64;
83
84#[allow(clippy::too_many_arguments)]
85fn collect_pages(
86 file: &PdfFile,
87 node_ref: ObjectRef,
88 output: &mut Vec<PageInfo>,
89 inherited_resources: Option<&PdfDictionary>,
90 inherited_media_box: Option<Rect>,
91 inherited_rotate: Option<i32>,
92 depth: usize,
93 visited: &mut std::collections::BTreeSet<ObjectRef>,
94) -> PdfResult<()> {
95 if depth > MAX_PAGE_TREE_DEPTH {
96 return Err(PdfError::Corrupt(
97 "page tree exceeds maximum depth".to_string(),
98 ));
99 }
100 if !visited.insert(node_ref) {
101 return Err(PdfError::Corrupt("cycle detected in page tree".to_string()));
102 }
103 let dictionary = file.get_dictionary(node_ref)?;
104 match dictionary.get("Type").and_then(PdfValue::as_name) {
105 Some("Pages") => {
106 let resources = dictionary
107 .get("Resources")
108 .map(|value| file.resolve_dict(value))
109 .transpose()?
110 .or(inherited_resources);
111 let media_box = dictionary
112 .get("MediaBox")
113 .map(|value| parse_rect(file.resolve(value)?))
114 .transpose()?
115 .or(inherited_media_box);
116 let rotate = dictionary
117 .get("Rotate")
118 .map(|value| parse_rotation(file.resolve(value)?))
119 .transpose()?
120 .or(inherited_rotate);
121 let kids = dictionary
122 .get("Kids")
123 .and_then(PdfValue::as_array)
124 .ok_or_else(|| PdfError::Corrupt("Pages node is missing Kids".to_string()))?;
125 for kid in kids {
126 let kid_ref = match kid {
127 PdfValue::Reference(object_ref) => *object_ref,
128 _ => {
129 return Err(PdfError::Corrupt(
130 "Pages Kids entry is not an object reference".to_string(),
131 ));
132 }
133 };
134 collect_pages(
135 file,
136 kid_ref,
137 output,
138 resources,
139 media_box,
140 rotate,
141 depth + 1,
142 visited,
143 )?;
144 }
145 }
146 Some("Page") => {
147 let resources = dictionary
148 .get("Resources")
149 .map(|value| file.resolve_dict(value))
150 .transpose()?
151 .or(inherited_resources)
152 .cloned()
153 .ok_or_else(|| PdfError::Corrupt("page is missing Resources".to_string()))?;
154 let media_box = dictionary
155 .get("MediaBox")
156 .map(|value| parse_rect(file.resolve(value)?))
157 .transpose()?
158 .or(inherited_media_box)
159 .ok_or_else(|| PdfError::Corrupt("page is missing MediaBox".to_string()))?;
160 let crop_box = dictionary
161 .get("CropBox")
162 .map(|value| parse_rect(file.resolve(value)?))
163 .transpose()?
164 .unwrap_or(media_box);
165 let rotate = dictionary
166 .get("Rotate")
167 .map(|value| parse_rotation(file.resolve(value)?))
168 .transpose()?
169 .or(inherited_rotate)
170 .unwrap_or(0);
171 let content_refs = parse_contents_refs(dictionary)?;
172 let annotation_refs = dictionary
173 .get("Annots")
174 .and_then(PdfValue::as_array)
175 .map(|entries| {
176 entries
177 .iter()
178 .map(|entry| match entry {
179 PdfValue::Reference(object_ref) => Ok(*object_ref),
180 _ => Err(PdfError::Corrupt(
181 "annotation entry is not a reference".to_string(),
182 )),
183 })
184 .collect::<PdfResult<Vec<_>>>()
185 })
186 .transpose()?
187 .unwrap_or_default();
188 output.push(PageInfo {
189 page_ref: node_ref,
190 resources,
191 page_box: PageBox {
192 media_box,
193 crop_box,
194 rotate,
195 },
196 content_refs,
197 annotation_refs,
198 });
199 }
200 other => {
201 return Err(PdfError::Corrupt(format!(
202 "unexpected page tree node type: {other:?}"
203 )));
204 }
205 }
206 Ok(())
207}
208
209fn parse_rotation(value: &PdfValue) -> PdfResult<i32> {
210 value
211 .as_integer()
212 .map(|value| value as i32)
213 .ok_or_else(|| PdfError::Corrupt("Rotate is not an integer".to_string()))
214}
215
216fn parse_rect(value: &PdfValue) -> PdfResult<Rect> {
217 let array = value
218 .as_array()
219 .ok_or_else(|| PdfError::Corrupt("expected box array".to_string()))?;
220 if array.len() != 4 {
221 return Err(PdfError::Corrupt(
222 "box array must contain four numbers".to_string(),
223 ));
224 }
225 let left = array[0]
226 .as_number()
227 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
228 let bottom = array[1]
229 .as_number()
230 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
231 let right = array[2]
232 .as_number()
233 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
234 let top = array[3]
235 .as_number()
236 .ok_or_else(|| PdfError::Corrupt("invalid box value".to_string()))?;
237 Ok(Rect {
238 x: left,
239 y: bottom,
240 width: right - left,
241 height: top - bottom,
242 }
243 .normalize())
244}
245
246fn parse_contents_refs(page: &PdfDictionary) -> PdfResult<Vec<ObjectRef>> {
247 match page.get("Contents") {
248 Some(PdfValue::Reference(object_ref)) => Ok(vec![*object_ref]),
249 Some(PdfValue::Array(entries)) => entries
250 .iter()
251 .map(|entry| match entry {
252 PdfValue::Reference(object_ref) => Ok(*object_ref),
253 _ => Err(PdfError::Unsupported(
254 "direct content streams are not supported".to_string(),
255 )),
256 })
257 .collect(),
258 Some(PdfValue::Dictionary(_)) => Err(PdfError::Unsupported(
259 "direct content streams are not supported".to_string(),
260 )),
261 Some(_) => Err(PdfError::Corrupt(
262 "page Contents entry is not a reference or array".to_string(),
263 )),
264 None => Ok(Vec::new()),
265 }
266}
267
268pub fn get_stream(file: &PdfFile, object_ref: ObjectRef) -> PdfResult<&crate::types::PdfStream> {
269 match file.get_object(object_ref)? {
270 PdfObject::Stream(stream) => Ok(stream),
271 PdfObject::Value(_) => Err(PdfError::Corrupt(format!(
272 "expected stream object at {} {}",
273 object_ref.object_number, object_ref.generation
274 ))),
275 }
276}