1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3
4use tracing::warn;
5use zpdf_core::{ObjectId, PdfDict, PdfObject, Rect, Result};
6use zpdf_parser::PdfFile;
7
8pub(crate) const MAX_PAGE_TREE_DEPTH: usize = 64;
12
13#[derive(Debug)]
14pub struct PdfPage {
15 pub id: ObjectId,
16 pub media_box: Rect,
17 pub crop_box: Rect,
18 pub rotate: i32,
19 pub resources: ResourceDict,
20 pub contents: Vec<ObjectId>,
21 pub annots: Vec<ObjectId>,
23}
24
25#[derive(Debug, Default)]
26pub struct ResourceDict {
27 pub fonts: HashMap<String, ObjectId>,
28 pub xobjects: HashMap<String, ObjectId>,
29 pub ext_g_state: HashMap<String, ObjectId>,
30 pub ext_g_state_inline: HashMap<String, zpdf_core::PdfDict>,
31 pub color_spaces: HashMap<String, ObjectId>,
32 pub color_spaces_inline: HashMap<String, PdfObject>,
35 pub patterns: HashMap<String, ObjectId>,
36 pub shadings: HashMap<String, ObjectId>,
37 pub shadings_inline: HashMap<String, PdfObject>,
38 pub properties: HashMap<String, ObjectId>,
40 pub properties_inline: HashMap<String, zpdf_core::PdfDict>,
41}
42
43impl PdfPage {
44 pub fn from_object(file: &PdfFile, page_id: ObjectId) -> Result<Self> {
45 let obj = file.resolve(page_id)?;
46 let dict = obj.as_dict()?;
47
48 let inherited = InheritedAttrs::gather(file, dict);
52
53 let media_box = inherited
54 .media_box
55 .ok_or_else(|| zpdf_core::Error::MissingKey("MediaBox".into()))?;
56 let crop_box = inherited.crop_box.unwrap_or(media_box);
57 let rotate = inherited.rotate.unwrap_or(0);
58 let resources = inherited.resources.unwrap_or_default();
59
60 let contents = Self::collect_content_refs(file, dict.get("Contents"));
61 let annots = Self::collect_annot_refs(file, dict.get("Annots"));
62
63 Ok(Self {
64 id: page_id,
65 media_box,
66 crop_box,
67 rotate,
68 resources,
69 contents,
70 annots,
71 })
72 }
73
74 fn collect_content_refs(file: &PdfFile, contents: Option<&PdfObject>) -> Vec<ObjectId> {
80 fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
81 arr.iter()
82 .filter_map(|o| match o {
83 PdfObject::Ref(r) => Some(*r),
84 _ => None,
85 })
86 .collect()
87 }
88 match contents {
89 Some(PdfObject::Array(arr)) => refs_from_array(arr),
90 Some(PdfObject::Ref(r)) => match file.resolve(*r) {
91 Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
93 Ok(PdfObject::Stream(_)) => vec![*r],
95 _ => vec![*r],
98 },
99 _ => vec![],
100 }
101 }
102
103 fn collect_annot_refs(file: &PdfFile, annots: Option<&PdfObject>) -> Vec<ObjectId> {
106 fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
107 arr.iter()
108 .filter_map(|o| match o {
109 PdfObject::Ref(r) => Some(*r),
110 _ => None,
111 })
112 .collect()
113 }
114 match annots {
115 Some(PdfObject::Array(arr)) => refs_from_array(arr),
116 Some(PdfObject::Ref(r)) => match file.resolve(*r) {
117 Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
118 _ => Vec::new(),
119 },
120 _ => Vec::new(),
121 }
122 }
123
124 pub fn width(&self) -> f64 {
125 self.media_box.width()
126 }
127
128 pub fn height(&self) -> f64 {
129 self.media_box.height()
130 }
131
132 pub fn effective_box(&self) -> Rect {
137 let media = self.media_box.normalize();
138 let crop = self.crop_box.normalize();
139 let inter = Rect::new(
140 crop.x0.max(media.x0),
141 crop.y0.max(media.y0),
142 crop.x1.min(media.x1),
143 crop.y1.min(media.y1),
144 );
145 if inter.x1 > inter.x0 && inter.y1 > inter.y0 {
146 inter
147 } else {
148 media
149 }
150 }
151}
152
153#[derive(Default)]
156struct InheritedAttrs {
157 media_box: Option<Rect>,
158 crop_box: Option<Rect>,
159 rotate: Option<i32>,
160 resources: Option<ResourceDict>,
161}
162
163impl InheritedAttrs {
164 fn is_complete(&self) -> bool {
165 self.media_box.is_some()
166 && self.crop_box.is_some()
167 && self.rotate.is_some()
168 && self.resources.is_some()
169 }
170
171 fn gather(file: &PdfFile, leaf: &PdfDict) -> Self {
172 let mut attrs = Self::default();
173 let mut visited: HashSet<ObjectId> = HashSet::new();
174 let mut current: Cow<'_, PdfDict> = Cow::Borrowed(leaf);
175 let mut depth = 0usize;
176
177 loop {
178 attrs.absorb(file, ¤t);
179 if attrs.is_complete() {
180 break;
181 }
182 let parent_ref = match current.get("Parent") {
183 Some(PdfObject::Ref(r)) => *r,
184 _ => break,
185 };
186 depth += 1;
187 if depth > MAX_PAGE_TREE_DEPTH {
188 warn!("page-tree /Parent chain deeper than {MAX_PAGE_TREE_DEPTH}; stopping inheritance walk");
189 break;
190 }
191 if !visited.insert(parent_ref) {
192 warn!("page-tree /Parent cycle at {parent_ref}; stopping inheritance walk");
193 break;
194 }
195 match file.resolve(parent_ref) {
196 Ok(PdfObject::Dict(d)) => current = Cow::Owned(d),
197 Ok(PdfObject::Null) => {
198 warn!(
199 "page-tree parent {parent_ref} resolves to null; stopping inheritance walk"
200 );
201 break;
202 }
203 Ok(other) => {
204 warn!(
205 "page-tree parent {parent_ref} is {}, expected Dict; stopping inheritance walk",
206 other.type_name()
207 );
208 break;
209 }
210 Err(e) => {
211 warn!("failed to resolve page-tree parent {parent_ref}: {e}");
212 break;
213 }
214 }
215 }
216 attrs
217 }
218
219 fn absorb(&mut self, file: &PdfFile, dict: &PdfDict) {
222 if self.media_box.is_none() {
223 self.media_box = resolve_rect(file, dict, "MediaBox");
224 }
225 if self.crop_box.is_none() {
226 self.crop_box = resolve_rect(file, dict, "CropBox");
227 }
228 if self.rotate.is_none() {
229 self.rotate = resolve_i64(file, dict.get("Rotate")).map(|n| n as i32);
230 }
231 if self.resources.is_none() {
232 if let Some(d) = resolve_sub_dict(dict, "Resources", file) {
233 match parse_resource_dict(&d, file) {
234 Ok(r) => self.resources = Some(r),
235 Err(e) => warn!("failed to parse /Resources: {e}"),
236 }
237 }
238 }
239 }
240}
241
242pub(crate) fn resolve_rect(file: &PdfFile, dict: &PdfDict, key: &str) -> Option<Rect> {
245 let arr: Cow<'_, [PdfObject]> = match dict.get(key)? {
246 PdfObject::Array(a) => Cow::Borrowed(a.as_slice()),
247 PdfObject::Ref(r) => match file.resolve(*r) {
248 Ok(PdfObject::Array(a)) => Cow::Owned(a),
249 Ok(other) => {
250 warn!(
251 "/{key} ref {r} resolved to {}, expected Array",
252 other.type_name()
253 );
254 return None;
255 }
256 Err(e) => {
257 warn!("failed to resolve /{key} ref {r}: {e}");
258 return None;
259 }
260 },
261 _ => return None,
262 };
263 if arr.len() != 4 {
264 warn!("/{key} array has {} elements, expected 4", arr.len());
265 return None;
266 }
267 let mut v = [0f64; 4];
268 for (slot, obj) in v.iter_mut().zip(arr.iter()) {
269 *slot = match obj {
270 PdfObject::Ref(r) => file.resolve(*r).ok()?.as_f64().ok()?,
271 other => other.as_f64().ok()?,
272 };
273 }
274 Some(Rect::new(v[0], v[1], v[2], v[3]))
275}
276
277fn resolve_i64(file: &PdfFile, value: Option<&PdfObject>) -> Option<i64> {
279 match value? {
280 PdfObject::Integer(n) => Some(*n),
281 PdfObject::Real(r) => Some(*r as i64),
282 PdfObject::Ref(r) => match file.resolve(*r).ok()? {
283 PdfObject::Integer(n) => Some(n),
284 PdfObject::Real(r) => Some(r as i64),
285 _ => None,
286 },
287 _ => None,
288 }
289}
290
291fn resolve_sub_dict<'a>(
292 dict: &'a zpdf_core::PdfDict,
293 key: &str,
294 file: &'a PdfFile,
295) -> Option<std::borrow::Cow<'a, zpdf_core::PdfDict>> {
296 match dict.get(key) {
297 Some(PdfObject::Dict(d)) => Some(std::borrow::Cow::Borrowed(d)),
298 Some(PdfObject::Ref(r)) => file.resolve(*r).ok().and_then(|o| match o {
299 PdfObject::Dict(d) => Some(std::borrow::Cow::Owned(d)),
300 _ => None,
301 }),
302 _ => None,
303 }
304}
305
306pub fn parse_resource_dict(dict: &zpdf_core::PdfDict, file: &PdfFile) -> Result<ResourceDict> {
307 let mut res = ResourceDict::default();
308
309 if let Some(fonts) = resolve_sub_dict(dict, "Font", file) {
310 for (name, obj) in &fonts.0 {
311 if let PdfObject::Ref(r) = obj {
312 res.fonts.insert(name.0.clone(), *r);
313 }
314 }
315 }
316
317 if let Some(xobjects) = resolve_sub_dict(dict, "XObject", file) {
318 for (name, obj) in &xobjects.0 {
319 if let PdfObject::Ref(r) = obj {
320 res.xobjects.insert(name.0.clone(), *r);
321 }
322 }
323 }
324
325 if let Some(gs) = resolve_sub_dict(dict, "ExtGState", file) {
326 for (name, obj) in &gs.0 {
327 match obj {
328 PdfObject::Ref(r) => {
329 res.ext_g_state.insert(name.0.clone(), *r);
330 }
331 PdfObject::Dict(d) => {
332 res.ext_g_state_inline.insert(name.0.clone(), d.clone());
333 }
334 _ => {}
335 }
336 }
337 }
338
339 if let Some(cs) = resolve_sub_dict(dict, "ColorSpace", file) {
340 for (name, obj) in &cs.0 {
341 match obj {
342 PdfObject::Ref(r) => {
343 res.color_spaces.insert(name.0.clone(), *r);
344 }
345 other @ (PdfObject::Array(_) | PdfObject::Name(_)) => {
346 res.color_spaces_inline
347 .insert(name.0.clone(), other.clone());
348 }
349 _ => {}
350 }
351 }
352 }
353
354 if let Some(pat) = resolve_sub_dict(dict, "Pattern", file) {
355 for (name, obj) in &pat.0 {
356 if let PdfObject::Ref(r) = obj {
357 res.patterns.insert(name.0.clone(), *r);
358 }
359 }
360 }
361
362 if let Some(sh) = resolve_sub_dict(dict, "Shading", file) {
363 for (name, obj) in &sh.0 {
364 match obj {
365 PdfObject::Ref(r) => {
366 res.shadings.insert(name.0.clone(), *r);
367 }
368 other @ PdfObject::Dict(_) => {
369 res.shadings_inline.insert(name.0.clone(), other.clone());
370 }
371 _ => {}
372 }
373 }
374 }
375
376 if let Some(props) = resolve_sub_dict(dict, "Properties", file) {
377 for (name, obj) in &props.0 {
378 match obj {
379 PdfObject::Ref(r) => {
380 res.properties.insert(name.0.clone(), *r);
381 }
382 PdfObject::Dict(d) => {
383 res.properties_inline.insert(name.0.clone(), d.clone());
384 }
385 _ => {}
386 }
387 }
388 }
389
390 Ok(res)
391}
392
393#[cfg(test)]
394mod tests {
395 use super::*;
396 use crate::test_util::build_pdf;
397 use crate::PdfDocument;
398
399 fn page0(objects: &[&str]) -> PdfPage {
401 let doc = PdfDocument::open(build_pdf(objects)).expect("open");
402 doc.page(0).expect("page")
403 }
404
405 #[test]
406 fn rotate_and_resources_inherited_from_pages_node() {
407 let page = page0(&[
408 "<< /Type /Catalog /Pages 2 0 R >>",
409 "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
410 "<< /Type /Page /Parent 2 0 R >>",
411 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
412 ]);
413 assert_eq!(page.rotate, 90);
414 assert_eq!(page.media_box, Rect::new(0.0, 0.0, 612.0, 792.0));
415 assert_eq!(page.resources.fonts.get("F1"), Some(&ObjectId(4, 0)));
416 }
417
418 #[test]
419 fn leaf_attributes_override_inherited() {
420 let page = page0(&[
421 "<< /Type /Catalog /Pages 2 0 R >>",
422 "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
423 "<< /Type /Page /Parent 2 0 R /Rotate 180 /Resources << /Font << /F2 4 0 R >> >> >>",
424 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
425 ]);
426 assert_eq!(page.rotate, 180);
427 assert!(page.resources.fonts.contains_key("F2"));
428 assert!(!page.resources.fonts.contains_key("F1"));
430 }
431
432 #[test]
433 fn indirect_media_and_crop_boxes_resolve() {
434 let page = page0(&[
435 "<< /Type /Catalog /Pages 2 0 R >>",
436 "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
437 "<< /Type /Page /Parent 2 0 R /MediaBox 4 0 R /CropBox [10 10 5 0 R 200] >>",
438 "[0 0 300 400]",
439 "100",
440 ]);
441 assert_eq!(page.media_box, Rect::new(0.0, 0.0, 300.0, 400.0));
442 assert_eq!(page.crop_box, Rect::new(10.0, 10.0, 100.0, 200.0));
443 }
444
445 #[test]
446 fn parent_cycle_terminates_and_keeps_found_values() {
447 let page = page0(&[
450 "<< /Type /Catalog /Pages 2 0 R >>",
451 "<< /Type /Pages /Kids [3 0 R] /Count 1 /Parent 3 0 R /MediaBox [0 0 100 100] >>",
452 "<< /Type /Page /Parent 2 0 R >>",
453 ]);
454 assert_eq!(page.media_box, Rect::new(0.0, 0.0, 100.0, 100.0));
455 assert_eq!(page.rotate, 0);
456 }
457
458 #[test]
459 fn annots_refs_collected() {
460 let page = page0(&[
461 "<< /Type /Catalog /Pages 2 0 R >>",
462 "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
463 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] /Annots [4 0 R 5 0 R] >>",
464 "<< /Type /Annot /Subtype /Link >>",
465 "<< /Type /Annot /Subtype /Square >>",
466 ]);
467 assert_eq!(page.annots, vec![ObjectId(4, 0), ObjectId(5, 0)]);
468 }
469
470 fn page_with_boxes(media: Rect, crop: Rect) -> PdfPage {
471 PdfPage {
472 id: ObjectId(1, 0),
473 media_box: media,
474 crop_box: crop,
475 rotate: 0,
476 resources: ResourceDict::default(),
477 contents: vec![],
478 annots: vec![],
479 }
480 }
481
482 #[test]
483 fn effective_box_intersects_crop_with_media() {
484 let media = Rect::new(0.0, 0.0, 612.0, 792.0);
485 let p = page_with_boxes(media, Rect::new(10.0, 20.0, 500.0, 700.0));
487 assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
488 let p = page_with_boxes(media, Rect::new(-50.0, -50.0, 700.0, 800.0));
490 assert_eq!(p.effective_box(), media);
491 let p = page_with_boxes(media, Rect::new(300.0, 400.0, 900.0, 900.0));
493 assert_eq!(p.effective_box(), Rect::new(300.0, 400.0, 612.0, 792.0));
494 }
495
496 #[test]
497 fn effective_box_falls_back_to_media_box() {
498 let media = Rect::new(0.0, 0.0, 612.0, 792.0);
499 let p = page_with_boxes(media, Rect::new(1000.0, 1000.0, 1100.0, 1100.0));
501 assert_eq!(p.effective_box(), media);
502 let p = page_with_boxes(media, Rect::new(100.0, 100.0, 100.0, 100.0));
504 assert_eq!(p.effective_box(), media);
505 let p = page_with_boxes(media, media);
507 assert_eq!(p.effective_box(), media);
508 }
509
510 #[test]
511 fn effective_box_normalizes_inverted_crop() {
512 let media = Rect::new(0.0, 0.0, 612.0, 792.0);
513 let p = page_with_boxes(media, Rect::new(500.0, 700.0, 10.0, 20.0));
514 assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
515 }
516}