1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3
4use tracing::warn;
5use zpdf_core::{ObjectId, PdfDict, PdfObject, Rect, Result};
6use zpdf_parser::PdfFile;
7
8pub(crate) const MAX_PAGE_TREE_DEPTH: usize = 64;
12
13const DEFAULT_MEDIA_BOX: Rect = Rect {
16 x0: 0.0,
17 y0: 0.0,
18 x1: 612.0,
19 y1: 792.0,
20};
21
22fn is_usable_box(r: &Rect) -> bool {
26 if ![r.x0, r.y0, r.x1, r.y1].iter().all(|v| v.is_finite()) {
27 return false;
28 }
29 let n = r.normalize();
30 n.width() > 0.0 && n.height() > 0.0
31}
32
33#[derive(Debug)]
34pub struct PdfPage {
35 pub id: ObjectId,
36 pub media_box: Rect,
37 pub crop_box: Rect,
38 pub rotate: i32,
39 pub resources: ResourceDict,
40 pub contents: Vec<ObjectId>,
41 pub annots: Vec<ObjectId>,
43}
44
45#[derive(Debug, Default)]
46pub struct ResourceDict {
47 pub fonts: HashMap<String, ObjectId>,
48 pub xobjects: HashMap<String, ObjectId>,
49 pub ext_g_state: HashMap<String, ObjectId>,
50 pub ext_g_state_inline: HashMap<String, zpdf_core::PdfDict>,
51 pub color_spaces: HashMap<String, ObjectId>,
52 pub color_spaces_inline: HashMap<String, PdfObject>,
55 pub patterns: HashMap<String, ObjectId>,
56 pub shadings: HashMap<String, ObjectId>,
57 pub shadings_inline: HashMap<String, PdfObject>,
58 pub properties: HashMap<String, ObjectId>,
60 pub properties_inline: HashMap<String, zpdf_core::PdfDict>,
61}
62
63impl PdfPage {
64 pub fn from_object(file: &PdfFile, page_id: ObjectId) -> Result<Self> {
65 let obj = file.resolve(page_id)?;
66 let dict = obj.as_dict()?;
67
68 let inherited = InheritedAttrs::gather(file, dict);
72
73 let media_box = inherited
78 .media_box
79 .filter(is_usable_box)
80 .unwrap_or(DEFAULT_MEDIA_BOX);
81 let crop_box = inherited
82 .crop_box
83 .filter(is_usable_box)
84 .unwrap_or(media_box);
85 let rotate = inherited.rotate.unwrap_or(0);
86 let resources = inherited.resources.unwrap_or_default();
87
88 let contents = Self::collect_content_refs(file, dict.get("Contents"));
89 let annots = Self::collect_annot_refs(file, dict.get("Annots"));
90
91 Ok(Self {
92 id: page_id,
93 media_box,
94 crop_box,
95 rotate,
96 resources,
97 contents,
98 annots,
99 })
100 }
101
102 fn collect_content_refs(file: &PdfFile, contents: Option<&PdfObject>) -> Vec<ObjectId> {
108 fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
109 arr.iter()
110 .filter_map(|o| match o {
111 PdfObject::Ref(r) => Some(*r),
112 _ => None,
113 })
114 .collect()
115 }
116 match contents {
117 Some(PdfObject::Array(arr)) => refs_from_array(arr),
118 Some(PdfObject::Ref(r)) => match file.resolve(*r) {
119 Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
121 Ok(PdfObject::Stream(_)) => vec![*r],
123 _ => vec![*r],
126 },
127 _ => vec![],
128 }
129 }
130
131 fn collect_annot_refs(file: &PdfFile, annots: Option<&PdfObject>) -> Vec<ObjectId> {
134 fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
135 arr.iter()
136 .filter_map(|o| match o {
137 PdfObject::Ref(r) => Some(*r),
138 _ => None,
139 })
140 .collect()
141 }
142 match annots {
143 Some(PdfObject::Array(arr)) => refs_from_array(arr),
144 Some(PdfObject::Ref(r)) => match file.resolve(*r) {
145 Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
146 _ => Vec::new(),
147 },
148 _ => Vec::new(),
149 }
150 }
151
152 pub fn width(&self) -> f64 {
153 self.media_box.width()
154 }
155
156 pub fn height(&self) -> f64 {
157 self.media_box.height()
158 }
159
160 pub fn effective_box(&self) -> Rect {
165 let media = self.media_box.normalize();
166 let crop = self.crop_box.normalize();
167 let inter = Rect::new(
168 crop.x0.max(media.x0),
169 crop.y0.max(media.y0),
170 crop.x1.min(media.x1),
171 crop.y1.min(media.y1),
172 );
173 if inter.x1 > inter.x0 && inter.y1 > inter.y0 {
174 inter
175 } else {
176 media
177 }
178 }
179}
180
181#[derive(Default)]
184struct InheritedAttrs {
185 media_box: Option<Rect>,
186 crop_box: Option<Rect>,
187 rotate: Option<i32>,
188 resources: Option<ResourceDict>,
189}
190
191impl InheritedAttrs {
192 fn is_complete(&self) -> bool {
193 self.media_box.is_some()
194 && self.crop_box.is_some()
195 && self.rotate.is_some()
196 && self.resources.is_some()
197 }
198
199 fn gather(file: &PdfFile, leaf: &PdfDict) -> Self {
200 let mut attrs = Self::default();
201 let mut visited: HashSet<ObjectId> = HashSet::new();
202 let mut current: Cow<'_, PdfDict> = Cow::Borrowed(leaf);
203 let mut depth = 0usize;
204
205 loop {
206 attrs.absorb(file, ¤t);
207 if attrs.is_complete() {
208 break;
209 }
210 let parent_ref = match current.get("Parent") {
211 Some(PdfObject::Ref(r)) => *r,
212 _ => break,
213 };
214 depth += 1;
215 if depth > MAX_PAGE_TREE_DEPTH {
216 warn!("page-tree /Parent chain deeper than {MAX_PAGE_TREE_DEPTH}; stopping inheritance walk");
217 break;
218 }
219 if !visited.insert(parent_ref) {
220 warn!("page-tree /Parent cycle at {parent_ref}; stopping inheritance walk");
221 break;
222 }
223 match file.resolve(parent_ref) {
224 Ok(PdfObject::Dict(d)) => current = Cow::Owned(d),
225 Ok(PdfObject::Null) => {
226 warn!(
227 "page-tree parent {parent_ref} resolves to null; stopping inheritance walk"
228 );
229 break;
230 }
231 Ok(other) => {
232 warn!(
233 "page-tree parent {parent_ref} is {}, expected Dict; stopping inheritance walk",
234 other.type_name()
235 );
236 break;
237 }
238 Err(e) => {
239 warn!("failed to resolve page-tree parent {parent_ref}: {e}");
240 break;
241 }
242 }
243 }
244 attrs
245 }
246
247 fn absorb(&mut self, file: &PdfFile, dict: &PdfDict) {
250 if self.media_box.is_none() {
251 self.media_box = resolve_rect(file, dict, "MediaBox");
252 }
253 if self.crop_box.is_none() {
254 self.crop_box = resolve_rect(file, dict, "CropBox");
255 }
256 if self.rotate.is_none() {
257 self.rotate = resolve_i64(file, dict.get("Rotate")).map(|n| n as i32);
258 }
259 if self.resources.is_none() {
260 if let Some(d) = resolve_sub_dict(dict, "Resources", file) {
261 match parse_resource_dict(&d, file) {
262 Ok(r) => self.resources = Some(r),
263 Err(e) => warn!("failed to parse /Resources: {e}"),
264 }
265 }
266 }
267 }
268}
269
270pub(crate) fn resolve_rect(file: &PdfFile, dict: &PdfDict, key: &str) -> Option<Rect> {
273 let arr: Cow<'_, [PdfObject]> = match dict.get(key)? {
274 PdfObject::Array(a) => Cow::Borrowed(a.as_slice()),
275 PdfObject::Ref(r) => match file.resolve(*r) {
276 Ok(PdfObject::Array(a)) => Cow::Owned(a),
277 Ok(other) => {
278 warn!(
279 "/{key} ref {r} resolved to {}, expected Array",
280 other.type_name()
281 );
282 return None;
283 }
284 Err(e) => {
285 warn!("failed to resolve /{key} ref {r}: {e}");
286 return None;
287 }
288 },
289 _ => return None,
290 };
291 if arr.len() != 4 {
292 warn!("/{key} array has {} elements, expected 4", arr.len());
293 return None;
294 }
295 let mut v = [0f64; 4];
296 for (slot, obj) in v.iter_mut().zip(arr.iter()) {
297 *slot = match obj {
298 PdfObject::Ref(r) => file.resolve(*r).ok()?.as_f64().ok()?,
299 other => other.as_f64().ok()?,
300 };
301 }
302 Some(Rect::new(v[0], v[1], v[2], v[3]))
303}
304
305fn resolve_i64(file: &PdfFile, value: Option<&PdfObject>) -> Option<i64> {
307 match value? {
308 PdfObject::Integer(n) => Some(*n),
309 PdfObject::Real(r) => Some(*r as i64),
310 PdfObject::Ref(r) => match file.resolve(*r).ok()? {
311 PdfObject::Integer(n) => Some(n),
312 PdfObject::Real(r) => Some(r as i64),
313 _ => None,
314 },
315 _ => None,
316 }
317}
318
319fn resolve_sub_dict<'a>(
320 dict: &'a zpdf_core::PdfDict,
321 key: &str,
322 file: &'a PdfFile,
323) -> Option<std::borrow::Cow<'a, zpdf_core::PdfDict>> {
324 match dict.get(key) {
325 Some(PdfObject::Dict(d)) => Some(std::borrow::Cow::Borrowed(d)),
326 Some(PdfObject::Ref(r)) => file.resolve(*r).ok().and_then(|o| match o {
327 PdfObject::Dict(d) => Some(std::borrow::Cow::Owned(d)),
328 _ => None,
329 }),
330 _ => None,
331 }
332}
333
334pub fn parse_resource_dict(dict: &zpdf_core::PdfDict, file: &PdfFile) -> Result<ResourceDict> {
335 let mut res = ResourceDict::default();
336
337 if let Some(fonts) = resolve_sub_dict(dict, "Font", file) {
338 for (name, obj) in &fonts.0 {
339 if let PdfObject::Ref(r) = obj {
340 res.fonts.insert(name.0.clone(), *r);
341 }
342 }
343 }
344
345 if let Some(xobjects) = resolve_sub_dict(dict, "XObject", file) {
346 for (name, obj) in &xobjects.0 {
347 if let PdfObject::Ref(r) = obj {
348 res.xobjects.insert(name.0.clone(), *r);
349 }
350 }
351 }
352
353 if let Some(gs) = resolve_sub_dict(dict, "ExtGState", file) {
354 for (name, obj) in &gs.0 {
355 match obj {
356 PdfObject::Ref(r) => {
357 res.ext_g_state.insert(name.0.clone(), *r);
358 }
359 PdfObject::Dict(d) => {
360 res.ext_g_state_inline.insert(name.0.clone(), d.clone());
361 }
362 _ => {}
363 }
364 }
365 }
366
367 if let Some(cs) = resolve_sub_dict(dict, "ColorSpace", file) {
368 for (name, obj) in &cs.0 {
369 match obj {
370 PdfObject::Ref(r) => {
371 res.color_spaces.insert(name.0.clone(), *r);
372 }
373 other @ (PdfObject::Array(_) | PdfObject::Name(_)) => {
374 res.color_spaces_inline
375 .insert(name.0.clone(), other.clone());
376 }
377 _ => {}
378 }
379 }
380 }
381
382 if let Some(pat) = resolve_sub_dict(dict, "Pattern", file) {
383 for (name, obj) in &pat.0 {
384 if let PdfObject::Ref(r) = obj {
385 res.patterns.insert(name.0.clone(), *r);
386 }
387 }
388 }
389
390 if let Some(sh) = resolve_sub_dict(dict, "Shading", file) {
391 for (name, obj) in &sh.0 {
392 match obj {
393 PdfObject::Ref(r) => {
394 res.shadings.insert(name.0.clone(), *r);
395 }
396 other @ PdfObject::Dict(_) => {
397 res.shadings_inline.insert(name.0.clone(), other.clone());
398 }
399 _ => {}
400 }
401 }
402 }
403
404 if let Some(props) = resolve_sub_dict(dict, "Properties", file) {
405 for (name, obj) in &props.0 {
406 match obj {
407 PdfObject::Ref(r) => {
408 res.properties.insert(name.0.clone(), *r);
409 }
410 PdfObject::Dict(d) => {
411 res.properties_inline.insert(name.0.clone(), d.clone());
412 }
413 _ => {}
414 }
415 }
416 }
417
418 Ok(res)
419}
420
421#[cfg(test)]
422mod tests {
423 use super::*;
424 use crate::test_util::build_pdf;
425 use crate::PdfDocument;
426
427 fn page0(objects: &[&str]) -> PdfPage {
429 let doc = PdfDocument::open(build_pdf(objects)).expect("open");
430 doc.page(0).expect("page")
431 }
432
433 #[test]
434 fn rotate_and_resources_inherited_from_pages_node() {
435 let page = page0(&[
436 "<< /Type /Catalog /Pages 2 0 R >>",
437 "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
438 "<< /Type /Page /Parent 2 0 R >>",
439 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
440 ]);
441 assert_eq!(page.rotate, 90);
442 assert_eq!(page.media_box, Rect::new(0.0, 0.0, 612.0, 792.0));
443 assert_eq!(page.resources.fonts.get("F1"), Some(&ObjectId(4, 0)));
444 }
445
446 #[test]
447 fn leaf_attributes_override_inherited() {
448 let page = page0(&[
449 "<< /Type /Catalog /Pages 2 0 R >>",
450 "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
451 "<< /Type /Page /Parent 2 0 R /Rotate 180 /Resources << /Font << /F2 4 0 R >> >> >>",
452 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
453 ]);
454 assert_eq!(page.rotate, 180);
455 assert!(page.resources.fonts.contains_key("F2"));
456 assert!(!page.resources.fonts.contains_key("F1"));
458 }
459
460 #[test]
461 fn indirect_media_and_crop_boxes_resolve() {
462 let page = page0(&[
463 "<< /Type /Catalog /Pages 2 0 R >>",
464 "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
465 "<< /Type /Page /Parent 2 0 R /MediaBox 4 0 R /CropBox [10 10 5 0 R 200] >>",
466 "[0 0 300 400]",
467 "100",
468 ]);
469 assert_eq!(page.media_box, Rect::new(0.0, 0.0, 300.0, 400.0));
470 assert_eq!(page.crop_box, Rect::new(10.0, 10.0, 100.0, 200.0));
471 }
472
473 #[test]
474 fn parent_cycle_terminates_and_keeps_found_values() {
475 let page = page0(&[
478 "<< /Type /Catalog /Pages 2 0 R >>",
479 "<< /Type /Pages /Kids [3 0 R] /Count 1 /Parent 3 0 R /MediaBox [0 0 100 100] >>",
480 "<< /Type /Page /Parent 2 0 R >>",
481 ]);
482 assert_eq!(page.media_box, Rect::new(0.0, 0.0, 100.0, 100.0));
483 assert_eq!(page.rotate, 0);
484 }
485
486 #[test]
487 fn annots_refs_collected() {
488 let page = page0(&[
489 "<< /Type /Catalog /Pages 2 0 R >>",
490 "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
491 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] /Annots [4 0 R 5 0 R] >>",
492 "<< /Type /Annot /Subtype /Link >>",
493 "<< /Type /Annot /Subtype /Square >>",
494 ]);
495 assert_eq!(page.annots, vec![ObjectId(4, 0), ObjectId(5, 0)]);
496 }
497
498 fn page_with_boxes(media: Rect, crop: Rect) -> PdfPage {
499 PdfPage {
500 id: ObjectId(1, 0),
501 media_box: media,
502 crop_box: crop,
503 rotate: 0,
504 resources: ResourceDict::default(),
505 contents: vec![],
506 annots: vec![],
507 }
508 }
509
510 #[test]
511 fn effective_box_intersects_crop_with_media() {
512 let media = Rect::new(0.0, 0.0, 612.0, 792.0);
513 let p = page_with_boxes(media, Rect::new(10.0, 20.0, 500.0, 700.0));
515 assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
516 let p = page_with_boxes(media, Rect::new(-50.0, -50.0, 700.0, 800.0));
518 assert_eq!(p.effective_box(), media);
519 let p = page_with_boxes(media, Rect::new(300.0, 400.0, 900.0, 900.0));
521 assert_eq!(p.effective_box(), Rect::new(300.0, 400.0, 612.0, 792.0));
522 }
523
524 #[test]
525 fn effective_box_falls_back_to_media_box() {
526 let media = Rect::new(0.0, 0.0, 612.0, 792.0);
527 let p = page_with_boxes(media, Rect::new(1000.0, 1000.0, 1100.0, 1100.0));
529 assert_eq!(p.effective_box(), media);
530 let p = page_with_boxes(media, Rect::new(100.0, 100.0, 100.0, 100.0));
532 assert_eq!(p.effective_box(), media);
533 let p = page_with_boxes(media, media);
535 assert_eq!(p.effective_box(), media);
536 }
537
538 #[test]
539 fn effective_box_normalizes_inverted_crop() {
540 let media = Rect::new(0.0, 0.0, 612.0, 792.0);
541 let p = page_with_boxes(media, Rect::new(500.0, 700.0, 10.0, 20.0));
542 assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
543 }
544}