Skip to main content

zpdf_document/
page.rs

1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3
4use tracing::warn;
5use zpdf_core::{ObjectId, PdfDict, PdfObject, Rect, Result};
6use zpdf_parser::PdfFile;
7
8/// Hard cap on page-tree walks (`/Parent` chains and `/Kids` recursion) — far
9/// deeper than any sane document, it bounds malformed or adversarial trees in
10/// concert with the visited-set cycle checks.
11pub(crate) const MAX_PAGE_TREE_DEPTH: usize = 64;
12
13/// US Letter, used when a page has no usable `/MediaBox` (missing, degenerate,
14/// or non-finite). Matches the fallback mainstream PDF readers apply.
15const DEFAULT_MEDIA_BOX: Rect = Rect {
16    x0: 0.0,
17    y0: 0.0,
18    x1: 612.0,
19    y1: 792.0,
20};
21
22/// A box is usable only if all four corners are finite and it encloses a
23/// non-empty area once normalized. Rejects NaN/∞ (which would poison the raster
24/// dimension math downstream) and zero/negative-area rectangles.
25fn is_usable_box(r: &Rect) -> bool {
26    if ![r.x0, r.y0, r.x1, r.y1].iter().all(|v| v.is_finite()) {
27        return false;
28    }
29    let n = r.normalize();
30    n.width() > 0.0 && n.height() > 0.0
31}
32
33#[derive(Debug)]
34pub struct PdfPage {
35    pub id: ObjectId,
36    pub media_box: Rect,
37    pub crop_box: Rect,
38    pub rotate: i32,
39    pub resources: ResourceDict,
40    pub contents: Vec<ObjectId>,
41    /// Annotation object ids from `/Annots`, parsed but not yet rendered.
42    pub annots: Vec<ObjectId>,
43}
44
45#[derive(Debug, Default)]
46pub struct ResourceDict {
47    pub fonts: HashMap<String, ObjectId>,
48    pub xobjects: HashMap<String, ObjectId>,
49    pub ext_g_state: HashMap<String, ObjectId>,
50    pub ext_g_state_inline: HashMap<String, zpdf_core::PdfDict>,
51    pub color_spaces: HashMap<String, ObjectId>,
52    /// Colorspace resources whose value is a direct array/name rather than a
53    /// reference (common from Quartz and Ghostscript).
54    pub color_spaces_inline: HashMap<String, PdfObject>,
55    pub patterns: HashMap<String, ObjectId>,
56    pub shadings: HashMap<String, ObjectId>,
57    pub shadings_inline: HashMap<String, PdfObject>,
58    /// /Properties (marked-content property lists, e.g. BDC /OC lookups).
59    pub properties: HashMap<String, ObjectId>,
60    pub properties_inline: HashMap<String, zpdf_core::PdfDict>,
61}
62
63impl PdfPage {
64    pub fn from_object(file: &PdfFile, page_id: ObjectId) -> Result<Self> {
65        let obj = file.resolve(page_id)?;
66        let dict = obj.as_dict()?;
67
68        // MediaBox, CropBox, Rotate and Resources are all inheritable page
69        // attributes (PDF 32000-1 Table 31): one guarded walk up /Parent
70        // gathers whichever values the leaf doesn't carry itself.
71        let inherited = InheritedAttrs::gather(file, dict);
72
73        // /MediaBox is required and inheritable, but real-world files routinely
74        // omit it or carry a degenerate/non-finite one. Mainstream readers fall
75        // back to US Letter rather than refusing the page; do the same so a
76        // single bad page never sinks the whole document.
77        let media_box = inherited
78            .media_box
79            .filter(is_usable_box)
80            .unwrap_or(DEFAULT_MEDIA_BOX);
81        let crop_box = inherited
82            .crop_box
83            .filter(is_usable_box)
84            .unwrap_or(media_box);
85        let rotate = inherited.rotate.unwrap_or(0);
86        let resources = inherited.resources.unwrap_or_default();
87
88        let contents = Self::collect_content_refs(file, dict.get("Contents"));
89        let annots = Self::collect_annot_refs(file, dict.get("Annots"));
90
91        Ok(Self {
92            id: page_id,
93            media_box,
94            crop_box,
95            rotate,
96            resources,
97            contents,
98            annots,
99        })
100    }
101
102    /// Collect the page's content-stream object ids from `/Contents`, which may
103    /// be: a single stream ref; a direct array of stream refs; or — as some
104    /// scanners emit — an indirect ref *to* an array of stream refs (double
105    /// indirection). The latter is resolved one level so the array is flattened
106    /// rather than mistaken for a single (non-stream) object.
107    fn collect_content_refs(file: &PdfFile, contents: Option<&PdfObject>) -> Vec<ObjectId> {
108        fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
109            arr.iter()
110                .filter_map(|o| match o {
111                    PdfObject::Ref(r) => Some(*r),
112                    _ => None,
113                })
114                .collect()
115        }
116        match contents {
117            Some(PdfObject::Array(arr)) => refs_from_array(arr),
118            Some(PdfObject::Ref(r)) => match file.resolve(*r) {
119                // Ref → array of stream refs: flatten it.
120                Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
121                // Ref → a single content stream: keep the ref itself.
122                Ok(PdfObject::Stream(_)) => vec![*r],
123                // Anything else (incl. resolve failure): treat as the lone ref so
124                // a later resolve attempt surfaces the real error.
125                _ => vec![*r],
126            },
127            _ => vec![],
128        }
129    }
130
131    /// Collect annotation object ids from `/Annots` (a direct array or a ref
132    /// to an array). Parse-only plumbing: appearance streams are not rendered.
133    fn collect_annot_refs(file: &PdfFile, annots: Option<&PdfObject>) -> Vec<ObjectId> {
134        fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
135            arr.iter()
136                .filter_map(|o| match o {
137                    PdfObject::Ref(r) => Some(*r),
138                    _ => None,
139                })
140                .collect()
141        }
142        match annots {
143            Some(PdfObject::Array(arr)) => refs_from_array(arr),
144            Some(PdfObject::Ref(r)) => match file.resolve(*r) {
145                Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
146                _ => Vec::new(),
147            },
148            _ => Vec::new(),
149        }
150    }
151
152    pub fn width(&self) -> f64 {
153        self.media_box.width()
154    }
155
156    pub fn height(&self) -> f64 {
157        self.media_box.height()
158    }
159
160    /// The rectangle the page is rendered into: `/CropBox` intersected with
161    /// `/MediaBox`. Per spec a CropBox extending beyond the MediaBox is
162    /// clamped to it; an empty or non-overlapping CropBox falls back to the
163    /// full MediaBox.
164    pub fn effective_box(&self) -> Rect {
165        let media = self.media_box.normalize();
166        let crop = self.crop_box.normalize();
167        let inter = Rect::new(
168            crop.x0.max(media.x0),
169            crop.y0.max(media.y0),
170            crop.x1.min(media.x1),
171            crop.y1.min(media.y1),
172        );
173        if inter.x1 > inter.x0 && inter.y1 > inter.y0 {
174            inter
175        } else {
176            media
177        }
178    }
179}
180
181/// Inheritable page attributes (PDF 32000-1 Table 31), filled in leaf-first
182/// while walking up the `/Parent` chain with cycle and depth guards.
183#[derive(Default)]
184struct InheritedAttrs {
185    media_box: Option<Rect>,
186    crop_box: Option<Rect>,
187    rotate: Option<i32>,
188    resources: Option<ResourceDict>,
189}
190
191impl InheritedAttrs {
192    fn is_complete(&self) -> bool {
193        self.media_box.is_some()
194            && self.crop_box.is_some()
195            && self.rotate.is_some()
196            && self.resources.is_some()
197    }
198
199    fn gather(file: &PdfFile, leaf: &PdfDict) -> Self {
200        let mut attrs = Self::default();
201        let mut visited: HashSet<ObjectId> = HashSet::new();
202        let mut current: Cow<'_, PdfDict> = Cow::Borrowed(leaf);
203        let mut depth = 0usize;
204
205        loop {
206            attrs.absorb(file, &current);
207            if attrs.is_complete() {
208                break;
209            }
210            let parent_ref = match current.get("Parent") {
211                Some(PdfObject::Ref(r)) => *r,
212                _ => break,
213            };
214            depth += 1;
215            if depth > MAX_PAGE_TREE_DEPTH {
216                warn!("page-tree /Parent chain deeper than {MAX_PAGE_TREE_DEPTH}; stopping inheritance walk");
217                break;
218            }
219            if !visited.insert(parent_ref) {
220                warn!("page-tree /Parent cycle at {parent_ref}; stopping inheritance walk");
221                break;
222            }
223            match file.resolve(parent_ref) {
224                Ok(PdfObject::Dict(d)) => current = Cow::Owned(d),
225                Ok(PdfObject::Null) => {
226                    warn!(
227                        "page-tree parent {parent_ref} resolves to null; stopping inheritance walk"
228                    );
229                    break;
230                }
231                Ok(other) => {
232                    warn!(
233                        "page-tree parent {parent_ref} is {}, expected Dict; stopping inheritance walk",
234                        other.type_name()
235                    );
236                    break;
237                }
238                Err(e) => {
239                    warn!("failed to resolve page-tree parent {parent_ref}: {e}");
240                    break;
241                }
242            }
243        }
244        attrs
245    }
246
247    /// Pick up any attribute the walk hasn't found yet from `dict`. Values
248    /// closer to the leaf win, so only `None` slots are filled.
249    fn absorb(&mut self, file: &PdfFile, dict: &PdfDict) {
250        if self.media_box.is_none() {
251            self.media_box = resolve_rect(file, dict, "MediaBox");
252        }
253        if self.crop_box.is_none() {
254            self.crop_box = resolve_rect(file, dict, "CropBox");
255        }
256        if self.rotate.is_none() {
257            self.rotate = resolve_i64(file, dict.get("Rotate")).map(|n| n as i32);
258        }
259        if self.resources.is_none() {
260            if let Some(d) = resolve_sub_dict(dict, "Resources", file) {
261                match parse_resource_dict(&d, file) {
262                    Ok(r) => self.resources = Some(r),
263                    Err(e) => warn!("failed to parse /Resources: {e}"),
264                }
265            }
266        }
267    }
268}
269
270/// Read a rectangle value that may be a direct array, an indirect ref to an
271/// array, or an array whose elements are themselves indirect number refs.
272pub(crate) fn resolve_rect(file: &PdfFile, dict: &PdfDict, key: &str) -> Option<Rect> {
273    let arr: Cow<'_, [PdfObject]> = match dict.get(key)? {
274        PdfObject::Array(a) => Cow::Borrowed(a.as_slice()),
275        PdfObject::Ref(r) => match file.resolve(*r) {
276            Ok(PdfObject::Array(a)) => Cow::Owned(a),
277            Ok(other) => {
278                warn!(
279                    "/{key} ref {r} resolved to {}, expected Array",
280                    other.type_name()
281                );
282                return None;
283            }
284            Err(e) => {
285                warn!("failed to resolve /{key} ref {r}: {e}");
286                return None;
287            }
288        },
289        _ => return None,
290    };
291    if arr.len() != 4 {
292        warn!("/{key} array has {} elements, expected 4", arr.len());
293        return None;
294    }
295    let mut v = [0f64; 4];
296    for (slot, obj) in v.iter_mut().zip(arr.iter()) {
297        *slot = match obj {
298            PdfObject::Ref(r) => file.resolve(*r).ok()?.as_f64().ok()?,
299            other => other.as_f64().ok()?,
300        };
301    }
302    Some(Rect::new(v[0], v[1], v[2], v[3]))
303}
304
305/// Read an integer value that may be direct or an indirect ref.
306fn resolve_i64(file: &PdfFile, value: Option<&PdfObject>) -> Option<i64> {
307    match value? {
308        PdfObject::Integer(n) => Some(*n),
309        PdfObject::Real(r) => Some(*r as i64),
310        PdfObject::Ref(r) => match file.resolve(*r).ok()? {
311            PdfObject::Integer(n) => Some(n),
312            PdfObject::Real(r) => Some(r as i64),
313            _ => None,
314        },
315        _ => None,
316    }
317}
318
319fn resolve_sub_dict<'a>(
320    dict: &'a zpdf_core::PdfDict,
321    key: &str,
322    file: &'a PdfFile,
323) -> Option<std::borrow::Cow<'a, zpdf_core::PdfDict>> {
324    match dict.get(key) {
325        Some(PdfObject::Dict(d)) => Some(std::borrow::Cow::Borrowed(d)),
326        Some(PdfObject::Ref(r)) => file.resolve(*r).ok().and_then(|o| match o {
327            PdfObject::Dict(d) => Some(std::borrow::Cow::Owned(d)),
328            _ => None,
329        }),
330        _ => None,
331    }
332}
333
334pub fn parse_resource_dict(dict: &zpdf_core::PdfDict, file: &PdfFile) -> Result<ResourceDict> {
335    let mut res = ResourceDict::default();
336
337    if let Some(fonts) = resolve_sub_dict(dict, "Font", file) {
338        for (name, obj) in &fonts.0 {
339            if let PdfObject::Ref(r) = obj {
340                res.fonts.insert(name.0.clone(), *r);
341            }
342        }
343    }
344
345    if let Some(xobjects) = resolve_sub_dict(dict, "XObject", file) {
346        for (name, obj) in &xobjects.0 {
347            if let PdfObject::Ref(r) = obj {
348                res.xobjects.insert(name.0.clone(), *r);
349            }
350        }
351    }
352
353    if let Some(gs) = resolve_sub_dict(dict, "ExtGState", file) {
354        for (name, obj) in &gs.0 {
355            match obj {
356                PdfObject::Ref(r) => {
357                    res.ext_g_state.insert(name.0.clone(), *r);
358                }
359                PdfObject::Dict(d) => {
360                    res.ext_g_state_inline.insert(name.0.clone(), d.clone());
361                }
362                _ => {}
363            }
364        }
365    }
366
367    if let Some(cs) = resolve_sub_dict(dict, "ColorSpace", file) {
368        for (name, obj) in &cs.0 {
369            match obj {
370                PdfObject::Ref(r) => {
371                    res.color_spaces.insert(name.0.clone(), *r);
372                }
373                other @ (PdfObject::Array(_) | PdfObject::Name(_)) => {
374                    res.color_spaces_inline
375                        .insert(name.0.clone(), other.clone());
376                }
377                _ => {}
378            }
379        }
380    }
381
382    if let Some(pat) = resolve_sub_dict(dict, "Pattern", file) {
383        for (name, obj) in &pat.0 {
384            if let PdfObject::Ref(r) = obj {
385                res.patterns.insert(name.0.clone(), *r);
386            }
387        }
388    }
389
390    if let Some(sh) = resolve_sub_dict(dict, "Shading", file) {
391        for (name, obj) in &sh.0 {
392            match obj {
393                PdfObject::Ref(r) => {
394                    res.shadings.insert(name.0.clone(), *r);
395                }
396                other @ PdfObject::Dict(_) => {
397                    res.shadings_inline.insert(name.0.clone(), other.clone());
398                }
399                _ => {}
400            }
401        }
402    }
403
404    if let Some(props) = resolve_sub_dict(dict, "Properties", file) {
405        for (name, obj) in &props.0 {
406            match obj {
407                PdfObject::Ref(r) => {
408                    res.properties.insert(name.0.clone(), *r);
409                }
410                PdfObject::Dict(d) => {
411                    res.properties_inline.insert(name.0.clone(), d.clone());
412                }
413                _ => {}
414            }
415        }
416    }
417
418    Ok(res)
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424    use crate::test_util::build_pdf;
425    use crate::PdfDocument;
426
427    /// Open a synthetic PDF and return its first page.
428    fn page0(objects: &[&str]) -> PdfPage {
429        let doc = PdfDocument::open(build_pdf(objects)).expect("open");
430        doc.page(0).expect("page")
431    }
432
433    #[test]
434    fn rotate_and_resources_inherited_from_pages_node() {
435        let page = page0(&[
436            "<< /Type /Catalog /Pages 2 0 R >>",
437            "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
438            "<< /Type /Page /Parent 2 0 R >>",
439            "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
440        ]);
441        assert_eq!(page.rotate, 90);
442        assert_eq!(page.media_box, Rect::new(0.0, 0.0, 612.0, 792.0));
443        assert_eq!(page.resources.fonts.get("F1"), Some(&ObjectId(4, 0)));
444    }
445
446    #[test]
447    fn leaf_attributes_override_inherited() {
448        let page = page0(&[
449            "<< /Type /Catalog /Pages 2 0 R >>",
450            "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
451            "<< /Type /Page /Parent 2 0 R /Rotate 180 /Resources << /Font << /F2 4 0 R >> >> >>",
452            "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
453        ]);
454        assert_eq!(page.rotate, 180);
455        assert!(page.resources.fonts.contains_key("F2"));
456        // The leaf's own /Resources replaces (not merges with) the parent's.
457        assert!(!page.resources.fonts.contains_key("F1"));
458    }
459
460    #[test]
461    fn indirect_media_and_crop_boxes_resolve() {
462        let page = page0(&[
463            "<< /Type /Catalog /Pages 2 0 R >>",
464            "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
465            "<< /Type /Page /Parent 2 0 R /MediaBox 4 0 R /CropBox [10 10 5 0 R 200] >>",
466            "[0 0 300 400]",
467            "100",
468        ]);
469        assert_eq!(page.media_box, Rect::new(0.0, 0.0, 300.0, 400.0));
470        assert_eq!(page.crop_box, Rect::new(10.0, 10.0, 100.0, 200.0));
471    }
472
473    #[test]
474    fn parent_cycle_terminates_and_keeps_found_values() {
475        // Nodes 2 and 3 name each other as /Parent; the walk must terminate
476        // and still pick up the MediaBox found before the cycle closes.
477        let page = page0(&[
478            "<< /Type /Catalog /Pages 2 0 R >>",
479            "<< /Type /Pages /Kids [3 0 R] /Count 1 /Parent 3 0 R /MediaBox [0 0 100 100] >>",
480            "<< /Type /Page /Parent 2 0 R >>",
481        ]);
482        assert_eq!(page.media_box, Rect::new(0.0, 0.0, 100.0, 100.0));
483        assert_eq!(page.rotate, 0);
484    }
485
486    #[test]
487    fn annots_refs_collected() {
488        let page = page0(&[
489            "<< /Type /Catalog /Pages 2 0 R >>",
490            "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
491            "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] /Annots [4 0 R 5 0 R] >>",
492            "<< /Type /Annot /Subtype /Link >>",
493            "<< /Type /Annot /Subtype /Square >>",
494        ]);
495        assert_eq!(page.annots, vec![ObjectId(4, 0), ObjectId(5, 0)]);
496    }
497
498    fn page_with_boxes(media: Rect, crop: Rect) -> PdfPage {
499        PdfPage {
500            id: ObjectId(1, 0),
501            media_box: media,
502            crop_box: crop,
503            rotate: 0,
504            resources: ResourceDict::default(),
505            contents: vec![],
506            annots: vec![],
507        }
508    }
509
510    #[test]
511    fn effective_box_intersects_crop_with_media() {
512        let media = Rect::new(0.0, 0.0, 612.0, 792.0);
513        // CropBox inside MediaBox: used as-is.
514        let p = page_with_boxes(media, Rect::new(10.0, 20.0, 500.0, 700.0));
515        assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
516        // CropBox sticking out on every side: clamped to the MediaBox.
517        let p = page_with_boxes(media, Rect::new(-50.0, -50.0, 700.0, 800.0));
518        assert_eq!(p.effective_box(), media);
519        // Partial overlap: the intersection.
520        let p = page_with_boxes(media, Rect::new(300.0, 400.0, 900.0, 900.0));
521        assert_eq!(p.effective_box(), Rect::new(300.0, 400.0, 612.0, 792.0));
522    }
523
524    #[test]
525    fn effective_box_falls_back_to_media_box() {
526        let media = Rect::new(0.0, 0.0, 612.0, 792.0);
527        // Disjoint CropBox.
528        let p = page_with_boxes(media, Rect::new(1000.0, 1000.0, 1100.0, 1100.0));
529        assert_eq!(p.effective_box(), media);
530        // Degenerate (zero-area) CropBox.
531        let p = page_with_boxes(media, Rect::new(100.0, 100.0, 100.0, 100.0));
532        assert_eq!(p.effective_box(), media);
533        // Default: CropBox == MediaBox.
534        let p = page_with_boxes(media, media);
535        assert_eq!(p.effective_box(), media);
536    }
537
538    #[test]
539    fn effective_box_normalizes_inverted_crop() {
540        let media = Rect::new(0.0, 0.0, 612.0, 792.0);
541        let p = page_with_boxes(media, Rect::new(500.0, 700.0, 10.0, 20.0));
542        assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
543    }
544}