Skip to main content

zpdf_document/
page.rs

1use std::borrow::Cow;
2use std::collections::{HashMap, HashSet};
3
4use tracing::warn;
5use zpdf_core::{ObjectId, PdfDict, PdfObject, Rect, Result};
6use zpdf_parser::PdfFile;
7
8/// Hard cap on page-tree walks (`/Parent` chains and `/Kids` recursion) — far
9/// deeper than any sane document, it bounds malformed or adversarial trees in
10/// concert with the visited-set cycle checks.
11pub(crate) const MAX_PAGE_TREE_DEPTH: usize = 64;
12
13#[derive(Debug)]
14pub struct PdfPage {
15    pub id: ObjectId,
16    pub media_box: Rect,
17    pub crop_box: Rect,
18    pub rotate: i32,
19    pub resources: ResourceDict,
20    pub contents: Vec<ObjectId>,
21    /// Annotation object ids from `/Annots`, parsed but not yet rendered.
22    pub annots: Vec<ObjectId>,
23}
24
25#[derive(Debug, Default)]
26pub struct ResourceDict {
27    pub fonts: HashMap<String, ObjectId>,
28    pub xobjects: HashMap<String, ObjectId>,
29    pub ext_g_state: HashMap<String, ObjectId>,
30    pub ext_g_state_inline: HashMap<String, zpdf_core::PdfDict>,
31    pub color_spaces: HashMap<String, ObjectId>,
32    /// Colorspace resources whose value is a direct array/name rather than a
33    /// reference (common from Quartz and Ghostscript).
34    pub color_spaces_inline: HashMap<String, PdfObject>,
35    pub patterns: HashMap<String, ObjectId>,
36    pub shadings: HashMap<String, ObjectId>,
37    pub shadings_inline: HashMap<String, PdfObject>,
38    /// /Properties (marked-content property lists, e.g. BDC /OC lookups).
39    pub properties: HashMap<String, ObjectId>,
40    pub properties_inline: HashMap<String, zpdf_core::PdfDict>,
41}
42
43impl PdfPage {
44    pub fn from_object(file: &PdfFile, page_id: ObjectId) -> Result<Self> {
45        let obj = file.resolve(page_id)?;
46        let dict = obj.as_dict()?;
47
48        // MediaBox, CropBox, Rotate and Resources are all inheritable page
49        // attributes (PDF 32000-1 Table 31): one guarded walk up /Parent
50        // gathers whichever values the leaf doesn't carry itself.
51        let inherited = InheritedAttrs::gather(file, dict);
52
53        let media_box = inherited
54            .media_box
55            .ok_or_else(|| zpdf_core::Error::MissingKey("MediaBox".into()))?;
56        let crop_box = inherited.crop_box.unwrap_or(media_box);
57        let rotate = inherited.rotate.unwrap_or(0);
58        let resources = inherited.resources.unwrap_or_default();
59
60        let contents = Self::collect_content_refs(file, dict.get("Contents"));
61        let annots = Self::collect_annot_refs(file, dict.get("Annots"));
62
63        Ok(Self {
64            id: page_id,
65            media_box,
66            crop_box,
67            rotate,
68            resources,
69            contents,
70            annots,
71        })
72    }
73
74    /// Collect the page's content-stream object ids from `/Contents`, which may
75    /// be: a single stream ref; a direct array of stream refs; or — as some
76    /// scanners emit — an indirect ref *to* an array of stream refs (double
77    /// indirection). The latter is resolved one level so the array is flattened
78    /// rather than mistaken for a single (non-stream) object.
79    fn collect_content_refs(file: &PdfFile, contents: Option<&PdfObject>) -> Vec<ObjectId> {
80        fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
81            arr.iter()
82                .filter_map(|o| match o {
83                    PdfObject::Ref(r) => Some(*r),
84                    _ => None,
85                })
86                .collect()
87        }
88        match contents {
89            Some(PdfObject::Array(arr)) => refs_from_array(arr),
90            Some(PdfObject::Ref(r)) => match file.resolve(*r) {
91                // Ref → array of stream refs: flatten it.
92                Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
93                // Ref → a single content stream: keep the ref itself.
94                Ok(PdfObject::Stream(_)) => vec![*r],
95                // Anything else (incl. resolve failure): treat as the lone ref so
96                // a later resolve attempt surfaces the real error.
97                _ => vec![*r],
98            },
99            _ => vec![],
100        }
101    }
102
103    /// Collect annotation object ids from `/Annots` (a direct array or a ref
104    /// to an array). Parse-only plumbing: appearance streams are not rendered.
105    fn collect_annot_refs(file: &PdfFile, annots: Option<&PdfObject>) -> Vec<ObjectId> {
106        fn refs_from_array(arr: &[PdfObject]) -> Vec<ObjectId> {
107            arr.iter()
108                .filter_map(|o| match o {
109                    PdfObject::Ref(r) => Some(*r),
110                    _ => None,
111                })
112                .collect()
113        }
114        match annots {
115            Some(PdfObject::Array(arr)) => refs_from_array(arr),
116            Some(PdfObject::Ref(r)) => match file.resolve(*r) {
117                Ok(PdfObject::Array(arr)) => refs_from_array(&arr),
118                _ => Vec::new(),
119            },
120            _ => Vec::new(),
121        }
122    }
123
124    pub fn width(&self) -> f64 {
125        self.media_box.width()
126    }
127
128    pub fn height(&self) -> f64 {
129        self.media_box.height()
130    }
131
132    /// The rectangle the page is rendered into: `/CropBox` intersected with
133    /// `/MediaBox`. Per spec a CropBox extending beyond the MediaBox is
134    /// clamped to it; an empty or non-overlapping CropBox falls back to the
135    /// full MediaBox.
136    pub fn effective_box(&self) -> Rect {
137        let media = self.media_box.normalize();
138        let crop = self.crop_box.normalize();
139        let inter = Rect::new(
140            crop.x0.max(media.x0),
141            crop.y0.max(media.y0),
142            crop.x1.min(media.x1),
143            crop.y1.min(media.y1),
144        );
145        if inter.x1 > inter.x0 && inter.y1 > inter.y0 {
146            inter
147        } else {
148            media
149        }
150    }
151}
152
153/// Inheritable page attributes (PDF 32000-1 Table 31), filled in leaf-first
154/// while walking up the `/Parent` chain with cycle and depth guards.
155#[derive(Default)]
156struct InheritedAttrs {
157    media_box: Option<Rect>,
158    crop_box: Option<Rect>,
159    rotate: Option<i32>,
160    resources: Option<ResourceDict>,
161}
162
163impl InheritedAttrs {
164    fn is_complete(&self) -> bool {
165        self.media_box.is_some()
166            && self.crop_box.is_some()
167            && self.rotate.is_some()
168            && self.resources.is_some()
169    }
170
171    fn gather(file: &PdfFile, leaf: &PdfDict) -> Self {
172        let mut attrs = Self::default();
173        let mut visited: HashSet<ObjectId> = HashSet::new();
174        let mut current: Cow<'_, PdfDict> = Cow::Borrowed(leaf);
175        let mut depth = 0usize;
176
177        loop {
178            attrs.absorb(file, &current);
179            if attrs.is_complete() {
180                break;
181            }
182            let parent_ref = match current.get("Parent") {
183                Some(PdfObject::Ref(r)) => *r,
184                _ => break,
185            };
186            depth += 1;
187            if depth > MAX_PAGE_TREE_DEPTH {
188                warn!("page-tree /Parent chain deeper than {MAX_PAGE_TREE_DEPTH}; stopping inheritance walk");
189                break;
190            }
191            if !visited.insert(parent_ref) {
192                warn!("page-tree /Parent cycle at {parent_ref}; stopping inheritance walk");
193                break;
194            }
195            match file.resolve(parent_ref) {
196                Ok(PdfObject::Dict(d)) => current = Cow::Owned(d),
197                Ok(PdfObject::Null) => {
198                    warn!(
199                        "page-tree parent {parent_ref} resolves to null; stopping inheritance walk"
200                    );
201                    break;
202                }
203                Ok(other) => {
204                    warn!(
205                        "page-tree parent {parent_ref} is {}, expected Dict; stopping inheritance walk",
206                        other.type_name()
207                    );
208                    break;
209                }
210                Err(e) => {
211                    warn!("failed to resolve page-tree parent {parent_ref}: {e}");
212                    break;
213                }
214            }
215        }
216        attrs
217    }
218
219    /// Pick up any attribute the walk hasn't found yet from `dict`. Values
220    /// closer to the leaf win, so only `None` slots are filled.
221    fn absorb(&mut self, file: &PdfFile, dict: &PdfDict) {
222        if self.media_box.is_none() {
223            self.media_box = resolve_rect(file, dict, "MediaBox");
224        }
225        if self.crop_box.is_none() {
226            self.crop_box = resolve_rect(file, dict, "CropBox");
227        }
228        if self.rotate.is_none() {
229            self.rotate = resolve_i64(file, dict.get("Rotate")).map(|n| n as i32);
230        }
231        if self.resources.is_none() {
232            if let Some(d) = resolve_sub_dict(dict, "Resources", file) {
233                match parse_resource_dict(&d, file) {
234                    Ok(r) => self.resources = Some(r),
235                    Err(e) => warn!("failed to parse /Resources: {e}"),
236                }
237            }
238        }
239    }
240}
241
242/// Read a rectangle value that may be a direct array, an indirect ref to an
243/// array, or an array whose elements are themselves indirect number refs.
244pub(crate) fn resolve_rect(file: &PdfFile, dict: &PdfDict, key: &str) -> Option<Rect> {
245    let arr: Cow<'_, [PdfObject]> = match dict.get(key)? {
246        PdfObject::Array(a) => Cow::Borrowed(a.as_slice()),
247        PdfObject::Ref(r) => match file.resolve(*r) {
248            Ok(PdfObject::Array(a)) => Cow::Owned(a),
249            Ok(other) => {
250                warn!(
251                    "/{key} ref {r} resolved to {}, expected Array",
252                    other.type_name()
253                );
254                return None;
255            }
256            Err(e) => {
257                warn!("failed to resolve /{key} ref {r}: {e}");
258                return None;
259            }
260        },
261        _ => return None,
262    };
263    if arr.len() != 4 {
264        warn!("/{key} array has {} elements, expected 4", arr.len());
265        return None;
266    }
267    let mut v = [0f64; 4];
268    for (slot, obj) in v.iter_mut().zip(arr.iter()) {
269        *slot = match obj {
270            PdfObject::Ref(r) => file.resolve(*r).ok()?.as_f64().ok()?,
271            other => other.as_f64().ok()?,
272        };
273    }
274    Some(Rect::new(v[0], v[1], v[2], v[3]))
275}
276
277/// Read an integer value that may be direct or an indirect ref.
278fn resolve_i64(file: &PdfFile, value: Option<&PdfObject>) -> Option<i64> {
279    match value? {
280        PdfObject::Integer(n) => Some(*n),
281        PdfObject::Real(r) => Some(*r as i64),
282        PdfObject::Ref(r) => match file.resolve(*r).ok()? {
283            PdfObject::Integer(n) => Some(n),
284            PdfObject::Real(r) => Some(r as i64),
285            _ => None,
286        },
287        _ => None,
288    }
289}
290
291fn resolve_sub_dict<'a>(
292    dict: &'a zpdf_core::PdfDict,
293    key: &str,
294    file: &'a PdfFile,
295) -> Option<std::borrow::Cow<'a, zpdf_core::PdfDict>> {
296    match dict.get(key) {
297        Some(PdfObject::Dict(d)) => Some(std::borrow::Cow::Borrowed(d)),
298        Some(PdfObject::Ref(r)) => file.resolve(*r).ok().and_then(|o| match o {
299            PdfObject::Dict(d) => Some(std::borrow::Cow::Owned(d)),
300            _ => None,
301        }),
302        _ => None,
303    }
304}
305
306pub fn parse_resource_dict(dict: &zpdf_core::PdfDict, file: &PdfFile) -> Result<ResourceDict> {
307    let mut res = ResourceDict::default();
308
309    if let Some(fonts) = resolve_sub_dict(dict, "Font", file) {
310        for (name, obj) in &fonts.0 {
311            if let PdfObject::Ref(r) = obj {
312                res.fonts.insert(name.0.clone(), *r);
313            }
314        }
315    }
316
317    if let Some(xobjects) = resolve_sub_dict(dict, "XObject", file) {
318        for (name, obj) in &xobjects.0 {
319            if let PdfObject::Ref(r) = obj {
320                res.xobjects.insert(name.0.clone(), *r);
321            }
322        }
323    }
324
325    if let Some(gs) = resolve_sub_dict(dict, "ExtGState", file) {
326        for (name, obj) in &gs.0 {
327            match obj {
328                PdfObject::Ref(r) => {
329                    res.ext_g_state.insert(name.0.clone(), *r);
330                }
331                PdfObject::Dict(d) => {
332                    res.ext_g_state_inline.insert(name.0.clone(), d.clone());
333                }
334                _ => {}
335            }
336        }
337    }
338
339    if let Some(cs) = resolve_sub_dict(dict, "ColorSpace", file) {
340        for (name, obj) in &cs.0 {
341            match obj {
342                PdfObject::Ref(r) => {
343                    res.color_spaces.insert(name.0.clone(), *r);
344                }
345                other @ (PdfObject::Array(_) | PdfObject::Name(_)) => {
346                    res.color_spaces_inline
347                        .insert(name.0.clone(), other.clone());
348                }
349                _ => {}
350            }
351        }
352    }
353
354    if let Some(pat) = resolve_sub_dict(dict, "Pattern", file) {
355        for (name, obj) in &pat.0 {
356            if let PdfObject::Ref(r) = obj {
357                res.patterns.insert(name.0.clone(), *r);
358            }
359        }
360    }
361
362    if let Some(sh) = resolve_sub_dict(dict, "Shading", file) {
363        for (name, obj) in &sh.0 {
364            match obj {
365                PdfObject::Ref(r) => {
366                    res.shadings.insert(name.0.clone(), *r);
367                }
368                other @ PdfObject::Dict(_) => {
369                    res.shadings_inline.insert(name.0.clone(), other.clone());
370                }
371                _ => {}
372            }
373        }
374    }
375
376    if let Some(props) = resolve_sub_dict(dict, "Properties", file) {
377        for (name, obj) in &props.0 {
378            match obj {
379                PdfObject::Ref(r) => {
380                    res.properties.insert(name.0.clone(), *r);
381                }
382                PdfObject::Dict(d) => {
383                    res.properties_inline.insert(name.0.clone(), d.clone());
384                }
385                _ => {}
386            }
387        }
388    }
389
390    Ok(res)
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396    use crate::test_util::build_pdf;
397    use crate::PdfDocument;
398
399    /// Open a synthetic PDF and return its first page.
400    fn page0(objects: &[&str]) -> PdfPage {
401        let doc = PdfDocument::open(build_pdf(objects)).expect("open");
402        doc.page(0).expect("page")
403    }
404
405    #[test]
406    fn rotate_and_resources_inherited_from_pages_node() {
407        let page = page0(&[
408            "<< /Type /Catalog /Pages 2 0 R >>",
409            "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
410            "<< /Type /Page /Parent 2 0 R >>",
411            "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
412        ]);
413        assert_eq!(page.rotate, 90);
414        assert_eq!(page.media_box, Rect::new(0.0, 0.0, 612.0, 792.0));
415        assert_eq!(page.resources.fonts.get("F1"), Some(&ObjectId(4, 0)));
416    }
417
418    #[test]
419    fn leaf_attributes_override_inherited() {
420        let page = page0(&[
421            "<< /Type /Catalog /Pages 2 0 R >>",
422            "<< /Type /Pages /Kids [3 0 R] /Count 1 /MediaBox [0 0 612 792] /Rotate 90 /Resources << /Font << /F1 4 0 R >> >> >>",
423            "<< /Type /Page /Parent 2 0 R /Rotate 180 /Resources << /Font << /F2 4 0 R >> >> >>",
424            "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
425        ]);
426        assert_eq!(page.rotate, 180);
427        assert!(page.resources.fonts.contains_key("F2"));
428        // The leaf's own /Resources replaces (not merges with) the parent's.
429        assert!(!page.resources.fonts.contains_key("F1"));
430    }
431
432    #[test]
433    fn indirect_media_and_crop_boxes_resolve() {
434        let page = page0(&[
435            "<< /Type /Catalog /Pages 2 0 R >>",
436            "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
437            "<< /Type /Page /Parent 2 0 R /MediaBox 4 0 R /CropBox [10 10 5 0 R 200] >>",
438            "[0 0 300 400]",
439            "100",
440        ]);
441        assert_eq!(page.media_box, Rect::new(0.0, 0.0, 300.0, 400.0));
442        assert_eq!(page.crop_box, Rect::new(10.0, 10.0, 100.0, 200.0));
443    }
444
445    #[test]
446    fn parent_cycle_terminates_and_keeps_found_values() {
447        // Nodes 2 and 3 name each other as /Parent; the walk must terminate
448        // and still pick up the MediaBox found before the cycle closes.
449        let page = page0(&[
450            "<< /Type /Catalog /Pages 2 0 R >>",
451            "<< /Type /Pages /Kids [3 0 R] /Count 1 /Parent 3 0 R /MediaBox [0 0 100 100] >>",
452            "<< /Type /Page /Parent 2 0 R >>",
453        ]);
454        assert_eq!(page.media_box, Rect::new(0.0, 0.0, 100.0, 100.0));
455        assert_eq!(page.rotate, 0);
456    }
457
458    #[test]
459    fn annots_refs_collected() {
460        let page = page0(&[
461            "<< /Type /Catalog /Pages 2 0 R >>",
462            "<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
463            "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] /Annots [4 0 R 5 0 R] >>",
464            "<< /Type /Annot /Subtype /Link >>",
465            "<< /Type /Annot /Subtype /Square >>",
466        ]);
467        assert_eq!(page.annots, vec![ObjectId(4, 0), ObjectId(5, 0)]);
468    }
469
470    fn page_with_boxes(media: Rect, crop: Rect) -> PdfPage {
471        PdfPage {
472            id: ObjectId(1, 0),
473            media_box: media,
474            crop_box: crop,
475            rotate: 0,
476            resources: ResourceDict::default(),
477            contents: vec![],
478            annots: vec![],
479        }
480    }
481
482    #[test]
483    fn effective_box_intersects_crop_with_media() {
484        let media = Rect::new(0.0, 0.0, 612.0, 792.0);
485        // CropBox inside MediaBox: used as-is.
486        let p = page_with_boxes(media, Rect::new(10.0, 20.0, 500.0, 700.0));
487        assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
488        // CropBox sticking out on every side: clamped to the MediaBox.
489        let p = page_with_boxes(media, Rect::new(-50.0, -50.0, 700.0, 800.0));
490        assert_eq!(p.effective_box(), media);
491        // Partial overlap: the intersection.
492        let p = page_with_boxes(media, Rect::new(300.0, 400.0, 900.0, 900.0));
493        assert_eq!(p.effective_box(), Rect::new(300.0, 400.0, 612.0, 792.0));
494    }
495
496    #[test]
497    fn effective_box_falls_back_to_media_box() {
498        let media = Rect::new(0.0, 0.0, 612.0, 792.0);
499        // Disjoint CropBox.
500        let p = page_with_boxes(media, Rect::new(1000.0, 1000.0, 1100.0, 1100.0));
501        assert_eq!(p.effective_box(), media);
502        // Degenerate (zero-area) CropBox.
503        let p = page_with_boxes(media, Rect::new(100.0, 100.0, 100.0, 100.0));
504        assert_eq!(p.effective_box(), media);
505        // Default: CropBox == MediaBox.
506        let p = page_with_boxes(media, media);
507        assert_eq!(p.effective_box(), media);
508    }
509
510    #[test]
511    fn effective_box_normalizes_inverted_crop() {
512        let media = Rect::new(0.0, 0.0, 612.0, 792.0);
513        let p = page_with_boxes(media, Rect::new(500.0, 700.0, 10.0, 20.0));
514        assert_eq!(p.effective_box(), Rect::new(10.0, 20.0, 500.0, 700.0));
515    }
516}