Skip to main content

pdf_syntax/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::Array;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::Rect;
8use crate::object::Stream;
9use crate::object::dict::keys::*;
10use crate::object::{Object, ObjectLike};
11use crate::reader::ReaderContext;
12use crate::sync::OnceLock;
13use crate::util::FloatExt;
14use crate::xref::XRef;
15use alloc::boxed::Box;
16use alloc::vec;
17use alloc::vec::Vec;
18use core::ops::Deref;
19use log::warn;
20
21/// Attributes that can be inherited.
22#[derive(Debug, Clone)]
23struct PagesContext {
24    media_box: Option<Rect>,
25    crop_box: Option<Rect>,
26    rotate: Option<u32>,
27}
28
29impl PagesContext {
30    fn new() -> Self {
31        Self {
32            media_box: None,
33            crop_box: None,
34            rotate: None,
35        }
36    }
37}
38
39/// A structure holding the pages of a PDF document.
40pub struct Pages<'a> {
41    pages: Vec<Page<'a>>,
42    xref: &'a XRef,
43}
44
45impl<'a> Pages<'a> {
46    /// Create a new `Pages` object.
47    pub(crate) fn new(
48        pages_dict: &Dict<'a>,
49        ctx: &ReaderContext<'a>,
50        xref: &'a XRef,
51    ) -> Option<Self> {
52        let mut pages = vec![];
53        let pages_ctx = PagesContext::new();
54        resolve_pages(
55            pages_dict,
56            &mut pages,
57            pages_ctx,
58            Resources::new(Dict::empty(), None, ctx),
59        )?;
60
61        Some(Self { pages, xref })
62    }
63
64    /// Create a new `Pages` object by bruteforce-searching.
65    ///
66    /// Of course this could result in the order of pages being messed up, but
67    /// this is still better than nothing.
68    pub(crate) fn new_brute_force(ctx: &ReaderContext<'a>, xref: &'a XRef) -> Option<Self> {
69        let mut pages = vec![];
70
71        for object in xref.objects() {
72            if let Some(dict) = object.into_dict()
73                && let Some(page) = Page::new(
74                    &dict,
75                    &PagesContext::new(),
76                    Resources::new(Dict::empty(), None, ctx),
77                    true,
78                )
79            {
80                pages.push(page);
81            }
82        }
83
84        if pages.is_empty() {
85            return None;
86        }
87
88        Some(Self { pages, xref })
89    }
90
91    /// Return the xref table (of the document the pages belong to).   
92    pub fn xref(&self) -> &'a XRef {
93        self.xref
94    }
95}
96
97impl<'a> Deref for Pages<'a> {
98    type Target = [Page<'a>];
99
100    fn deref(&self) -> &Self::Target {
101        &self.pages
102    }
103}
104
105/// Maximum depth for recursive page tree traversal.
106/// Prevents stack overflow on malformed PDFs with deeply nested or circular page trees.
107const MAX_PAGE_TREE_DEPTH: usize = 256;
108
109/// Maximum number of pages to collect from the page tree.
110/// Prevents exponential blowup from "page tree bombs" where shared Kids nodes
111/// cause the same subtrees to be visited multiple times (e.g., 8 KB PDF → 2^27 pages).
112const MAX_PAGE_COUNT: usize = 100_000;
113
114fn resolve_pages<'a>(
115    pages_dict: &Dict<'a>,
116    entries: &mut Vec<Page<'a>>,
117    ctx: PagesContext,
118    resources: Resources<'a>,
119) -> Option<()> {
120    let max_depth = resources
121        .ctx
122        .load_limits()
123        .object_depth_limit()
124        .map(|d| d as usize)
125        .unwrap_or(MAX_PAGE_TREE_DEPTH);
126
127    resolve_pages_depth(pages_dict, entries, ctx, resources, 0, max_depth)
128}
129
130fn resolve_pages_depth<'a>(
131    pages_dict: &Dict<'a>,
132    entries: &mut Vec<Page<'a>>,
133    mut ctx: PagesContext,
134    resources: Resources<'a>,
135    depth: usize,
136    max_depth: usize,
137) -> Option<()> {
138    if depth > max_depth {
139        log::warn!("Page tree depth exceeds {max_depth}, stopping traversal");
140        return None;
141    }
142
143    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
144        ctx.media_box = Some(media_box);
145    }
146
147    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
148        ctx.crop_box = Some(crop_box);
149    }
150
151    // /Rotate may be negative (e.g. -90), which is valid per the PDF spec.
152    // Normalise to [0, 360) via Euclidean remainder so u32 storage is safe.
153    if let Some(rotate) = pages_dict.get::<i32>(ROTATE) {
154        ctx.rotate = Some(rotate.rem_euclid(360) as u32);
155    }
156
157    let resources = Resources::from_parent(
158        pages_dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
159        resources.clone(),
160    );
161
162    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
163
164    for dict in kids.iter::<Dict<'_>>() {
165        if entries.len() >= MAX_PAGE_COUNT {
166            log::warn!("Page count exceeds {MAX_PAGE_COUNT}, stopping page tree traversal");
167            return Some(());
168        }
169
170        match dict.get::<Name>(TYPE).as_deref() {
171            Some(PAGES) => {
172                resolve_pages_depth(
173                    &dict,
174                    entries,
175                    ctx.clone(),
176                    resources.clone(),
177                    depth + 1,
178                    max_depth,
179                );
180            }
181            // Let's be lenient and assume it's a `Page` in case it's `None` or something else
182            // (see corpus test case 0083781).
183            _ => {
184                if let Some(page) = Page::new(&dict, &ctx, resources.clone(), false) {
185                    entries.push(page);
186                }
187            }
188        }
189    }
190
191    Some(())
192}
193
194/// The rotation of the page.
195#[derive(Debug, Copy, Clone)]
196pub enum Rotation {
197    /// No rotation.
198    None,
199    /// A rotation of 90 degrees.
200    Horizontal,
201    /// A rotation of 180 degrees.
202    Flipped,
203    /// A rotation of 270 degrees.
204    FlippedHorizontal,
205}
206
207/// A PDF page.
208pub struct Page<'a> {
209    inner: Dict<'a>,
210    media_box: Rect,
211    crop_box: Rect,
212    rotation: Rotation,
213    page_streams: OnceLock<Option<Vec<u8>>>,
214    resources: Resources<'a>,
215    ctx: ReaderContext<'a>,
216}
217
218impl<'a> Page<'a> {
219    fn new(
220        dict: &Dict<'a>,
221        ctx: &PagesContext,
222        resources: Resources<'a>,
223        brute_force: bool,
224    ) -> Option<Self> {
225        // In general, pages without content are allowed, but in case we are brute-forcing
226        // we ignore them.
227        if brute_force && !dict.contains_key(CONTENTS) {
228            return None;
229        }
230
231        let media_box = dict
232            .get::<Rect>(MEDIA_BOX)
233            .or(ctx.media_box)
234            .unwrap_or(US_LETTER);
235
236        let crop_box = dict
237            .get::<Rect>(CROP_BOX)
238            .or(ctx.crop_box)
239            .unwrap_or(media_box);
240
241        let rotation = match dict
242            .get::<i32>(ROTATE)
243            .map(|r| r.rem_euclid(360) as u32)
244            .or(ctx.rotate)
245            .unwrap_or(0)
246        {
247            0 => Rotation::None,
248            90 => Rotation::Horizontal,
249            180 => Rotation::Flipped,
250            270 => Rotation::FlippedHorizontal,
251            _ => Rotation::None,
252        };
253
254        let ctx = resources.ctx.clone();
255        let resources = Resources::from_parent(
256            dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
257            resources,
258        );
259
260        Some(Self {
261            inner: dict.clone(),
262            media_box,
263            crop_box,
264            rotation,
265            page_streams: OnceLock::new(),
266            resources,
267            ctx,
268        })
269    }
270
271    fn operations_impl(&self) -> Option<UntypedIter<'_>> {
272        let stream = self.page_stream()?;
273        let iter = UntypedIter::new(stream);
274
275        Some(iter)
276    }
277
278    /// Return the decoded content stream of the page.
279    pub fn page_stream(&self) -> Option<&[u8]> {
280        let convert_single = |s: Stream<'_>| {
281            let data = s.decoded().ok()?;
282            Some(data.to_vec())
283        };
284
285        self.page_streams
286            .get_or_init(|| {
287                if let Some(stream) = self.inner.get::<Stream<'_>>(CONTENTS) {
288                    convert_single(stream)
289                } else if let Some(array) = self.inner.get::<Array<'_>>(CONTENTS) {
290                    let streams = array.iter::<Stream<'_>>().flat_map(convert_single);
291
292                    let mut collected = vec![];
293
294                    for stream in streams {
295                        collected.extend(stream);
296                        // Streams must have at least one whitespace in-between.
297                        collected.push(b' ');
298                    }
299
300                    Some(collected)
301                } else {
302                    warn!("contents entry of page was neither stream nor array of streams");
303
304                    None
305                }
306            })
307            .as_ref()
308            .map(|d| d.as_slice())
309    }
310
311    /// Get the resources of the page.
312    pub fn resources(&self) -> &Resources<'a> {
313        &self.resources
314    }
315
316    /// Get the media box of the page.
317    pub fn media_box(&self) -> Rect {
318        self.media_box
319    }
320
321    /// Get the rotation of the page.
322    pub fn rotation(&self) -> Rotation {
323        self.rotation
324    }
325
326    /// Get the crop box of the page.
327    pub fn crop_box(&self) -> Rect {
328        self.crop_box
329    }
330
331    /// Return the intersection of crop box and media box.
332    pub fn intersected_crop_box(&self) -> Rect {
333        self.crop_box().intersect(self.media_box())
334    }
335
336    /// Return the base dimensions of the page used for the canvas size.
337    ///
338    /// When the CropBox origin is within the MediaBox (i.e. CropBox.x0 >=
339    /// MediaBox.x0 and CropBox.y0 >= MediaBox.y0), the canvas is sized to
340    /// intersect(CropBox, MediaBox).  This matches MuPDF's behaviour for
341    /// spec-violating PDFs where CropBox.y1 > MediaBox.y1 (e.g. gen-271:
342    /// CropBox=[0,0,595,793.7] vs MediaBox=[0,0,612,792] — using raw
343    /// CropBox gives 1654px height, a 4px vertical content offset, and
344    /// SSIM 0.49; intersecting gives 1650px matching MuPDF exactly).
345    ///
346    /// When CropBox extends below the MediaBox origin (gen-802 style:
347    /// CropBox=[0,0,684,864] vs MediaBox=[36,36,648,828]), MuPDF still uses
348    /// the full CropBox dimensions, so we do too. (#544, #558, gen-271)
349    pub fn base_dimensions(&self) -> (f32, f32) {
350        let crop_box = self.crop_box();
351        let media_box = self.media_box();
352
353        // Clip to MediaBox only when the CropBox origin lies within the
354        // MediaBox (both axes).  When the CropBox extends below the MediaBox
355        // origin MuPDF uses the raw CropBox, so preserve that behaviour.
356        let effective = if crop_box.x0 >= media_box.x0 && crop_box.y0 >= media_box.y0 {
357            crop_box.intersect(media_box)
358        } else {
359            crop_box
360        };
361
362        if (effective.width() as f32).is_nearly_zero()
363            || (effective.height() as f32).is_nearly_zero()
364        {
365            (US_LETTER.width() as f32, US_LETTER.height() as f32)
366        } else {
367            (
368                effective.width().max(1.0) as f32,
369                effective.height().max(1.0) as f32,
370            )
371        }
372    }
373
374    /// Return the with and height of the page that should be assumed when rendering the page.
375    ///
376    /// Depending on the document, it is either based on the media box or the crop box
377    /// of the page. In addition to that, it also takes the rotation of the page into account.
378    pub fn render_dimensions(&self) -> (f32, f32) {
379        let (mut base_width, mut base_height) = self.base_dimensions();
380
381        if matches!(
382            self.rotation(),
383            Rotation::Horizontal | Rotation::FlippedHorizontal
384        ) {
385            core::mem::swap(&mut base_width, &mut base_height);
386        }
387
388        (base_width, base_height)
389    }
390
391    /// Return an untyped iterator over the operators of the page's content stream.
392    pub fn operations(&self) -> UntypedIter<'_> {
393        self.operations_impl().unwrap_or(UntypedIter::empty())
394    }
395
396    /// Get the raw dictionary of the page.
397    pub fn raw(&self) -> &Dict<'a> {
398        &self.inner
399    }
400
401    /// Get the xref table (of the document the page belongs to).
402    pub fn xref(&self) -> &'a XRef {
403        self.ctx.xref()
404    }
405
406    /// Return a typed iterator over the operators of the page's content stream.
407    pub fn typed_operations(&self) -> TypedIter<'_> {
408        TypedIter::from_untyped(self.operations())
409    }
410
411    /// Return the annotation dictionaries for this page, if any.
412    pub fn annots(&self) -> Vec<Dict<'a>> {
413        self.inner
414            .get::<Array<'_>>(crate::object::dict::keys::ANNOTS)
415            .map(|arr| arr.iter::<Dict<'_>>().collect())
416            .unwrap_or_default()
417    }
418}
419
420/// A structure keeping track of the resources of a page.
421#[derive(Clone, Debug)]
422pub struct Resources<'a> {
423    parent: Option<Box<Self>>,
424    ctx: ReaderContext<'a>,
425    /// The raw dictionary of external graphics states.
426    pub ext_g_states: Dict<'a>,
427    /// The raw dictionary of fonts.
428    pub fonts: Dict<'a>,
429    /// The raw dictionary of properties.
430    pub properties: Dict<'a>,
431    /// The raw dictionary of color spaces.
432    pub color_spaces: Dict<'a>,
433    /// The raw dictionary of x objects.
434    pub x_objects: Dict<'a>,
435    /// The raw dictionary of patterns.
436    pub patterns: Dict<'a>,
437    /// The raw dictionary of shadings.
438    pub shadings: Dict<'a>,
439}
440
441impl<'a> Resources<'a> {
442    /// Create a new `Resources` object from a dictionary with a parent.
443    pub fn from_parent(resources: Dict<'a>, parent: Self) -> Self {
444        let ctx = parent.ctx.clone();
445
446        Self::new(resources, Some(parent), &ctx)
447    }
448
449    /// Create a new `Resources` object.
450    pub(crate) fn new(resources: Dict<'a>, parent: Option<Self>, ctx: &ReaderContext<'a>) -> Self {
451        let ext_g_states = resources.get::<Dict<'_>>(EXT_G_STATE).unwrap_or_default();
452        let fonts = resources.get::<Dict<'_>>(FONT).unwrap_or_default();
453        let color_spaces = resources.get::<Dict<'_>>(COLORSPACE).unwrap_or_default();
454        let x_objects = resources.get::<Dict<'_>>(XOBJECT).unwrap_or_default();
455        let patterns = resources.get::<Dict<'_>>(PATTERN).unwrap_or_default();
456        let shadings = resources.get::<Dict<'_>>(SHADING).unwrap_or_default();
457        let properties = resources.get::<Dict<'_>>(PROPERTIES).unwrap_or_default();
458
459        let parent = parent.map(Box::new);
460
461        Self {
462            parent,
463            ext_g_states,
464            fonts,
465            color_spaces,
466            properties,
467            x_objects,
468            patterns,
469            shadings,
470            ctx: ctx.clone(),
471        }
472    }
473
474    fn get_resource<T: ObjectLike<'a>>(&self, name: Name, dict: &Dict<'a>) -> Option<T> {
475        dict.get::<T>(name.deref())
476    }
477
478    /// Get the parent in the resource, chain, if available.
479    pub fn parent(&self) -> Option<&Self> {
480        self.parent.as_deref()
481    }
482
483    /// Get an external graphics state by name.
484    pub fn get_ext_g_state(&self, name: Name) -> Option<Dict<'a>> {
485        self.get_resource::<Dict<'_>>(name.clone(), &self.ext_g_states)
486            .or_else(|| self.parent.as_ref().and_then(|p| p.get_ext_g_state(name)))
487    }
488
489    /// Get a color space by name.
490    pub fn get_color_space(&self, name: Name) -> Option<Object<'a>> {
491        self.get_resource::<Object<'_>>(name.clone(), &self.color_spaces)
492            .or_else(|| self.parent.as_ref().and_then(|p| p.get_color_space(name)))
493    }
494
495    /// Get a font by name.
496    pub fn get_font(&self, name: Name) -> Option<Dict<'a>> {
497        self.get_resource::<Dict<'_>>(name.clone(), &self.fonts)
498            .or_else(|| self.parent.as_ref().and_then(|p| p.get_font(name)))
499    }
500
501    /// Get a pattern by name.
502    pub fn get_pattern(&self, name: Name) -> Option<Object<'a>> {
503        self.get_resource::<Object<'_>>(name.clone(), &self.patterns)
504            .or_else(|| self.parent.as_ref().and_then(|p| p.get_pattern(name)))
505    }
506
507    /// Get an x object by name.
508    pub fn get_x_object(&self, name: Name) -> Option<Stream<'a>> {
509        self.get_resource::<Stream<'_>>(name.clone(), &self.x_objects)
510            .or_else(|| self.parent.as_ref().and_then(|p| p.get_x_object(name)))
511    }
512
513    /// Get a shading by name.
514    pub fn get_shading(&self, name: Name) -> Option<Object<'a>> {
515        self.get_resource::<Object<'_>>(name.clone(), &self.shadings)
516            .or_else(|| self.parent.as_ref().and_then(|p| p.get_shading(name)))
517    }
518}
519
520// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
521const POINTS_PER_INCH: f64 = 72.0;
522const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
523
524/// The dimension of an A4 page (kept for completeness).
525pub const A4: Rect = Rect {
526    x0: 0.0,
527    y0: 0.0,
528    x1: 210.0 * POINTS_PER_MM,
529    y1: 297.0 * POINTS_PER_MM,
530};
531
532/// US Letter (8.5×11 in) — used as fallback when no MediaBox is present.
533///
534/// Old PDF 1.0/1.1 documents (especially US-government publications from the
535/// 1990s) omit MediaBox entirely and assume a US-Letter canvas, which matches
536/// the PostScript default and MuPDF's behaviour.  Using A4 here causes a
537/// ~5 % scale mismatch that tanks SSIM against MuPDF's oracle renders.
538/// See render_mupdf_oracle failures gen-059, gen-069, …  (#544)
539const US_LETTER: Rect = Rect {
540    x0: 0.0,
541    y0: 0.0,
542    x1: 8.5 * POINTS_PER_INCH,
543    y1: 11.0 * POINTS_PER_INCH,
544};
545
546pub(crate) mod cached {
547    use crate::page::Pages;
548    use crate::reader::ReaderContext;
549    use crate::xref::XRef;
550    use core::ops::Deref;
551
552    // Keep in sync with the implementation in `sync`. We duplicate it here
553    // to make it more visible since we have unsafe code here.
554    #[cfg(feature = "std")]
555    pub(crate) use std::sync::Arc;
556
557    #[cfg(not(feature = "std"))]
558    pub(crate) use alloc::rc::Rc as Arc;
559
560    pub(crate) struct CachedPages {
561        pages: Pages<'static>,
562        // NOTE: `pages` references the data in `xref`, so it's important that `xref`
563        // appears after `pages` in the struct definition to ensure correct drop order.
564        _xref: Arc<XRef>,
565    }
566
567    impl CachedPages {
568        pub(crate) fn new(xref: Arc<XRef>) -> Option<Self> {
569            // SAFETY:
570            // - The XRef's location is stable in memory:
571            //   - We wrapped it in a `Arc` (or `Rc` in `no_std`), which implements `StableDeref`.
572            //   - The struct owns the `Arc`, ensuring that the inner value is not dropped during the whole
573            //     duration.
574            // - The internal 'static lifetime is not leaked because its rewritten
575            //   to the self-lifetime in `pages()`.
576            let xref_reference: &'static XRef = unsafe { core::mem::transmute(xref.deref()) };
577
578            let ctx = ReaderContext::new(xref_reference, false);
579            let pages = xref_reference
580                .get_with(xref.trailer_data().pages_ref, &ctx)
581                .and_then(|p| Pages::new(&p, &ctx, xref_reference))
582                .or_else(|| Pages::new_brute_force(&ctx, xref_reference))?;
583
584            Some(Self { pages, _xref: xref })
585        }
586
587        pub(crate) fn get(&self) -> &Pages<'_> {
588            &self.pages
589        }
590    }
591}