Skip to main content

pdf_syntax/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::Array;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::Rect;
8use crate::object::Stream;
9use crate::object::dict::keys::*;
10use crate::object::{Object, ObjectLike};
11use crate::reader::ReaderContext;
12use crate::sync::OnceLock;
13use crate::util::FloatExt;
14use crate::xref::XRef;
15use alloc::boxed::Box;
16use alloc::vec;
17use alloc::vec::Vec;
18use core::ops::Deref;
19use log::warn;
20
21/// Attributes that can be inherited.
22#[derive(Debug, Clone)]
23struct PagesContext {
24    media_box: Option<Rect>,
25    crop_box: Option<Rect>,
26    rotate: Option<u32>,
27}
28
29impl PagesContext {
30    fn new() -> Self {
31        Self {
32            media_box: None,
33            crop_box: None,
34            rotate: None,
35        }
36    }
37}
38
39/// A structure holding the pages of a PDF document.
40pub struct Pages<'a> {
41    pages: Vec<Page<'a>>,
42    xref: &'a XRef,
43}
44
45impl<'a> Pages<'a> {
46    /// Create a new `Pages` object.
47    pub(crate) fn new(
48        pages_dict: &Dict<'a>,
49        ctx: &ReaderContext<'a>,
50        xref: &'a XRef,
51    ) -> Option<Self> {
52        let mut pages = vec![];
53        let pages_ctx = PagesContext::new();
54        resolve_pages(
55            pages_dict,
56            &mut pages,
57            pages_ctx,
58            Resources::new(Dict::empty(), None, ctx),
59        )?;
60
61        Some(Self { pages, xref })
62    }
63
64    /// Create a new `Pages` object by bruteforce-searching.
65    ///
66    /// Of course this could result in the order of pages being messed up, but
67    /// this is still better than nothing.
68    pub(crate) fn new_brute_force(ctx: &ReaderContext<'a>, xref: &'a XRef) -> Option<Self> {
69        let mut pages = vec![];
70
71        for object in xref.objects() {
72            if let Some(dict) = object.into_dict()
73                && let Some(page) = Page::new(
74                    &dict,
75                    &PagesContext::new(),
76                    Resources::new(Dict::empty(), None, ctx),
77                    true,
78                )
79            {
80                pages.push(page);
81            }
82        }
83
84        if pages.is_empty() {
85            return None;
86        }
87
88        Some(Self { pages, xref })
89    }
90
91    /// Return the xref table (of the document the pages belong to).   
92    pub fn xref(&self) -> &'a XRef {
93        self.xref
94    }
95}
96
97impl<'a> Deref for Pages<'a> {
98    type Target = [Page<'a>];
99
100    fn deref(&self) -> &Self::Target {
101        &self.pages
102    }
103}
104
105/// Maximum depth for recursive page tree traversal.
106/// Prevents stack overflow on malformed PDFs with deeply nested or circular page trees.
107const MAX_PAGE_TREE_DEPTH: usize = 256;
108
109/// Maximum number of pages to collect from the page tree.
110/// Prevents exponential blowup from "page tree bombs" where shared Kids nodes
111/// cause the same subtrees to be visited multiple times (e.g., 8 KB PDF → 2^27 pages).
112const MAX_PAGE_COUNT: usize = 100_000;
113
114fn resolve_pages<'a>(
115    pages_dict: &Dict<'a>,
116    entries: &mut Vec<Page<'a>>,
117    ctx: PagesContext,
118    resources: Resources<'a>,
119) -> Option<()> {
120    resolve_pages_depth(pages_dict, entries, ctx, resources, 0)
121}
122
123fn resolve_pages_depth<'a>(
124    pages_dict: &Dict<'a>,
125    entries: &mut Vec<Page<'a>>,
126    mut ctx: PagesContext,
127    resources: Resources<'a>,
128    depth: usize,
129) -> Option<()> {
130    if depth > MAX_PAGE_TREE_DEPTH {
131        log::warn!("Page tree depth exceeds {MAX_PAGE_TREE_DEPTH}, stopping traversal");
132        return None;
133    }
134
135    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
136        ctx.media_box = Some(media_box);
137    }
138
139    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
140        ctx.crop_box = Some(crop_box);
141    }
142
143    // /Rotate may be negative (e.g. -90), which is valid per the PDF spec.
144    // Normalise to [0, 360) via Euclidean remainder so u32 storage is safe.
145    if let Some(rotate) = pages_dict.get::<i32>(ROTATE) {
146        ctx.rotate = Some(rotate.rem_euclid(360) as u32);
147    }
148
149    let resources = Resources::from_parent(
150        pages_dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
151        resources.clone(),
152    );
153
154    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
155
156    for dict in kids.iter::<Dict<'_>>() {
157        if entries.len() >= MAX_PAGE_COUNT {
158            log::warn!("Page count exceeds {MAX_PAGE_COUNT}, stopping page tree traversal");
159            return Some(());
160        }
161
162        match dict.get::<Name>(TYPE).as_deref() {
163            Some(PAGES) => {
164                resolve_pages_depth(&dict, entries, ctx.clone(), resources.clone(), depth + 1);
165            }
166            // Let's be lenient and assume it's a `Page` in case it's `None` or something else
167            // (see corpus test case 0083781).
168            _ => {
169                if let Some(page) = Page::new(&dict, &ctx, resources.clone(), false) {
170                    entries.push(page);
171                }
172            }
173        }
174    }
175
176    Some(())
177}
178
179/// The rotation of the page.
180#[derive(Debug, Copy, Clone)]
181pub enum Rotation {
182    /// No rotation.
183    None,
184    /// A rotation of 90 degrees.
185    Horizontal,
186    /// A rotation of 180 degrees.
187    Flipped,
188    /// A rotation of 270 degrees.
189    FlippedHorizontal,
190}
191
192/// A PDF page.
193pub struct Page<'a> {
194    inner: Dict<'a>,
195    media_box: Rect,
196    crop_box: Rect,
197    rotation: Rotation,
198    page_streams: OnceLock<Option<Vec<u8>>>,
199    resources: Resources<'a>,
200    ctx: ReaderContext<'a>,
201}
202
203impl<'a> Page<'a> {
204    fn new(
205        dict: &Dict<'a>,
206        ctx: &PagesContext,
207        resources: Resources<'a>,
208        brute_force: bool,
209    ) -> Option<Self> {
210        // In general, pages without content are allowed, but in case we are brute-forcing
211        // we ignore them.
212        if brute_force && !dict.contains_key(CONTENTS) {
213            return None;
214        }
215
216        let media_box = dict
217            .get::<Rect>(MEDIA_BOX)
218            .or(ctx.media_box)
219            .unwrap_or(US_LETTER);
220
221        let crop_box = dict
222            .get::<Rect>(CROP_BOX)
223            .or(ctx.crop_box)
224            .unwrap_or(media_box);
225
226        let rotation = match dict
227            .get::<i32>(ROTATE)
228            .map(|r| r.rem_euclid(360) as u32)
229            .or(ctx.rotate)
230            .unwrap_or(0)
231        {
232            0 => Rotation::None,
233            90 => Rotation::Horizontal,
234            180 => Rotation::Flipped,
235            270 => Rotation::FlippedHorizontal,
236            _ => Rotation::None,
237        };
238
239        let ctx = resources.ctx.clone();
240        let resources = Resources::from_parent(
241            dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
242            resources,
243        );
244
245        Some(Self {
246            inner: dict.clone(),
247            media_box,
248            crop_box,
249            rotation,
250            page_streams: OnceLock::new(),
251            resources,
252            ctx,
253        })
254    }
255
256    fn operations_impl(&self) -> Option<UntypedIter<'_>> {
257        let stream = self.page_stream()?;
258        let iter = UntypedIter::new(stream);
259
260        Some(iter)
261    }
262
263    /// Return the decoded content stream of the page.
264    pub fn page_stream(&self) -> Option<&[u8]> {
265        let convert_single = |s: Stream<'_>| {
266            let data = s.decoded().ok()?;
267            Some(data.to_vec())
268        };
269
270        self.page_streams
271            .get_or_init(|| {
272                if let Some(stream) = self.inner.get::<Stream<'_>>(CONTENTS) {
273                    convert_single(stream)
274                } else if let Some(array) = self.inner.get::<Array<'_>>(CONTENTS) {
275                    let streams = array.iter::<Stream<'_>>().flat_map(convert_single);
276
277                    let mut collected = vec![];
278
279                    for stream in streams {
280                        collected.extend(stream);
281                        // Streams must have at least one whitespace in-between.
282                        collected.push(b' ');
283                    }
284
285                    Some(collected)
286                } else {
287                    warn!("contents entry of page was neither stream nor array of streams");
288
289                    None
290                }
291            })
292            .as_ref()
293            .map(|d| d.as_slice())
294    }
295
296    /// Get the resources of the page.
297    pub fn resources(&self) -> &Resources<'a> {
298        &self.resources
299    }
300
301    /// Get the media box of the page.
302    pub fn media_box(&self) -> Rect {
303        self.media_box
304    }
305
306    /// Get the rotation of the page.
307    pub fn rotation(&self) -> Rotation {
308        self.rotation
309    }
310
311    /// Get the crop box of the page.
312    pub fn crop_box(&self) -> Rect {
313        self.crop_box
314    }
315
316    /// Return the intersection of crop box and media box.
317    pub fn intersected_crop_box(&self) -> Rect {
318        self.crop_box().intersect(self.media_box())
319    }
320
321    /// Return the base dimensions of the page used for the canvas size.
322    ///
323    /// When the CropBox origin is within the MediaBox (i.e. CropBox.x0 >=
324    /// MediaBox.x0 and CropBox.y0 >= MediaBox.y0), the canvas is sized to
325    /// intersect(CropBox, MediaBox).  This matches MuPDF's behaviour for
326    /// spec-violating PDFs where CropBox.y1 > MediaBox.y1 (e.g. gen-271:
327    /// CropBox=[0,0,595,793.7] vs MediaBox=[0,0,612,792] — using raw
328    /// CropBox gives 1654px height, a 4px vertical content offset, and
329    /// SSIM 0.49; intersecting gives 1650px matching MuPDF exactly).
330    ///
331    /// When CropBox extends below the MediaBox origin (gen-802 style:
332    /// CropBox=[0,0,684,864] vs MediaBox=[36,36,648,828]), MuPDF still uses
333    /// the full CropBox dimensions, so we do too. (#544, #558, gen-271)
334    pub fn base_dimensions(&self) -> (f32, f32) {
335        let crop_box = self.crop_box();
336        let media_box = self.media_box();
337
338        // Clip to MediaBox only when the CropBox origin lies within the
339        // MediaBox (both axes).  When the CropBox extends below the MediaBox
340        // origin MuPDF uses the raw CropBox, so preserve that behaviour.
341        let effective = if crop_box.x0 >= media_box.x0 && crop_box.y0 >= media_box.y0 {
342            crop_box.intersect(media_box)
343        } else {
344            crop_box
345        };
346
347        if (effective.width() as f32).is_nearly_zero()
348            || (effective.height() as f32).is_nearly_zero()
349        {
350            (US_LETTER.width() as f32, US_LETTER.height() as f32)
351        } else {
352            (
353                effective.width().max(1.0) as f32,
354                effective.height().max(1.0) as f32,
355            )
356        }
357    }
358
359    /// Return the with and height of the page that should be assumed when rendering the page.
360    ///
361    /// Depending on the document, it is either based on the media box or the crop box
362    /// of the page. In addition to that, it also takes the rotation of the page into account.
363    pub fn render_dimensions(&self) -> (f32, f32) {
364        let (mut base_width, mut base_height) = self.base_dimensions();
365
366        if matches!(
367            self.rotation(),
368            Rotation::Horizontal | Rotation::FlippedHorizontal
369        ) {
370            core::mem::swap(&mut base_width, &mut base_height);
371        }
372
373        (base_width, base_height)
374    }
375
376    /// Return an untyped iterator over the operators of the page's content stream.
377    pub fn operations(&self) -> UntypedIter<'_> {
378        self.operations_impl().unwrap_or(UntypedIter::empty())
379    }
380
381    /// Get the raw dictionary of the page.
382    pub fn raw(&self) -> &Dict<'a> {
383        &self.inner
384    }
385
386    /// Get the xref table (of the document the page belongs to).
387    pub fn xref(&self) -> &'a XRef {
388        self.ctx.xref()
389    }
390
391    /// Return a typed iterator over the operators of the page's content stream.
392    pub fn typed_operations(&self) -> TypedIter<'_> {
393        TypedIter::from_untyped(self.operations())
394    }
395
396    /// Return the annotation dictionaries for this page, if any.
397    pub fn annots(&self) -> Vec<Dict<'a>> {
398        self.inner
399            .get::<Array<'_>>(crate::object::dict::keys::ANNOTS)
400            .map(|arr| arr.iter::<Dict<'_>>().collect())
401            .unwrap_or_default()
402    }
403}
404
405/// A structure keeping track of the resources of a page.
406#[derive(Clone, Debug)]
407pub struct Resources<'a> {
408    parent: Option<Box<Self>>,
409    ctx: ReaderContext<'a>,
410    /// The raw dictionary of external graphics states.
411    pub ext_g_states: Dict<'a>,
412    /// The raw dictionary of fonts.
413    pub fonts: Dict<'a>,
414    /// The raw dictionary of properties.
415    pub properties: Dict<'a>,
416    /// The raw dictionary of color spaces.
417    pub color_spaces: Dict<'a>,
418    /// The raw dictionary of x objects.
419    pub x_objects: Dict<'a>,
420    /// The raw dictionary of patterns.
421    pub patterns: Dict<'a>,
422    /// The raw dictionary of shadings.
423    pub shadings: Dict<'a>,
424}
425
426impl<'a> Resources<'a> {
427    /// Create a new `Resources` object from a dictionary with a parent.
428    pub fn from_parent(resources: Dict<'a>, parent: Self) -> Self {
429        let ctx = parent.ctx.clone();
430
431        Self::new(resources, Some(parent), &ctx)
432    }
433
434    /// Create a new `Resources` object.
435    pub(crate) fn new(resources: Dict<'a>, parent: Option<Self>, ctx: &ReaderContext<'a>) -> Self {
436        let ext_g_states = resources.get::<Dict<'_>>(EXT_G_STATE).unwrap_or_default();
437        let fonts = resources.get::<Dict<'_>>(FONT).unwrap_or_default();
438        let color_spaces = resources.get::<Dict<'_>>(COLORSPACE).unwrap_or_default();
439        let x_objects = resources.get::<Dict<'_>>(XOBJECT).unwrap_or_default();
440        let patterns = resources.get::<Dict<'_>>(PATTERN).unwrap_or_default();
441        let shadings = resources.get::<Dict<'_>>(SHADING).unwrap_or_default();
442        let properties = resources.get::<Dict<'_>>(PROPERTIES).unwrap_or_default();
443
444        let parent = parent.map(Box::new);
445
446        Self {
447            parent,
448            ext_g_states,
449            fonts,
450            color_spaces,
451            properties,
452            x_objects,
453            patterns,
454            shadings,
455            ctx: ctx.clone(),
456        }
457    }
458
459    fn get_resource<T: ObjectLike<'a>>(&self, name: Name, dict: &Dict<'a>) -> Option<T> {
460        dict.get::<T>(name.deref())
461    }
462
463    /// Get the parent in the resource, chain, if available.
464    pub fn parent(&self) -> Option<&Self> {
465        self.parent.as_deref()
466    }
467
468    /// Get an external graphics state by name.
469    pub fn get_ext_g_state(&self, name: Name) -> Option<Dict<'a>> {
470        self.get_resource::<Dict<'_>>(name.clone(), &self.ext_g_states)
471            .or_else(|| self.parent.as_ref().and_then(|p| p.get_ext_g_state(name)))
472    }
473
474    /// Get a color space by name.
475    pub fn get_color_space(&self, name: Name) -> Option<Object<'a>> {
476        self.get_resource::<Object<'_>>(name.clone(), &self.color_spaces)
477            .or_else(|| self.parent.as_ref().and_then(|p| p.get_color_space(name)))
478    }
479
480    /// Get a font by name.
481    pub fn get_font(&self, name: Name) -> Option<Dict<'a>> {
482        self.get_resource::<Dict<'_>>(name.clone(), &self.fonts)
483            .or_else(|| self.parent.as_ref().and_then(|p| p.get_font(name)))
484    }
485
486    /// Get a pattern by name.
487    pub fn get_pattern(&self, name: Name) -> Option<Object<'a>> {
488        self.get_resource::<Object<'_>>(name.clone(), &self.patterns)
489            .or_else(|| self.parent.as_ref().and_then(|p| p.get_pattern(name)))
490    }
491
492    /// Get an x object by name.
493    pub fn get_x_object(&self, name: Name) -> Option<Stream<'a>> {
494        self.get_resource::<Stream<'_>>(name.clone(), &self.x_objects)
495            .or_else(|| self.parent.as_ref().and_then(|p| p.get_x_object(name)))
496    }
497
498    /// Get a shading by name.
499    pub fn get_shading(&self, name: Name) -> Option<Object<'a>> {
500        self.get_resource::<Object<'_>>(name.clone(), &self.shadings)
501            .or_else(|| self.parent.as_ref().and_then(|p| p.get_shading(name)))
502    }
503}
504
505// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
506const POINTS_PER_INCH: f64 = 72.0;
507const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
508
509/// The dimension of an A4 page (kept for completeness).
510pub const A4: Rect = Rect {
511    x0: 0.0,
512    y0: 0.0,
513    x1: 210.0 * POINTS_PER_MM,
514    y1: 297.0 * POINTS_PER_MM,
515};
516
517/// US Letter (8.5×11 in) — used as fallback when no MediaBox is present.
518///
519/// Old PDF 1.0/1.1 documents (especially US-government publications from the
520/// 1990s) omit MediaBox entirely and assume a US-Letter canvas, which matches
521/// the PostScript default and MuPDF's behaviour.  Using A4 here causes a
522/// ~5 % scale mismatch that tanks SSIM against MuPDF's oracle renders.
523/// See render_mupdf_oracle failures gen-059, gen-069, …  (#544)
524const US_LETTER: Rect = Rect {
525    x0: 0.0,
526    y0: 0.0,
527    x1: 8.5 * POINTS_PER_INCH,
528    y1: 11.0 * POINTS_PER_INCH,
529};
530
531pub(crate) mod cached {
532    use crate::page::Pages;
533    use crate::reader::ReaderContext;
534    use crate::xref::XRef;
535    use core::ops::Deref;
536
537    // Keep in sync with the implementation in `sync`. We duplicate it here
538    // to make it more visible since we have unsafe code here.
539    #[cfg(feature = "std")]
540    pub(crate) use std::sync::Arc;
541
542    #[cfg(not(feature = "std"))]
543    pub(crate) use alloc::rc::Rc as Arc;
544
545    pub(crate) struct CachedPages {
546        pages: Pages<'static>,
547        // NOTE: `pages` references the data in `xref`, so it's important that `xref`
548        // appears after `pages` in the struct definition to ensure correct drop order.
549        _xref: Arc<XRef>,
550    }
551
552    impl CachedPages {
553        pub(crate) fn new(xref: Arc<XRef>) -> Option<Self> {
554            // SAFETY:
555            // - The XRef's location is stable in memory:
556            //   - We wrapped it in a `Arc` (or `Rc` in `no_std`), which implements `StableDeref`.
557            //   - The struct owns the `Arc`, ensuring that the inner value is not dropped during the whole
558            //     duration.
559            // - The internal 'static lifetime is not leaked because its rewritten
560            //   to the self-lifetime in `pages()`.
561            let xref_reference: &'static XRef = unsafe { core::mem::transmute(xref.deref()) };
562
563            let ctx = ReaderContext::new(xref_reference, false);
564            let pages = xref_reference
565                .get_with(xref.trailer_data().pages_ref, &ctx)
566                .and_then(|p| Pages::new(&p, &ctx, xref_reference))
567                .or_else(|| Pages::new_brute_force(&ctx, xref_reference))?;
568
569            Some(Self { pages, _xref: xref })
570        }
571
572        pub(crate) fn get(&self) -> &Pages<'_> {
573            &self.pages
574        }
575    }
576}