Skip to main content

hayro_syntax/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::Array;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::Rect;
8use crate::object::Stream;
9use crate::object::dict::keys::*;
10use crate::object::{Object, ObjectLike};
11use crate::reader::ReaderContext;
12use crate::sync::OnceLock;
13use crate::transform::Transform;
14use crate::util::FloatExt;
15use crate::xref::XRef;
16use alloc::boxed::Box;
17use alloc::vec;
18use alloc::vec::Vec;
19use core::ops::Deref;
20
21/// Attributes that can be inherited.
22#[derive(Debug, Clone)]
23struct PagesContext {
24    media_box: Option<Rect>,
25    crop_box: Option<Rect>,
26    rotate: Option<i32>,
27}
28
29impl PagesContext {
30    fn new() -> Self {
31        Self {
32            media_box: None,
33            crop_box: None,
34            rotate: None,
35        }
36    }
37}
38
39/// A structure holding the pages of a PDF document.
40pub struct Pages<'a> {
41    pages: Vec<Page<'a>>,
42    xref: &'a XRef,
43}
44
45impl<'a> Pages<'a> {
46    /// Create a new `Pages` object.
47    pub(crate) fn new(
48        pages_dict: &Dict<'a>,
49        ctx: &ReaderContext<'a>,
50        xref: &'a XRef,
51    ) -> Option<Self> {
52        let mut pages = vec![];
53        let pages_ctx = PagesContext::new();
54        resolve_pages(
55            pages_dict,
56            &mut pages,
57            pages_ctx,
58            Resources::new(Dict::empty(), None, ctx),
59        )?;
60
61        Some(Self { pages, xref })
62    }
63
64    /// Create a new `Pages` object by bruteforce-searching.
65    ///
66    /// Of course this could result in the order of pages being messed up, but
67    /// this is still better than nothing.
68    pub(crate) fn new_brute_force(ctx: &ReaderContext<'a>, xref: &'a XRef) -> Option<Self> {
69        let mut pages = vec![];
70
71        for object in xref.objects() {
72            if let Some(dict) = object.into_dict()
73                && let Some(page) = Page::new(
74                    &dict,
75                    &PagesContext::new(),
76                    Resources::new(Dict::empty(), None, ctx),
77                    true,
78                )
79            {
80                pages.push(page);
81            }
82        }
83
84        if pages.is_empty() {
85            return None;
86        }
87
88        Some(Self { pages, xref })
89    }
90
91    /// Return the xref table (of the document the pages belong to).   
92    pub fn xref(&self) -> &'a XRef {
93        self.xref
94    }
95}
96
97impl<'a> Deref for Pages<'a> {
98    type Target = [Page<'a>];
99
100    fn deref(&self) -> &Self::Target {
101        &self.pages
102    }
103}
104
105fn resolve_pages<'a>(
106    pages_dict: &Dict<'a>,
107    entries: &mut Vec<Page<'a>>,
108    mut ctx: PagesContext,
109    resources: Resources<'a>,
110) -> Option<()> {
111    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
112        ctx.media_box = Some(media_box);
113    }
114
115    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
116        ctx.crop_box = Some(crop_box);
117    }
118
119    if let Some(rotate) = pages_dict.get::<i32>(ROTATE) {
120        ctx.rotate = Some(rotate);
121    }
122
123    let resources = Resources::from_parent(
124        pages_dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
125        resources.clone(),
126    );
127
128    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
129
130    for dict in kids.iter::<Dict<'_>>() {
131        match dict.get::<Name<'_>>(TYPE).as_deref() {
132            Some(PAGES) => {
133                resolve_pages(&dict, entries, ctx.clone(), resources.clone());
134            }
135            // Let's be lenient and assume it's a `Page` in case it's `None` or something else
136            // (see corpus test case 0083781).
137            _ => {
138                if let Some(page) = Page::new(&dict, &ctx, resources.clone(), false) {
139                    entries.push(page);
140                }
141            }
142        }
143    }
144
145    Some(())
146}
147
148/// The rotation of the page.
149#[derive(Debug, Copy, Clone)]
150pub enum Rotation {
151    /// No rotation.
152    None,
153    /// A rotation of 90 degrees.
154    Horizontal,
155    /// A rotation of 180 degrees.
156    Flipped,
157    /// A rotation of 270 degrees.
158    FlippedHorizontal,
159}
160
161/// A PDF page.
162pub struct Page<'a> {
163    inner: Dict<'a>,
164    media_box: Rect,
165    crop_box: Rect,
166    rotation: Rotation,
167    page_streams: OnceLock<Option<Vec<u8>>>,
168    resources: Resources<'a>,
169    ctx: ReaderContext<'a>,
170}
171
172impl<'a> Page<'a> {
173    fn new(
174        dict: &Dict<'a>,
175        ctx: &PagesContext,
176        resources: Resources<'a>,
177        brute_force: bool,
178    ) -> Option<Self> {
179        // In general, pages without content are allowed, but in case we are brute-forcing
180        // we ignore them.
181        if brute_force && !dict.contains_key(CONTENTS) {
182            return None;
183        }
184
185        let media_box = dict.get::<Rect>(MEDIA_BOX).or(ctx.media_box).unwrap_or(A4);
186
187        let crop_box = dict
188            .get::<Rect>(CROP_BOX)
189            .or(ctx.crop_box)
190            .unwrap_or(media_box);
191
192        let rotation = match dict
193            .get::<i32>(ROTATE)
194            .or(ctx.rotate)
195            .unwrap_or(0)
196            .rem_euclid(360)
197        {
198            0 => Rotation::None,
199            90 => Rotation::Horizontal,
200            180 => Rotation::Flipped,
201            270 => Rotation::FlippedHorizontal,
202            _ => Rotation::None,
203        };
204
205        let ctx = resources.ctx.clone();
206        let resources = Resources::from_parent(
207            dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
208            resources,
209        );
210
211        Some(Self {
212            inner: dict.clone(),
213            media_box,
214            crop_box,
215            rotation,
216            page_streams: OnceLock::new(),
217            resources,
218            ctx,
219        })
220    }
221
222    fn operations_impl(&self) -> Option<UntypedIter<'_>> {
223        let stream = self.page_stream()?;
224        let iter = UntypedIter::new(stream);
225
226        Some(iter)
227    }
228
229    /// Return the decoded content stream of the page.
230    pub fn page_stream(&self) -> Option<&[u8]> {
231        let convert_single = |s: Stream<'_>| {
232            let data = s.decoded().ok()?;
233            Some(data.to_vec())
234        };
235
236        self.page_streams
237            .get_or_init(|| {
238                if let Some(stream) = self.inner.get::<Stream<'_>>(CONTENTS) {
239                    convert_single(stream)
240                } else if let Some(array) = self.inner.get::<Array<'_>>(CONTENTS) {
241                    let streams = array.iter::<Stream<'_>>().flat_map(convert_single);
242
243                    let mut collected = vec![];
244
245                    for stream in streams {
246                        collected.extend(stream);
247                        // Streams must have at least one whitespace in-between.
248                        collected.push(b' ');
249                    }
250
251                    Some(collected)
252                } else {
253                    warn!("contents entry of page was neither stream nor array of streams");
254
255                    None
256                }
257            })
258            .as_ref()
259            .map(|d| d.as_slice())
260    }
261
262    /// Get the resources of the page.
263    pub fn resources(&self) -> &Resources<'a> {
264        &self.resources
265    }
266
267    /// Get the media box of the page.
268    pub fn media_box(&self) -> Rect {
269        self.media_box
270    }
271
272    /// Get the rotation of the page.
273    pub fn rotation(&self) -> Rotation {
274        self.rotation
275    }
276
277    /// Get the crop box of the page.
278    pub fn crop_box(&self) -> Rect {
279        self.crop_box
280    }
281
282    /// Return the intersection of crop box and media box.
283    pub fn intersected_crop_box(&self) -> Rect {
284        self.crop_box().intersect(self.media_box())
285    }
286
287    /// Return the base dimensions of the page (same as `intersected_crop_box`, but with special
288    /// handling applied for zero-area pages).
289    pub fn base_dimensions(&self) -> (f32, f32) {
290        let crop_box = self.intersected_crop_box();
291
292        if (crop_box.width() as f32).is_nearly_zero() || (crop_box.height() as f32).is_nearly_zero()
293        {
294            (A4.width() as f32, A4.height() as f32)
295        } else {
296            (
297                crop_box.width().max(1.0) as f32,
298                crop_box.height().max(1.0) as f32,
299            )
300        }
301    }
302
303    /// Return the with and height of the page that should be assumed when rendering the page.
304    ///
305    /// Depending on the document, it is either based on the media box or the crop box
306    /// of the page. In addition to that, it also takes the rotation of the page into account.
307    pub fn render_dimensions(&self) -> (f32, f32) {
308        let (mut base_width, mut base_height) = self.base_dimensions();
309
310        if matches!(
311            self.rotation(),
312            Rotation::Horizontal | Rotation::FlippedHorizontal
313        ) {
314            core::mem::swap(&mut base_width, &mut base_height);
315        }
316
317        (base_width, base_height)
318    }
319
320    /// Return an untyped iterator over the operators of the page's content stream.
321    pub fn operations(&self) -> UntypedIter<'_> {
322        self.operations_impl().unwrap_or(UntypedIter::empty())
323    }
324
325    /// Get the raw dictionary of the page.
326    pub fn raw(&self) -> &Dict<'a> {
327        &self.inner
328    }
329
330    /// Get the xref table (of the document the page belongs to).
331    pub fn xref(&self) -> &'a XRef {
332        self.ctx.xref()
333    }
334
335    /// Return a typed iterator over the operators of the page's content stream.
336    pub fn typed_operations(&self) -> TypedIter<'_> {
337        TypedIter::from_untyped(self.operations())
338    }
339
340    /// Return the initial transform that should be applied when rendering.
341    ///
342    /// This accounts for the mismatch between PDF's y-up and most renderers'
343    /// y-down coordinate system, the rotation of the page and the offset of
344    /// the crop box.
345    pub fn initial_transform(&self, invert_y: bool) -> Transform {
346        let crop_box = self.intersected_crop_box();
347        let (_, base_height) = self.base_dimensions();
348        let (width, height) = self.render_dimensions();
349
350        let horizontal_t = Transform::ROTATE_CW_90 * Transform::translate((0.0, -width as f64));
351        let flipped_horizontal_t =
352            Transform::translate((0.0, height as f64)) * Transform::ROTATE_CCW_90;
353
354        let rotation_transform = match self.rotation() {
355            Rotation::None => Transform::IDENTITY,
356            Rotation::Horizontal => {
357                if invert_y {
358                    horizontal_t
359                } else {
360                    flipped_horizontal_t
361                }
362            }
363            Rotation::Flipped => {
364                Transform::scale(-1.0) * Transform::translate((-width as f64, -height as f64))
365            }
366            Rotation::FlippedHorizontal => {
367                if invert_y {
368                    flipped_horizontal_t
369                } else {
370                    horizontal_t
371                }
372            }
373        };
374
375        let inversion_transform = if invert_y {
376            Transform::new([1.0, 0.0, 0.0, -1.0, 0.0, base_height as f64])
377        } else {
378            Transform::IDENTITY
379        };
380
381        rotation_transform
382            * inversion_transform
383            * Transform::translate((-crop_box.x0, -crop_box.y0))
384    }
385}
386
387/// A structure keeping track of the resources of a page.
388#[derive(Clone, Debug)]
389pub struct Resources<'a> {
390    parent: Option<Box<Self>>,
391    ctx: ReaderContext<'a>,
392    /// The raw dictionary of external graphics states.
393    pub ext_g_states: Dict<'a>,
394    /// The raw dictionary of fonts.
395    pub fonts: Dict<'a>,
396    /// The raw dictionary of properties.
397    pub properties: Dict<'a>,
398    /// The raw dictionary of color spaces.
399    pub color_spaces: Dict<'a>,
400    /// The raw dictionary of x objects.
401    pub x_objects: Dict<'a>,
402    /// The raw dictionary of patterns.
403    pub patterns: Dict<'a>,
404    /// The raw dictionary of shadings.
405    pub shadings: Dict<'a>,
406}
407
408impl<'a> Resources<'a> {
409    /// Create a new `Resources` object from a dictionary with a parent.
410    pub fn from_parent(resources: Dict<'a>, parent: Self) -> Self {
411        let ctx = parent.ctx.clone();
412
413        Self::new(resources, Some(parent), &ctx)
414    }
415
416    /// Create a new `Resources` object.
417    pub(crate) fn new(resources: Dict<'a>, parent: Option<Self>, ctx: &ReaderContext<'a>) -> Self {
418        let ext_g_states = resources.get::<Dict<'_>>(EXT_G_STATE).unwrap_or_default();
419        let fonts = resources.get::<Dict<'_>>(FONT).unwrap_or_default();
420        let color_spaces = resources.get::<Dict<'_>>(COLORSPACE).unwrap_or_default();
421        let x_objects = resources.get::<Dict<'_>>(XOBJECT).unwrap_or_default();
422        let patterns = resources.get::<Dict<'_>>(PATTERN).unwrap_or_default();
423        let shadings = resources.get::<Dict<'_>>(SHADING).unwrap_or_default();
424        let properties = resources.get::<Dict<'_>>(PROPERTIES).unwrap_or_default();
425
426        let parent = parent.map(Box::new);
427
428        Self {
429            parent,
430            ext_g_states,
431            fonts,
432            color_spaces,
433            properties,
434            x_objects,
435            patterns,
436            shadings,
437            ctx: ctx.clone(),
438        }
439    }
440
441    fn get_resource<T: ObjectLike<'a>>(&self, name: &Name<'_>, dict: &Dict<'a>) -> Option<T> {
442        dict.get::<T>(name.deref())
443    }
444
445    /// Get the parent in the resource, chain, if available.
446    pub fn parent(&self) -> Option<&Self> {
447        self.parent.as_deref()
448    }
449
450    /// Get an external graphics state by name.
451    pub fn get_ext_g_state(&self, name: &Name<'_>) -> Option<Dict<'a>> {
452        self.get_resource::<Dict<'_>>(name, &self.ext_g_states)
453            .or_else(|| self.parent.as_ref().and_then(|p| p.get_ext_g_state(name)))
454    }
455
456    /// Get a color space by name.
457    pub fn get_color_space(&self, name: &Name<'_>) -> Option<Object<'a>> {
458        self.get_resource::<Object<'_>>(name, &self.color_spaces)
459            .or_else(|| self.parent.as_ref().and_then(|p| p.get_color_space(name)))
460    }
461
462    /// Get a font by name.
463    pub fn get_font(&self, name: &Name<'_>) -> Option<Dict<'a>> {
464        self.get_resource::<Dict<'_>>(name, &self.fonts)
465            .or_else(|| self.parent.as_ref().and_then(|p| p.get_font(name)))
466    }
467
468    /// Get a pattern by name.
469    pub fn get_pattern(&self, name: &Name<'_>) -> Option<Object<'a>> {
470        self.get_resource::<Object<'_>>(name, &self.patterns)
471            .or_else(|| self.parent.as_ref().and_then(|p| p.get_pattern(name)))
472    }
473
474    /// Get an x object by name.
475    pub fn get_x_object(&self, name: &Name<'_>) -> Option<Stream<'a>> {
476        self.get_resource::<Stream<'_>>(name, &self.x_objects)
477            .or_else(|| self.parent.as_ref().and_then(|p| p.get_x_object(name)))
478    }
479
480    /// Get a shading by name.
481    pub fn get_shading(&self, name: &Name<'_>) -> Option<Object<'a>> {
482        self.get_resource::<Object<'_>>(name, &self.shadings)
483            .or_else(|| self.parent.as_ref().and_then(|p| p.get_shading(name)))
484    }
485}
486
487// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
488const POINTS_PER_INCH: f64 = 72.0;
489const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
490
491/// The dimension of an A4 page.
492pub const A4: Rect = Rect {
493    x0: 0.0,
494    y0: 0.0,
495    x1: 210.0 * POINTS_PER_MM,
496    y1: 297.0 * POINTS_PER_MM,
497};
498
499pub(crate) mod cached {
500    use crate::page::Pages;
501    use crate::reader::ReaderContext;
502    use crate::xref::XRef;
503    use core::ops::Deref;
504
505    // Keep in sync with the implementation in `sync`. We duplicate it here
506    // to make it more visible since we have unsafe code here.
507    #[cfg(feature = "std")]
508    pub(crate) use std::sync::Arc;
509
510    #[cfg(not(feature = "std"))]
511    pub(crate) use alloc::rc::Rc as Arc;
512
513    pub(crate) struct CachedPages {
514        pages: Pages<'static>,
515        // NOTE: `pages` references the data in `xref`, so it's important that `xref`
516        // appears after `pages` in the struct definition to ensure correct drop order.
517        _xref: Arc<XRef>,
518    }
519
520    impl CachedPages {
521        pub(crate) fn new(xref: Arc<XRef>) -> Option<Self> {
522            // SAFETY:
523            // - The XRef's location is stable in memory:
524            //   - We wrapped it in a `Arc` (or `Rc` in `no_std`), which implements `StableDeref`.
525            //   - The struct owns the `Arc`, ensuring that the inner value is not dropped during the whole
526            //     duration.
527            // - The internal 'static lifetime is not leaked because its rewritten
528            //   to the self-lifetime in `pages()`.
529            let xref_reference: &'static XRef = unsafe { core::mem::transmute(xref.deref()) };
530
531            let ctx = ReaderContext::new(xref_reference, false);
532            let pages = xref_reference
533                .get_with(xref.trailer_data().pages_ref, &ctx)
534                .and_then(|p| Pages::new(&p, &ctx, xref_reference))
535                .or_else(|| Pages::new_brute_force(&ctx, xref_reference))?;
536
537            Some(Self { pages, _xref: xref })
538        }
539
540        pub(crate) fn get(&self) -> &Pages<'_> {
541            &self.pages
542        }
543    }
544}