hayro_syntax/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::Array;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::Rect;
8use crate::object::Stream;
9use crate::object::dict::keys::*;
10use crate::object::{MaybeRef, ObjRef};
11use crate::object::{Object, ObjectLike};
12use crate::reader::ReaderContext;
13use crate::util::FloatExt;
14use crate::xref::XRef;
15use kurbo::Affine;
16use log::warn;
17use std::ops::Deref;
18use std::sync::OnceLock;
19
20/// Attributes that can be inherited.
21#[derive(Debug, Clone)]
22struct PagesContext {
23    media_box: Option<Rect>,
24    crop_box: Option<Rect>,
25    rotate: Option<u32>,
26}
27
28impl PagesContext {
29    fn new() -> Self {
30        Self {
31            media_box: None,
32            crop_box: None,
33            rotate: None,
34        }
35    }
36}
37
38/// A structure holding the pages of a PDF document.
39pub struct Pages<'a> {
40    pages: Vec<Page<'a>>,
41    xref: &'a XRef,
42}
43
44impl<'a> Pages<'a> {
45    /// Create a new `Pages` object.
46    pub(crate) fn new(
47        pages_dict: Dict<'a>,
48        ctx: ReaderContext<'a>,
49        xref: &'a XRef,
50    ) -> Option<Pages<'a>> {
51        let mut pages = vec![];
52        let pages_ctx = PagesContext::new();
53        resolve_pages(
54            pages_dict,
55            &mut pages,
56            pages_ctx,
57            Resources::new(Dict::empty(), None, ctx),
58        )?;
59
60        Some(Self { pages, xref })
61    }
62
63    /// Return the xref table (of the document the pages belong to).   
64    pub fn xref(&self) -> &'a XRef {
65        self.xref
66    }
67}
68
69impl<'a> Deref for Pages<'a> {
70    type Target = [Page<'a>];
71
72    fn deref(&self) -> &Self::Target {
73        &self.pages
74    }
75}
76
77fn resolve_pages<'a>(
78    pages_dict: Dict<'a>,
79    entries: &mut Vec<Page<'a>>,
80    mut ctx: PagesContext,
81    resources: Resources<'a>,
82) -> Option<()> {
83    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
84        ctx.media_box = Some(media_box);
85    }
86
87    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
88        ctx.crop_box = Some(crop_box);
89    }
90
91    if let Some(rotate) = pages_dict.get::<u32>(ROTATE) {
92        ctx.rotate = Some(rotate);
93    }
94
95    let resources = Resources::from_parent(
96        pages_dict.get::<Dict>(RESOURCES).unwrap_or_default(),
97        resources.clone(),
98    );
99
100    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
101
102    for dict in kids.iter::<Dict>() {
103        match dict.get::<Name>(TYPE)?.deref() {
104            PAGES => resolve_pages(dict, entries, ctx.clone(), resources.clone())?,
105            PAGE => entries.push(Page::new(dict, &ctx, resources.clone())),
106            _ => return None,
107        }
108    }
109
110    Some(())
111}
112
113/// The rotation of the page.
114#[derive(Debug, Copy, Clone)]
115pub enum Rotation {
116    /// No rotation.
117    None,
118    /// A rotation of 90 degrees.
119    Horizontal,
120    /// A rotation of 180 degrees.
121    Flipped,
122    /// A rotation of 270 degrees.
123    FlippedHorizontal,
124}
125
126/// A PDF page.
127pub struct Page<'a> {
128    inner: Dict<'a>,
129    media_box: kurbo::Rect,
130    crop_box: kurbo::Rect,
131    rotation: Rotation,
132    page_streams: OnceLock<Option<Vec<u8>>>,
133    resources: Resources<'a>,
134    ctx: ReaderContext<'a>,
135}
136
137impl<'a> Page<'a> {
138    fn new(dict: Dict<'a>, ctx: &PagesContext, resources: Resources<'a>) -> Page<'a> {
139        let media_box = dict.get::<Rect>(MEDIA_BOX).or(ctx.media_box).unwrap_or(A4);
140
141        let crop_box = dict
142            .get::<Rect>(CROP_BOX)
143            .or(ctx.crop_box)
144            .unwrap_or(media_box);
145
146        let rotation = match dict.get::<u32>(ROTATE).or(ctx.rotate).unwrap_or(0) % 360 {
147            0 => Rotation::None,
148            90 => Rotation::Horizontal,
149            180 => Rotation::Flipped,
150            270 => Rotation::FlippedHorizontal,
151            _ => Rotation::None,
152        };
153
154        let ctx = resources.ctx;
155        let resources =
156            Resources::from_parent(dict.get::<Dict>(RESOURCES).unwrap_or_default(), resources);
157
158        Self {
159            inner: dict,
160            media_box,
161            crop_box,
162            rotation,
163            page_streams: OnceLock::new(),
164            resources,
165            ctx,
166        }
167    }
168
169    fn operations_impl(&self) -> Option<UntypedIter<'_>> {
170        let stream = self.page_stream()?;
171        let iter = UntypedIter::new(stream);
172
173        Some(iter)
174    }
175
176    /// Return the decoded content stream of the page.
177    pub fn page_stream(&self) -> Option<&[u8]> {
178        let convert_single = |s: Stream| {
179            let data = s.decoded().ok()?;
180            Some(data.to_vec())
181        };
182
183        self.page_streams
184            .get_or_init(|| {
185                if let Some(stream) = self.inner.get::<Stream>(CONTENTS) {
186                    convert_single(stream)
187                } else if let Some(array) = self.inner.get::<Array>(CONTENTS) {
188                    let streams = array.iter::<Stream>().flat_map(convert_single);
189
190                    let mut collected = vec![];
191
192                    for stream in streams {
193                        collected.extend(stream);
194                        // Streams must have at least one whitespace in-between.
195                        collected.push(b' ')
196                    }
197
198                    Some(collected)
199                } else {
200                    warn!("contents entry of page was neither stream nor array of streams");
201
202                    None
203                }
204            })
205            .as_ref()
206            .map(|d| d.as_slice())
207    }
208
209    /// Get the resources of the page.
210    pub fn resources(&self) -> &Resources<'a> {
211        &self.resources
212    }
213
214    /// Get the media box of the page.
215    pub fn media_box(&self) -> Rect {
216        self.media_box
217    }
218
219    /// Get the rotation of the page.
220    pub fn rotation(&self) -> Rotation {
221        self.rotation
222    }
223
224    /// Get the crop box of the page.
225    pub fn crop_box(&self) -> Rect {
226        self.crop_box
227    }
228
229    /// Return the intersection of crop box and media box.
230    pub fn intersected_crop_box(&self) -> Rect {
231        self.crop_box().intersect(self.media_box())
232    }
233
234    fn base_dimensions(&self) -> (f32, f32) {
235        let crop_box = self.intersected_crop_box();
236
237        if (crop_box.width() as f32).is_nearly_zero() || (crop_box.height() as f32).is_nearly_zero()
238        {
239            (A4.width() as f32, A4.height() as f32)
240        } else {
241            (
242                crop_box.width().max(1.0) as f32,
243                crop_box.height().max(1.0) as f32,
244            )
245        }
246    }
247
248    /// Return the initial transform that should be applied when rendering. This accounts for a
249    /// number of factors, such as the mismatch between PDF's y-up and most renderers' y-down
250    /// coordinate system, the rotation of the page and the offset of the crop box.
251    pub fn initial_transform(&self, invert_y: bool) -> kurbo::Affine {
252        let crop_box = self.intersected_crop_box();
253        let (_, base_height) = self.base_dimensions();
254        let (width, height) = self.render_dimensions();
255
256        let horizontal_t =
257            Affine::rotate(90.0f64.to_radians()) * Affine::translate((0.0, -width as f64));
258        let flipped_horizontal_t =
259            Affine::translate((0.0, height as f64)) * Affine::rotate(-90.0f64.to_radians());
260
261        let rotation_transform = match self.rotation() {
262            Rotation::None => Affine::IDENTITY,
263            Rotation::Horizontal => {
264                if invert_y {
265                    horizontal_t
266                } else {
267                    flipped_horizontal_t
268                }
269            }
270            Rotation::Flipped => {
271                Affine::scale(-1.0) * Affine::translate((-width as f64, -height as f64))
272            }
273            Rotation::FlippedHorizontal => {
274                if invert_y {
275                    flipped_horizontal_t
276                } else {
277                    horizontal_t
278                }
279            }
280        };
281
282        let inversion_transform = if invert_y {
283            Affine::new([1.0, 0.0, 0.0, -1.0, 0.0, base_height as f64])
284        } else {
285            Affine::IDENTITY
286        };
287
288        rotation_transform * inversion_transform * Affine::translate((-crop_box.x0, -crop_box.y0))
289    }
290
291    /// Return the with and height of the page that should be assumed when rendering the page.
292    ///
293    /// Depending on the document, it is either based on the media box or the crop box
294    /// of the page. In addition to that, it also takes the rotation of the page into account.
295    pub fn render_dimensions(&self) -> (f32, f32) {
296        let (mut base_width, mut base_height) = self.base_dimensions();
297
298        if matches!(
299            self.rotation(),
300            Rotation::Horizontal | Rotation::FlippedHorizontal
301        ) {
302            std::mem::swap(&mut base_width, &mut base_height);
303        }
304
305        (base_width, base_height)
306    }
307
308    /// Return an untyped iterator over the operators of the page's content stream.
309    pub fn operations(&self) -> UntypedIter<'_> {
310        self.operations_impl().unwrap_or(UntypedIter::empty())
311    }
312
313    /// Get the raw dictionary of the page.
314    pub fn raw(&self) -> &Dict<'a> {
315        &self.inner
316    }
317
318    /// Get the xref table (of the document the page belongs to).
319    pub fn xref(&self) -> &'a XRef {
320        self.ctx.xref
321    }
322
323    /// Return a typed iterator over the operators of the page's content stream.
324    pub fn typed_operations(&self) -> TypedIter<'_> {
325        TypedIter::from_untyped(self.operations())
326    }
327}
328
329/// A structure keeping track of the resources of a page.
330#[derive(Clone, Debug)]
331pub struct Resources<'a> {
332    parent: Option<Box<Resources<'a>>>,
333    ctx: ReaderContext<'a>,
334    /// The raw dictionary of external graphics states.
335    pub ext_g_states: Dict<'a>,
336    /// The raw dictionary of fonts.
337    pub fonts: Dict<'a>,
338    /// The raw dictionary of properties.
339    pub properties: Dict<'a>,
340    /// The raw dictionary of color spaces.
341    pub color_spaces: Dict<'a>,
342    /// The raw dictionary of x objects.
343    pub x_objects: Dict<'a>,
344    /// The raw dictionary of patterns.
345    pub patterns: Dict<'a>,
346    /// The raw dictionary of shadings.
347    pub shadings: Dict<'a>,
348}
349
350impl<'a> Resources<'a> {
351    /// Create a new `Resources` object from a dictionary with a parent.
352    pub fn from_parent(resources: Dict<'a>, parent: Resources<'a>) -> Resources<'a> {
353        let ctx = parent.ctx;
354
355        Self::new(resources, Some(parent), ctx)
356    }
357
358    /// Create a new `Resources` object.
359    pub(crate) fn new(
360        resources: Dict<'a>,
361        parent: Option<Resources<'a>>,
362        ctx: ReaderContext<'a>,
363    ) -> Resources<'a> {
364        let ext_g_states = resources.get::<Dict>(EXT_G_STATE).unwrap_or_default();
365        let fonts = resources.get::<Dict>(FONT).unwrap_or_default();
366        let color_spaces = resources.get::<Dict>(COLORSPACE).unwrap_or_default();
367        let x_objects = resources.get::<Dict>(XOBJECT).unwrap_or_default();
368        let patterns = resources.get::<Dict>(PATTERN).unwrap_or_default();
369        let shadings = resources.get::<Dict>(SHADING).unwrap_or_default();
370        let properties = resources.get::<Dict>(PROPERTIES).unwrap_or_default();
371
372        let parent = parent.map(Box::new);
373
374        Self {
375            parent,
376            ext_g_states,
377            fonts,
378            color_spaces,
379            properties,
380            x_objects,
381            patterns,
382            shadings,
383            ctx,
384        }
385    }
386
387    /// Resolve an object reference to an object.
388    #[allow(private_bounds)]
389    pub fn resolve_ref<T: ObjectLike<'a>>(&self, ref_: ObjRef) -> Option<T> {
390        self.ctx.xref.get(ref_.into())
391    }
392
393    fn get_resource<T: ObjectLike<'a>, U>(
394        &self,
395        name: Name,
396        dict: &Dict<'a>,
397        mut cache: impl FnMut(ObjRef) -> Option<U>,
398        mut resolve: impl FnMut(T) -> Option<U>,
399    ) -> Option<U> {
400        // TODO: Cache non-ref resources as well
401
402        match dict.get_raw::<T>(name.deref())? {
403            MaybeRef::Ref(ref_) => {
404                cache(ref_).or_else(|| self.ctx.xref.get::<T>(ref_.into()).and_then(&mut resolve))
405            }
406            MaybeRef::NotRef(i) => resolve(i),
407        }
408    }
409
410    /// Get the parent in the resource, chain, if available.
411    pub fn parent(&self) -> Option<&Resources<'a>> {
412        self.parent.as_deref()
413    }
414
415    // TODO: Refactor caching mechanism
416
417    /// Get an external graphics state by name.
418    pub fn get_ext_g_state<U>(
419        &self,
420        name: Name,
421        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
422        mut resolve: Box<dyn FnMut(Dict<'a>) -> Option<U> + '_>,
423    ) -> Option<U> {
424        self.get_resource::<Dict, U>(name.clone(), &self.ext_g_states, &mut cache, &mut resolve)
425            .or_else(|| {
426                self.parent
427                    .as_ref()
428                    .and_then(|p| p.get_ext_g_state::<U>(name, cache, resolve))
429            })
430    }
431
432    /// Get a color space by name.
433    pub fn get_color_space<U>(
434        &self,
435        name: Name,
436        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
437        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
438    ) -> Option<U> {
439        self.get_resource::<Object, U>(name.clone(), &self.color_spaces, &mut cache, &mut resolve)
440            .or_else(|| {
441                self.parent
442                    .as_ref()
443                    .and_then(|p| p.get_color_space::<U>(name, cache, resolve))
444            })
445    }
446
447    /// Get a font by name.
448    pub fn get_font<U>(
449        &self,
450        name: Name,
451        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
452        mut resolve: Box<dyn FnMut(Dict<'a>) -> Option<U> + '_>,
453    ) -> Option<U> {
454        self.get_resource::<Dict, U>(name.clone(), &self.fonts, &mut cache, &mut resolve)
455            .or_else(|| {
456                self.parent
457                    .as_ref()
458                    .and_then(|p| p.get_font::<U>(name, cache, resolve))
459            })
460    }
461
462    /// Get a pattern by name.
463    pub fn get_pattern<U>(
464        &self,
465        name: Name,
466        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
467        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
468    ) -> Option<U> {
469        self.get_resource::<Object, U>(name.clone(), &self.patterns, &mut cache, &mut resolve)
470            .or_else(|| {
471                self.parent
472                    .as_ref()
473                    .and_then(|p| p.get_pattern::<U>(name, cache, resolve))
474            })
475    }
476
477    /// Get an x object by name.
478    pub fn get_x_object<U>(
479        &self,
480        name: Name,
481        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
482        mut resolve: Box<dyn FnMut(Stream<'a>) -> Option<U> + '_>,
483    ) -> Option<U> {
484        self.get_resource::<Stream, U>(name.clone(), &self.x_objects, &mut cache, &mut resolve)
485            .or_else(|| {
486                self.parent
487                    .as_ref()
488                    .and_then(|p| p.get_x_object::<U>(name, cache, resolve))
489            })
490    }
491
492    /// Get a shading by name.
493    pub fn get_shading<U>(
494        &self,
495        name: Name,
496        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
497        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
498    ) -> Option<U> {
499        self.get_resource::<Object, U>(name.clone(), &self.shadings, &mut cache, &mut resolve)
500            .or_else(|| {
501                self.parent
502                    .as_ref()
503                    .and_then(|p| p.get_shading::<U>(name, cache, resolve))
504            })
505    }
506}
507
508// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
509const POINTS_PER_INCH: f64 = 72.0;
510const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
511
512/// The dimension of an A4 page.
513pub const A4: Rect = Rect {
514    x0: 0.0,
515    y0: 0.0,
516    x1: 210.0 * POINTS_PER_MM,
517    y1: 297.0 * POINTS_PER_MM,
518};
519
520pub(crate) mod cached {
521    use crate::page::Pages;
522    use crate::reader::ReaderContext;
523    use crate::xref::XRef;
524    use std::ops::Deref;
525    use std::sync::Arc;
526
527    pub(crate) struct CachedPages {
528        pages: Pages<'static>,
529        // NOTE: `pages` references the data in `xref`, so it's important that `xref`
530        // appears after `pages` in the struct definition to ensure correct drop order.
531        _xref: Arc<XRef>,
532    }
533
534    impl CachedPages {
535        pub(crate) fn new(xref: Arc<XRef>) -> Option<Self> {
536            // SAFETY:
537            // - The XRef's location is stable in memory:
538            //   - We wrapped it in a `Arc`, which implements `StableDeref`.
539            //   - The struct owns the `Arc`, ensuring that the inner value is not dropped during the whole
540            //     duration.
541            // - The internal 'static lifetime is not leaked because its rewritten
542            //   to the self-lifetime in `pages()`.
543            let xref_reference: &'static XRef = unsafe { std::mem::transmute(xref.deref()) };
544
545            let ctx = ReaderContext::new(xref_reference, false);
546            let pages = xref_reference
547                .get(xref.trailer_data().pages_ref)
548                .and_then(|p| Pages::new(p, ctx, xref_reference))?;
549
550            Some(Self { pages, _xref: xref })
551        }
552
553        pub(crate) fn get(&self) -> &Pages<'_> {
554            &self.pages
555        }
556    }
557}