hayro_syntax/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::Array;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::Rect;
8use crate::object::Stream;
9use crate::object::dict::keys::*;
10use crate::object::{MaybeRef, ObjRef};
11use crate::object::{Object, ObjectLike};
12use crate::reader::ReaderContext;
13use crate::util::FloatExt;
14use crate::xref::XRef;
15use kurbo::Affine;
16use log::warn;
17use std::ops::Deref;
18use std::sync::OnceLock;
19
20/// Attributes that can be inherited.
21#[derive(Debug, Clone)]
22struct PagesContext {
23    media_box: Option<Rect>,
24    crop_box: Option<Rect>,
25    rotate: Option<u32>,
26}
27
28impl PagesContext {
29    fn new() -> Self {
30        Self {
31            media_box: None,
32            crop_box: None,
33            rotate: None,
34        }
35    }
36}
37
38/// A structure holding the pages of a PDF document.
39pub struct Pages<'a> {
40    pages: Vec<Page<'a>>,
41    xref: &'a XRef,
42}
43
44impl<'a> Pages<'a> {
45    /// Create a new `Pages` object.
46    pub(crate) fn new(
47        pages_dict: Dict<'a>,
48        ctx: &ReaderContext<'a>,
49        xref: &'a XRef,
50    ) -> Option<Pages<'a>> {
51        let mut pages = vec![];
52        let pages_ctx = PagesContext::new();
53        resolve_pages(
54            pages_dict,
55            &mut pages,
56            pages_ctx,
57            Resources::new(Dict::empty(), None, ctx),
58        )?;
59
60        Some(Self { pages, xref })
61    }
62
63    /// Return the xref table (of the document the pages belong to).   
64    pub fn xref(&self) -> &'a XRef {
65        self.xref
66    }
67}
68
69impl<'a> Deref for Pages<'a> {
70    type Target = [Page<'a>];
71
72    fn deref(&self) -> &Self::Target {
73        &self.pages
74    }
75}
76
77fn resolve_pages<'a>(
78    pages_dict: Dict<'a>,
79    entries: &mut Vec<Page<'a>>,
80    mut ctx: PagesContext,
81    resources: Resources<'a>,
82) -> Option<()> {
83    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
84        ctx.media_box = Some(media_box);
85    }
86
87    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
88        ctx.crop_box = Some(crop_box);
89    }
90
91    if let Some(rotate) = pages_dict.get::<u32>(ROTATE) {
92        ctx.rotate = Some(rotate);
93    }
94
95    let resources = Resources::from_parent(
96        pages_dict.get::<Dict>(RESOURCES).unwrap_or_default(),
97        resources.clone(),
98    );
99
100    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
101
102    for dict in kids.iter::<Dict>() {
103        match dict.get::<Name>(TYPE)?.deref() {
104            PAGES => resolve_pages(dict, entries, ctx.clone(), resources.clone())?,
105            PAGE => entries.push(Page::new(dict, &ctx, resources.clone())),
106            _ => return None,
107        }
108    }
109
110    Some(())
111}
112
113/// The rotation of the page.
114#[derive(Debug, Copy, Clone)]
115pub enum Rotation {
116    /// No rotation.
117    None,
118    /// A rotation of 90 degrees.
119    Horizontal,
120    /// A rotation of 180 degrees.
121    Flipped,
122    /// A rotation of 270 degrees.
123    FlippedHorizontal,
124}
125
126/// A PDF page.
127pub struct Page<'a> {
128    inner: Dict<'a>,
129    media_box: kurbo::Rect,
130    crop_box: kurbo::Rect,
131    rotation: Rotation,
132    page_streams: OnceLock<Option<Vec<u8>>>,
133    resources: Resources<'a>,
134    ctx: ReaderContext<'a>,
135}
136
137impl<'a> Page<'a> {
138    fn new(dict: Dict<'a>, ctx: &PagesContext, resources: Resources<'a>) -> Page<'a> {
139        let media_box = dict.get::<Rect>(MEDIA_BOX).or(ctx.media_box).unwrap_or(A4);
140
141        let crop_box = dict
142            .get::<Rect>(CROP_BOX)
143            .or(ctx.crop_box)
144            .unwrap_or(media_box);
145
146        let rotation = match dict.get::<u32>(ROTATE).or(ctx.rotate).unwrap_or(0) % 360 {
147            0 => Rotation::None,
148            90 => Rotation::Horizontal,
149            180 => Rotation::Flipped,
150            270 => Rotation::FlippedHorizontal,
151            _ => Rotation::None,
152        };
153
154        let ctx = resources.ctx.clone();
155        let resources =
156            Resources::from_parent(dict.get::<Dict>(RESOURCES).unwrap_or_default(), resources);
157
158        Self {
159            inner: dict,
160            media_box,
161            crop_box,
162            rotation,
163            page_streams: OnceLock::new(),
164            resources,
165            ctx,
166        }
167    }
168
169    fn operations_impl(&self) -> Option<UntypedIter<'_>> {
170        let stream = self.page_stream()?;
171        let iter = UntypedIter::new(stream);
172
173        Some(iter)
174    }
175
176    /// Return the decoded content stream of the page.
177    pub fn page_stream(&self) -> Option<&[u8]> {
178        let convert_single = |s: Stream| {
179            let data = s.decoded().ok()?;
180            Some(data.to_vec())
181        };
182
183        self.page_streams
184            .get_or_init(|| {
185                if let Some(stream) = self.inner.get::<Stream>(CONTENTS) {
186                    convert_single(stream)
187                } else if let Some(array) = self.inner.get::<Array>(CONTENTS) {
188                    let streams = array.iter::<Stream>().flat_map(convert_single);
189
190                    let mut collected = vec![];
191
192                    for stream in streams {
193                        collected.extend(stream);
194                        // Streams must have at least one whitespace in-between.
195                        collected.push(b' ')
196                    }
197
198                    Some(collected)
199                } else {
200                    warn!("contents entry of page was neither stream nor array of streams");
201
202                    None
203                }
204            })
205            .as_ref()
206            .map(|d| d.as_slice())
207    }
208
209    /// Get the resources of the page.
210    pub fn resources(&self) -> &Resources<'a> {
211        &self.resources
212    }
213
214    /// Get the media box of the page.
215    pub fn media_box(&self) -> Rect {
216        self.media_box
217    }
218
219    /// Get the rotation of the page.
220    pub fn rotation(&self) -> Rotation {
221        self.rotation
222    }
223
224    /// Get the crop box of the page.
225    pub fn crop_box(&self) -> Rect {
226        self.crop_box
227    }
228
229    /// Return the intersection of crop box and media box.
230    pub fn intersected_crop_box(&self) -> Rect {
231        self.crop_box().intersect(self.media_box())
232    }
233
234    fn base_dimensions(&self) -> (f32, f32) {
235        let crop_box = self.intersected_crop_box();
236
237        if (crop_box.width() as f32).is_nearly_zero() || (crop_box.height() as f32).is_nearly_zero()
238        {
239            (A4.width() as f32, A4.height() as f32)
240        } else {
241            (
242                crop_box.width().max(1.0) as f32,
243                crop_box.height().max(1.0) as f32,
244            )
245        }
246    }
247
248    /// Return the initial transform that should be applied when rendering. This accounts for a
249    /// number of factors, such as the mismatch between PDF's y-up and most renderers' y-down
250    /// coordinate system, the rotation of the page and the offset of the crop box.
251    pub fn initial_transform(&self, invert_y: bool) -> kurbo::Affine {
252        let crop_box = self.intersected_crop_box();
253        let (_, base_height) = self.base_dimensions();
254        let (width, height) = self.render_dimensions();
255
256        let horizontal_t =
257            Affine::rotate(90.0f64.to_radians()) * Affine::translate((0.0, -width as f64));
258        let flipped_horizontal_t =
259            Affine::translate((0.0, height as f64)) * Affine::rotate(-90.0f64.to_radians());
260
261        let rotation_transform = match self.rotation() {
262            Rotation::None => Affine::IDENTITY,
263            Rotation::Horizontal => {
264                if invert_y {
265                    horizontal_t
266                } else {
267                    flipped_horizontal_t
268                }
269            }
270            Rotation::Flipped => {
271                Affine::scale(-1.0) * Affine::translate((-width as f64, -height as f64))
272            }
273            Rotation::FlippedHorizontal => {
274                if invert_y {
275                    flipped_horizontal_t
276                } else {
277                    horizontal_t
278                }
279            }
280        };
281
282        let inversion_transform = if invert_y {
283            Affine::new([1.0, 0.0, 0.0, -1.0, 0.0, base_height as f64])
284        } else {
285            Affine::IDENTITY
286        };
287
288        rotation_transform * inversion_transform * Affine::translate((-crop_box.x0, -crop_box.y0))
289    }
290
291    /// Return the with and height of the page that should be assumed when rendering the page.
292    ///
293    /// Depending on the document, it is either based on the media box or the crop box
294    /// of the page. In addition to that, it also takes the rotation of the page into account.
295    pub fn render_dimensions(&self) -> (f32, f32) {
296        let (mut base_width, mut base_height) = self.base_dimensions();
297
298        if matches!(
299            self.rotation(),
300            Rotation::Horizontal | Rotation::FlippedHorizontal
301        ) {
302            std::mem::swap(&mut base_width, &mut base_height);
303        }
304
305        (base_width, base_height)
306    }
307
308    /// Return an untyped iterator over the operators of the page's content stream.
309    pub fn operations(&self) -> UntypedIter<'_> {
310        self.operations_impl().unwrap_or(UntypedIter::empty())
311    }
312
313    /// Get the raw dictionary of the page.
314    pub fn raw(&self) -> &Dict<'a> {
315        &self.inner
316    }
317
318    /// Get the xref table (of the document the page belongs to).
319    pub fn xref(&self) -> &'a XRef {
320        self.ctx.xref
321    }
322
323    /// Return a typed iterator over the operators of the page's content stream.
324    pub fn typed_operations(&self) -> TypedIter<'_> {
325        TypedIter::from_untyped(self.operations())
326    }
327}
328
329/// A structure keeping track of the resources of a page.
330#[derive(Clone, Debug)]
331pub struct Resources<'a> {
332    parent: Option<Box<Resources<'a>>>,
333    ctx: ReaderContext<'a>,
334    /// The raw dictionary of external graphics states.
335    pub ext_g_states: Dict<'a>,
336    /// The raw dictionary of fonts.
337    pub fonts: Dict<'a>,
338    /// The raw dictionary of properties.
339    pub properties: Dict<'a>,
340    /// The raw dictionary of color spaces.
341    pub color_spaces: Dict<'a>,
342    /// The raw dictionary of x objects.
343    pub x_objects: Dict<'a>,
344    /// The raw dictionary of patterns.
345    pub patterns: Dict<'a>,
346    /// The raw dictionary of shadings.
347    pub shadings: Dict<'a>,
348}
349
350impl<'a> Resources<'a> {
351    /// Create a new `Resources` object from a dictionary with a parent.
352    pub fn from_parent(resources: Dict<'a>, parent: Resources<'a>) -> Resources<'a> {
353        let ctx = parent.ctx.clone();
354
355        Self::new(resources, Some(parent), &ctx)
356    }
357
358    /// Create a new `Resources` object.
359    pub(crate) fn new(
360        resources: Dict<'a>,
361        parent: Option<Resources<'a>>,
362        ctx: &ReaderContext<'a>,
363    ) -> Resources<'a> {
364        let ext_g_states = resources.get::<Dict>(EXT_G_STATE).unwrap_or_default();
365        let fonts = resources.get::<Dict>(FONT).unwrap_or_default();
366        let color_spaces = resources.get::<Dict>(COLORSPACE).unwrap_or_default();
367        let x_objects = resources.get::<Dict>(XOBJECT).unwrap_or_default();
368        let patterns = resources.get::<Dict>(PATTERN).unwrap_or_default();
369        let shadings = resources.get::<Dict>(SHADING).unwrap_or_default();
370        let properties = resources.get::<Dict>(PROPERTIES).unwrap_or_default();
371
372        let parent = parent.map(Box::new);
373
374        Self {
375            parent,
376            ext_g_states,
377            fonts,
378            color_spaces,
379            properties,
380            x_objects,
381            patterns,
382            shadings,
383            ctx: ctx.clone(),
384        }
385    }
386
387    /// Resolve an object reference to an object.
388    #[allow(private_bounds)]
389    pub fn resolve_ref<T: ObjectLike<'a>>(&self, ref_: ObjRef) -> Option<T> {
390        self.ctx.xref.get_with(ref_.into(), &self.ctx)
391    }
392
393    fn get_resource<T: ObjectLike<'a>, U>(
394        &self,
395        name: Name,
396        dict: &Dict<'a>,
397        mut cache: impl FnMut(ObjRef) -> Option<U>,
398        mut resolve: impl FnMut(T) -> Option<U>,
399    ) -> Option<U> {
400        // TODO: Cache non-ref resources as well
401
402        match dict.get_raw::<T>(name.deref())? {
403            MaybeRef::Ref(ref_) => cache(ref_).or_else(|| {
404                self.ctx
405                    .xref
406                    .get_with::<T>(ref_.into(), &self.ctx)
407                    .and_then(&mut resolve)
408            }),
409            MaybeRef::NotRef(i) => resolve(i),
410        }
411    }
412
413    /// Get the parent in the resource, chain, if available.
414    pub fn parent(&self) -> Option<&Resources<'a>> {
415        self.parent.as_deref()
416    }
417
418    // TODO: Refactor caching mechanism
419
420    /// Get an external graphics state by name.
421    pub fn get_ext_g_state<U>(
422        &self,
423        name: Name,
424        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
425        mut resolve: Box<dyn FnMut(Dict<'a>) -> Option<U> + '_>,
426    ) -> Option<U> {
427        self.get_resource::<Dict, U>(name.clone(), &self.ext_g_states, &mut cache, &mut resolve)
428            .or_else(|| {
429                self.parent
430                    .as_ref()
431                    .and_then(|p| p.get_ext_g_state::<U>(name, cache, resolve))
432            })
433    }
434
435    /// Get a color space by name.
436    pub fn get_color_space<U>(
437        &self,
438        name: Name,
439        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
440        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
441    ) -> Option<U> {
442        self.get_resource::<Object, U>(name.clone(), &self.color_spaces, &mut cache, &mut resolve)
443            .or_else(|| {
444                self.parent
445                    .as_ref()
446                    .and_then(|p| p.get_color_space::<U>(name, cache, resolve))
447            })
448    }
449
450    /// Get a font by name.
451    pub fn get_font<U>(
452        &self,
453        name: Name,
454        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
455        mut resolve: Box<dyn FnMut(Dict<'a>) -> Option<U> + '_>,
456    ) -> Option<U> {
457        self.get_resource::<Dict, U>(name.clone(), &self.fonts, &mut cache, &mut resolve)
458            .or_else(|| {
459                self.parent
460                    .as_ref()
461                    .and_then(|p| p.get_font::<U>(name, cache, resolve))
462            })
463    }
464
465    /// Get a pattern by name.
466    pub fn get_pattern<U>(
467        &self,
468        name: Name,
469        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
470        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
471    ) -> Option<U> {
472        self.get_resource::<Object, U>(name.clone(), &self.patterns, &mut cache, &mut resolve)
473            .or_else(|| {
474                self.parent
475                    .as_ref()
476                    .and_then(|p| p.get_pattern::<U>(name, cache, resolve))
477            })
478    }
479
480    /// Get an x object by name.
481    pub fn get_x_object<U>(
482        &self,
483        name: Name,
484        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
485        mut resolve: Box<dyn FnMut(Stream<'a>) -> Option<U> + '_>,
486    ) -> Option<U> {
487        self.get_resource::<Stream, U>(name.clone(), &self.x_objects, &mut cache, &mut resolve)
488            .or_else(|| {
489                self.parent
490                    .as_ref()
491                    .and_then(|p| p.get_x_object::<U>(name, cache, resolve))
492            })
493    }
494
495    /// Get a shading by name.
496    pub fn get_shading<U>(
497        &self,
498        name: Name,
499        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
500        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
501    ) -> Option<U> {
502        self.get_resource::<Object, U>(name.clone(), &self.shadings, &mut cache, &mut resolve)
503            .or_else(|| {
504                self.parent
505                    .as_ref()
506                    .and_then(|p| p.get_shading::<U>(name, cache, resolve))
507            })
508    }
509}
510
511// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
512const POINTS_PER_INCH: f64 = 72.0;
513const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
514
515/// The dimension of an A4 page.
516pub const A4: Rect = Rect {
517    x0: 0.0,
518    y0: 0.0,
519    x1: 210.0 * POINTS_PER_MM,
520    y1: 297.0 * POINTS_PER_MM,
521};
522
523pub(crate) mod cached {
524    use crate::page::Pages;
525    use crate::reader::ReaderContext;
526    use crate::xref::XRef;
527    use std::ops::Deref;
528    use std::sync::Arc;
529
530    pub(crate) struct CachedPages {
531        pages: Pages<'static>,
532        // NOTE: `pages` references the data in `xref`, so it's important that `xref`
533        // appears after `pages` in the struct definition to ensure correct drop order.
534        _xref: Arc<XRef>,
535    }
536
537    impl CachedPages {
538        pub(crate) fn new(xref: Arc<XRef>) -> Option<Self> {
539            // SAFETY:
540            // - The XRef's location is stable in memory:
541            //   - We wrapped it in a `Arc`, which implements `StableDeref`.
542            //   - The struct owns the `Arc`, ensuring that the inner value is not dropped during the whole
543            //     duration.
544            // - The internal 'static lifetime is not leaked because its rewritten
545            //   to the self-lifetime in `pages()`.
546            let xref_reference: &'static XRef = unsafe { std::mem::transmute(xref.deref()) };
547
548            let ctx = ReaderContext::new(xref_reference, false);
549            let pages = xref_reference
550                .get_with(xref.trailer_data().pages_ref, &ctx)
551                .and_then(|p| Pages::new(p, &ctx, xref_reference))?;
552
553            Some(Self { pages, _xref: xref })
554        }
555
556        pub(crate) fn get(&self) -> &Pages<'_> {
557            &self.pages
558        }
559    }
560}