hayro_syntax/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::Array;
5use crate::object::Dict;
6use crate::object::Name;
7use crate::object::Rect;
8use crate::object::Stream;
9use crate::object::dict::keys::*;
10use crate::object::{Object, ObjectLike};
11use crate::reader::ReaderContext;
12use crate::util::FloatExt;
13use crate::xref::XRef;
14use log::warn;
15use std::ops::Deref;
16use std::sync::OnceLock;
17
18/// Attributes that can be inherited.
19#[derive(Debug, Clone)]
20struct PagesContext {
21    media_box: Option<Rect>,
22    crop_box: Option<Rect>,
23    rotate: Option<u32>,
24}
25
26impl PagesContext {
27    fn new() -> Self {
28        Self {
29            media_box: None,
30            crop_box: None,
31            rotate: None,
32        }
33    }
34}
35
36/// A structure holding the pages of a PDF document.
37pub struct Pages<'a> {
38    pages: Vec<Page<'a>>,
39    xref: &'a XRef,
40}
41
42impl<'a> Pages<'a> {
43    /// Create a new `Pages` object.
44    pub(crate) fn new(
45        pages_dict: &Dict<'a>,
46        ctx: &ReaderContext<'a>,
47        xref: &'a XRef,
48    ) -> Option<Self> {
49        let mut pages = vec![];
50        let pages_ctx = PagesContext::new();
51        resolve_pages(
52            pages_dict,
53            &mut pages,
54            pages_ctx,
55            Resources::new(Dict::empty(), None, ctx),
56        )?;
57
58        Some(Self { pages, xref })
59    }
60
61    /// Create a new `Pages` object by bruteforce-searching.
62    ///
63    /// Of course this could result in the order of pages being messed up, but
64    /// this is still better than nothing.
65    pub(crate) fn new_brute_force(ctx: &ReaderContext<'a>, xref: &'a XRef) -> Option<Self> {
66        let mut pages = vec![];
67
68        for object in xref.objects() {
69            if let Some(dict) = object.into_dict()
70                && let Some(page) = Page::new(
71                    &dict,
72                    &PagesContext::new(),
73                    Resources::new(Dict::empty(), None, ctx),
74                )
75            {
76                pages.push(page);
77            }
78        }
79
80        if pages.is_empty() {
81            return None;
82        }
83
84        Some(Self { pages, xref })
85    }
86
87    /// Return the xref table (of the document the pages belong to).   
88    pub fn xref(&self) -> &'a XRef {
89        self.xref
90    }
91}
92
93impl<'a> Deref for Pages<'a> {
94    type Target = [Page<'a>];
95
96    fn deref(&self) -> &Self::Target {
97        &self.pages
98    }
99}
100
101fn resolve_pages<'a>(
102    pages_dict: &Dict<'a>,
103    entries: &mut Vec<Page<'a>>,
104    mut ctx: PagesContext,
105    resources: Resources<'a>,
106) -> Option<()> {
107    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
108        ctx.media_box = Some(media_box);
109    }
110
111    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
112        ctx.crop_box = Some(crop_box);
113    }
114
115    if let Some(rotate) = pages_dict.get::<u32>(ROTATE) {
116        ctx.rotate = Some(rotate);
117    }
118
119    let resources = Resources::from_parent(
120        pages_dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
121        resources.clone(),
122    );
123
124    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
125
126    for dict in kids.iter::<Dict<'_>>() {
127        match dict.get::<Name<'_>>(TYPE).as_deref() {
128            Some(PAGES) => {
129                resolve_pages(&dict, entries, ctx.clone(), resources.clone());
130            }
131            // Let's be lenient and assume it's a `Page` in case it's `None` or something else
132            // (see corpus test case 0083781).
133            _ => {
134                if let Some(page) = Page::new(&dict, &ctx, resources.clone()) {
135                    entries.push(page);
136                }
137            }
138        }
139    }
140
141    Some(())
142}
143
144/// The rotation of the page.
145#[derive(Debug, Copy, Clone)]
146pub enum Rotation {
147    /// No rotation.
148    None,
149    /// A rotation of 90 degrees.
150    Horizontal,
151    /// A rotation of 180 degrees.
152    Flipped,
153    /// A rotation of 270 degrees.
154    FlippedHorizontal,
155}
156
157/// A PDF page.
158pub struct Page<'a> {
159    inner: Dict<'a>,
160    media_box: Rect,
161    crop_box: Rect,
162    rotation: Rotation,
163    page_streams: OnceLock<Option<Vec<u8>>>,
164    resources: Resources<'a>,
165    ctx: ReaderContext<'a>,
166}
167
168impl<'a> Page<'a> {
169    fn new(dict: &Dict<'a>, ctx: &PagesContext, resources: Resources<'a>) -> Option<Self> {
170        if !dict.contains_key(CONTENTS) {
171            return None;
172        }
173
174        let media_box = dict.get::<Rect>(MEDIA_BOX).or(ctx.media_box).unwrap_or(A4);
175
176        let crop_box = dict
177            .get::<Rect>(CROP_BOX)
178            .or(ctx.crop_box)
179            .unwrap_or(media_box);
180
181        let rotation = match dict.get::<u32>(ROTATE).or(ctx.rotate).unwrap_or(0) % 360 {
182            0 => Rotation::None,
183            90 => Rotation::Horizontal,
184            180 => Rotation::Flipped,
185            270 => Rotation::FlippedHorizontal,
186            _ => Rotation::None,
187        };
188
189        let ctx = resources.ctx.clone();
190        let resources = Resources::from_parent(
191            dict.get::<Dict<'_>>(RESOURCES).unwrap_or_default(),
192            resources,
193        );
194
195        Some(Self {
196            inner: dict.clone(),
197            media_box,
198            crop_box,
199            rotation,
200            page_streams: OnceLock::new(),
201            resources,
202            ctx,
203        })
204    }
205
206    fn operations_impl(&self) -> Option<UntypedIter<'_>> {
207        let stream = self.page_stream()?;
208        let iter = UntypedIter::new(stream);
209
210        Some(iter)
211    }
212
213    /// Return the decoded content stream of the page.
214    pub fn page_stream(&self) -> Option<&[u8]> {
215        let convert_single = |s: Stream<'_>| {
216            let data = s.decoded().ok()?;
217            Some(data.to_vec())
218        };
219
220        self.page_streams
221            .get_or_init(|| {
222                if let Some(stream) = self.inner.get::<Stream<'_>>(CONTENTS) {
223                    convert_single(stream)
224                } else if let Some(array) = self.inner.get::<Array<'_>>(CONTENTS) {
225                    let streams = array.iter::<Stream<'_>>().flat_map(convert_single);
226
227                    let mut collected = vec![];
228
229                    for stream in streams {
230                        collected.extend(stream);
231                        // Streams must have at least one whitespace in-between.
232                        collected.push(b' ');
233                    }
234
235                    Some(collected)
236                } else {
237                    warn!("contents entry of page was neither stream nor array of streams");
238
239                    None
240                }
241            })
242            .as_ref()
243            .map(|d| d.as_slice())
244    }
245
246    /// Get the resources of the page.
247    pub fn resources(&self) -> &Resources<'a> {
248        &self.resources
249    }
250
251    /// Get the media box of the page.
252    pub fn media_box(&self) -> Rect {
253        self.media_box
254    }
255
256    /// Get the rotation of the page.
257    pub fn rotation(&self) -> Rotation {
258        self.rotation
259    }
260
261    /// Get the crop box of the page.
262    pub fn crop_box(&self) -> Rect {
263        self.crop_box
264    }
265
266    /// Return the intersection of crop box and media box.
267    pub fn intersected_crop_box(&self) -> Rect {
268        self.crop_box().intersect(self.media_box())
269    }
270
271    /// Return the base dimensions of the page (same as `intersected_crop_box`, but with special
272    /// handling applied for zero-area pages).
273    pub fn base_dimensions(&self) -> (f32, f32) {
274        let crop_box = self.intersected_crop_box();
275
276        if (crop_box.width() as f32).is_nearly_zero() || (crop_box.height() as f32).is_nearly_zero()
277        {
278            (A4.width() as f32, A4.height() as f32)
279        } else {
280            (
281                crop_box.width().max(1.0) as f32,
282                crop_box.height().max(1.0) as f32,
283            )
284        }
285    }
286
287    /// Return the with and height of the page that should be assumed when rendering the page.
288    ///
289    /// Depending on the document, it is either based on the media box or the crop box
290    /// of the page. In addition to that, it also takes the rotation of the page into account.
291    pub fn render_dimensions(&self) -> (f32, f32) {
292        let (mut base_width, mut base_height) = self.base_dimensions();
293
294        if matches!(
295            self.rotation(),
296            Rotation::Horizontal | Rotation::FlippedHorizontal
297        ) {
298            std::mem::swap(&mut base_width, &mut base_height);
299        }
300
301        (base_width, base_height)
302    }
303
304    /// Return an untyped iterator over the operators of the page's content stream.
305    pub fn operations(&self) -> UntypedIter<'_> {
306        self.operations_impl().unwrap_or(UntypedIter::empty())
307    }
308
309    /// Get the raw dictionary of the page.
310    pub fn raw(&self) -> &Dict<'a> {
311        &self.inner
312    }
313
314    /// Get the xref table (of the document the page belongs to).
315    pub fn xref(&self) -> &'a XRef {
316        self.ctx.xref
317    }
318
319    /// Return a typed iterator over the operators of the page's content stream.
320    pub fn typed_operations(&self) -> TypedIter<'_> {
321        TypedIter::from_untyped(self.operations())
322    }
323}
324
325/// A structure keeping track of the resources of a page.
326#[derive(Clone, Debug)]
327pub struct Resources<'a> {
328    parent: Option<Box<Self>>,
329    ctx: ReaderContext<'a>,
330    /// The raw dictionary of external graphics states.
331    pub ext_g_states: Dict<'a>,
332    /// The raw dictionary of fonts.
333    pub fonts: Dict<'a>,
334    /// The raw dictionary of properties.
335    pub properties: Dict<'a>,
336    /// The raw dictionary of color spaces.
337    pub color_spaces: Dict<'a>,
338    /// The raw dictionary of x objects.
339    pub x_objects: Dict<'a>,
340    /// The raw dictionary of patterns.
341    pub patterns: Dict<'a>,
342    /// The raw dictionary of shadings.
343    pub shadings: Dict<'a>,
344}
345
346impl<'a> Resources<'a> {
347    /// Create a new `Resources` object from a dictionary with a parent.
348    pub fn from_parent(resources: Dict<'a>, parent: Self) -> Self {
349        let ctx = parent.ctx.clone();
350
351        Self::new(resources, Some(parent), &ctx)
352    }
353
354    /// Create a new `Resources` object.
355    pub(crate) fn new(resources: Dict<'a>, parent: Option<Self>, ctx: &ReaderContext<'a>) -> Self {
356        let ext_g_states = resources.get::<Dict<'_>>(EXT_G_STATE).unwrap_or_default();
357        let fonts = resources.get::<Dict<'_>>(FONT).unwrap_or_default();
358        let color_spaces = resources.get::<Dict<'_>>(COLORSPACE).unwrap_or_default();
359        let x_objects = resources.get::<Dict<'_>>(XOBJECT).unwrap_or_default();
360        let patterns = resources.get::<Dict<'_>>(PATTERN).unwrap_or_default();
361        let shadings = resources.get::<Dict<'_>>(SHADING).unwrap_or_default();
362        let properties = resources.get::<Dict<'_>>(PROPERTIES).unwrap_or_default();
363
364        let parent = parent.map(Box::new);
365
366        Self {
367            parent,
368            ext_g_states,
369            fonts,
370            color_spaces,
371            properties,
372            x_objects,
373            patterns,
374            shadings,
375            ctx: ctx.clone(),
376        }
377    }
378
379    fn get_resource<T: ObjectLike<'a>>(&self, name: Name<'_>, dict: &Dict<'a>) -> Option<T> {
380        dict.get::<T>(name.deref())
381    }
382
383    /// Get the parent in the resource, chain, if available.
384    pub fn parent(&self) -> Option<&Self> {
385        self.parent.as_deref()
386    }
387
388    /// Get an external graphics state by name.
389    pub fn get_ext_g_state(&self, name: Name<'_>) -> Option<Dict<'a>> {
390        self.get_resource::<Dict<'_>>(name.clone(), &self.ext_g_states)
391            .or_else(|| self.parent.as_ref().and_then(|p| p.get_ext_g_state(name)))
392    }
393
394    /// Get a color space by name.
395    pub fn get_color_space(&self, name: Name<'_>) -> Option<Object<'a>> {
396        self.get_resource::<Object<'_>>(name.clone(), &self.color_spaces)
397            .or_else(|| self.parent.as_ref().and_then(|p| p.get_color_space(name)))
398    }
399
400    /// Get a font by name.
401    pub fn get_font(&self, name: Name<'_>) -> Option<Dict<'a>> {
402        self.get_resource::<Dict<'_>>(name.clone(), &self.fonts)
403            .or_else(|| self.parent.as_ref().and_then(|p| p.get_font(name)))
404    }
405
406    /// Get a pattern by name.
407    pub fn get_pattern(&self, name: Name<'_>) -> Option<Object<'a>> {
408        self.get_resource::<Object<'_>>(name.clone(), &self.patterns)
409            .or_else(|| self.parent.as_ref().and_then(|p| p.get_pattern(name)))
410    }
411
412    /// Get an x object by name.
413    pub fn get_x_object(&self, name: Name<'_>) -> Option<Stream<'a>> {
414        self.get_resource::<Stream<'_>>(name.clone(), &self.x_objects)
415            .or_else(|| self.parent.as_ref().and_then(|p| p.get_x_object(name)))
416    }
417
418    /// Get a shading by name.
419    pub fn get_shading(&self, name: Name<'_>) -> Option<Object<'a>> {
420        self.get_resource::<Object<'_>>(name.clone(), &self.shadings)
421            .or_else(|| self.parent.as_ref().and_then(|p| p.get_shading(name)))
422    }
423}
424
425// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
426const POINTS_PER_INCH: f64 = 72.0;
427const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
428
429/// The dimension of an A4 page.
430pub const A4: Rect = Rect {
431    x0: 0.0,
432    y0: 0.0,
433    x1: 210.0 * POINTS_PER_MM,
434    y1: 297.0 * POINTS_PER_MM,
435};
436
437pub(crate) mod cached {
438    use crate::page::Pages;
439    use crate::reader::ReaderContext;
440    use crate::xref::XRef;
441    use std::ops::Deref;
442    use std::sync::Arc;
443
444    pub(crate) struct CachedPages {
445        pages: Pages<'static>,
446        // NOTE: `pages` references the data in `xref`, so it's important that `xref`
447        // appears after `pages` in the struct definition to ensure correct drop order.
448        _xref: Arc<XRef>,
449    }
450
451    impl CachedPages {
452        pub(crate) fn new(xref: Arc<XRef>) -> Option<Self> {
453            // SAFETY:
454            // - The XRef's location is stable in memory:
455            //   - We wrapped it in a `Arc`, which implements `StableDeref`.
456            //   - The struct owns the `Arc`, ensuring that the inner value is not dropped during the whole
457            //     duration.
458            // - The internal 'static lifetime is not leaked because its rewritten
459            //   to the self-lifetime in `pages()`.
460            let xref_reference: &'static XRef = unsafe { std::mem::transmute(xref.deref()) };
461
462            let ctx = ReaderContext::new(xref_reference, false);
463            let pages = xref_reference
464                .get_with(xref.trailer_data().pages_ref, &ctx)
465                .and_then(|p| Pages::new(&p, &ctx, xref_reference))
466                .or_else(|| Pages::new_brute_force(&ctx, xref_reference))?;
467
468            Some(Self { pages, _xref: xref })
469        }
470
471        pub(crate) fn get(&self) -> &Pages<'_> {
472            &self.pages
473        }
474    }
475}