hayro_syntax/document/
page.rs

1//! Reading the pages of a PDF document.
2
3use crate::content::{TypedIter, UntypedIter};
4use crate::object::array::Array;
5use crate::object::dict::Dict;
6use crate::object::dict::keys::*;
7use crate::object::name::Name;
8use crate::object::rect::Rect;
9use crate::object::r#ref::{MaybeRef, ObjRef};
10use crate::object::stream::Stream;
11use crate::object::{Object, ObjectLike};
12use crate::xref::XRef;
13use log::warn;
14use std::cell::OnceCell;
15
16/// A structure holding the pages of a PDF document.
17pub struct Pages<'a> {
18    /// The pages of the document.
19    pub pages: Vec<Page<'a>>,
20}
21
22/// Attributes that can be inherited.
23#[derive(Debug, Clone)]
24struct PagesContext {
25    media_box: Option<Rect>,
26    crop_box: Option<Rect>,
27    rotate: Option<u32>,
28}
29
30impl PagesContext {
31    fn new() -> Self {
32        Self {
33            media_box: None,
34            crop_box: None,
35            rotate: None,
36        }
37    }
38}
39
40impl<'a> Pages<'a> {
41    /// Create a new `Pages` object.
42    pub(crate) fn new(pages_dict: Dict<'a>, xref: &'a XRef) -> Option<Pages<'a>> {
43        let mut pages = vec![];
44        let ctx = PagesContext::new();
45        resolve_pages(
46            pages_dict,
47            &mut pages,
48            ctx,
49            Resources::new(Dict::empty(), None, xref),
50        )?;
51
52        Some(Self { pages })
53    }
54
55    /// The number of available pages.
56    pub fn len(&self) -> usize {
57        self.pages.len()
58    }
59}
60
61fn resolve_pages<'a>(
62    pages_dict: Dict<'a>,
63    entries: &mut Vec<Page<'a>>,
64    mut ctx: PagesContext,
65    resources: Resources<'a>,
66) -> Option<()> {
67    if let Some(media_box) = pages_dict.get::<Rect>(MEDIA_BOX) {
68        ctx.media_box = Some(media_box);
69    }
70
71    if let Some(crop_box) = pages_dict.get::<Rect>(CROP_BOX) {
72        ctx.crop_box = Some(crop_box);
73    }
74
75    if let Some(rotate) = pages_dict.get::<u32>(ROTATE) {
76        ctx.rotate = Some(rotate);
77    }
78
79    let resources = Resources::from_parent(
80        pages_dict.get::<Dict>(RESOURCES).unwrap_or_default(),
81        resources.clone(),
82    );
83
84    let kids = pages_dict.get::<Array<'a>>(KIDS)?;
85
86    for dict in kids.iter::<Dict>() {
87        match dict.get::<Name>(TYPE)? {
88            PAGES => resolve_pages(dict, entries, ctx.clone(), resources.clone())?,
89            PAGE => entries.push(Page::new(dict, &ctx, resources.clone())),
90            _ => return None,
91        }
92    }
93
94    Some(())
95}
96
97/// The rotation of the page.
98#[derive(Debug, Copy, Clone)]
99pub enum Rotation {
100    /// No rotation.
101    None,
102    /// A rotation of 90 degrees.
103    Horizontal,
104    /// A rotation of 180 degrees.
105    Flipped,
106    /// A rotation of 270 degrees.
107    FlippedHorizontal,
108}
109
110/// A PDF page.
111pub struct Page<'a> {
112    inner: Dict<'a>,
113    media_box: kurbo::Rect,
114    crop_box: kurbo::Rect,
115    rotation: Rotation,
116    page_streams: OnceCell<Option<Vec<u8>>>,
117    resources: Resources<'a>,
118    xref: &'a XRef,
119}
120
121impl<'a> Page<'a> {
122    fn new(dict: Dict<'a>, ctx: &PagesContext, resources: Resources<'a>) -> Page<'a> {
123        let media_box = dict
124            .get::<Rect>(MEDIA_BOX)
125            .or_else(|| ctx.media_box)
126            .unwrap_or(A4);
127
128        let crop_box = dict
129            .get::<Rect>(CROP_BOX)
130            .or_else(|| ctx.crop_box)
131            .unwrap_or(media_box);
132
133        let rotation = match dict.get::<u32>(ROTATE).or_else(|| ctx.rotate).unwrap_or(0) % 360 {
134            0 => Rotation::None,
135            90 => Rotation::Horizontal,
136            180 => Rotation::Flipped,
137            270 => Rotation::FlippedHorizontal,
138            _ => Rotation::None,
139        };
140
141        let xref = resources.xref;
142        let resources =
143            Resources::from_parent(dict.get::<Dict>(RESOURCES).unwrap_or_default(), resources);
144
145        Self {
146            inner: dict,
147            media_box,
148            crop_box,
149            rotation,
150            page_streams: OnceCell::new(),
151            resources,
152            xref,
153        }
154    }
155
156    fn operations_impl(&self) -> Option<UntypedIter> {
157        let convert_single = |s: Stream| {
158            let data = s.decoded()?;
159            Some(data.to_vec())
160        };
161
162        let stream = self
163            .page_streams
164            .get_or_init(|| {
165                if let Some(stream) = self.inner.get::<Stream>(CONTENTS) {
166                    convert_single(stream)
167                } else if let Some(array) = self.inner.get::<Array>(CONTENTS) {
168                    let streams = array.iter::<Stream>().flat_map(convert_single);
169
170                    let mut collected = vec![];
171
172                    for stream in streams {
173                        collected.extend(stream);
174                        // Streams must have at least one whitespace in-between.
175                        collected.push(b' ')
176                    }
177
178                    Some(collected)
179                } else {
180                    warn!("contents entry of page was neither stream nor array of streams");
181
182                    return None;
183                }
184            })
185            .as_ref()?;
186
187        let iter = UntypedIter::new(&stream);
188
189        Some(iter)
190    }
191
192    /// Get the resources of the page.
193    pub fn resources(&self) -> &Resources<'a> {
194        &self.resources
195    }
196
197    /// Get the media box of the page.
198    pub fn media_box(&self) -> Rect {
199        self.media_box
200    }
201
202    /// Get the rotation of the page.
203    pub fn rotation(&self) -> Rotation {
204        self.rotation
205    }
206
207    /// Get the crop box of the page.
208    pub fn crop_box(&self) -> Rect {
209        self.crop_box
210    }
211
212    /// Get the operations of the content stream of the page.
213    pub fn operations(&self) -> UntypedIter {
214        self.operations_impl().unwrap_or(UntypedIter::empty())
215    }
216
217    // TODO: Remove?
218    /// Get the xref table (of the document the page belongs to).
219    pub fn xref(&self) -> &'a XRef {
220        self.xref
221    }
222
223    /// Return an iterator over the operators in the page's content stream.
224    pub fn typed_operations(&self) -> TypedIter {
225        TypedIter::new(self.operations().into_iter())
226    }
227}
228
229/// A structure keeping track of the resources of a page.
230#[derive(Clone, Debug)]
231pub struct Resources<'a> {
232    parent: Option<Box<Resources<'a>>>,
233    xref: &'a XRef,
234    ext_g_states: Dict<'a>,
235    fonts: Dict<'a>,
236    color_spaces: Dict<'a>,
237    x_objects: Dict<'a>,
238    patterns: Dict<'a>,
239    shadings: Dict<'a>,
240}
241
242impl<'a> Resources<'a> {
243    /// Create a new `Resources` object from a dictionary with a parent.
244    pub fn from_parent(resources: Dict<'a>, parent: Resources<'a>) -> Resources<'a> {
245        let xref = parent.xref;
246
247        Self::new(resources, Some(parent), xref)
248    }
249
250    /// Create a new `Resources` object.
251    pub fn new(
252        resources: Dict<'a>,
253        parent: Option<Resources<'a>>,
254        xref: &'a XRef,
255    ) -> Resources<'a> {
256        let ext_g_states = resources.get::<Dict>(EXT_G_STATE).unwrap_or_default();
257        let fonts = resources.get::<Dict>(FONT).unwrap_or_default();
258        let color_spaces = resources.get::<Dict>(COLORSPACE).unwrap_or_default();
259        let x_objects = resources.get::<Dict>(XOBJECT).unwrap_or_default();
260        let patterns = resources.get::<Dict>(PATTERN).unwrap_or_default();
261        let shadings = resources.get::<Dict>(SHADING).unwrap_or_default();
262
263        let parent = parent.map(|r| Box::new(r));
264
265        Self {
266            parent,
267            ext_g_states,
268            fonts,
269            color_spaces,
270            x_objects,
271            patterns,
272            shadings,
273            xref,
274        }
275    }
276
277    /// Resolve an object reference to an object.
278    #[allow(private_bounds)]
279    pub fn resolve_ref<T: ObjectLike<'a>>(&self, ref_: ObjRef) -> Option<T> {
280        self.xref.get(ref_.into())
281    }
282
283    fn get_resource<T: ObjectLike<'a>, U>(
284        &self,
285        name: &Name,
286        dict: &Dict<'a>,
287        mut cache: impl FnMut(ObjRef) -> Option<U>,
288        mut resolve: impl FnMut(T) -> Option<U>,
289    ) -> Option<U> {
290        // TODO: Cache non-ref resources as well
291
292        match dict.get_raw::<T>(name)? {
293            MaybeRef::Ref(ref_) => {
294                cache(ref_).or_else(|| self.xref.get::<T>(ref_.into()).and_then(|t| resolve(t)))
295            }
296            MaybeRef::NotRef(i) => resolve(i),
297        }
298    }
299
300    // TODO: Refactor caching mechanism
301
302    /// Get an external graphics state by name.
303    pub fn get_ext_g_state<U>(
304        &self,
305        name: &Name,
306        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
307        mut resolve: Box<dyn FnMut(Dict<'a>) -> Option<U> + '_>,
308    ) -> Option<U> {
309        self.get_resource::<Dict, U>(name, &self.ext_g_states, &mut cache, &mut resolve)
310            .or_else(|| {
311                self.parent
312                    .as_ref()
313                    .and_then(|p| p.get_ext_g_state::<U>(name, cache, resolve))
314            })
315    }
316
317    /// Get a color space by name.
318    pub fn get_color_space<U>(
319        &self,
320        name: &Name,
321        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
322        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
323    ) -> Option<U> {
324        self.get_resource::<Object, U>(name, &self.color_spaces, &mut cache, &mut resolve)
325            .or_else(|| {
326                self.parent
327                    .as_ref()
328                    .and_then(|p| p.get_color_space::<U>(name, cache, resolve))
329            })
330    }
331
332    /// Get a font by name.
333    pub fn get_font<U>(
334        &self,
335        name: &Name,
336        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
337        mut resolve: Box<dyn FnMut(Dict<'a>) -> Option<U> + '_>,
338    ) -> Option<U> {
339        self.get_resource::<Dict, U>(name, &self.fonts, &mut cache, &mut resolve)
340            .or_else(|| {
341                self.parent
342                    .as_ref()
343                    .and_then(|p| p.get_font::<U>(name, cache, resolve))
344            })
345    }
346
347    /// Get a pattern by name.
348    pub fn get_pattern<U>(
349        &self,
350        name: &Name,
351        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
352        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
353    ) -> Option<U> {
354        self.get_resource::<Object, U>(name, &self.patterns, &mut cache, &mut resolve)
355            .or_else(|| {
356                self.parent
357                    .as_ref()
358                    .and_then(|p| p.get_pattern::<U>(name, cache, resolve))
359            })
360    }
361
362    /// Get an x object by name.
363    pub fn get_x_object<U>(
364        &self,
365        name: &Name,
366        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
367        mut resolve: Box<dyn FnMut(Stream<'a>) -> Option<U> + '_>,
368    ) -> Option<U> {
369        self.get_resource::<Stream, U>(name, &self.x_objects, &mut cache, &mut resolve)
370            .or_else(|| {
371                self.parent
372                    .as_ref()
373                    .and_then(|p| p.get_x_object::<U>(name, cache, resolve))
374            })
375    }
376
377    /// Get a shading by name.
378    pub fn get_shading<U>(
379        &self,
380        name: &Name,
381        mut cache: Box<dyn FnMut(ObjRef) -> Option<U> + '_>,
382        mut resolve: Box<dyn FnMut(Object<'a>) -> Option<U> + '_>,
383    ) -> Option<U> {
384        self.get_resource::<Object, U>(name, &self.shadings, &mut cache, &mut resolve)
385            .or_else(|| {
386                self.parent
387                    .as_ref()
388                    .and_then(|p| p.get_shading::<U>(name, cache, resolve))
389            })
390    }
391}
392
393// <https://github.com/apache/pdfbox/blob/a53a70db16ea3133994120bcf1e216b9e760c05b/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/common/PDRectangle.java#L38>
394const POINTS_PER_INCH: f64 = 72.0;
395const POINTS_PER_MM: f64 = 1.0 / (10.0 * 2.54) * POINTS_PER_INCH;
396
397/// The dimension of an A4 page.
398pub const A4: Rect = Rect {
399    x0: 0.0,
400    y0: 0.0,
401    x1: 210.0 * POINTS_PER_MM,
402    y1: 297.0 * POINTS_PER_MM,
403};