hayro_write/
lib.rs

1/*!
2A crate for converting PDF pages into either `XObjects` or a new page via [`pdf-writer`](https://docs.rs/pdf-writer/).
3
4This is an internal crate and not meant for external use. Therefore, it's not very
5well-documented.
6*/
7
8#![forbid(unsafe_code)]
9#![deny(missing_docs)]
10
11mod primitive;
12
13use crate::primitive::{WriteDirect, WriteIndirect};
14use flate2::Compression;
15use flate2::write::ZlibEncoder;
16use hayro_syntax::Pdf;
17use hayro_syntax::object::Dict;
18use hayro_syntax::object::Object;
19use hayro_syntax::object::dict::keys::{
20    COLORSPACE, EXT_G_STATE, FONT, GROUP, PATTERN, PROPERTIES, SHADING, XOBJECT,
21};
22use hayro_syntax::object::{MaybeRef, ObjRef};
23use hayro_syntax::page::{Page, Resources, Rotation};
24use kurbo::Affine;
25use log::warn;
26use pdf_writer::{Chunk, Content, Filter, Finish, Name, Rect, Ref};
27use std::collections::{BTreeMap, HashMap, HashSet};
28use std::ops::Deref;
29use std::ops::DerefMut;
30
31pub use hayro_syntax;
32
33/// Apply the extraction queries to the given PDF and return the results.
34pub fn extract<'a>(
35    pdf: &Pdf,
36    new_ref: Box<dyn FnMut() -> Ref + 'a>,
37    queries: &[ExtractionQuery],
38) -> Result<ExtractionResult, ExtractionError> {
39    let pages = pdf.pages();
40    let mut ctx = ExtractionContext::new(new_ref, pdf);
41
42    for query in queries {
43        let page = pages
44            .get(query.page_index)
45            .ok_or(ExtractionError::InvalidPageIndex(query.page_index))?;
46
47        let root_ref = ctx.new_ref();
48
49        let res = match query.query_type {
50            ExtractionQueryType::XObject => write_xobject(page, root_ref, &mut ctx),
51            ExtractionQueryType::Page => write_page(page, root_ref, query.page_index, &mut ctx),
52        };
53
54        ctx.root_refs.push(res.map(|_| root_ref));
55    }
56
57    // Now we have shallowly extracted all pages, now go through all dependencies until there aren't
58    // any anymore.
59    write_dependencies(pdf, &mut ctx);
60
61    let mut global_chunk = Chunk::new();
62
63    for chunk in &ctx.chunks {
64        global_chunk.extend(chunk);
65    }
66
67    Ok(ExtractionResult {
68        chunk: global_chunk,
69        root_refs: ctx.root_refs,
70        page_tree_parent_ref: ctx.page_tree_parent_ref,
71    })
72}
73
74/// A type of extraction query, indicating as what kind of
75/// object you want to extract the page.
76#[derive(Copy, Clone, Debug)]
77pub enum ExtractionQueryType {
78    /// Extract the page as an `XObject`.
79    XObject,
80    /// Extract the page as a new page.
81    Page,
82}
83
84/// An extraction query.
85#[derive(Copy, Clone, Debug)]
86pub struct ExtractionQuery {
87    query_type: ExtractionQueryType,
88    page_index: usize,
89}
90
91impl ExtractionQuery {
92    /// Create a new page extraction query with the given page index.
93    pub fn new_page(page_index: usize) -> Self {
94        Self {
95            query_type: ExtractionQueryType::Page,
96            page_index,
97        }
98    }
99
100    /// Create a new `XObject` extraction query with the given page index.
101    pub fn new_xobject(page_index: usize) -> Self {
102        Self {
103            query_type: ExtractionQueryType::XObject,
104            page_index,
105        }
106    }
107}
108
109/// An error that occurred during page extraction.
110#[derive(Debug, Copy, Clone)]
111pub enum ExtractionError {
112    /// An invalid page index was given.
113    InvalidPageIndex(usize),
114}
115
116/// The result of an extraction.
117pub struct ExtractionResult {
118    /// The chunk containing all objects as well as their dependencies.
119    pub chunk: Chunk,
120    /// The root references of the pages/XObject, one for each extraction query.
121    pub root_refs: Vec<Result<Ref, ExtractionError>>,
122    /// The reference to the page tree parent that was generated.
123    pub page_tree_parent_ref: Ref,
124}
125
126struct ExtractionContext<'a> {
127    chunks: Vec<Chunk>,
128    visited_objects: HashSet<ObjRef>,
129    to_visit_refs: Vec<ObjRef>,
130    valid_ref_cache: HashMap<ObjRef, bool>,
131    root_refs: Vec<Result<Ref, ExtractionError>>,
132    pdf: &'a Pdf,
133    new_ref: Box<dyn FnMut() -> Ref + 'a>,
134    ref_map: HashMap<ObjRef, Ref>,
135    cached_content_streams: HashMap<usize, Ref>,
136    page_tree_parent_ref: Ref,
137}
138
139impl<'a> ExtractionContext<'a> {
140    fn new(mut new_ref: Box<dyn FnMut() -> Ref + 'a>, pdf: &'a Pdf) -> Self {
141        let page_tree_parent_ref = new_ref();
142        Self {
143            chunks: vec![],
144            visited_objects: HashSet::new(),
145            to_visit_refs: Vec::new(),
146            valid_ref_cache: HashMap::new(),
147            pdf,
148            new_ref,
149            ref_map: HashMap::new(),
150            cached_content_streams: HashMap::new(),
151            root_refs: Vec::new(),
152            page_tree_parent_ref,
153        }
154    }
155
156    pub(crate) fn map_ref(&mut self, ref_: ObjRef) -> Ref {
157        if let Some(ref_) = self.ref_map.get(&ref_) {
158            *ref_
159        } else {
160            let new_ref = self.new_ref();
161            self.ref_map.insert(ref_, new_ref);
162
163            new_ref
164        }
165    }
166
167    pub(crate) fn new_ref(&mut self) -> Ref {
168        (self.new_ref)()
169    }
170}
171
172fn write_dependencies(pdf: &Pdf, ctx: &mut ExtractionContext<'_>) {
173    while let Some(ref_) = ctx.to_visit_refs.pop() {
174        // Don't visit objects twice!
175        if ctx.visited_objects.contains(&ref_) {
176            continue;
177        }
178
179        let mut chunk = Chunk::new();
180        if let Some(object) = pdf.xref().get::<Object<'_>>(ref_.into()) {
181            let new_ref = ctx.map_ref(ref_);
182            object.write_indirect(&mut chunk, new_ref, ctx);
183            ctx.chunks.push(chunk);
184
185            ctx.visited_objects.insert(ref_);
186        } else {
187            warn!("failed to extract object with ref: {ref_:?}");
188        }
189    }
190}
191
192/// Extract the given pages from the PDF and resave them as a new PDF. This function shouldn't be
193/// used directly and only exists for test purposes.
194#[doc(hidden)]
195pub fn extract_pages_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
196    let mut pdf = pdf_writer::Pdf::new();
197    let mut next_ref = Ref::new(1);
198    let requests = page_indices
199        .iter()
200        .map(|i| ExtractionQuery {
201            query_type: ExtractionQueryType::Page,
202            page_index: *i,
203        })
204        .collect::<Vec<_>>();
205
206    let catalog_id = next_ref.bump();
207
208    let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
209    pdf.catalog(catalog_id)
210        .pages(extracted.page_tree_parent_ref);
211    let count = extracted.root_refs.len();
212    pdf.pages(extracted.page_tree_parent_ref)
213        .kids(extracted.root_refs.iter().map(|r| r.unwrap()))
214        .count(count as i32);
215    pdf.extend(&extracted.chunk);
216
217    pdf.finish()
218}
219
220/// Extract the given pages as XObjects from the PDF and resave them as a new PDF.
221/// This function shouldn't be used directly and only exists for test purposes.
222#[doc(hidden)]
223pub fn extract_pages_as_xobject_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
224    let hayro_pages = hayro_pdf.pages();
225    let page_list = hayro_pages.as_ref();
226
227    let mut pdf = pdf_writer::Pdf::new();
228    let mut next_ref = Ref::new(1);
229
230    let catalog_id = next_ref.bump();
231    let requests = page_indices
232        .iter()
233        .map(|i| ExtractionQuery {
234            query_type: ExtractionQueryType::XObject,
235            page_index: *i,
236        })
237        .collect::<Vec<_>>();
238
239    let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
240
241    pdf.catalog(catalog_id)
242        .pages(extracted.page_tree_parent_ref);
243    let mut page_refs = vec![];
244
245    for (x_object_ref, page_idx) in extracted.root_refs.iter().zip(page_indices) {
246        let page = &page_list[*page_idx];
247        let render_dimensions = page.render_dimensions();
248
249        let mut content = Content::new();
250        content.x_object(Name(b"O1"));
251
252        let finished = content.finish();
253
254        let page_id = next_ref.bump();
255        let stream_id = next_ref.bump();
256        page_refs.push(page_id);
257
258        let mut page = pdf.page(page_id);
259        page.resources()
260            .x_objects()
261            .pair(Name(b"O1"), x_object_ref.unwrap());
262        page.media_box(Rect::new(
263            0.0,
264            0.0,
265            render_dimensions.0,
266            render_dimensions.1,
267        ));
268        page.parent(extracted.page_tree_parent_ref);
269        page.contents(stream_id);
270        page.finish();
271
272        pdf.stream(stream_id, finished.as_slice());
273    }
274
275    let count = extracted.root_refs.len();
276    pdf.pages(extracted.page_tree_parent_ref)
277        .kids(page_refs)
278        .count(count as i32);
279    pdf.extend(&extracted.chunk);
280
281    pdf.finish()
282}
283
284fn write_page(
285    page: &Page<'_>,
286    page_ref: Ref,
287    page_idx: usize,
288    ctx: &mut ExtractionContext<'_>,
289) -> Result<(), ExtractionError> {
290    let mut chunk = Chunk::new();
291    // Note: We can cache content stream references, but _not_ the page references themselves.
292    // Acrobat for some reason doesn't like duplicate page references in the page tree.
293    let stream_ref = if let Some(cached) = ctx.cached_content_streams.get(&page_idx) {
294        *cached
295    } else {
296        let stream_ref = ctx.new_ref();
297
298        chunk
299            .stream(
300                stream_ref,
301                &deflate_encode(page.page_stream().unwrap_or(b"")),
302            )
303            .filter(Filter::FlateDecode);
304        ctx.cached_content_streams.insert(page_idx, stream_ref);
305
306        stream_ref
307    };
308
309    let mut pdf_page = chunk.page(page_ref);
310
311    pdf_page
312        .media_box(convert_rect(&page.media_box()))
313        .crop_box(convert_rect(&page.crop_box()))
314        .rotate(match page.rotation() {
315            Rotation::None => 0,
316            Rotation::Horizontal => 90,
317            Rotation::Flipped => 180,
318            Rotation::FlippedHorizontal => 270,
319        })
320        .parent(ctx.page_tree_parent_ref)
321        .contents(stream_ref);
322
323    let raw_dict = page.raw();
324
325    if let Some(group) = raw_dict.get_raw::<Object<'_>>(GROUP) {
326        group.write_direct(pdf_page.insert(Name(GROUP)), ctx);
327    }
328
329    serialize_resources(page.resources(), ctx, &mut pdf_page);
330
331    pdf_page.finish();
332
333    ctx.chunks.push(chunk);
334
335    Ok(())
336}
337
338fn write_xobject(
339    page: &Page<'_>,
340    xobj_ref: Ref,
341    ctx: &mut ExtractionContext<'_>,
342) -> Result<(), ExtractionError> {
343    let mut chunk = Chunk::new();
344    let encoded_stream = deflate_encode(page.page_stream().unwrap_or(b""));
345    let mut x_object = chunk.form_xobject(xobj_ref, &encoded_stream);
346    x_object.deref_mut().filter(Filter::FlateDecode);
347
348    let bbox = page.crop_box();
349    let initial_transform = page.initial_transform(false);
350
351    x_object.bbox(Rect::new(
352        bbox.x0 as f32,
353        bbox.y0 as f32,
354        bbox.x1 as f32,
355        bbox.y1 as f32,
356    ));
357
358    let i = initial_transform.as_coeffs();
359    x_object.matrix([
360        i[0] as f32,
361        i[1] as f32,
362        i[2] as f32,
363        i[3] as f32,
364        i[4] as f32,
365        i[5] as f32,
366    ]);
367
368    serialize_resources(page.resources(), ctx, &mut x_object);
369
370    x_object.finish();
371    ctx.chunks.push(chunk);
372
373    Ok(())
374}
375
376fn serialize_resources(
377    resources: &Resources<'_>,
378    ctx: &mut ExtractionContext<'_>,
379    writer: &mut impl ResourcesExt,
380) {
381    let ext_g_states = collect_resources(resources, |r| r.ext_g_states.clone());
382    let shadings = collect_resources(resources, |r| r.shadings.clone());
383    let patterns = collect_resources(resources, |r| r.patterns.clone());
384    let x_objects = collect_resources(resources, |r| r.x_objects.clone());
385    let color_spaces = collect_resources(resources, |r| r.color_spaces.clone());
386    let fonts = collect_resources(resources, |r| r.fonts.clone());
387    let properties = collect_resources(resources, |r| r.properties.clone());
388
389    if !(ext_g_states.is_empty()
390        && shadings.is_empty()
391        && patterns.is_empty()
392        && x_objects.is_empty()
393        && color_spaces.is_empty()
394        && properties.is_empty()
395        && fonts.is_empty())
396    {
397        let mut resources = writer.resources();
398
399        macro_rules! write {
400            ($name:ident, $key:expr) => {
401                if !$name.is_empty() {
402                    let mut dict = resources.insert(Name($key)).dict();
403
404                    for (name, obj) in $name {
405                        obj.write_direct(dict.insert(Name(name.deref())), ctx);
406                    }
407                }
408            };
409        }
410
411        write!(ext_g_states, EXT_G_STATE);
412        write!(shadings, SHADING);
413        write!(patterns, PATTERN);
414        write!(x_objects, XOBJECT);
415        write!(color_spaces, COLORSPACE);
416        write!(fonts, FONT);
417        write!(properties, PROPERTIES);
418    }
419}
420
421fn collect_resources<'a>(
422    resources: &Resources<'a>,
423    get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
424) -> BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>> {
425    let mut map = BTreeMap::new();
426    collect_resources_inner(resources, get_dict, &mut map);
427    map
428}
429
430fn collect_resources_inner<'a>(
431    resources: &Resources<'a>,
432    mut get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
433    map: &mut BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>>,
434) {
435    // Process parents first, so that duplicates get overridden by the current dictionary.
436    // Since for inheritance, the current dictionary always has priority over entries in the
437    // parent dictionary.
438    if let Some(parent) = resources.parent() {
439        collect_resources_inner(parent, get_dict.clone(), map);
440    }
441
442    let dict = get_dict(resources);
443
444    for (name, object) in dict.entries() {
445        map.insert(name, object);
446    }
447}
448
449pub(crate) fn deflate_encode(data: &[u8]) -> Vec<u8> {
450    use std::io::Write;
451
452    const COMPRESSION_LEVEL: u8 = 6;
453    let mut e = ZlibEncoder::new(Vec::new(), Compression::new(COMPRESSION_LEVEL as u32));
454    e.write_all(data).unwrap();
455    e.finish().unwrap()
456}
457
458fn convert_rect(hy_rect: &hayro_syntax::object::Rect) -> Rect {
459    Rect::new(
460        hy_rect.x0 as f32,
461        hy_rect.y0 as f32,
462        hy_rect.x1 as f32,
463        hy_rect.y1 as f32,
464    )
465}
466
467trait ResourcesExt {
468    fn resources(&mut self) -> pdf_writer::writers::Resources<'_>;
469}
470
471impl ResourcesExt for pdf_writer::writers::Page<'_> {
472    fn resources(&mut self) -> pdf_writer::writers::Resources<'_> {
473        Self::resources(self)
474    }
475}
476
477impl ResourcesExt for pdf_writer::writers::FormXObject<'_> {
478    fn resources(&mut self) -> pdf_writer::writers::Resources<'_> {
479        Self::resources(self)
480    }
481}
482
483// Note: Keep in sync with `hayro-interpret`.
484trait PageExt {
485    /// Return the initial transform that should be applied when rendering. This accounts for a
486    /// number of factors, such as the mismatch between PDF's y-up and most renderers' y-down
487    /// coordinate system, the rotation of the page and the offset of the crop box.
488    fn initial_transform(&self, invert_y: bool) -> Affine;
489}
490
491impl PageExt for Page<'_> {
492    fn initial_transform(&self, invert_y: bool) -> Affine {
493        let crop_box = self.intersected_crop_box();
494        let (_, base_height) = self.base_dimensions();
495        let (width, height) = self.render_dimensions();
496
497        let horizontal_t =
498            Affine::rotate(90.0_f64.to_radians()) * Affine::translate((0.0, -width as f64));
499        let flipped_horizontal_t =
500            Affine::translate((0.0, height as f64)) * Affine::rotate(-90.0_f64.to_radians());
501
502        let rotation_transform = match self.rotation() {
503            Rotation::None => Affine::IDENTITY,
504            Rotation::Horizontal => {
505                if invert_y {
506                    horizontal_t
507                } else {
508                    flipped_horizontal_t
509                }
510            }
511            Rotation::Flipped => {
512                Affine::scale(-1.0) * Affine::translate((-width as f64, -height as f64))
513            }
514            Rotation::FlippedHorizontal => {
515                if invert_y {
516                    flipped_horizontal_t
517                } else {
518                    horizontal_t
519                }
520            }
521        };
522
523        let inversion_transform = if invert_y {
524            Affine::new([1.0, 0.0, 0.0, -1.0, 0.0, base_height as f64])
525        } else {
526            Affine::IDENTITY
527        };
528
529        rotation_transform * inversion_transform * Affine::translate((-crop_box.x0, -crop_box.y0))
530    }
531}