hayro_write/
lib.rs

1/*!
2A crate for converting PDF pages into either XObjects or a new page via [`pdf-writer`](https://docs.rs/pdf-writer/).
3
4This is an internal crate and not meant for external use. Therefore, it's not very
5well-documented.
6*/
7
8#![forbid(unsafe_code)]
9#![deny(missing_docs)]
10
11mod primitive;
12
13use crate::primitive::{WriteDirect, WriteIndirect};
14use flate2::Compression;
15use flate2::write::ZlibEncoder;
16use hayro_syntax::object::Dict;
17use hayro_syntax::object::Object;
18use hayro_syntax::object::dict::keys::{
19    COLORSPACE, EXT_G_STATE, FONT, GROUP, PATTERN, PROPERTIES, SHADING, XOBJECT,
20};
21use hayro_syntax::object::{MaybeRef, ObjRef};
22use hayro_syntax::page::{Resources, Rotation};
23use log::warn;
24use pdf_writer::{Chunk, Content, Filter, Finish, Name, Rect, Ref};
25use std::collections::{BTreeMap, HashMap, HashSet};
26use std::ops::Deref;
27use std::ops::DerefMut;
28
29pub use hayro_syntax::page::{Page, Pages};
30pub use hayro_syntax::{LoadPdfError, Pdf, PdfData, PdfVersion};
31
32/// Apply the extraction queries to the given PDF and return the results.
33pub fn extract<'a>(
34    pdf: &Pdf,
35    new_ref: Box<dyn FnMut() -> Ref + 'a>,
36    queries: &[ExtractionQuery],
37) -> Result<ExtractionResult, ExtractionError> {
38    let pages = pdf.pages();
39    let mut ctx = ExtractionContext::new(new_ref, pdf);
40
41    for query in queries {
42        let page = pages
43            .get(query.page_index)
44            .ok_or(ExtractionError::InvalidPageIndex(query.page_index))?;
45
46        let root_ref = ctx.new_ref();
47
48        let res = match query.query_type {
49            ExtractionQueryType::XObject => write_xobject(page, root_ref, &mut ctx),
50            ExtractionQueryType::Page => write_page(page, root_ref, query.page_index, &mut ctx),
51        };
52
53        ctx.root_refs.push(res.map(|_| root_ref));
54    }
55
56    // Now we have shallowly extracted all pages, now go through all dependencies until there aren't
57    // any anymore.
58    write_dependencies(pdf, &mut ctx);
59
60    let mut global_chunk = Chunk::new();
61
62    for chunk in &ctx.chunks {
63        global_chunk.extend(chunk)
64    }
65
66    Ok(ExtractionResult {
67        chunk: global_chunk,
68        root_refs: ctx.root_refs,
69        page_tree_parent_ref: ctx.page_tree_parent_ref,
70    })
71}
72
73/// A type of extraction query, indicating as what kind of
74/// object you want to extract the page.
75#[derive(Copy, Clone, Debug)]
76pub enum ExtractionQueryType {
77    /// Extract the page as an XObject.
78    XObject,
79    /// Extract the page as a new page.
80    Page,
81}
82
83/// An extraction query.
84#[derive(Copy, Clone, Debug)]
85pub struct ExtractionQuery {
86    query_type: ExtractionQueryType,
87    page_index: usize,
88}
89
90impl ExtractionQuery {
91    /// Create a new page extraction query with the given page index.
92    pub fn new_page(page_index: usize) -> Self {
93        Self {
94            query_type: ExtractionQueryType::Page,
95            page_index,
96        }
97    }
98
99    /// Create a new XObject extraction query with the given page index.
100    pub fn new_xobject(page_index: usize) -> Self {
101        Self {
102            query_type: ExtractionQueryType::XObject,
103            page_index,
104        }
105    }
106}
107
108/// An error that occurred during page extraction.
109#[derive(Debug, Copy, Clone)]
110pub enum ExtractionError {
111    /// An invalid page index was given.
112    InvalidPageIndex(usize),
113}
114
115/// The result of an extraction.
116pub struct ExtractionResult {
117    /// The chunk containing all objects as well as their dependencies.
118    pub chunk: Chunk,
119    /// The root references of the pages/XObject, one for each extraction query.
120    pub root_refs: Vec<Result<Ref, ExtractionError>>,
121    /// The reference to the page tree parent that was generated.
122    pub page_tree_parent_ref: Ref,
123}
124
125struct ExtractionContext<'a> {
126    chunks: Vec<Chunk>,
127    visited_objects: HashSet<ObjRef>,
128    to_visit_refs: Vec<ObjRef>,
129    valid_ref_cache: HashMap<ObjRef, bool>,
130    root_refs: Vec<Result<Ref, ExtractionError>>,
131    pdf: &'a Pdf,
132    new_ref: Box<dyn FnMut() -> Ref + 'a>,
133    ref_map: HashMap<ObjRef, Ref>,
134    cached_content_streams: HashMap<usize, Ref>,
135    page_tree_parent_ref: Ref,
136}
137
138impl<'a> ExtractionContext<'a> {
139    fn new(mut new_ref: Box<dyn FnMut() -> Ref + 'a>, pdf: &'a Pdf) -> Self {
140        let page_tree_parent_ref = new_ref();
141        Self {
142            chunks: vec![],
143            visited_objects: HashSet::new(),
144            to_visit_refs: Vec::new(),
145            valid_ref_cache: HashMap::new(),
146            pdf,
147            new_ref,
148            ref_map: HashMap::new(),
149            cached_content_streams: HashMap::new(),
150            root_refs: Vec::new(),
151            page_tree_parent_ref,
152        }
153    }
154
155    pub(crate) fn map_ref(&mut self, ref_: ObjRef) -> Ref {
156        if let Some(ref_) = self.ref_map.get(&ref_) {
157            *ref_
158        } else {
159            let new_ref = self.new_ref();
160            self.ref_map.insert(ref_, new_ref);
161
162            new_ref
163        }
164    }
165
166    pub(crate) fn new_ref(&mut self) -> Ref {
167        (self.new_ref)()
168    }
169}
170
171fn write_dependencies(pdf: &Pdf, ctx: &mut ExtractionContext) {
172    while let Some(ref_) = ctx.to_visit_refs.pop() {
173        // Don't visit objects twice!
174        if ctx.visited_objects.contains(&ref_) {
175            continue;
176        }
177
178        let mut chunk = Chunk::new();
179        if let Some(object) = pdf.xref().get::<Object>(ref_.into()) {
180            let new_ref = ctx.map_ref(ref_);
181            object.write_indirect(&mut chunk, new_ref, ctx);
182            ctx.chunks.push(chunk);
183
184            ctx.visited_objects.insert(ref_);
185        } else {
186            warn!("failed to extract object with ref: {ref_:?}");
187        }
188    }
189}
190
191/// Extract the given pages from the PDF and resave them as a new PDF. This function shouldn't be
192/// used directly and only exists for test purposes.
193#[doc(hidden)]
194pub fn extract_pages_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
195    let mut pdf = pdf_writer::Pdf::new();
196    let mut next_ref = Ref::new(1);
197    let requests = page_indices
198        .iter()
199        .map(|i| ExtractionQuery {
200            query_type: ExtractionQueryType::Page,
201            page_index: *i,
202        })
203        .collect::<Vec<_>>();
204
205    let catalog_id = next_ref.bump();
206
207    let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
208    pdf.catalog(catalog_id)
209        .pages(extracted.page_tree_parent_ref);
210    let count = extracted.root_refs.len();
211    pdf.pages(extracted.page_tree_parent_ref)
212        .kids(extracted.root_refs.iter().map(|r| r.unwrap()))
213        .count(count as i32);
214    pdf.extend(&extracted.chunk);
215
216    pdf.finish()
217}
218
219/// Extract the given pages as XObjects from the PDF and resave them as a new PDF.
220/// This function shouldn't be used directly and only exists for test purposes.
221#[doc(hidden)]
222pub fn extract_pages_as_xobject_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
223    let hayro_pages = hayro_pdf.pages();
224    let page_list = hayro_pages.as_ref();
225
226    let mut pdf = pdf_writer::Pdf::new();
227    let mut next_ref = Ref::new(1);
228
229    let catalog_id = next_ref.bump();
230    let requests = page_indices
231        .iter()
232        .map(|i| ExtractionQuery {
233            query_type: ExtractionQueryType::XObject,
234            page_index: *i,
235        })
236        .collect::<Vec<_>>();
237
238    let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
239
240    pdf.catalog(catalog_id)
241        .pages(extracted.page_tree_parent_ref);
242    let mut page_refs = vec![];
243
244    for (x_object_ref, page_idx) in extracted.root_refs.iter().zip(page_indices) {
245        let page = &page_list[*page_idx];
246        let render_dimensions = page.render_dimensions();
247
248        let mut content = Content::new();
249        content.x_object(Name(b"O1"));
250
251        let finished = content.finish();
252
253        let page_id = next_ref.bump();
254        let stream_id = next_ref.bump();
255        page_refs.push(page_id);
256
257        let mut page = pdf.page(page_id);
258        page.resources()
259            .x_objects()
260            .pair(Name(b"O1"), x_object_ref.unwrap());
261        page.media_box(Rect::new(
262            0.0,
263            0.0,
264            render_dimensions.0,
265            render_dimensions.1,
266        ));
267        page.parent(extracted.page_tree_parent_ref);
268        page.contents(stream_id);
269        page.finish();
270
271        pdf.stream(stream_id, finished.as_slice());
272    }
273
274    let count = extracted.root_refs.len();
275    pdf.pages(extracted.page_tree_parent_ref)
276        .kids(page_refs)
277        .count(count as i32);
278    pdf.extend(&extracted.chunk);
279
280    pdf.finish()
281}
282
283fn write_page(
284    page: &hayro_syntax::page::Page,
285    page_ref: Ref,
286    page_idx: usize,
287    ctx: &mut ExtractionContext,
288) -> Result<(), ExtractionError> {
289    let mut chunk = Chunk::new();
290    // Note: We can cache content stream references, but _not_ the page references themselves.
291    // Acrobat for some reason doesn't like duplicate page references in the page tree.
292    let stream_ref = if let Some(cached) = ctx.cached_content_streams.get(&page_idx) {
293        *cached
294    } else {
295        let stream_ref = ctx.new_ref();
296
297        chunk
298            .stream(
299                stream_ref,
300                &deflate_encode(page.page_stream().unwrap_or(b"")),
301            )
302            .filter(Filter::FlateDecode);
303        ctx.cached_content_streams.insert(page_idx, stream_ref);
304
305        stream_ref
306    };
307
308    let mut pdf_page = chunk.page(page_ref);
309
310    pdf_page
311        .media_box(convert_rect(&page.media_box()))
312        .crop_box(convert_rect(&page.crop_box()))
313        .rotate(match page.rotation() {
314            Rotation::None => 0,
315            Rotation::Horizontal => 90,
316            Rotation::Flipped => 180,
317            Rotation::FlippedHorizontal => 270,
318        })
319        .parent(ctx.page_tree_parent_ref)
320        .contents(stream_ref);
321
322    let raw_dict = page.raw();
323
324    if let Some(group) = raw_dict.get_raw::<Object>(GROUP) {
325        group.write_direct(pdf_page.insert(pdf_writer::Name(GROUP)), ctx);
326    }
327
328    serialize_resources(page.resources(), ctx, &mut pdf_page);
329
330    pdf_page.finish();
331
332    ctx.chunks.push(chunk);
333
334    Ok(())
335}
336
337fn write_xobject(
338    page: &hayro_syntax::page::Page,
339    xobj_ref: Ref,
340    ctx: &mut ExtractionContext,
341) -> Result<(), ExtractionError> {
342    let mut chunk = Chunk::new();
343    let encoded_stream = deflate_encode(page.page_stream().unwrap_or(b""));
344    let mut x_object = chunk.form_xobject(xobj_ref, &encoded_stream);
345    x_object.deref_mut().filter(Filter::FlateDecode);
346
347    let bbox = page.crop_box();
348    let initial_transform = page.initial_transform(false);
349
350    x_object.bbox(pdf_writer::Rect::new(
351        bbox.x0 as f32,
352        bbox.y0 as f32,
353        bbox.x1 as f32,
354        bbox.y1 as f32,
355    ));
356
357    let i = initial_transform.as_coeffs();
358    x_object.matrix([
359        i[0] as f32,
360        i[1] as f32,
361        i[2] as f32,
362        i[3] as f32,
363        i[4] as f32,
364        i[5] as f32,
365    ]);
366
367    serialize_resources(page.resources(), ctx, &mut x_object);
368
369    x_object.finish();
370    ctx.chunks.push(chunk);
371
372    Ok(())
373}
374
375fn serialize_resources(
376    resources: &Resources,
377    ctx: &mut ExtractionContext,
378    writer: &mut impl ResourcesExt,
379) {
380    let ext_g_states = collect_resources(resources, |r| r.ext_g_states.clone());
381    let shadings = collect_resources(resources, |r| r.shadings.clone());
382    let patterns = collect_resources(resources, |r| r.patterns.clone());
383    let x_objects = collect_resources(resources, |r| r.x_objects.clone());
384    let color_spaces = collect_resources(resources, |r| r.color_spaces.clone());
385    let fonts = collect_resources(resources, |r| r.fonts.clone());
386    let properties = collect_resources(resources, |r| r.properties.clone());
387
388    if !(ext_g_states.is_empty()
389        && shadings.is_empty()
390        && patterns.is_empty()
391        && x_objects.is_empty()
392        && color_spaces.is_empty()
393        && properties.is_empty()
394        && fonts.is_empty())
395    {
396        let mut resources = writer.resources();
397
398        macro_rules! write {
399            ($name:ident, $key:expr) => {
400                if !$name.is_empty() {
401                    let mut dict = resources.insert(Name($key)).dict();
402
403                    for (name, obj) in $name {
404                        obj.write_direct(dict.insert(Name(name.deref())), ctx);
405                    }
406                }
407            };
408        }
409
410        write!(ext_g_states, EXT_G_STATE);
411        write!(shadings, SHADING);
412        write!(patterns, PATTERN);
413        write!(x_objects, XOBJECT);
414        write!(color_spaces, COLORSPACE);
415        write!(fonts, FONT);
416        write!(properties, PROPERTIES);
417    }
418}
419
420fn collect_resources<'a>(
421    resources: &Resources<'a>,
422    get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
423) -> BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>> {
424    let mut map = BTreeMap::new();
425    collect_resources_inner(resources, get_dict, &mut map);
426    map
427}
428
429fn collect_resources_inner<'a>(
430    resources: &Resources<'a>,
431    mut get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
432    map: &mut BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>>,
433) {
434    // Process parents first, so that duplicates get overridden by the current dictionary.
435    // Since for inheritance, the current dictionary always has priority over entries in the
436    // parent dictionary.
437    if let Some(parent) = resources.parent() {
438        collect_resources_inner(parent, get_dict.clone(), map);
439    }
440
441    let dict = get_dict(resources);
442
443    for (name, object) in dict.entries() {
444        map.insert(name, object);
445    }
446}
447
448pub(crate) fn deflate_encode(data: &[u8]) -> Vec<u8> {
449    use std::io::Write;
450
451    const COMPRESSION_LEVEL: u8 = 6;
452    let mut e = ZlibEncoder::new(Vec::new(), Compression::new(COMPRESSION_LEVEL as u32));
453    e.write_all(data).unwrap();
454    e.finish().unwrap()
455}
456
457fn convert_rect(hy_rect: &hayro_syntax::object::Rect) -> pdf_writer::Rect {
458    Rect::new(
459        hy_rect.x0 as f32,
460        hy_rect.y0 as f32,
461        hy_rect.x1 as f32,
462        hy_rect.y1 as f32,
463    )
464}
465
466trait ResourcesExt {
467    fn resources(&mut self) -> pdf_writer::writers::Resources;
468}
469
470impl ResourcesExt for pdf_writer::writers::Page<'_> {
471    fn resources(&mut self) -> pdf_writer::writers::Resources {
472        Self::resources(self)
473    }
474}
475
476impl ResourcesExt for pdf_writer::writers::FormXObject<'_> {
477    fn resources(&mut self) -> pdf_writer::writers::Resources {
478        Self::resources(self)
479    }
480}