hayro_write/
lib.rs

1/*!
2A crate for converting PDF pages into either XObjects or a new page via [`pdf-writer`](https://docs.rs/pdf-writer/).
3
4This is an internal crate and not meant for external use. Therefore, it's not very
5well-documented.
6*/
7
8#![forbid(unsafe_code)]
9#![deny(missing_docs)]
10
11mod primitive;
12
13use crate::primitive::{WriteDirect, WriteIndirect};
14use flate2::Compression;
15use flate2::write::ZlibEncoder;
16use hayro_syntax::object::Dict;
17use hayro_syntax::object::Object;
18use hayro_syntax::object::dict::keys::{
19    COLORSPACE, EXT_G_STATE, FONT, GROUP, PATTERN, PROPERTIES, SHADING, XOBJECT,
20};
21use hayro_syntax::object::{MaybeRef, ObjRef};
22use hayro_syntax::page::{Resources, Rotation};
23use log::warn;
24use pdf_writer::{Chunk, Content, Filter, Finish, Name, Rect, Ref};
25use std::collections::{BTreeMap, HashMap, HashSet};
26use std::ops::Deref;
27use std::ops::DerefMut;
28
29pub use hayro_syntax::page::{Page, Pages};
30pub use hayro_syntax::{LoadPdfError, Pdf, PdfData, PdfVersion};
31
32/// Apply the extraction queries to the given PDF and return the results.
33pub fn extract<'a>(
34    pdf: &Pdf,
35    new_ref: Box<dyn FnMut() -> Ref + 'a>,
36    queries: &[ExtractionQuery],
37) -> Result<ExtractionResult, ExtractionError> {
38    let pages = pdf.pages();
39    let mut ctx = ExtractionContext::new(new_ref);
40
41    for query in queries {
42        let page = pages
43            .get(query.page_index)
44            .ok_or(ExtractionError::InvalidPageIndex(query.page_index))?;
45
46        let root_ref = ctx.new_ref();
47
48        let res = match query.query_type {
49            ExtractionQueryType::XObject => write_xobject(page, root_ref, &mut ctx),
50            ExtractionQueryType::Page => write_page(page, root_ref, query.page_index, &mut ctx),
51        };
52
53        ctx.root_refs.push(res.map(|_| root_ref));
54    }
55
56    // Now we have shallowly extracted all pages, now go through all dependencies until there aren't
57    // any anymore.
58    write_dependencies(pdf, &mut ctx);
59
60    let mut global_chunk = Chunk::new();
61
62    for chunk in &ctx.chunks {
63        global_chunk.extend(chunk)
64    }
65
66    Ok(ExtractionResult {
67        chunk: global_chunk,
68        root_refs: ctx.root_refs,
69        page_tree_parent_ref: ctx.page_tree_parent_ref,
70    })
71}
72
73/// A type of extraction query, indicating as what kind of
74/// object you want to extract the page.
75#[derive(Copy, Clone, Debug)]
76pub enum ExtractionQueryType {
77    /// Extract the page as an XObject.
78    XObject,
79    /// Extract the page as a new page.
80    Page,
81}
82
83/// An extraction query.
84#[derive(Copy, Clone, Debug)]
85pub struct ExtractionQuery {
86    query_type: ExtractionQueryType,
87    page_index: usize,
88}
89
90impl ExtractionQuery {
91    /// Create a new page extraction query with the given page index.
92    pub fn new_page(page_index: usize) -> Self {
93        Self {
94            query_type: ExtractionQueryType::Page,
95            page_index,
96        }
97    }
98
99    /// Create a new XObject extraction query with the given page index.
100    pub fn new_xobject(page_index: usize) -> Self {
101        Self {
102            query_type: ExtractionQueryType::XObject,
103            page_index,
104        }
105    }
106}
107
108/// An error that occurred during page extraction.
109#[derive(Debug, Copy, Clone)]
110pub enum ExtractionError {
111    /// An invalid page index was given.
112    InvalidPageIndex(usize),
113}
114
115/// The result of an extraction.
116pub struct ExtractionResult {
117    /// The chunk containing all objects as well as their dependencies.
118    pub chunk: Chunk,
119    /// The root references of the pages/XObject, one for each extraction query.
120    pub root_refs: Vec<Result<Ref, ExtractionError>>,
121    /// The reference to the page tree parent that was generated.
122    pub page_tree_parent_ref: Ref,
123}
124
125struct ExtractionContext<'a> {
126    chunks: Vec<Chunk>,
127    visited_objects: HashSet<ObjRef>,
128    to_visit_refs: Vec<ObjRef>,
129    root_refs: Vec<Result<Ref, ExtractionError>>,
130    new_ref: Box<dyn FnMut() -> Ref + 'a>,
131    ref_map: HashMap<ObjRef, Ref>,
132    cached_content_streams: HashMap<usize, Ref>,
133    page_tree_parent_ref: Ref,
134}
135
136impl<'a> ExtractionContext<'a> {
137    fn new(mut new_ref: Box<dyn FnMut() -> Ref + 'a>) -> Self {
138        let page_tree_parent_ref = new_ref();
139        Self {
140            chunks: vec![],
141            visited_objects: HashSet::new(),
142            to_visit_refs: Vec::new(),
143            new_ref,
144            ref_map: HashMap::new(),
145            cached_content_streams: HashMap::new(),
146            root_refs: Vec::new(),
147            page_tree_parent_ref,
148        }
149    }
150
151    pub(crate) fn map_ref(&mut self, ref_: ObjRef) -> Ref {
152        if let Some(ref_) = self.ref_map.get(&ref_) {
153            *ref_
154        } else {
155            let new_ref = self.new_ref();
156            self.ref_map.insert(ref_, new_ref);
157
158            new_ref
159        }
160    }
161
162    pub(crate) fn new_ref(&mut self) -> Ref {
163        (self.new_ref)()
164    }
165}
166
167fn write_dependencies(pdf: &Pdf, ctx: &mut ExtractionContext) {
168    while let Some(ref_) = ctx.to_visit_refs.pop() {
169        // Don't visit objects twice!
170        if ctx.visited_objects.contains(&ref_) {
171            continue;
172        }
173
174        let mut chunk = Chunk::new();
175        if let Some(object) = pdf.xref().get::<Object>(ref_.into()) {
176            let new_ref = ctx.map_ref(ref_);
177            object.write_indirect(&mut chunk, new_ref, ctx);
178            ctx.chunks.push(chunk);
179
180            ctx.visited_objects.insert(ref_);
181        } else {
182            warn!("failed to extract object with ref: {ref_:?}");
183        }
184    }
185}
186
187/// Extract the given pages from the PDF and resave them as a new PDF. This function shouldn't be
188/// used directly and only exists for test purposes.
189#[doc(hidden)]
190pub fn extract_pages_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
191    let mut pdf = pdf_writer::Pdf::new();
192    let mut next_ref = Ref::new(1);
193    let requests = page_indices
194        .iter()
195        .map(|i| ExtractionQuery {
196            query_type: ExtractionQueryType::Page,
197            page_index: *i,
198        })
199        .collect::<Vec<_>>();
200
201    let catalog_id = next_ref.bump();
202
203    let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
204    pdf.catalog(catalog_id)
205        .pages(extracted.page_tree_parent_ref);
206    let count = extracted.root_refs.len();
207    pdf.pages(extracted.page_tree_parent_ref)
208        .kids(extracted.root_refs.iter().map(|r| r.unwrap()))
209        .count(count as i32);
210    pdf.extend(&extracted.chunk);
211
212    pdf.finish()
213}
214
215/// Extract the given pages as XObjects from the PDF and resave them as a new PDF.
216/// This function shouldn't be used directly and only exists for test purposes.
217#[doc(hidden)]
218pub fn extract_pages_as_xobject_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
219    let hayro_pages = hayro_pdf.pages();
220    let page_list = hayro_pages.as_ref();
221
222    let mut pdf = pdf_writer::Pdf::new();
223    let mut next_ref = Ref::new(1);
224
225    let catalog_id = next_ref.bump();
226    let requests = page_indices
227        .iter()
228        .map(|i| ExtractionQuery {
229            query_type: ExtractionQueryType::XObject,
230            page_index: *i,
231        })
232        .collect::<Vec<_>>();
233
234    let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
235
236    pdf.catalog(catalog_id)
237        .pages(extracted.page_tree_parent_ref);
238    let mut page_refs = vec![];
239
240    for (x_object_ref, page_idx) in extracted.root_refs.iter().zip(page_indices) {
241        let page = &page_list[*page_idx];
242        let render_dimensions = page.render_dimensions();
243
244        let mut content = Content::new();
245        content.x_object(Name(b"O1"));
246
247        let finished = content.finish();
248
249        let page_id = next_ref.bump();
250        let stream_id = next_ref.bump();
251        page_refs.push(page_id);
252
253        let mut page = pdf.page(page_id);
254        page.resources()
255            .x_objects()
256            .pair(Name(b"O1"), x_object_ref.unwrap());
257        page.media_box(Rect::new(
258            0.0,
259            0.0,
260            render_dimensions.0,
261            render_dimensions.1,
262        ));
263        page.parent(extracted.page_tree_parent_ref);
264        page.contents(stream_id);
265        page.finish();
266
267        pdf.stream(stream_id, finished.as_slice());
268    }
269
270    let count = extracted.root_refs.len();
271    pdf.pages(extracted.page_tree_parent_ref)
272        .kids(page_refs)
273        .count(count as i32);
274    pdf.extend(&extracted.chunk);
275
276    pdf.finish()
277}
278
279fn write_page(
280    page: &hayro_syntax::page::Page,
281    page_ref: Ref,
282    page_idx: usize,
283    ctx: &mut ExtractionContext,
284) -> Result<(), ExtractionError> {
285    let mut chunk = Chunk::new();
286    // Note: We can cache content stream references, but _not_ the page references themselves.
287    // Acrobat for some reason doesn't like duplicate page references in the page tree.
288    let stream_ref = if let Some(cached) = ctx.cached_content_streams.get(&page_idx) {
289        *cached
290    } else {
291        let stream_ref = ctx.new_ref();
292
293        chunk
294            .stream(
295                stream_ref,
296                &deflate_encode(page.page_stream().unwrap_or(b"")),
297            )
298            .filter(Filter::FlateDecode);
299        ctx.cached_content_streams.insert(page_idx, stream_ref);
300
301        stream_ref
302    };
303
304    let mut pdf_page = chunk.page(page_ref);
305
306    pdf_page
307        .media_box(convert_rect(&page.media_box()))
308        .crop_box(convert_rect(&page.crop_box()))
309        .rotate(match page.rotation() {
310            Rotation::None => 0,
311            Rotation::Horizontal => 90,
312            Rotation::Flipped => 180,
313            Rotation::FlippedHorizontal => 270,
314        })
315        .parent(ctx.page_tree_parent_ref)
316        .contents(stream_ref);
317
318    let raw_dict = page.raw();
319
320    if let Some(group) = raw_dict.get_raw::<Object>(GROUP) {
321        group.write_direct(pdf_page.insert(pdf_writer::Name(GROUP)), ctx);
322    }
323
324    serialize_resources(page.resources(), ctx, &mut pdf_page);
325
326    pdf_page.finish();
327
328    ctx.chunks.push(chunk);
329
330    Ok(())
331}
332
333fn write_xobject(
334    page: &hayro_syntax::page::Page,
335    xobj_ref: Ref,
336    ctx: &mut ExtractionContext,
337) -> Result<(), ExtractionError> {
338    let mut chunk = Chunk::new();
339    let encoded_stream = deflate_encode(page.page_stream().unwrap_or(b""));
340    let mut x_object = chunk.form_xobject(xobj_ref, &encoded_stream);
341    x_object.deref_mut().filter(Filter::FlateDecode);
342
343    let bbox = page.crop_box();
344    let initial_transform = page.initial_transform(false);
345
346    x_object.bbox(pdf_writer::Rect::new(
347        bbox.x0 as f32,
348        bbox.y0 as f32,
349        bbox.x1 as f32,
350        bbox.y1 as f32,
351    ));
352
353    let i = initial_transform.as_coeffs();
354    x_object.matrix([
355        i[0] as f32,
356        i[1] as f32,
357        i[2] as f32,
358        i[3] as f32,
359        i[4] as f32,
360        i[5] as f32,
361    ]);
362
363    serialize_resources(page.resources(), ctx, &mut x_object);
364
365    x_object.finish();
366    ctx.chunks.push(chunk);
367
368    Ok(())
369}
370
371fn serialize_resources(
372    resources: &Resources,
373    ctx: &mut ExtractionContext,
374    writer: &mut impl ResourcesExt,
375) {
376    let ext_g_states = collect_resources(resources, |r| r.ext_g_states.clone());
377    let shadings = collect_resources(resources, |r| r.shadings.clone());
378    let patterns = collect_resources(resources, |r| r.patterns.clone());
379    let x_objects = collect_resources(resources, |r| r.x_objects.clone());
380    let color_spaces = collect_resources(resources, |r| r.color_spaces.clone());
381    let fonts = collect_resources(resources, |r| r.fonts.clone());
382    let properties = collect_resources(resources, |r| r.properties.clone());
383
384    if !(ext_g_states.is_empty()
385        && shadings.is_empty()
386        && patterns.is_empty()
387        && x_objects.is_empty()
388        && color_spaces.is_empty()
389        && properties.is_empty()
390        && fonts.is_empty())
391    {
392        let mut resources = writer.resources();
393
394        macro_rules! write {
395            ($name:ident, $key:expr) => {
396                if !$name.is_empty() {
397                    let mut dict = resources.insert(Name($key)).dict();
398
399                    for (name, obj) in $name {
400                        obj.write_direct(dict.insert(Name(name.deref())), ctx);
401                    }
402                }
403            };
404        }
405
406        write!(ext_g_states, EXT_G_STATE);
407        write!(shadings, SHADING);
408        write!(patterns, PATTERN);
409        write!(x_objects, XOBJECT);
410        write!(color_spaces, COLORSPACE);
411        write!(fonts, FONT);
412        write!(properties, PROPERTIES);
413    }
414}
415
416fn collect_resources<'a>(
417    resources: &Resources<'a>,
418    get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
419) -> BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>> {
420    let mut map = BTreeMap::new();
421    collect_resources_inner(resources, get_dict, &mut map);
422    map
423}
424
425fn collect_resources_inner<'a>(
426    resources: &Resources<'a>,
427    mut get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
428    map: &mut BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>>,
429) {
430    // Process parents first, so that duplicates get overridden by the current dictionary.
431    // Since for inheritance, the current dictionary always has priority over entries in the
432    // parent dictionary.
433    if let Some(parent) = resources.parent() {
434        collect_resources_inner(parent, get_dict.clone(), map);
435    }
436
437    let dict = get_dict(resources);
438
439    for (name, object) in dict.entries() {
440        map.insert(name, object);
441    }
442}
443
444pub(crate) fn deflate_encode(data: &[u8]) -> Vec<u8> {
445    use std::io::Write;
446
447    const COMPRESSION_LEVEL: u8 = 6;
448    let mut e = ZlibEncoder::new(Vec::new(), Compression::new(COMPRESSION_LEVEL as u32));
449    e.write_all(data).unwrap();
450    e.finish().unwrap()
451}
452
453fn convert_rect(hy_rect: &hayro_syntax::object::Rect) -> pdf_writer::Rect {
454    Rect::new(
455        hy_rect.x0 as f32,
456        hy_rect.y0 as f32,
457        hy_rect.x1 as f32,
458        hy_rect.y1 as f32,
459    )
460}
461
462trait ResourcesExt {
463    fn resources(&mut self) -> pdf_writer::writers::Resources;
464}
465
466impl ResourcesExt for pdf_writer::writers::Page<'_> {
467    fn resources(&mut self) -> pdf_writer::writers::Resources {
468        Self::resources(self)
469    }
470}
471
472impl ResourcesExt for pdf_writer::writers::FormXObject<'_> {
473    fn resources(&mut self) -> pdf_writer::writers::Resources {
474        Self::resources(self)
475    }
476}