1#![forbid(unsafe_code)]
9#![deny(missing_docs)]
10
11mod primitive;
12
13use crate::primitive::{WriteDirect, WriteIndirect};
14use flate2::Compression;
15use flate2::write::ZlibEncoder;
16use hayro_syntax::object::Dict;
17use hayro_syntax::object::Object;
18use hayro_syntax::object::dict::keys::{
19 COLORSPACE, EXT_G_STATE, FONT, GROUP, PATTERN, PROPERTIES, SHADING, XOBJECT,
20};
21use hayro_syntax::object::{MaybeRef, ObjRef};
22use hayro_syntax::page::{Resources, Rotation};
23use log::warn;
24use pdf_writer::{Chunk, Content, Filter, Finish, Name, Rect, Ref};
25use std::collections::{BTreeMap, HashMap, HashSet};
26use std::ops::Deref;
27use std::ops::DerefMut;
28
29pub use hayro_syntax::page::{Page, Pages};
30pub use hayro_syntax::{LoadPdfError, Pdf, PdfData, PdfVersion};
31
32pub fn extract<'a>(
34 pdf: &Pdf,
35 new_ref: Box<dyn FnMut() -> Ref + 'a>,
36 queries: &[ExtractionQuery],
37) -> Result<ExtractionResult, ExtractionError> {
38 let pages = pdf.pages();
39 let mut ctx = ExtractionContext::new(new_ref);
40
41 for query in queries {
42 let page = pages
43 .get(query.page_index)
44 .ok_or(ExtractionError::InvalidPageIndex(query.page_index))?;
45
46 let root_ref = ctx.new_ref();
47
48 let res = match query.query_type {
49 ExtractionQueryType::XObject => write_xobject(page, root_ref, &mut ctx),
50 ExtractionQueryType::Page => write_page(page, root_ref, query.page_index, &mut ctx),
51 };
52
53 ctx.root_refs.push(res.map(|_| root_ref));
54 }
55
56 write_dependencies(pdf, &mut ctx);
59
60 let mut global_chunk = Chunk::new();
61
62 for chunk in &ctx.chunks {
63 global_chunk.extend(chunk)
64 }
65
66 Ok(ExtractionResult {
67 chunk: global_chunk,
68 root_refs: ctx.root_refs,
69 page_tree_parent_ref: ctx.page_tree_parent_ref,
70 })
71}
72
73#[derive(Copy, Clone, Debug)]
76pub enum ExtractionQueryType {
77 XObject,
79 Page,
81}
82
83#[derive(Copy, Clone, Debug)]
85pub struct ExtractionQuery {
86 query_type: ExtractionQueryType,
87 page_index: usize,
88}
89
90impl ExtractionQuery {
91 pub fn new_page(page_index: usize) -> Self {
93 Self {
94 query_type: ExtractionQueryType::Page,
95 page_index,
96 }
97 }
98
99 pub fn new_xobject(page_index: usize) -> Self {
101 Self {
102 query_type: ExtractionQueryType::XObject,
103 page_index,
104 }
105 }
106}
107
108#[derive(Debug, Copy, Clone)]
110pub enum ExtractionError {
111 InvalidPageIndex(usize),
113}
114
115pub struct ExtractionResult {
117 pub chunk: Chunk,
119 pub root_refs: Vec<Result<Ref, ExtractionError>>,
121 pub page_tree_parent_ref: Ref,
123}
124
125struct ExtractionContext<'a> {
126 chunks: Vec<Chunk>,
127 visited_objects: HashSet<ObjRef>,
128 to_visit_refs: Vec<ObjRef>,
129 root_refs: Vec<Result<Ref, ExtractionError>>,
130 new_ref: Box<dyn FnMut() -> Ref + 'a>,
131 ref_map: HashMap<ObjRef, Ref>,
132 cached_content_streams: HashMap<usize, Ref>,
133 page_tree_parent_ref: Ref,
134}
135
136impl<'a> ExtractionContext<'a> {
137 fn new(mut new_ref: Box<dyn FnMut() -> Ref + 'a>) -> Self {
138 let page_tree_parent_ref = new_ref();
139 Self {
140 chunks: vec![],
141 visited_objects: HashSet::new(),
142 to_visit_refs: Vec::new(),
143 new_ref,
144 ref_map: HashMap::new(),
145 cached_content_streams: HashMap::new(),
146 root_refs: Vec::new(),
147 page_tree_parent_ref,
148 }
149 }
150
151 pub(crate) fn map_ref(&mut self, ref_: ObjRef) -> Ref {
152 if let Some(ref_) = self.ref_map.get(&ref_) {
153 *ref_
154 } else {
155 let new_ref = self.new_ref();
156 self.ref_map.insert(ref_, new_ref);
157
158 new_ref
159 }
160 }
161
162 pub(crate) fn new_ref(&mut self) -> Ref {
163 (self.new_ref)()
164 }
165}
166
167fn write_dependencies(pdf: &Pdf, ctx: &mut ExtractionContext) {
168 while let Some(ref_) = ctx.to_visit_refs.pop() {
169 if ctx.visited_objects.contains(&ref_) {
171 continue;
172 }
173
174 let mut chunk = Chunk::new();
175 if let Some(object) = pdf.xref().get::<Object>(ref_.into()) {
176 let new_ref = ctx.map_ref(ref_);
177 object.write_indirect(&mut chunk, new_ref, ctx);
178 ctx.chunks.push(chunk);
179
180 ctx.visited_objects.insert(ref_);
181 } else {
182 warn!("failed to extract object with ref: {ref_:?}");
183 }
184 }
185}
186
187#[doc(hidden)]
190pub fn extract_pages_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
191 let mut pdf = pdf_writer::Pdf::new();
192 let mut next_ref = Ref::new(1);
193 let requests = page_indices
194 .iter()
195 .map(|i| ExtractionQuery {
196 query_type: ExtractionQueryType::Page,
197 page_index: *i,
198 })
199 .collect::<Vec<_>>();
200
201 let catalog_id = next_ref.bump();
202
203 let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
204 pdf.catalog(catalog_id)
205 .pages(extracted.page_tree_parent_ref);
206 let count = extracted.root_refs.len();
207 pdf.pages(extracted.page_tree_parent_ref)
208 .kids(extracted.root_refs.iter().map(|r| r.unwrap()))
209 .count(count as i32);
210 pdf.extend(&extracted.chunk);
211
212 pdf.finish()
213}
214
215#[doc(hidden)]
218pub fn extract_pages_as_xobject_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
219 let hayro_pages = hayro_pdf.pages();
220 let page_list = hayro_pages.as_ref();
221
222 let mut pdf = pdf_writer::Pdf::new();
223 let mut next_ref = Ref::new(1);
224
225 let catalog_id = next_ref.bump();
226 let requests = page_indices
227 .iter()
228 .map(|i| ExtractionQuery {
229 query_type: ExtractionQueryType::XObject,
230 page_index: *i,
231 })
232 .collect::<Vec<_>>();
233
234 let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
235
236 pdf.catalog(catalog_id)
237 .pages(extracted.page_tree_parent_ref);
238 let mut page_refs = vec![];
239
240 for (x_object_ref, page_idx) in extracted.root_refs.iter().zip(page_indices) {
241 let page = &page_list[*page_idx];
242 let render_dimensions = page.render_dimensions();
243
244 let mut content = Content::new();
245 content.x_object(Name(b"O1"));
246
247 let finished = content.finish();
248
249 let page_id = next_ref.bump();
250 let stream_id = next_ref.bump();
251 page_refs.push(page_id);
252
253 let mut page = pdf.page(page_id);
254 page.resources()
255 .x_objects()
256 .pair(Name(b"O1"), x_object_ref.unwrap());
257 page.media_box(Rect::new(
258 0.0,
259 0.0,
260 render_dimensions.0,
261 render_dimensions.1,
262 ));
263 page.parent(extracted.page_tree_parent_ref);
264 page.contents(stream_id);
265 page.finish();
266
267 pdf.stream(stream_id, finished.as_slice());
268 }
269
270 let count = extracted.root_refs.len();
271 pdf.pages(extracted.page_tree_parent_ref)
272 .kids(page_refs)
273 .count(count as i32);
274 pdf.extend(&extracted.chunk);
275
276 pdf.finish()
277}
278
279fn write_page(
280 page: &hayro_syntax::page::Page,
281 page_ref: Ref,
282 page_idx: usize,
283 ctx: &mut ExtractionContext,
284) -> Result<(), ExtractionError> {
285 let mut chunk = Chunk::new();
286 let stream_ref = if let Some(cached) = ctx.cached_content_streams.get(&page_idx) {
289 *cached
290 } else {
291 let stream_ref = ctx.new_ref();
292
293 chunk
294 .stream(
295 stream_ref,
296 &deflate_encode(page.page_stream().unwrap_or(b"")),
297 )
298 .filter(Filter::FlateDecode);
299 ctx.cached_content_streams.insert(page_idx, stream_ref);
300
301 stream_ref
302 };
303
304 let mut pdf_page = chunk.page(page_ref);
305
306 pdf_page
307 .media_box(convert_rect(&page.media_box()))
308 .crop_box(convert_rect(&page.crop_box()))
309 .rotate(match page.rotation() {
310 Rotation::None => 0,
311 Rotation::Horizontal => 90,
312 Rotation::Flipped => 180,
313 Rotation::FlippedHorizontal => 270,
314 })
315 .parent(ctx.page_tree_parent_ref)
316 .contents(stream_ref);
317
318 let raw_dict = page.raw();
319
320 if let Some(group) = raw_dict.get_raw::<Object>(GROUP) {
321 group.write_direct(pdf_page.insert(pdf_writer::Name(GROUP)), ctx);
322 }
323
324 serialize_resources(page.resources(), ctx, &mut pdf_page);
325
326 pdf_page.finish();
327
328 ctx.chunks.push(chunk);
329
330 Ok(())
331}
332
333fn write_xobject(
334 page: &hayro_syntax::page::Page,
335 xobj_ref: Ref,
336 ctx: &mut ExtractionContext,
337) -> Result<(), ExtractionError> {
338 let mut chunk = Chunk::new();
339 let encoded_stream = deflate_encode(page.page_stream().unwrap_or(b""));
340 let mut x_object = chunk.form_xobject(xobj_ref, &encoded_stream);
341 x_object.deref_mut().filter(Filter::FlateDecode);
342
343 let bbox = page.crop_box();
344 let initial_transform = page.initial_transform(false);
345
346 x_object.bbox(pdf_writer::Rect::new(
347 bbox.x0 as f32,
348 bbox.y0 as f32,
349 bbox.x1 as f32,
350 bbox.y1 as f32,
351 ));
352
353 let i = initial_transform.as_coeffs();
354 x_object.matrix([
355 i[0] as f32,
356 i[1] as f32,
357 i[2] as f32,
358 i[3] as f32,
359 i[4] as f32,
360 i[5] as f32,
361 ]);
362
363 serialize_resources(page.resources(), ctx, &mut x_object);
364
365 x_object.finish();
366 ctx.chunks.push(chunk);
367
368 Ok(())
369}
370
371fn serialize_resources(
372 resources: &Resources,
373 ctx: &mut ExtractionContext,
374 writer: &mut impl ResourcesExt,
375) {
376 let ext_g_states = collect_resources(resources, |r| r.ext_g_states.clone());
377 let shadings = collect_resources(resources, |r| r.shadings.clone());
378 let patterns = collect_resources(resources, |r| r.patterns.clone());
379 let x_objects = collect_resources(resources, |r| r.x_objects.clone());
380 let color_spaces = collect_resources(resources, |r| r.color_spaces.clone());
381 let fonts = collect_resources(resources, |r| r.fonts.clone());
382 let properties = collect_resources(resources, |r| r.properties.clone());
383
384 if !(ext_g_states.is_empty()
385 && shadings.is_empty()
386 && patterns.is_empty()
387 && x_objects.is_empty()
388 && color_spaces.is_empty()
389 && properties.is_empty()
390 && fonts.is_empty())
391 {
392 let mut resources = writer.resources();
393
394 macro_rules! write {
395 ($name:ident, $key:expr) => {
396 if !$name.is_empty() {
397 let mut dict = resources.insert(Name($key)).dict();
398
399 for (name, obj) in $name {
400 obj.write_direct(dict.insert(Name(name.deref())), ctx);
401 }
402 }
403 };
404 }
405
406 write!(ext_g_states, EXT_G_STATE);
407 write!(shadings, SHADING);
408 write!(patterns, PATTERN);
409 write!(x_objects, XOBJECT);
410 write!(color_spaces, COLORSPACE);
411 write!(fonts, FONT);
412 write!(properties, PROPERTIES);
413 }
414}
415
416fn collect_resources<'a>(
417 resources: &Resources<'a>,
418 get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
419) -> BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>> {
420 let mut map = BTreeMap::new();
421 collect_resources_inner(resources, get_dict, &mut map);
422 map
423}
424
425fn collect_resources_inner<'a>(
426 resources: &Resources<'a>,
427 mut get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
428 map: &mut BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>>,
429) {
430 if let Some(parent) = resources.parent() {
434 collect_resources_inner(parent, get_dict.clone(), map);
435 }
436
437 let dict = get_dict(resources);
438
439 for (name, object) in dict.entries() {
440 map.insert(name, object);
441 }
442}
443
444pub(crate) fn deflate_encode(data: &[u8]) -> Vec<u8> {
445 use std::io::Write;
446
447 const COMPRESSION_LEVEL: u8 = 6;
448 let mut e = ZlibEncoder::new(Vec::new(), Compression::new(COMPRESSION_LEVEL as u32));
449 e.write_all(data).unwrap();
450 e.finish().unwrap()
451}
452
453fn convert_rect(hy_rect: &hayro_syntax::object::Rect) -> pdf_writer::Rect {
454 Rect::new(
455 hy_rect.x0 as f32,
456 hy_rect.y0 as f32,
457 hy_rect.x1 as f32,
458 hy_rect.y1 as f32,
459 )
460}
461
462trait ResourcesExt {
463 fn resources(&mut self) -> pdf_writer::writers::Resources;
464}
465
466impl ResourcesExt for pdf_writer::writers::Page<'_> {
467 fn resources(&mut self) -> pdf_writer::writers::Resources {
468 Self::resources(self)
469 }
470}
471
472impl ResourcesExt for pdf_writer::writers::FormXObject<'_> {
473 fn resources(&mut self) -> pdf_writer::writers::Resources {
474 Self::resources(self)
475 }
476}