1#![forbid(unsafe_code)]
9#![deny(missing_docs)]
10
11mod primitive;
12
13use crate::primitive::{WriteDirect, WriteIndirect};
14use flate2::Compression;
15use flate2::write::ZlibEncoder;
16use hayro_syntax::Pdf;
17use hayro_syntax::object::Dict;
18use hayro_syntax::object::Object;
19use hayro_syntax::object::dict::keys::{
20 COLORSPACE, EXT_G_STATE, FONT, GROUP, PATTERN, PROPERTIES, SHADING, XOBJECT,
21};
22use hayro_syntax::object::{MaybeRef, ObjRef};
23use hayro_syntax::page::{Page, Resources, Rotation};
24use kurbo::Affine;
25use log::warn;
26use pdf_writer::{Chunk, Content, Filter, Finish, Name, Rect, Ref};
27use std::collections::{BTreeMap, HashMap, HashSet};
28use std::ops::Deref;
29use std::ops::DerefMut;
30
31pub use hayro_syntax;
32
33pub fn extract<'a>(
35 pdf: &Pdf,
36 new_ref: Box<dyn FnMut() -> Ref + 'a>,
37 queries: &[ExtractionQuery],
38) -> Result<ExtractionResult, ExtractionError> {
39 let pages = pdf.pages();
40 let mut ctx = ExtractionContext::new(new_ref, pdf);
41
42 for query in queries {
43 let page = pages
44 .get(query.page_index)
45 .ok_or(ExtractionError::InvalidPageIndex(query.page_index))?;
46
47 let root_ref = ctx.new_ref();
48
49 let res = match query.query_type {
50 ExtractionQueryType::XObject => write_xobject(page, root_ref, &mut ctx),
51 ExtractionQueryType::Page => write_page(page, root_ref, query.page_index, &mut ctx),
52 };
53
54 ctx.root_refs.push(res.map(|_| root_ref));
55 }
56
57 write_dependencies(pdf, &mut ctx);
60
61 let mut global_chunk = Chunk::new();
62
63 for chunk in &ctx.chunks {
64 global_chunk.extend(chunk);
65 }
66
67 Ok(ExtractionResult {
68 chunk: global_chunk,
69 root_refs: ctx.root_refs,
70 page_tree_parent_ref: ctx.page_tree_parent_ref,
71 })
72}
73
74#[derive(Copy, Clone, Debug)]
77pub enum ExtractionQueryType {
78 XObject,
80 Page,
82}
83
84#[derive(Copy, Clone, Debug)]
86pub struct ExtractionQuery {
87 query_type: ExtractionQueryType,
88 page_index: usize,
89}
90
91impl ExtractionQuery {
92 pub fn new_page(page_index: usize) -> Self {
94 Self {
95 query_type: ExtractionQueryType::Page,
96 page_index,
97 }
98 }
99
100 pub fn new_xobject(page_index: usize) -> Self {
102 Self {
103 query_type: ExtractionQueryType::XObject,
104 page_index,
105 }
106 }
107}
108
109#[derive(Debug, Copy, Clone)]
111pub enum ExtractionError {
112 InvalidPageIndex(usize),
114}
115
116pub struct ExtractionResult {
118 pub chunk: Chunk,
120 pub root_refs: Vec<Result<Ref, ExtractionError>>,
122 pub page_tree_parent_ref: Ref,
124}
125
126struct ExtractionContext<'a> {
127 chunks: Vec<Chunk>,
128 visited_objects: HashSet<ObjRef>,
129 to_visit_refs: Vec<ObjRef>,
130 valid_ref_cache: HashMap<ObjRef, bool>,
131 root_refs: Vec<Result<Ref, ExtractionError>>,
132 pdf: &'a Pdf,
133 new_ref: Box<dyn FnMut() -> Ref + 'a>,
134 ref_map: HashMap<ObjRef, Ref>,
135 cached_content_streams: HashMap<usize, Ref>,
136 page_tree_parent_ref: Ref,
137}
138
139impl<'a> ExtractionContext<'a> {
140 fn new(mut new_ref: Box<dyn FnMut() -> Ref + 'a>, pdf: &'a Pdf) -> Self {
141 let page_tree_parent_ref = new_ref();
142 Self {
143 chunks: vec![],
144 visited_objects: HashSet::new(),
145 to_visit_refs: Vec::new(),
146 valid_ref_cache: HashMap::new(),
147 pdf,
148 new_ref,
149 ref_map: HashMap::new(),
150 cached_content_streams: HashMap::new(),
151 root_refs: Vec::new(),
152 page_tree_parent_ref,
153 }
154 }
155
156 pub(crate) fn map_ref(&mut self, ref_: ObjRef) -> Ref {
157 if let Some(ref_) = self.ref_map.get(&ref_) {
158 *ref_
159 } else {
160 let new_ref = self.new_ref();
161 self.ref_map.insert(ref_, new_ref);
162
163 new_ref
164 }
165 }
166
167 pub(crate) fn new_ref(&mut self) -> Ref {
168 (self.new_ref)()
169 }
170}
171
172fn write_dependencies(pdf: &Pdf, ctx: &mut ExtractionContext<'_>) {
173 while let Some(ref_) = ctx.to_visit_refs.pop() {
174 if ctx.visited_objects.contains(&ref_) {
176 continue;
177 }
178
179 let mut chunk = Chunk::new();
180 if let Some(object) = pdf.xref().get::<Object<'_>>(ref_.into()) {
181 let new_ref = ctx.map_ref(ref_);
182 object.write_indirect(&mut chunk, new_ref, ctx);
183 ctx.chunks.push(chunk);
184
185 ctx.visited_objects.insert(ref_);
186 } else {
187 warn!("failed to extract object with ref: {ref_:?}");
188 }
189 }
190}
191
192#[doc(hidden)]
195pub fn extract_pages_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
196 let mut pdf = pdf_writer::Pdf::new();
197 let mut next_ref = Ref::new(1);
198 let requests = page_indices
199 .iter()
200 .map(|i| ExtractionQuery {
201 query_type: ExtractionQueryType::Page,
202 page_index: *i,
203 })
204 .collect::<Vec<_>>();
205
206 let catalog_id = next_ref.bump();
207
208 let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
209 pdf.catalog(catalog_id)
210 .pages(extracted.page_tree_parent_ref);
211 let count = extracted.root_refs.len();
212 pdf.pages(extracted.page_tree_parent_ref)
213 .kids(extracted.root_refs.iter().map(|r| r.unwrap()))
214 .count(count as i32);
215 pdf.extend(&extracted.chunk);
216
217 pdf.finish()
218}
219
220#[doc(hidden)]
223pub fn extract_pages_as_xobject_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
224 let hayro_pages = hayro_pdf.pages();
225 let page_list = hayro_pages.as_ref();
226
227 let mut pdf = pdf_writer::Pdf::new();
228 let mut next_ref = Ref::new(1);
229
230 let catalog_id = next_ref.bump();
231 let requests = page_indices
232 .iter()
233 .map(|i| ExtractionQuery {
234 query_type: ExtractionQueryType::XObject,
235 page_index: *i,
236 })
237 .collect::<Vec<_>>();
238
239 let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
240
241 pdf.catalog(catalog_id)
242 .pages(extracted.page_tree_parent_ref);
243 let mut page_refs = vec![];
244
245 for (x_object_ref, page_idx) in extracted.root_refs.iter().zip(page_indices) {
246 let page = &page_list[*page_idx];
247 let render_dimensions = page.render_dimensions();
248
249 let mut content = Content::new();
250 content.x_object(Name(b"O1"));
251
252 let finished = content.finish();
253
254 let page_id = next_ref.bump();
255 let stream_id = next_ref.bump();
256 page_refs.push(page_id);
257
258 let mut page = pdf.page(page_id);
259 page.resources()
260 .x_objects()
261 .pair(Name(b"O1"), x_object_ref.unwrap());
262 page.media_box(Rect::new(
263 0.0,
264 0.0,
265 render_dimensions.0,
266 render_dimensions.1,
267 ));
268 page.parent(extracted.page_tree_parent_ref);
269 page.contents(stream_id);
270 page.finish();
271
272 pdf.stream(stream_id, finished.as_slice());
273 }
274
275 let count = extracted.root_refs.len();
276 pdf.pages(extracted.page_tree_parent_ref)
277 .kids(page_refs)
278 .count(count as i32);
279 pdf.extend(&extracted.chunk);
280
281 pdf.finish()
282}
283
284fn write_page(
285 page: &Page<'_>,
286 page_ref: Ref,
287 page_idx: usize,
288 ctx: &mut ExtractionContext<'_>,
289) -> Result<(), ExtractionError> {
290 let mut chunk = Chunk::new();
291 let stream_ref = if let Some(cached) = ctx.cached_content_streams.get(&page_idx) {
294 *cached
295 } else {
296 let stream_ref = ctx.new_ref();
297
298 chunk
299 .stream(
300 stream_ref,
301 &deflate_encode(page.page_stream().unwrap_or(b"")),
302 )
303 .filter(Filter::FlateDecode);
304 ctx.cached_content_streams.insert(page_idx, stream_ref);
305
306 stream_ref
307 };
308
309 let mut pdf_page = chunk.page(page_ref);
310
311 pdf_page
312 .media_box(convert_rect(&page.media_box()))
313 .crop_box(convert_rect(&page.crop_box()))
314 .rotate(match page.rotation() {
315 Rotation::None => 0,
316 Rotation::Horizontal => 90,
317 Rotation::Flipped => 180,
318 Rotation::FlippedHorizontal => 270,
319 })
320 .parent(ctx.page_tree_parent_ref)
321 .contents(stream_ref);
322
323 let raw_dict = page.raw();
324
325 if let Some(group) = raw_dict.get_raw::<Object<'_>>(GROUP) {
326 group.write_direct(pdf_page.insert(Name(GROUP)), ctx);
327 }
328
329 serialize_resources(page.resources(), ctx, &mut pdf_page);
330
331 pdf_page.finish();
332
333 ctx.chunks.push(chunk);
334
335 Ok(())
336}
337
338fn write_xobject(
339 page: &Page<'_>,
340 xobj_ref: Ref,
341 ctx: &mut ExtractionContext<'_>,
342) -> Result<(), ExtractionError> {
343 let mut chunk = Chunk::new();
344 let encoded_stream = deflate_encode(page.page_stream().unwrap_or(b""));
345 let mut x_object = chunk.form_xobject(xobj_ref, &encoded_stream);
346 x_object.deref_mut().filter(Filter::FlateDecode);
347
348 let bbox = page.crop_box();
349 let initial_transform = page.initial_transform(false);
350
351 x_object.bbox(Rect::new(
352 bbox.x0 as f32,
353 bbox.y0 as f32,
354 bbox.x1 as f32,
355 bbox.y1 as f32,
356 ));
357
358 let i = initial_transform.as_coeffs();
359 x_object.matrix([
360 i[0] as f32,
361 i[1] as f32,
362 i[2] as f32,
363 i[3] as f32,
364 i[4] as f32,
365 i[5] as f32,
366 ]);
367
368 serialize_resources(page.resources(), ctx, &mut x_object);
369
370 x_object.finish();
371 ctx.chunks.push(chunk);
372
373 Ok(())
374}
375
376fn serialize_resources(
377 resources: &Resources<'_>,
378 ctx: &mut ExtractionContext<'_>,
379 writer: &mut impl ResourcesExt,
380) {
381 let ext_g_states = collect_resources(resources, |r| r.ext_g_states.clone());
382 let shadings = collect_resources(resources, |r| r.shadings.clone());
383 let patterns = collect_resources(resources, |r| r.patterns.clone());
384 let x_objects = collect_resources(resources, |r| r.x_objects.clone());
385 let color_spaces = collect_resources(resources, |r| r.color_spaces.clone());
386 let fonts = collect_resources(resources, |r| r.fonts.clone());
387 let properties = collect_resources(resources, |r| r.properties.clone());
388
389 if !(ext_g_states.is_empty()
390 && shadings.is_empty()
391 && patterns.is_empty()
392 && x_objects.is_empty()
393 && color_spaces.is_empty()
394 && properties.is_empty()
395 && fonts.is_empty())
396 {
397 let mut resources = writer.resources();
398
399 macro_rules! write {
400 ($name:ident, $key:expr) => {
401 if !$name.is_empty() {
402 let mut dict = resources.insert(Name($key)).dict();
403
404 for (name, obj) in $name {
405 obj.write_direct(dict.insert(Name(name.deref())), ctx);
406 }
407 }
408 };
409 }
410
411 write!(ext_g_states, EXT_G_STATE);
412 write!(shadings, SHADING);
413 write!(patterns, PATTERN);
414 write!(x_objects, XOBJECT);
415 write!(color_spaces, COLORSPACE);
416 write!(fonts, FONT);
417 write!(properties, PROPERTIES);
418 }
419}
420
421fn collect_resources<'a>(
422 resources: &Resources<'a>,
423 get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
424) -> BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>> {
425 let mut map = BTreeMap::new();
426 collect_resources_inner(resources, get_dict, &mut map);
427 map
428}
429
430fn collect_resources_inner<'a>(
431 resources: &Resources<'a>,
432 mut get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
433 map: &mut BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>>,
434) {
435 if let Some(parent) = resources.parent() {
439 collect_resources_inner(parent, get_dict.clone(), map);
440 }
441
442 let dict = get_dict(resources);
443
444 for (name, object) in dict.entries() {
445 map.insert(name, object);
446 }
447}
448
449pub(crate) fn deflate_encode(data: &[u8]) -> Vec<u8> {
450 use std::io::Write;
451
452 const COMPRESSION_LEVEL: u8 = 6;
453 let mut e = ZlibEncoder::new(Vec::new(), Compression::new(COMPRESSION_LEVEL as u32));
454 e.write_all(data).unwrap();
455 e.finish().unwrap()
456}
457
458fn convert_rect(hy_rect: &hayro_syntax::object::Rect) -> Rect {
459 Rect::new(
460 hy_rect.x0 as f32,
461 hy_rect.y0 as f32,
462 hy_rect.x1 as f32,
463 hy_rect.y1 as f32,
464 )
465}
466
467trait ResourcesExt {
468 fn resources(&mut self) -> pdf_writer::writers::Resources<'_>;
469}
470
471impl ResourcesExt for pdf_writer::writers::Page<'_> {
472 fn resources(&mut self) -> pdf_writer::writers::Resources<'_> {
473 Self::resources(self)
474 }
475}
476
477impl ResourcesExt for pdf_writer::writers::FormXObject<'_> {
478 fn resources(&mut self) -> pdf_writer::writers::Resources<'_> {
479 Self::resources(self)
480 }
481}
482
483trait PageExt {
485 fn initial_transform(&self, invert_y: bool) -> Affine;
489}
490
491impl PageExt for Page<'_> {
492 fn initial_transform(&self, invert_y: bool) -> Affine {
493 let crop_box = self.intersected_crop_box();
494 let (_, base_height) = self.base_dimensions();
495 let (width, height) = self.render_dimensions();
496
497 let horizontal_t =
498 Affine::rotate(90.0_f64.to_radians()) * Affine::translate((0.0, -width as f64));
499 let flipped_horizontal_t =
500 Affine::translate((0.0, height as f64)) * Affine::rotate(-90.0_f64.to_radians());
501
502 let rotation_transform = match self.rotation() {
503 Rotation::None => Affine::IDENTITY,
504 Rotation::Horizontal => {
505 if invert_y {
506 horizontal_t
507 } else {
508 flipped_horizontal_t
509 }
510 }
511 Rotation::Flipped => {
512 Affine::scale(-1.0) * Affine::translate((-width as f64, -height as f64))
513 }
514 Rotation::FlippedHorizontal => {
515 if invert_y {
516 flipped_horizontal_t
517 } else {
518 horizontal_t
519 }
520 }
521 };
522
523 let inversion_transform = if invert_y {
524 Affine::new([1.0, 0.0, 0.0, -1.0, 0.0, base_height as f64])
525 } else {
526 Affine::IDENTITY
527 };
528
529 rotation_transform * inversion_transform * Affine::translate((-crop_box.x0, -crop_box.y0))
530 }
531}