1#![forbid(unsafe_code)]
9#![deny(missing_docs)]
10
11mod primitive;
12
13use crate::primitive::{WriteDirect, WriteIndirect};
14use flate2::Compression;
15use flate2::write::ZlibEncoder;
16use hayro_syntax::object::Dict;
17use hayro_syntax::object::Object;
18use hayro_syntax::object::dict::keys::{
19 COLORSPACE, EXT_G_STATE, FONT, GROUP, PATTERN, PROPERTIES, SHADING, XOBJECT,
20};
21use hayro_syntax::object::{MaybeRef, ObjRef};
22use hayro_syntax::page::{Resources, Rotation};
23use log::warn;
24use pdf_writer::{Chunk, Content, Filter, Finish, Name, Rect, Ref};
25use std::collections::{BTreeMap, HashMap, HashSet};
26use std::ops::Deref;
27use std::ops::DerefMut;
28
29pub use hayro_syntax::page::{Page, Pages};
30pub use hayro_syntax::{LoadPdfError, Pdf, PdfData, PdfVersion};
31
32pub fn extract<'a>(
34 pdf: &Pdf,
35 new_ref: Box<dyn FnMut() -> Ref + 'a>,
36 queries: &[ExtractionQuery],
37) -> Result<ExtractionResult, ExtractionError> {
38 let pages = pdf.pages();
39 let mut ctx = ExtractionContext::new(new_ref, pdf);
40
41 for query in queries {
42 let page = pages
43 .get(query.page_index)
44 .ok_or(ExtractionError::InvalidPageIndex(query.page_index))?;
45
46 let root_ref = ctx.new_ref();
47
48 let res = match query.query_type {
49 ExtractionQueryType::XObject => write_xobject(page, root_ref, &mut ctx),
50 ExtractionQueryType::Page => write_page(page, root_ref, query.page_index, &mut ctx),
51 };
52
53 ctx.root_refs.push(res.map(|_| root_ref));
54 }
55
56 write_dependencies(pdf, &mut ctx);
59
60 let mut global_chunk = Chunk::new();
61
62 for chunk in &ctx.chunks {
63 global_chunk.extend(chunk)
64 }
65
66 Ok(ExtractionResult {
67 chunk: global_chunk,
68 root_refs: ctx.root_refs,
69 page_tree_parent_ref: ctx.page_tree_parent_ref,
70 })
71}
72
73#[derive(Copy, Clone, Debug)]
76pub enum ExtractionQueryType {
77 XObject,
79 Page,
81}
82
83#[derive(Copy, Clone, Debug)]
85pub struct ExtractionQuery {
86 query_type: ExtractionQueryType,
87 page_index: usize,
88}
89
90impl ExtractionQuery {
91 pub fn new_page(page_index: usize) -> Self {
93 Self {
94 query_type: ExtractionQueryType::Page,
95 page_index,
96 }
97 }
98
99 pub fn new_xobject(page_index: usize) -> Self {
101 Self {
102 query_type: ExtractionQueryType::XObject,
103 page_index,
104 }
105 }
106}
107
108#[derive(Debug, Copy, Clone)]
110pub enum ExtractionError {
111 InvalidPageIndex(usize),
113}
114
115pub struct ExtractionResult {
117 pub chunk: Chunk,
119 pub root_refs: Vec<Result<Ref, ExtractionError>>,
121 pub page_tree_parent_ref: Ref,
123}
124
125struct ExtractionContext<'a> {
126 chunks: Vec<Chunk>,
127 visited_objects: HashSet<ObjRef>,
128 to_visit_refs: Vec<ObjRef>,
129 valid_ref_cache: HashMap<ObjRef, bool>,
130 root_refs: Vec<Result<Ref, ExtractionError>>,
131 pdf: &'a Pdf,
132 new_ref: Box<dyn FnMut() -> Ref + 'a>,
133 ref_map: HashMap<ObjRef, Ref>,
134 cached_content_streams: HashMap<usize, Ref>,
135 page_tree_parent_ref: Ref,
136}
137
138impl<'a> ExtractionContext<'a> {
139 fn new(mut new_ref: Box<dyn FnMut() -> Ref + 'a>, pdf: &'a Pdf) -> Self {
140 let page_tree_parent_ref = new_ref();
141 Self {
142 chunks: vec![],
143 visited_objects: HashSet::new(),
144 to_visit_refs: Vec::new(),
145 valid_ref_cache: HashMap::new(),
146 pdf,
147 new_ref,
148 ref_map: HashMap::new(),
149 cached_content_streams: HashMap::new(),
150 root_refs: Vec::new(),
151 page_tree_parent_ref,
152 }
153 }
154
155 pub(crate) fn map_ref(&mut self, ref_: ObjRef) -> Ref {
156 if let Some(ref_) = self.ref_map.get(&ref_) {
157 *ref_
158 } else {
159 let new_ref = self.new_ref();
160 self.ref_map.insert(ref_, new_ref);
161
162 new_ref
163 }
164 }
165
166 pub(crate) fn new_ref(&mut self) -> Ref {
167 (self.new_ref)()
168 }
169}
170
171fn write_dependencies(pdf: &Pdf, ctx: &mut ExtractionContext) {
172 while let Some(ref_) = ctx.to_visit_refs.pop() {
173 if ctx.visited_objects.contains(&ref_) {
175 continue;
176 }
177
178 let mut chunk = Chunk::new();
179 if let Some(object) = pdf.xref().get::<Object>(ref_.into()) {
180 let new_ref = ctx.map_ref(ref_);
181 object.write_indirect(&mut chunk, new_ref, ctx);
182 ctx.chunks.push(chunk);
183
184 ctx.visited_objects.insert(ref_);
185 } else {
186 warn!("failed to extract object with ref: {ref_:?}");
187 }
188 }
189}
190
191#[doc(hidden)]
194pub fn extract_pages_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
195 let mut pdf = pdf_writer::Pdf::new();
196 let mut next_ref = Ref::new(1);
197 let requests = page_indices
198 .iter()
199 .map(|i| ExtractionQuery {
200 query_type: ExtractionQueryType::Page,
201 page_index: *i,
202 })
203 .collect::<Vec<_>>();
204
205 let catalog_id = next_ref.bump();
206
207 let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
208 pdf.catalog(catalog_id)
209 .pages(extracted.page_tree_parent_ref);
210 let count = extracted.root_refs.len();
211 pdf.pages(extracted.page_tree_parent_ref)
212 .kids(extracted.root_refs.iter().map(|r| r.unwrap()))
213 .count(count as i32);
214 pdf.extend(&extracted.chunk);
215
216 pdf.finish()
217}
218
219#[doc(hidden)]
222pub fn extract_pages_as_xobject_to_pdf(hayro_pdf: &Pdf, page_indices: &[usize]) -> Vec<u8> {
223 let hayro_pages = hayro_pdf.pages();
224 let page_list = hayro_pages.as_ref();
225
226 let mut pdf = pdf_writer::Pdf::new();
227 let mut next_ref = Ref::new(1);
228
229 let catalog_id = next_ref.bump();
230 let requests = page_indices
231 .iter()
232 .map(|i| ExtractionQuery {
233 query_type: ExtractionQueryType::XObject,
234 page_index: *i,
235 })
236 .collect::<Vec<_>>();
237
238 let extracted = extract(hayro_pdf, Box::new(|| next_ref.bump()), &requests).unwrap();
239
240 pdf.catalog(catalog_id)
241 .pages(extracted.page_tree_parent_ref);
242 let mut page_refs = vec![];
243
244 for (x_object_ref, page_idx) in extracted.root_refs.iter().zip(page_indices) {
245 let page = &page_list[*page_idx];
246 let render_dimensions = page.render_dimensions();
247
248 let mut content = Content::new();
249 content.x_object(Name(b"O1"));
250
251 let finished = content.finish();
252
253 let page_id = next_ref.bump();
254 let stream_id = next_ref.bump();
255 page_refs.push(page_id);
256
257 let mut page = pdf.page(page_id);
258 page.resources()
259 .x_objects()
260 .pair(Name(b"O1"), x_object_ref.unwrap());
261 page.media_box(Rect::new(
262 0.0,
263 0.0,
264 render_dimensions.0,
265 render_dimensions.1,
266 ));
267 page.parent(extracted.page_tree_parent_ref);
268 page.contents(stream_id);
269 page.finish();
270
271 pdf.stream(stream_id, finished.as_slice());
272 }
273
274 let count = extracted.root_refs.len();
275 pdf.pages(extracted.page_tree_parent_ref)
276 .kids(page_refs)
277 .count(count as i32);
278 pdf.extend(&extracted.chunk);
279
280 pdf.finish()
281}
282
283fn write_page(
284 page: &hayro_syntax::page::Page,
285 page_ref: Ref,
286 page_idx: usize,
287 ctx: &mut ExtractionContext,
288) -> Result<(), ExtractionError> {
289 let mut chunk = Chunk::new();
290 let stream_ref = if let Some(cached) = ctx.cached_content_streams.get(&page_idx) {
293 *cached
294 } else {
295 let stream_ref = ctx.new_ref();
296
297 chunk
298 .stream(
299 stream_ref,
300 &deflate_encode(page.page_stream().unwrap_or(b"")),
301 )
302 .filter(Filter::FlateDecode);
303 ctx.cached_content_streams.insert(page_idx, stream_ref);
304
305 stream_ref
306 };
307
308 let mut pdf_page = chunk.page(page_ref);
309
310 pdf_page
311 .media_box(convert_rect(&page.media_box()))
312 .crop_box(convert_rect(&page.crop_box()))
313 .rotate(match page.rotation() {
314 Rotation::None => 0,
315 Rotation::Horizontal => 90,
316 Rotation::Flipped => 180,
317 Rotation::FlippedHorizontal => 270,
318 })
319 .parent(ctx.page_tree_parent_ref)
320 .contents(stream_ref);
321
322 let raw_dict = page.raw();
323
324 if let Some(group) = raw_dict.get_raw::<Object>(GROUP) {
325 group.write_direct(pdf_page.insert(pdf_writer::Name(GROUP)), ctx);
326 }
327
328 serialize_resources(page.resources(), ctx, &mut pdf_page);
329
330 pdf_page.finish();
331
332 ctx.chunks.push(chunk);
333
334 Ok(())
335}
336
337fn write_xobject(
338 page: &hayro_syntax::page::Page,
339 xobj_ref: Ref,
340 ctx: &mut ExtractionContext,
341) -> Result<(), ExtractionError> {
342 let mut chunk = Chunk::new();
343 let encoded_stream = deflate_encode(page.page_stream().unwrap_or(b""));
344 let mut x_object = chunk.form_xobject(xobj_ref, &encoded_stream);
345 x_object.deref_mut().filter(Filter::FlateDecode);
346
347 let bbox = page.crop_box();
348 let initial_transform = page.initial_transform(false);
349
350 x_object.bbox(pdf_writer::Rect::new(
351 bbox.x0 as f32,
352 bbox.y0 as f32,
353 bbox.x1 as f32,
354 bbox.y1 as f32,
355 ));
356
357 let i = initial_transform.as_coeffs();
358 x_object.matrix([
359 i[0] as f32,
360 i[1] as f32,
361 i[2] as f32,
362 i[3] as f32,
363 i[4] as f32,
364 i[5] as f32,
365 ]);
366
367 serialize_resources(page.resources(), ctx, &mut x_object);
368
369 x_object.finish();
370 ctx.chunks.push(chunk);
371
372 Ok(())
373}
374
375fn serialize_resources(
376 resources: &Resources,
377 ctx: &mut ExtractionContext,
378 writer: &mut impl ResourcesExt,
379) {
380 let ext_g_states = collect_resources(resources, |r| r.ext_g_states.clone());
381 let shadings = collect_resources(resources, |r| r.shadings.clone());
382 let patterns = collect_resources(resources, |r| r.patterns.clone());
383 let x_objects = collect_resources(resources, |r| r.x_objects.clone());
384 let color_spaces = collect_resources(resources, |r| r.color_spaces.clone());
385 let fonts = collect_resources(resources, |r| r.fonts.clone());
386 let properties = collect_resources(resources, |r| r.properties.clone());
387
388 if !(ext_g_states.is_empty()
389 && shadings.is_empty()
390 && patterns.is_empty()
391 && x_objects.is_empty()
392 && color_spaces.is_empty()
393 && properties.is_empty()
394 && fonts.is_empty())
395 {
396 let mut resources = writer.resources();
397
398 macro_rules! write {
399 ($name:ident, $key:expr) => {
400 if !$name.is_empty() {
401 let mut dict = resources.insert(Name($key)).dict();
402
403 for (name, obj) in $name {
404 obj.write_direct(dict.insert(Name(name.deref())), ctx);
405 }
406 }
407 };
408 }
409
410 write!(ext_g_states, EXT_G_STATE);
411 write!(shadings, SHADING);
412 write!(patterns, PATTERN);
413 write!(x_objects, XOBJECT);
414 write!(color_spaces, COLORSPACE);
415 write!(fonts, FONT);
416 write!(properties, PROPERTIES);
417 }
418}
419
420fn collect_resources<'a>(
421 resources: &Resources<'a>,
422 get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
423) -> BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>> {
424 let mut map = BTreeMap::new();
425 collect_resources_inner(resources, get_dict, &mut map);
426 map
427}
428
429fn collect_resources_inner<'a>(
430 resources: &Resources<'a>,
431 mut get_dict: impl FnMut(&Resources<'a>) -> Dict<'a> + Clone,
432 map: &mut BTreeMap<hayro_syntax::object::Name<'a>, MaybeRef<Object<'a>>>,
433) {
434 if let Some(parent) = resources.parent() {
438 collect_resources_inner(parent, get_dict.clone(), map);
439 }
440
441 let dict = get_dict(resources);
442
443 for (name, object) in dict.entries() {
444 map.insert(name, object);
445 }
446}
447
448pub(crate) fn deflate_encode(data: &[u8]) -> Vec<u8> {
449 use std::io::Write;
450
451 const COMPRESSION_LEVEL: u8 = 6;
452 let mut e = ZlibEncoder::new(Vec::new(), Compression::new(COMPRESSION_LEVEL as u32));
453 e.write_all(data).unwrap();
454 e.finish().unwrap()
455}
456
457fn convert_rect(hy_rect: &hayro_syntax::object::Rect) -> pdf_writer::Rect {
458 Rect::new(
459 hy_rect.x0 as f32,
460 hy_rect.y0 as f32,
461 hy_rect.x1 as f32,
462 hy_rect.y1 as f32,
463 )
464}
465
466trait ResourcesExt {
467 fn resources(&mut self) -> pdf_writer::writers::Resources;
468}
469
470impl ResourcesExt for pdf_writer::writers::Page<'_> {
471 fn resources(&mut self) -> pdf_writer::writers::Resources {
472 Self::resources(self)
473 }
474}
475
476impl ResourcesExt for pdf_writer::writers::FormXObject<'_> {
477 fn resources(&mut self) -> pdf_writer::writers::Resources {
478 Self::resources(self)
479 }
480}