1use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::ProcessingLimits;
7use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
8use crate::text::{TextBlock, TextExtractionDevice};
9use crate::thumbnail::ThumbnailOptions;
10
11use pdf_forms::parse::parse_acroform;
12use pdf_forms::tree::{FieldType, FieldValue};
13use pdf_render::pdf_interpret::PageExt;
14use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings};
15use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
16use pdf_render::pdf_syntax::object::Dict;
17use pdf_render::pdf_syntax::page::Page;
18use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
19#[cfg(feature = "parallel")]
20use rayon::prelude::*;
21
22use kurbo::Rect;
23
24#[derive(Debug, Clone, Default)]
26pub struct DocumentInfo {
27 pub title: Option<String>,
29 pub author: Option<String>,
31 pub subject: Option<String>,
33 pub keywords: Option<String>,
35 pub creator: Option<String>,
37 pub producer: Option<String>,
39}
40
41#[derive(Debug, Clone)]
43pub struct BookmarkItem {
44 pub title: String,
46 pub page: Option<usize>,
48 pub children: Vec<BookmarkItem>,
50}
51
52pub struct PdfDocument {
54 pdf: Pdf,
55 settings: InterpreterSettings,
56}
57
58impl PdfDocument {
59 pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
61 let pdf = Pdf::new(data).map_err(|e| match e {
62 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
63 EngineError::Encrypted(format!("{d:?}"))
64 }
65 _ => EngineError::InvalidPdf(format!("{e:?}")),
66 })?;
67 Ok(Self {
68 pdf,
69 settings: InterpreterSettings::default(),
70 })
71 }
72
73 pub fn open_with_processing_limits(
75 data: impl Into<pdf_render::pdf_syntax::PdfData>,
76 limits: ProcessingLimits,
77 ) -> Result<Self> {
78 let syntax_limits = PdfLoadLimits::new()
79 .max_object_depth(limits.max_object_depth)
80 .max_image_pixels(limits.max_image_pixels)
81 .max_stream_bytes(limits.max_stream_bytes);
82 let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
83 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
84 EngineError::Encrypted(format!("{d:?}"))
85 }
86 _ => EngineError::InvalidPdf(format!("{e:?}")),
87 })?;
88 let settings = InterpreterSettings {
89 max_operator_count: Some(limits.max_operator_count),
90 ..InterpreterSettings::default()
91 };
92 Ok(Self { pdf, settings })
93 }
94
95 pub fn open_with_password(
97 data: impl Into<pdf_render::pdf_syntax::PdfData>,
98 password: &str,
99 ) -> Result<Self> {
100 let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
101 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
102 EngineError::Encrypted(format!("{d:?}"))
103 }
104 _ => EngineError::InvalidPdf(format!("{e:?}")),
105 })?;
106 Ok(Self {
107 pdf,
108 settings: InterpreterSettings::default(),
109 })
110 }
111
112 pub fn open_with_password_and_processing_limits(
114 data: impl Into<pdf_render::pdf_syntax::PdfData>,
115 password: &str,
116 limits: ProcessingLimits,
117 ) -> Result<Self> {
118 let syntax_limits = PdfLoadLimits::new()
119 .max_object_depth(limits.max_object_depth)
120 .max_image_pixels(limits.max_image_pixels)
121 .max_stream_bytes(limits.max_stream_bytes);
122 let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
123 |e| match e {
124 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
125 EngineError::Encrypted(format!("{d:?}"))
126 }
127 _ => EngineError::InvalidPdf(format!("{e:?}")),
128 },
129 )?;
130 let settings = InterpreterSettings {
131 max_operator_count: Some(limits.max_operator_count),
132 ..InterpreterSettings::default()
133 };
134 Ok(Self { pdf, settings })
135 }
136
137 pub fn pdf(&self) -> &Pdf {
139 &self.pdf
140 }
141
142 pub fn set_settings(&mut self, settings: InterpreterSettings) {
144 self.settings = settings;
145 }
146
147 pub fn page_count(&self) -> usize {
149 self.pdf.pages().len()
150 }
151
152 pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
154 let page = self.get_page(index)?;
155 Ok(geometry::extract_geometry(page))
156 }
157
158 pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
166 #[cfg(feature = "xfa")]
167 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
168 return flat_doc.render_page(index, options);
169 }
170 let page = self.get_page(index)?;
171 let (w, h) = page.render_dimensions();
175 if w <= 0.0 || h <= 0.0 {
176 return Err(EngineError::InvalidPageGeometry {
177 width: w,
178 height: h,
179 reason: "page has zero or negative dimensions".into(),
180 });
181 }
182 const MIN_PAGE_PT: f32 = 1.0;
185 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
186 return Err(EngineError::InvalidPageGeometry {
187 width: w,
188 height: h,
189 reason: "page too small to render (< 1pt)".into(),
190 });
191 }
192 Ok(render::render_page(page, options, &self.settings))
193 }
194
195 pub fn render_page_with_config(
200 &self,
201 index: usize,
202 config: &RenderConfig,
203 ) -> Result<RenderedPage> {
204 #[cfg(feature = "xfa")]
205 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
206 return flat_doc.render_page_with_config(index, config);
207 }
208 let page = self.get_page(index)?;
209 let (w, h) = page.render_dimensions();
210 if w <= 0.0 || h <= 0.0 {
211 return Err(EngineError::InvalidPageGeometry {
212 width: w,
213 height: h,
214 reason: "page has zero or negative dimensions".into(),
215 });
216 }
217 const MIN_PAGE_PT: f32 = 1.0;
218 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
219 return Err(EngineError::InvalidPageGeometry {
220 width: w,
221 height: h,
222 reason: "page too small to render (< 1pt)".into(),
223 });
224 }
225 Ok(render::render_page_with_config(
226 page,
227 config,
228 &self.settings,
229 ))
230 }
231
232 pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
234 self.render_page_with_config(
235 index,
236 &RenderConfig {
237 color_mode: ColorMode::PreserveCmyk,
238 dpi,
239 },
240 )
241 }
242
243 pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
245 let pages = self.pdf.pages();
246 #[cfg(feature = "parallel")]
247 return (0..pages.len())
248 .into_par_iter()
249 .map(|i| render::render_page(&pages[i], options, &self.settings))
250 .collect();
251 #[cfg(not(feature = "parallel"))]
252 (0..pages.len())
253 .map(|i| render::render_page(&pages[i], options, &self.settings))
254 .collect()
255 }
256
257 pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
259 let pages = self.pdf.pages();
260 #[cfg(feature = "parallel")]
261 return (0..pages.len())
262 .into_par_iter()
263 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
264 .collect();
265 #[cfg(not(feature = "parallel"))]
266 (0..pages.len())
267 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
268 .collect()
269 }
270
271 pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
273 let page = self.get_page(index)?;
274 Ok(render::render_thumbnail(
275 page,
276 options.max_dimension,
277 &self.settings,
278 ))
279 }
280
281 pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
283 let pages = self.pdf.pages();
284 #[cfg(feature = "parallel")]
285 return (0..pages.len())
286 .into_par_iter()
287 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
288 .collect();
289 #[cfg(not(feature = "parallel"))]
290 (0..pages.len())
291 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
292 .collect()
293 }
294
295 pub fn extract_text(&self, index: usize) -> Result<String> {
297 let page = self.get_page(index)?;
298 let mut device = TextExtractionDevice::new();
299 let mut ctx = self.create_context(page);
300 interpret_page(page, &mut ctx, &mut device);
301 Ok(device.into_text())
302 }
303
304 #[doc(hidden)]
306 pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
307 where
308 I: IntoIterator<Item = usize>,
309 {
310 let pages = self.pdf.pages();
311 let mut settings = self.text_extraction_settings();
312 let indices = indices.into_iter();
313 let (lower_bound, upper_bound) = indices.size_hint();
314 let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
315
316 for index in indices {
317 let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
318 index,
319 count: pages.len(),
320 })?;
321 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
322 settings = next_settings;
323 texts.push(text);
324 }
325
326 Ok(texts)
327 }
328
329 pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
331 let page = self.get_page(index)?;
332 let mut device = TextExtractionDevice::new();
333 let mut ctx = self.create_context(page);
334 interpret_page(page, &mut ctx, &mut device);
335 Ok(device.into_blocks())
336 }
337
338 pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
340 let pages = self.pdf.pages();
341 let mut settings = self.text_extraction_settings();
342 let mut blocks = Vec::with_capacity(pages.len());
343
344 for page in pages.iter() {
345 let (page_blocks, next_settings) =
346 Self::extract_text_blocks_with_settings(page, settings);
347 settings = next_settings;
348 blocks.push(page_blocks);
349 }
350
351 blocks
352 }
353
354 pub fn extract_acroform_text(&self) -> String {
360 let Some(tree) = parse_acroform(&self.pdf) else {
361 return String::new();
362 };
363 let mut parts: Vec<String> = Vec::new();
364 for id in tree.all_ids() {
365 let node = tree.get(id);
366 if node.children.is_empty() {
367 let value_str = match &node.value {
369 Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
370 Some(FieldValue::StringArray(arr)) => {
371 let joined = arr
372 .iter()
373 .filter(|s| !s.is_empty())
374 .cloned()
375 .collect::<Vec<_>>()
376 .join(", ");
377 if joined.is_empty() {
378 None
379 } else {
380 Some(joined)
381 }
382 }
383 _ => None,
384 };
385 let button_caption =
386 value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
387 let extracted = value_str.or_else(|| {
388 button_caption.then(|| {
389 node.mk
390 .as_ref()
391 .and_then(|mk| mk.caption.as_ref())
392 .filter(|caption| !caption.is_empty())
393 .cloned()
394 })?
395 });
396 if let Some(s) = extracted {
397 parts.push(s);
398 }
399 }
400 }
401 parts.join("\n")
402 }
403
404 pub fn extract_all_text(&self) -> String {
407 let pages = self.pdf.pages();
408 let mut settings = self.text_extraction_settings();
409 let mut page_texts = Vec::with_capacity(pages.len());
410 for page in pages.iter() {
411 let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
412 settings = next_settings;
413 page_texts.push(page_text);
414 }
415
416 let mut text = join_page_texts(page_texts.iter().map(String::as_str));
417 let acroform = self.extract_acroform_text();
418 if !acroform.is_empty() {
419 if !text.is_empty() && !text.ends_with('\n') {
420 text.push('\n');
421 }
422 text.push_str(&acroform);
423 }
424 text
425 }
426
427 pub fn search_text(&self, query: &str) -> Vec<usize> {
429 let pages = self.pdf.pages();
430 let query_lower = query.to_lowercase();
431 #[cfg(feature = "parallel")]
432 let page_contains = |i: usize| -> Option<usize> {
433 let page = &pages[i];
434 let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
435 if text.to_lowercase().contains(&query_lower) {
436 Some(i)
437 } else {
438 None
439 }
440 };
441 #[cfg(feature = "parallel")]
442 return (0..pages.len())
443 .into_par_iter()
444 .filter_map(page_contains)
445 .collect();
446 #[cfg(not(feature = "parallel"))]
447 {
448 let mut settings = self.text_extraction_settings();
449 let mut hits = Vec::new();
450 for (i, page) in pages.iter().enumerate() {
451 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
452 settings = next_settings;
453 if text.to_lowercase().contains(&query_lower) {
454 hits.push(i);
455 }
456 }
457 hits
458 }
459 }
460
461 pub fn info(&self) -> DocumentInfo {
463 let meta = self.pdf.metadata();
464 DocumentInfo {
465 title: meta.title.as_ref().map(|b| bytes_to_string(b)),
466 author: meta.author.as_ref().map(|b| bytes_to_string(b)),
467 subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
468 keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
469 creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
470 producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
471 }
472 }
473
474 pub fn bookmarks(&self) -> Vec<BookmarkItem> {
476 let xref = self.pdf.xref();
477 let root_id = xref.root_id();
478 let catalog: Dict<'_> = match xref.get(root_id) {
479 Some(d) => d,
480 None => return Vec::new(),
481 };
482
483 let outlines: Dict<'_> = match catalog.get(OUTLINES) {
484 Some(d) => d,
485 None => return Vec::new(),
486 };
487
488 let first: Dict<'_> = match outlines.get(FIRST) {
489 Some(d) => d,
490 None => return Vec::new(),
491 };
492
493 parse_outline_items(&first)
494 }
495
496 pub fn ocr_page(
515 &self,
516 index: usize,
517 backend: &dyn crate::ocr::OcrBackend,
518 dpi: f64,
519 ) -> crate::error::Result<crate::ocr::OcrResult> {
520 let opts = crate::render::RenderOptions {
521 dpi,
522 ..Default::default()
523 };
524 let rendered = self.render_page(index, &opts)?;
525
526 let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
528 for chunk in rendered.pixels.chunks(4) {
529 rgb.push(chunk[0]);
530 rgb.push(chunk[1]);
531 rgb.push(chunk[2]);
532 }
533
534 backend
535 .recognize(&rgb, rendered.width, rendered.height)
536 .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
537 }
538
539 fn get_page(&self, index: usize) -> Result<&Page<'_>> {
540 let pages = self.pdf.pages();
541 if index >= pages.len() {
542 return Err(EngineError::PageOutOfRange {
543 index,
544 count: pages.len(),
545 });
546 }
547 Ok(&pages[index])
548 }
549
550 fn text_extraction_settings(&self) -> InterpreterSettings {
551 let mut settings = self.settings.clone();
552 settings.skip_signature_widgets = false;
555 settings
556 }
557
558 fn create_context<'a>(&self, page: &Page<'a>) -> Context<'a> {
559 Self::create_context_with_settings(page, self.text_extraction_settings())
560 }
561
562 fn create_context_with_settings<'a>(
563 page: &Page<'a>,
564 settings: InterpreterSettings,
565 ) -> Context<'a> {
566 let (w, h) = page.render_dimensions();
567 Context::new(
568 page.initial_transform(false),
569 Rect::new(0.0, 0.0, w as f64, h as f64),
570 page.xref(),
571 settings,
572 )
573 }
574
575 fn extract_text_with_settings<'a>(
576 page: &Page<'a>,
577 settings: InterpreterSettings,
578 ) -> (String, InterpreterSettings) {
579 let mut device = TextExtractionDevice::new();
580 let mut ctx = Self::create_context_with_settings(page, settings);
581 interpret_page(page, &mut ctx, &mut device);
582 let settings = ctx.into_settings();
583 (device.into_text(), settings)
584 }
585
586 fn extract_text_blocks_with_settings<'a>(
587 page: &Page<'a>,
588 settings: InterpreterSettings,
589 ) -> (Vec<TextBlock>, InterpreterSettings) {
590 let mut device = TextExtractionDevice::new();
591 let mut ctx = Self::create_context_with_settings(page, settings);
592 interpret_page(page, &mut ctx, &mut device);
593 let settings = ctx.into_settings();
594 (device.into_blocks(), settings)
595 }
596
597 #[cfg(feature = "xfa")]
598 fn open_flattened_xfa_for_render(&self) -> Option<Self> {
599 if !crate::xfa::has_xfa(self) {
600 return None;
601 }
602
603 let flat_bytes = crate::xfa::flatten(self).ok()?;
604 let mut flat_doc = Self::open(flat_bytes).ok()?;
605 flat_doc.settings = self.settings.clone();
606 Some(flat_doc)
607 }
608}
609
610fn join_page_texts<I>(page_texts: I) -> String
611where
612 I: IntoIterator,
613 I::Item: AsRef<str>,
614{
615 let mut text = String::new();
616 let mut is_first = true;
617
618 for page_text in page_texts {
619 if !is_first {
620 while !text.is_empty() && !text.ends_with("\n\n") {
621 text.push('\n');
622 }
623 text.push('\u{000C}');
624 }
625 text.push_str(page_text.as_ref());
626 is_first = false;
627 }
628
629 text
630}
631
632#[cfg(test)]
633mod extract_all_text_tests {
634 use super::join_page_texts;
635
636 #[test]
637 fn separates_nonempty_pages_like_pdftotext() {
638 assert_eq!(
639 join_page_texts(["Page 1", "Page 2"]),
640 "Page 1\n\n\u{000C}Page 2"
641 );
642 }
643
644 #[test]
645 fn preserves_leading_blank_pages_without_extra_newlines() {
646 assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
647 }
648
649 #[test]
650 fn reuses_existing_blank_line_before_form_feed() {
651 assert_eq!(
652 join_page_texts(["Page 1\n\n", "Page 2"]),
653 "Page 1\n\n\u{000C}Page 2"
654 );
655 }
656}
657
658fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
660 let mut items = Vec::new();
661 let mut current: Option<Dict<'_>> = Some(item_dict.clone());
662
663 while let Some(dict) = current {
664 let title = dict
665 .get::<pdf_render::pdf_syntax::object::String>(TITLE)
666 .map(|s| bytes_to_string(s.as_bytes()))
667 .unwrap_or_default();
668
669 let children = match dict.get::<Dict<'_>>(FIRST) {
670 Some(child_dict) => parse_outline_items(&child_dict),
671 None => Vec::new(),
672 };
673
674 items.push(BookmarkItem {
675 title,
676 page: None, children,
678 });
679
680 current = dict.get::<Dict<'_>>(NEXT);
681 }
682
683 items
684}
685
686fn bytes_to_string(bytes: &[u8]) -> String {
688 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
690 let chars: Vec<u16> = bytes[2..]
691 .chunks(2)
692 .filter_map(|c| {
693 if c.len() == 2 {
694 Some(u16::from_be_bytes([c[0], c[1]]))
695 } else {
696 None
697 }
698 })
699 .collect();
700 return String::from_utf16_lossy(&chars);
701 }
702
703 match std::str::from_utf8(bytes) {
705 Ok(s) => s.to_string(),
706 Err(_) => bytes.iter().map(|&b| b as char).collect(),
707 }
708}
709
710#[cfg(test)]
711mod tests {
712 use super::*;
713 use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
714 use lopdf::{Document as LoDocument, Object};
715 use std::path::PathBuf;
716
717 fn corpus_path(name: &str) -> PathBuf {
718 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
719 .join("../../corpus")
720 .join(name)
721 }
722
723 fn normalize_text(text: &str) -> String {
724 text.split_whitespace().collect::<Vec<_>>().join(" ")
725 }
726
727 fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
728 fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
729 match dict.get(key).ok()? {
730 Object::Name(name) => Some(name.clone()),
731 _ => None,
732 }
733 }
734
735 fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
736 let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
737 return false;
738 };
739 let Some(Object::Reference(desc_id)) = descendants.first() else {
740 return false;
741 };
742 let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
743 return false;
744 };
745 matches!(
746 descendant.get(b"Subtype").ok(),
747 Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
748 )
749 }
750
751 let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
752 let ids: Vec<_> = doc.objects.keys().copied().collect();
753 let mut removed = 0usize;
754
755 for id in ids {
756 let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
757 continue;
758 };
759 if !matches!(
760 dict.get(b"Subtype").ok(),
761 Some(Object::Name(name)) if name.as_slice() == b"Type0"
762 ) {
763 continue;
764 }
765 if !matches!(
766 get_name(dict, b"Encoding").as_deref(),
767 Some(b"Identity-H") | Some(b"Identity-V")
768 ) {
769 continue;
770 }
771 if !descendant_is_cidfont_type2(&doc, dict) {
772 continue;
773 }
774
775 if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
776 if type0.has(b"ToUnicode") {
777 type0.remove(b"ToUnicode");
778 removed += 1;
779 }
780 }
781 }
782
783 let mut out = Vec::new();
784 doc.save_to(&mut out)
785 .expect("save stripped-to-unicode fixture");
786 (out, removed)
787 }
788
789 fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
790 use lopdf::{dictionary, Document, Object, Stream};
791
792 let mut doc = Document::with_version("1.4");
793
794 let pages_id = doc.new_object_id();
795 let page_id = doc.new_object_id();
796 let content = format!("{color_operator}\n0 0 72 72 re f\n");
797 let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
798
799 doc.objects.insert(
800 page_id,
801 Object::Dictionary(dictionary! {
802 "Type" => Object::Name(b"Page".to_vec()),
803 "Parent" => Object::Reference(pages_id),
804 "MediaBox" => Object::Array(vec![
805 Object::Integer(0),
806 Object::Integer(0),
807 Object::Integer(72),
808 Object::Integer(72),
809 ]),
810 "Contents" => Object::Reference(content_id),
811 }),
812 );
813
814 doc.objects.insert(
815 pages_id,
816 Object::Dictionary(dictionary! {
817 "Type" => Object::Name(b"Pages".to_vec()),
818 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
819 "Count" => Object::Integer(1),
820 }),
821 );
822
823 let catalog_id = doc.new_object_id();
824 doc.objects.insert(
825 catalog_id,
826 Object::Dictionary(dictionary! {
827 "Type" => Object::Name(b"Catalog".to_vec()),
828 "Pages" => Object::Reference(pages_id),
829 }),
830 );
831
832 doc.trailer.set("Root", Object::Reference(catalog_id));
833
834 let mut bytes = Vec::new();
835 doc.save_to(&mut bytes).expect("save solid fill fixture");
836 bytes
837 }
838
839 fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
840 use lopdf::{dictionary, Document, Object, Stream};
841
842 let mut doc = Document::with_version("1.4");
843 let pages_id = doc.new_object_id();
844 let page_id = doc.new_object_id();
845 let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
846 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
847
848 doc.objects.insert(
849 page_id,
850 Object::Dictionary(dictionary! {
851 "Type" => "Page",
852 "Parent" => Object::Reference(pages_id),
853 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
854 "Contents" => Object::Reference(content_id),
855 }),
856 );
857 doc.objects.insert(
858 pages_id,
859 Object::Dictionary(dictionary! {
860 "Type" => "Pages",
861 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
862 "Count" => Object::Integer(1),
863 }),
864 );
865 let catalog_id = doc.new_object_id();
866 doc.objects.insert(
867 catalog_id,
868 Object::Dictionary(dictionary! {
869 "Type" => "Catalog",
870 "Pages" => Object::Reference(pages_id),
871 }),
872 );
873 doc.trailer.set("Root", Object::Reference(catalog_id));
874
875 let mut bytes = Vec::new();
876 doc.save_to(&mut bytes)
877 .expect("save mixed rgb/cmyk fixture");
878 bytes
879 }
880
881 fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
882 use lopdf::{dictionary, Document, Object, Stream};
883
884 let mut doc = Document::with_version("1.4");
885 let pages_id = doc.new_object_id();
886 let page_id = doc.new_object_id();
887 let gs_id = doc.add_object(Object::Dictionary(dictionary! {
888 "Type" => "ExtGState",
889 "ca" => Object::Real(0.5),
890 }));
891 let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
892 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
893
894 doc.objects.insert(
895 page_id,
896 Object::Dictionary(dictionary! {
897 "Type" => "Page",
898 "Parent" => Object::Reference(pages_id),
899 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
900 "Resources" => dictionary! {
901 "ExtGState" => dictionary! {
902 "GS1" => Object::Reference(gs_id),
903 },
904 },
905 "Contents" => Object::Reference(content_id),
906 }),
907 );
908 doc.objects.insert(
909 pages_id,
910 Object::Dictionary(dictionary! {
911 "Type" => "Pages",
912 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
913 "Count" => Object::Integer(1),
914 }),
915 );
916 let catalog_id = doc.new_object_id();
917 doc.objects.insert(
918 catalog_id,
919 Object::Dictionary(dictionary! {
920 "Type" => "Catalog",
921 "Pages" => Object::Reference(pages_id),
922 }),
923 );
924 doc.trailer.set("Root", Object::Reference(catalog_id));
925
926 let mut bytes = Vec::new();
927 doc.save_to(&mut bytes)
928 .expect("save transparent cmyk fixture");
929 bytes
930 }
931
932 fn cmyk_image_pdf_bytes() -> Vec<u8> {
933 use lopdf::{dictionary, Document, Object, Stream};
934
935 let mut doc = Document::with_version("1.4");
936 let pages_id = doc.new_object_id();
937 let page_id = doc.new_object_id();
938 let image_id = doc.add_object(Stream::new(
939 dictionary! {
940 "Type" => "XObject",
941 "Subtype" => "Image",
942 "Width" => Object::Integer(2),
943 "Height" => Object::Integer(1),
944 "BitsPerComponent" => Object::Integer(8),
945 "ColorSpace" => "DeviceCMYK",
946 },
947 vec![255, 0, 0, 0, 0, 255, 0, 0],
948 ));
949 let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
950 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
951
952 doc.objects.insert(
953 page_id,
954 Object::Dictionary(dictionary! {
955 "Type" => "Page",
956 "Parent" => Object::Reference(pages_id),
957 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
958 "Resources" => dictionary! {
959 "XObject" => dictionary! {
960 "Im1" => Object::Reference(image_id),
961 },
962 },
963 "Contents" => Object::Reference(content_id),
964 }),
965 );
966 doc.objects.insert(
967 pages_id,
968 Object::Dictionary(dictionary! {
969 "Type" => "Pages",
970 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
971 "Count" => Object::Integer(1),
972 }),
973 );
974 let catalog_id = doc.new_object_id();
975 doc.objects.insert(
976 catalog_id,
977 Object::Dictionary(dictionary! {
978 "Type" => "Catalog",
979 "Pages" => Object::Reference(pages_id),
980 }),
981 );
982 doc.trailer.set("Root", Object::Reference(catalog_id));
983
984 let mut bytes = Vec::new();
985 doc.save_to(&mut bytes).expect("save cmyk image fixture");
986 bytes
987 }
988
989 fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
990 let idx = ((y * rendered.width + x) * 4) as usize;
991 [
992 rendered.pixels[idx],
993 rendered.pixels[idx + 1],
994 rendered.pixels[idx + 2],
995 rendered.pixels[idx + 3],
996 ]
997 }
998
999 fn non_embedded_truetype_pdf_bytes(
1004 base_font: &[u8],
1005 encoding: &[u8],
1006 text_bytes: &[u8],
1007 ) -> Vec<u8> {
1008 use lopdf::{dictionary, Document, Object, Stream};
1009
1010 let mut doc = Document::with_version("1.4");
1011
1012 let font_id = doc.add_object(Object::Dictionary(dictionary! {
1013 "Type" => "Font",
1014 "Subtype" => "TrueType",
1015 "Name" => Object::Name(b"F0".to_vec()),
1016 "BaseFont" => Object::Name(base_font.to_vec()),
1017 "Encoding" => Object::Name(encoding.to_vec()),
1018 }));
1019
1020 let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1021 "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1022 }));
1023
1024 let mut content = Vec::new();
1025 content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1026 for &b in text_bytes {
1027 match b {
1028 b'(' | b')' | b'\\' => {
1029 content.push(b'\\');
1030 content.push(b);
1031 }
1032 _ => content.push(b),
1033 }
1034 }
1035 content.extend_from_slice(b") Tj\nET\n");
1036 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1037
1038 let pages_id = doc.new_object_id();
1039 let page_id = doc.add_object(Object::Dictionary(dictionary! {
1040 "Type" => "Page",
1041 "Parent" => Object::Reference(pages_id),
1042 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1043 "Resources" => Object::Reference(resources_id),
1044 "Contents" => Object::Reference(content_id),
1045 }));
1046 doc.objects.insert(
1047 pages_id,
1048 Object::Dictionary(dictionary! {
1049 "Type" => "Pages",
1050 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1051 "Count" => Object::Integer(1),
1052 }),
1053 );
1054 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1055 "Type" => "Catalog",
1056 "Pages" => Object::Reference(pages_id),
1057 }));
1058 doc.trailer.set("Root", Object::Reference(catalog_id));
1059
1060 let mut bytes = Vec::new();
1061 doc.save_to(&mut bytes).expect("save non-embedded fixture");
1062 bytes
1063 }
1064
1065 fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1068 use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1069
1070 let mut doc = Document::with_version("1.4");
1071
1072 let catalog_id = doc.new_object_id();
1073 let pages_id = doc.new_object_id();
1074 let page_id = doc.new_object_id();
1075 let acroform_id = doc.new_object_id();
1076 let content_id = doc.new_object_id();
1077 let widget_id = doc.new_object_id();
1078
1079 doc.objects.insert(
1080 content_id,
1081 Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1082 );
1083 doc.objects.insert(
1084 widget_id,
1085 Object::Dictionary(dictionary! {
1086 "Type" => "Annot",
1087 "Subtype" => "Widget",
1088 "FT" => "Btn",
1089 "Ff" => Object::Integer(1 << 16),
1090 "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1091 "MK" => dictionary! {
1092 "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1093 },
1094 "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1095 "P" => Object::Reference(page_id),
1096 }),
1097 );
1098 doc.objects.insert(
1099 page_id,
1100 Object::Dictionary(dictionary! {
1101 "Type" => "Page",
1102 "Parent" => Object::Reference(pages_id),
1103 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1104 "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1105 "Contents" => Object::Reference(content_id),
1106 }),
1107 );
1108 doc.objects.insert(
1109 pages_id,
1110 Object::Dictionary(dictionary! {
1111 "Type" => "Pages",
1112 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1113 "Count" => Object::Integer(1),
1114 }),
1115 );
1116 doc.objects.insert(
1117 acroform_id,
1118 Object::Dictionary(dictionary! {
1119 "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1120 }),
1121 );
1122 doc.objects.insert(
1123 catalog_id,
1124 Object::Dictionary(dictionary! {
1125 "Type" => "Catalog",
1126 "Pages" => Object::Reference(pages_id),
1127 "AcroForm" => Object::Reference(acroform_id),
1128 }),
1129 );
1130 doc.trailer.set("Root", Object::Reference(catalog_id));
1131
1132 let mut bytes = Vec::new();
1133 doc.save_to(&mut bytes)
1134 .expect("save push-button caption fixture");
1135 bytes
1136 }
1137
1138 #[test]
1139 fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1140 let bytes = non_embedded_truetype_pdf_bytes(
1146 b"TimesNewRoman",
1147 b"WinAnsiEncoding",
1148 b"UNITED STATES DISTRICT COURT",
1149 );
1150 let text = PdfDocument::open(bytes)
1151 .expect("open non-embedded TrueType fixture")
1152 .extract_text(0)
1153 .expect("extract non-embedded TrueType text");
1154 let norm = normalize_text(&text);
1155 assert!(
1156 norm.contains("UNITED STATES DISTRICT COURT"),
1157 "expected WinAnsi-decoded text, got: {norm:?}"
1158 );
1159 }
1160
1161 #[test]
1162 fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1163 let bytes = non_embedded_truetype_pdf_bytes(
1170 b"OpaqueCustomXYZ",
1171 b"WinAnsiEncoding",
1172 b"Hello, world!",
1173 );
1174 let text = PdfDocument::open(bytes)
1175 .expect("open custom non-embedded fixture")
1176 .extract_text(0)
1177 .expect("extract custom non-embedded text");
1178 let norm = normalize_text(&text);
1179 assert!(
1180 norm.contains("Hello, world!"),
1181 "expected WinAnsi-decoded text, got: {norm:?}"
1182 );
1183 }
1184
1185 #[test]
1186 fn extract_acroform_text_includes_push_button_mk_caption() {
1187 let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1188 let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1189
1190 let page_text = doc.extract_text(0).expect("extract page text");
1191 assert!(
1192 normalize_text(&page_text).is_empty(),
1193 "expected empty page content stream, got: {page_text:?}"
1194 );
1195
1196 let acroform_text = doc.extract_acroform_text();
1197 assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1198
1199 let all_text = doc.extract_all_text();
1200 assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1201 }
1202
1203 #[test]
1204 fn bytes_to_string_utf8() {
1205 assert_eq!(bytes_to_string(b"hello"), "hello");
1206 }
1207
1208 #[test]
1209 fn bytes_to_string_latin1() {
1210 let bytes = &[0xC4, 0xD6, 0xDC]; let s = bytes_to_string(bytes);
1212 assert_eq!(s, "ÄÖÜ");
1213 }
1214
1215 #[test]
1216 fn bytes_to_string_utf16() {
1217 let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; assert_eq!(bytes_to_string(bytes), "Hi");
1219 }
1220
1221 #[test]
1222 fn document_info_default() {
1223 let info = DocumentInfo::default();
1224 assert!(info.title.is_none());
1225 assert!(info.author.is_none());
1226 }
1227
1228 #[test]
1229 fn bookmark_item_children() {
1230 let item = BookmarkItem {
1231 title: "Root".into(),
1232 page: None,
1233 children: vec![BookmarkItem {
1234 title: "Child".into(),
1235 page: Some(0),
1236 children: Vec::new(),
1237 }],
1238 };
1239 assert_eq!(item.children.len(), 1);
1240 assert_eq!(item.children[0].title, "Child");
1241 }
1242
1243 #[test]
1244 fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1245 let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1246 let expected = PdfDocument::open(original.clone())
1247 .expect("open original sf181")
1248 .extract_text(0)
1249 .expect("extract original sf181 text");
1250 assert!(
1251 expected.contains("Guide to Personnel Data Standards"),
1252 "unexpected baseline extraction: {expected}"
1253 );
1254
1255 let (stripped, removed) = strip_type0_tounicode(&original);
1256 assert!(
1257 removed > 0,
1258 "expected to strip at least one Type0 ToUnicode"
1259 );
1260
1261 let actual = PdfDocument::open(stripped)
1262 .expect("open stripped sf181")
1263 .extract_text(0)
1264 .expect("extract stripped sf181 text");
1265
1266 let actual_norm = normalize_text(&actual);
1267 let expected_norm = normalize_text(&expected);
1268
1269 assert!(
1270 actual_norm.contains("Guide to Personnel Data Standards"),
1271 "missing main heading after stripping ToUnicode: {actual_norm}"
1272 );
1273 assert!(
1274 actual_norm.contains("Privacy Act Statement"),
1275 "missing body text after stripping ToUnicode: {actual_norm}"
1276 );
1277 assert!(
1278 actual_norm.len() + 32 >= expected_norm.len(),
1279 "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1280 expected_norm.len(),
1281 actual_norm.len()
1282 );
1283 }
1284
1285 #[test]
1286 fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1287 let bytes =
1295 std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1296 let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1297 let text = doc.extract_all_text();
1298
1299 let norm = normalize_text(&text);
1300 assert!(
1301 norm.contains("Transatlantic Council"),
1302 "expected Identity-H codes to resolve as Unicode: {norm}"
1303 );
1304 assert!(
1305 norm.contains("Boy Scouts of America"),
1306 "expected body text to be recovered: {norm}"
1307 );
1308 }
1309
1310 #[test]
1311 fn render_page_with_config_srgb_matches_legacy_render_page() {
1312 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1313 let legacy = doc
1314 .render_page(
1315 0,
1316 &RenderOptions {
1317 dpi: 72.0,
1318 ..Default::default()
1319 },
1320 )
1321 .expect("legacy render succeeds");
1322 let configured = doc
1323 .render_page_with_config(
1324 0,
1325 &RenderConfig {
1326 color_mode: ColorMode::Srgb,
1327 dpi: 72,
1328 },
1329 )
1330 .expect("configured render succeeds");
1331
1332 assert_eq!(legacy.width, configured.width);
1333 assert_eq!(legacy.height, configured.height);
1334 assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1335 assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1336 assert_eq!(legacy.pixels, configured.pixels);
1337 }
1338
1339 #[test]
1340 fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1341 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1342 let rendered = doc
1343 .render_page_with_config(
1344 0,
1345 &RenderConfig {
1346 color_mode: ColorMode::PreserveCmyk,
1347 dpi: 72,
1348 },
1349 )
1350 .expect("cmyk render succeeds");
1351
1352 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1353 assert_eq!(
1354 rendered.pixels.len(),
1355 rendered.width as usize * rendered.height as usize * 4
1356 );
1357 assert_eq!(
1358 pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1359 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1360 );
1361 }
1362
1363 #[test]
1364 fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1365 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1366 let rendered = doc
1367 .render_page_with_config(
1368 0,
1369 &RenderConfig {
1370 color_mode: ColorMode::SimulateCmyk,
1371 dpi: 72,
1372 },
1373 )
1374 .expect("simulate cmyk render succeeds");
1375
1376 assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1377 assert!(!rendered.pixels.is_empty());
1378 }
1379
1380 #[test]
1381 fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1382 let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1383 let rendered = doc
1384 .render_page_with_config(
1385 0,
1386 &RenderConfig {
1387 color_mode: ColorMode::PreserveCmyk,
1388 dpi: 72,
1389 },
1390 )
1391 .expect("mixed render succeeds");
1392
1393 assert_eq!(
1394 pixel_at(&rendered, 54, 36),
1395 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1396 );
1397 assert_ne!(
1398 pixel_at(&rendered, 18, 36),
1399 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1400 );
1401 }
1402
1403 #[test]
1404 fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1405 let doc =
1406 PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1407 let rendered = doc
1408 .render_page_with_config(
1409 0,
1410 &RenderConfig {
1411 color_mode: ColorMode::PreserveCmyk,
1412 dpi: 72,
1413 },
1414 )
1415 .expect("transparent cmyk render succeeds");
1416
1417 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1418 assert_eq!(
1419 rendered.pixels.len(),
1420 rendered.width as usize * rendered.height as usize * 4
1421 );
1422 }
1423
1424 #[test]
1425 fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1426 let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1427 let rendered = doc
1428 .render_page_with_config(
1429 0,
1430 &RenderConfig {
1431 color_mode: ColorMode::PreserveCmyk,
1432 dpi: 72,
1433 },
1434 )
1435 .expect("cmyk image render succeeds");
1436
1437 assert_eq!(rendered.width, 2);
1438 assert_eq!(rendered.height, 1);
1439 assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1440 assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1441 }
1442}