1use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
7use crate::text::{TextBlock, TextExtractionDevice};
8use crate::thumbnail::ThumbnailOptions;
9
10use pdf_forms::parse::parse_acroform;
11use pdf_forms::tree::{FieldType, FieldValue};
12use pdf_render::pdf_interpret::PageExt;
13use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings};
14use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
15use pdf_render::pdf_syntax::object::Dict;
16use pdf_render::pdf_syntax::page::Page;
17use pdf_render::pdf_syntax::Pdf;
18#[cfg(feature = "parallel")]
19use rayon::prelude::*;
20
21use kurbo::Rect;
22
23#[derive(Debug, Clone, Default)]
25pub struct DocumentInfo {
26 pub title: Option<String>,
28 pub author: Option<String>,
30 pub subject: Option<String>,
32 pub keywords: Option<String>,
34 pub creator: Option<String>,
36 pub producer: Option<String>,
38}
39
40#[derive(Debug, Clone)]
42pub struct BookmarkItem {
43 pub title: String,
45 pub page: Option<usize>,
47 pub children: Vec<BookmarkItem>,
49}
50
51pub struct PdfDocument {
53 pdf: Pdf,
54 settings: InterpreterSettings,
55}
56
57impl PdfDocument {
58 pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
60 let pdf = Pdf::new(data).map_err(|e| match e {
61 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
62 EngineError::Encrypted(format!("{d:?}"))
63 }
64 _ => EngineError::InvalidPdf(format!("{e:?}")),
65 })?;
66 Ok(Self {
67 pdf,
68 settings: InterpreterSettings::default(),
69 })
70 }
71
72 pub fn open_with_password(
74 data: impl Into<pdf_render::pdf_syntax::PdfData>,
75 password: &str,
76 ) -> Result<Self> {
77 let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
78 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
79 EngineError::Encrypted(format!("{d:?}"))
80 }
81 _ => EngineError::InvalidPdf(format!("{e:?}")),
82 })?;
83 Ok(Self {
84 pdf,
85 settings: InterpreterSettings::default(),
86 })
87 }
88
89 pub fn pdf(&self) -> &Pdf {
91 &self.pdf
92 }
93
94 pub fn set_settings(&mut self, settings: InterpreterSettings) {
96 self.settings = settings;
97 }
98
99 pub fn page_count(&self) -> usize {
101 self.pdf.pages().len()
102 }
103
104 pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
106 let page = self.get_page(index)?;
107 Ok(geometry::extract_geometry(page))
108 }
109
110 pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
118 #[cfg(feature = "xfa")]
119 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
120 return flat_doc.render_page(index, options);
121 }
122 let page = self.get_page(index)?;
123 let (w, h) = page.render_dimensions();
127 if w <= 0.0 || h <= 0.0 {
128 return Err(EngineError::InvalidPageGeometry {
129 width: w,
130 height: h,
131 reason: "page has zero or negative dimensions".into(),
132 });
133 }
134 const MIN_PAGE_PT: f32 = 1.0;
137 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
138 return Err(EngineError::InvalidPageGeometry {
139 width: w,
140 height: h,
141 reason: "page too small to render (< 1pt)".into(),
142 });
143 }
144 Ok(render::render_page(page, options, &self.settings))
145 }
146
147 pub fn render_page_with_config(
152 &self,
153 index: usize,
154 config: &RenderConfig,
155 ) -> Result<RenderedPage> {
156 #[cfg(feature = "xfa")]
157 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
158 return flat_doc.render_page_with_config(index, config);
159 }
160 let page = self.get_page(index)?;
161 let (w, h) = page.render_dimensions();
162 if w <= 0.0 || h <= 0.0 {
163 return Err(EngineError::InvalidPageGeometry {
164 width: w,
165 height: h,
166 reason: "page has zero or negative dimensions".into(),
167 });
168 }
169 const MIN_PAGE_PT: f32 = 1.0;
170 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
171 return Err(EngineError::InvalidPageGeometry {
172 width: w,
173 height: h,
174 reason: "page too small to render (< 1pt)".into(),
175 });
176 }
177 Ok(render::render_page_with_config(
178 page,
179 config,
180 &self.settings,
181 ))
182 }
183
184 pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
186 self.render_page_with_config(
187 index,
188 &RenderConfig {
189 color_mode: ColorMode::PreserveCmyk,
190 dpi,
191 },
192 )
193 }
194
195 pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
197 let pages = self.pdf.pages();
198 #[cfg(feature = "parallel")]
199 return (0..pages.len())
200 .into_par_iter()
201 .map(|i| render::render_page(&pages[i], options, &self.settings))
202 .collect();
203 #[cfg(not(feature = "parallel"))]
204 (0..pages.len())
205 .map(|i| render::render_page(&pages[i], options, &self.settings))
206 .collect()
207 }
208
209 pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
211 let pages = self.pdf.pages();
212 #[cfg(feature = "parallel")]
213 return (0..pages.len())
214 .into_par_iter()
215 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
216 .collect();
217 #[cfg(not(feature = "parallel"))]
218 (0..pages.len())
219 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
220 .collect()
221 }
222
223 pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
225 let page = self.get_page(index)?;
226 Ok(render::render_thumbnail(
227 page,
228 options.max_dimension,
229 &self.settings,
230 ))
231 }
232
233 pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
235 let pages = self.pdf.pages();
236 #[cfg(feature = "parallel")]
237 return (0..pages.len())
238 .into_par_iter()
239 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
240 .collect();
241 #[cfg(not(feature = "parallel"))]
242 (0..pages.len())
243 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
244 .collect()
245 }
246
247 pub fn extract_text(&self, index: usize) -> Result<String> {
249 let page = self.get_page(index)?;
250 let mut device = TextExtractionDevice::new();
251 let mut ctx = self.create_context(page);
252 interpret_page(page, &mut ctx, &mut device);
253 Ok(device.into_text())
254 }
255
256 #[doc(hidden)]
258 pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
259 where
260 I: IntoIterator<Item = usize>,
261 {
262 let pages = self.pdf.pages();
263 let mut settings = self.text_extraction_settings();
264 let indices = indices.into_iter();
265 let (lower_bound, upper_bound) = indices.size_hint();
266 let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
267
268 for index in indices {
269 let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
270 index,
271 count: pages.len(),
272 })?;
273 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
274 settings = next_settings;
275 texts.push(text);
276 }
277
278 Ok(texts)
279 }
280
281 pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
283 let page = self.get_page(index)?;
284 let mut device = TextExtractionDevice::new();
285 let mut ctx = self.create_context(page);
286 interpret_page(page, &mut ctx, &mut device);
287 Ok(device.into_blocks())
288 }
289
290 pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
292 let pages = self.pdf.pages();
293 let mut settings = self.text_extraction_settings();
294 let mut blocks = Vec::with_capacity(pages.len());
295
296 for page in pages.iter() {
297 let (page_blocks, next_settings) =
298 Self::extract_text_blocks_with_settings(page, settings);
299 settings = next_settings;
300 blocks.push(page_blocks);
301 }
302
303 blocks
304 }
305
306 pub fn extract_acroform_text(&self) -> String {
312 let Some(tree) = parse_acroform(&self.pdf) else {
313 return String::new();
314 };
315 let mut parts: Vec<String> = Vec::new();
316 for id in tree.all_ids() {
317 let node = tree.get(id);
318 if node.children.is_empty() {
319 let value_str = match &node.value {
321 Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
322 Some(FieldValue::StringArray(arr)) => {
323 let joined = arr
324 .iter()
325 .filter(|s| !s.is_empty())
326 .cloned()
327 .collect::<Vec<_>>()
328 .join(", ");
329 if joined.is_empty() {
330 None
331 } else {
332 Some(joined)
333 }
334 }
335 _ => None,
336 };
337 let button_caption =
338 value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
339 let extracted = value_str.or_else(|| {
340 button_caption.then(|| {
341 node.mk
342 .as_ref()
343 .and_then(|mk| mk.caption.as_ref())
344 .filter(|caption| !caption.is_empty())
345 .cloned()
346 })?
347 });
348 if let Some(s) = extracted {
349 parts.push(s);
350 }
351 }
352 }
353 parts.join("\n")
354 }
355
356 pub fn extract_all_text(&self) -> String {
359 let pages = self.pdf.pages();
360 let mut settings = self.text_extraction_settings();
361 let mut page_texts = Vec::with_capacity(pages.len());
362 for page in pages.iter() {
363 let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
364 settings = next_settings;
365 page_texts.push(page_text);
366 }
367
368 let mut text = join_page_texts(page_texts.iter().map(String::as_str));
369 let acroform = self.extract_acroform_text();
370 if !acroform.is_empty() {
371 if !text.is_empty() && !text.ends_with('\n') {
372 text.push('\n');
373 }
374 text.push_str(&acroform);
375 }
376 text
377 }
378
379 pub fn search_text(&self, query: &str) -> Vec<usize> {
381 let pages = self.pdf.pages();
382 let query_lower = query.to_lowercase();
383 #[cfg(feature = "parallel")]
384 let page_contains = |i: usize| -> Option<usize> {
385 let page = &pages[i];
386 let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
387 if text.to_lowercase().contains(&query_lower) {
388 Some(i)
389 } else {
390 None
391 }
392 };
393 #[cfg(feature = "parallel")]
394 return (0..pages.len())
395 .into_par_iter()
396 .filter_map(page_contains)
397 .collect();
398 #[cfg(not(feature = "parallel"))]
399 {
400 let mut settings = self.text_extraction_settings();
401 let mut hits = Vec::new();
402 for (i, page) in pages.iter().enumerate() {
403 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
404 settings = next_settings;
405 if text.to_lowercase().contains(&query_lower) {
406 hits.push(i);
407 }
408 }
409 hits
410 }
411 }
412
413 pub fn info(&self) -> DocumentInfo {
415 let meta = self.pdf.metadata();
416 DocumentInfo {
417 title: meta.title.as_ref().map(|b| bytes_to_string(b)),
418 author: meta.author.as_ref().map(|b| bytes_to_string(b)),
419 subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
420 keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
421 creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
422 producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
423 }
424 }
425
426 pub fn bookmarks(&self) -> Vec<BookmarkItem> {
428 let xref = self.pdf.xref();
429 let root_id = xref.root_id();
430 let catalog: Dict<'_> = match xref.get(root_id) {
431 Some(d) => d,
432 None => return Vec::new(),
433 };
434
435 let outlines: Dict<'_> = match catalog.get(OUTLINES) {
436 Some(d) => d,
437 None => return Vec::new(),
438 };
439
440 let first: Dict<'_> = match outlines.get(FIRST) {
441 Some(d) => d,
442 None => return Vec::new(),
443 };
444
445 parse_outline_items(&first)
446 }
447
448 pub fn ocr_page(
467 &self,
468 index: usize,
469 backend: &dyn crate::ocr::OcrBackend,
470 dpi: f64,
471 ) -> crate::error::Result<crate::ocr::OcrResult> {
472 let opts = crate::render::RenderOptions {
473 dpi,
474 ..Default::default()
475 };
476 let rendered = self.render_page(index, &opts)?;
477
478 let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
480 for chunk in rendered.pixels.chunks(4) {
481 rgb.push(chunk[0]);
482 rgb.push(chunk[1]);
483 rgb.push(chunk[2]);
484 }
485
486 backend
487 .recognize(&rgb, rendered.width, rendered.height)
488 .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
489 }
490
491 fn get_page(&self, index: usize) -> Result<&Page<'_>> {
492 let pages = self.pdf.pages();
493 if index >= pages.len() {
494 return Err(EngineError::PageOutOfRange {
495 index,
496 count: pages.len(),
497 });
498 }
499 Ok(&pages[index])
500 }
501
502 fn text_extraction_settings(&self) -> InterpreterSettings {
503 let mut settings = self.settings.clone();
504 settings.skip_signature_widgets = false;
507 settings
508 }
509
510 fn create_context<'a>(&self, page: &Page<'a>) -> Context<'a> {
511 Self::create_context_with_settings(page, self.text_extraction_settings())
512 }
513
514 fn create_context_with_settings<'a>(
515 page: &Page<'a>,
516 settings: InterpreterSettings,
517 ) -> Context<'a> {
518 let (w, h) = page.render_dimensions();
519 Context::new(
520 page.initial_transform(false),
521 Rect::new(0.0, 0.0, w as f64, h as f64),
522 page.xref(),
523 settings,
524 )
525 }
526
527 fn extract_text_with_settings<'a>(
528 page: &Page<'a>,
529 settings: InterpreterSettings,
530 ) -> (String, InterpreterSettings) {
531 let mut device = TextExtractionDevice::new();
532 let mut ctx = Self::create_context_with_settings(page, settings);
533 interpret_page(page, &mut ctx, &mut device);
534 let settings = ctx.into_settings();
535 (device.into_text(), settings)
536 }
537
538 fn extract_text_blocks_with_settings<'a>(
539 page: &Page<'a>,
540 settings: InterpreterSettings,
541 ) -> (Vec<TextBlock>, InterpreterSettings) {
542 let mut device = TextExtractionDevice::new();
543 let mut ctx = Self::create_context_with_settings(page, settings);
544 interpret_page(page, &mut ctx, &mut device);
545 let settings = ctx.into_settings();
546 (device.into_blocks(), settings)
547 }
548
549 #[cfg(feature = "xfa")]
550 fn open_flattened_xfa_for_render(&self) -> Option<Self> {
551 if !crate::xfa::has_xfa(self) {
552 return None;
553 }
554
555 let flat_bytes = crate::xfa::flatten(self).ok()?;
556 let mut flat_doc = Self::open(flat_bytes).ok()?;
557 flat_doc.settings = self.settings.clone();
558 Some(flat_doc)
559 }
560}
561
562fn join_page_texts<I>(page_texts: I) -> String
563where
564 I: IntoIterator,
565 I::Item: AsRef<str>,
566{
567 let mut text = String::new();
568 let mut is_first = true;
569
570 for page_text in page_texts {
571 if !is_first {
572 while !text.is_empty() && !text.ends_with("\n\n") {
573 text.push('\n');
574 }
575 text.push('\u{000C}');
576 }
577 text.push_str(page_text.as_ref());
578 is_first = false;
579 }
580
581 text
582}
583
584#[cfg(test)]
585mod extract_all_text_tests {
586 use super::join_page_texts;
587
588 #[test]
589 fn separates_nonempty_pages_like_pdftotext() {
590 assert_eq!(
591 join_page_texts(["Page 1", "Page 2"]),
592 "Page 1\n\n\u{000C}Page 2"
593 );
594 }
595
596 #[test]
597 fn preserves_leading_blank_pages_without_extra_newlines() {
598 assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
599 }
600
601 #[test]
602 fn reuses_existing_blank_line_before_form_feed() {
603 assert_eq!(
604 join_page_texts(["Page 1\n\n", "Page 2"]),
605 "Page 1\n\n\u{000C}Page 2"
606 );
607 }
608}
609
610fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
612 let mut items = Vec::new();
613 let mut current: Option<Dict<'_>> = Some(item_dict.clone());
614
615 while let Some(dict) = current {
616 let title = dict
617 .get::<pdf_render::pdf_syntax::object::String>(TITLE)
618 .map(|s| bytes_to_string(s.as_bytes()))
619 .unwrap_or_default();
620
621 let children = match dict.get::<Dict<'_>>(FIRST) {
622 Some(child_dict) => parse_outline_items(&child_dict),
623 None => Vec::new(),
624 };
625
626 items.push(BookmarkItem {
627 title,
628 page: None, children,
630 });
631
632 current = dict.get::<Dict<'_>>(NEXT);
633 }
634
635 items
636}
637
638fn bytes_to_string(bytes: &[u8]) -> String {
640 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
642 let chars: Vec<u16> = bytes[2..]
643 .chunks(2)
644 .filter_map(|c| {
645 if c.len() == 2 {
646 Some(u16::from_be_bytes([c[0], c[1]]))
647 } else {
648 None
649 }
650 })
651 .collect();
652 return String::from_utf16_lossy(&chars);
653 }
654
655 match std::str::from_utf8(bytes) {
657 Ok(s) => s.to_string(),
658 Err(_) => bytes.iter().map(|&b| b as char).collect(),
659 }
660}
661
662#[cfg(test)]
663mod tests {
664 use super::*;
665 use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
666 use lopdf::{Document as LoDocument, Object};
667 use std::path::PathBuf;
668
669 fn corpus_path(name: &str) -> PathBuf {
670 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
671 .join("../../corpus")
672 .join(name)
673 }
674
675 fn normalize_text(text: &str) -> String {
676 text.split_whitespace().collect::<Vec<_>>().join(" ")
677 }
678
679 fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
680 fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
681 match dict.get(key).ok()? {
682 Object::Name(name) => Some(name.clone()),
683 _ => None,
684 }
685 }
686
687 fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
688 let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
689 return false;
690 };
691 let Some(Object::Reference(desc_id)) = descendants.first() else {
692 return false;
693 };
694 let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
695 return false;
696 };
697 matches!(
698 descendant.get(b"Subtype").ok(),
699 Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
700 )
701 }
702
703 let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
704 let ids: Vec<_> = doc.objects.keys().copied().collect();
705 let mut removed = 0usize;
706
707 for id in ids {
708 let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
709 continue;
710 };
711 if !matches!(
712 dict.get(b"Subtype").ok(),
713 Some(Object::Name(name)) if name.as_slice() == b"Type0"
714 ) {
715 continue;
716 }
717 if !matches!(
718 get_name(dict, b"Encoding").as_deref(),
719 Some(b"Identity-H") | Some(b"Identity-V")
720 ) {
721 continue;
722 }
723 if !descendant_is_cidfont_type2(&doc, dict) {
724 continue;
725 }
726
727 if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
728 if type0.has(b"ToUnicode") {
729 type0.remove(b"ToUnicode");
730 removed += 1;
731 }
732 }
733 }
734
735 let mut out = Vec::new();
736 doc.save_to(&mut out)
737 .expect("save stripped-to-unicode fixture");
738 (out, removed)
739 }
740
741 fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
742 use lopdf::{dictionary, Document, Object, Stream};
743
744 let mut doc = Document::with_version("1.4");
745
746 let pages_id = doc.new_object_id();
747 let page_id = doc.new_object_id();
748 let content = format!("{color_operator}\n0 0 72 72 re f\n");
749 let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
750
751 doc.objects.insert(
752 page_id,
753 Object::Dictionary(dictionary! {
754 "Type" => Object::Name(b"Page".to_vec()),
755 "Parent" => Object::Reference(pages_id),
756 "MediaBox" => Object::Array(vec![
757 Object::Integer(0),
758 Object::Integer(0),
759 Object::Integer(72),
760 Object::Integer(72),
761 ]),
762 "Contents" => Object::Reference(content_id),
763 }),
764 );
765
766 doc.objects.insert(
767 pages_id,
768 Object::Dictionary(dictionary! {
769 "Type" => Object::Name(b"Pages".to_vec()),
770 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
771 "Count" => Object::Integer(1),
772 }),
773 );
774
775 let catalog_id = doc.new_object_id();
776 doc.objects.insert(
777 catalog_id,
778 Object::Dictionary(dictionary! {
779 "Type" => Object::Name(b"Catalog".to_vec()),
780 "Pages" => Object::Reference(pages_id),
781 }),
782 );
783
784 doc.trailer.set("Root", Object::Reference(catalog_id));
785
786 let mut bytes = Vec::new();
787 doc.save_to(&mut bytes).expect("save solid fill fixture");
788 bytes
789 }
790
791 fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
792 use lopdf::{dictionary, Document, Object, Stream};
793
794 let mut doc = Document::with_version("1.4");
795 let pages_id = doc.new_object_id();
796 let page_id = doc.new_object_id();
797 let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
798 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
799
800 doc.objects.insert(
801 page_id,
802 Object::Dictionary(dictionary! {
803 "Type" => "Page",
804 "Parent" => Object::Reference(pages_id),
805 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
806 "Contents" => Object::Reference(content_id),
807 }),
808 );
809 doc.objects.insert(
810 pages_id,
811 Object::Dictionary(dictionary! {
812 "Type" => "Pages",
813 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
814 "Count" => Object::Integer(1),
815 }),
816 );
817 let catalog_id = doc.new_object_id();
818 doc.objects.insert(
819 catalog_id,
820 Object::Dictionary(dictionary! {
821 "Type" => "Catalog",
822 "Pages" => Object::Reference(pages_id),
823 }),
824 );
825 doc.trailer.set("Root", Object::Reference(catalog_id));
826
827 let mut bytes = Vec::new();
828 doc.save_to(&mut bytes)
829 .expect("save mixed rgb/cmyk fixture");
830 bytes
831 }
832
833 fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
834 use lopdf::{dictionary, Document, Object, Stream};
835
836 let mut doc = Document::with_version("1.4");
837 let pages_id = doc.new_object_id();
838 let page_id = doc.new_object_id();
839 let gs_id = doc.add_object(Object::Dictionary(dictionary! {
840 "Type" => "ExtGState",
841 "ca" => Object::Real(0.5),
842 }));
843 let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
844 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
845
846 doc.objects.insert(
847 page_id,
848 Object::Dictionary(dictionary! {
849 "Type" => "Page",
850 "Parent" => Object::Reference(pages_id),
851 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
852 "Resources" => dictionary! {
853 "ExtGState" => dictionary! {
854 "GS1" => Object::Reference(gs_id),
855 },
856 },
857 "Contents" => Object::Reference(content_id),
858 }),
859 );
860 doc.objects.insert(
861 pages_id,
862 Object::Dictionary(dictionary! {
863 "Type" => "Pages",
864 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
865 "Count" => Object::Integer(1),
866 }),
867 );
868 let catalog_id = doc.new_object_id();
869 doc.objects.insert(
870 catalog_id,
871 Object::Dictionary(dictionary! {
872 "Type" => "Catalog",
873 "Pages" => Object::Reference(pages_id),
874 }),
875 );
876 doc.trailer.set("Root", Object::Reference(catalog_id));
877
878 let mut bytes = Vec::new();
879 doc.save_to(&mut bytes)
880 .expect("save transparent cmyk fixture");
881 bytes
882 }
883
884 fn cmyk_image_pdf_bytes() -> Vec<u8> {
885 use lopdf::{dictionary, Document, Object, Stream};
886
887 let mut doc = Document::with_version("1.4");
888 let pages_id = doc.new_object_id();
889 let page_id = doc.new_object_id();
890 let image_id = doc.add_object(Stream::new(
891 dictionary! {
892 "Type" => "XObject",
893 "Subtype" => "Image",
894 "Width" => Object::Integer(2),
895 "Height" => Object::Integer(1),
896 "BitsPerComponent" => Object::Integer(8),
897 "ColorSpace" => "DeviceCMYK",
898 },
899 vec![255, 0, 0, 0, 0, 255, 0, 0],
900 ));
901 let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
902 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
903
904 doc.objects.insert(
905 page_id,
906 Object::Dictionary(dictionary! {
907 "Type" => "Page",
908 "Parent" => Object::Reference(pages_id),
909 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
910 "Resources" => dictionary! {
911 "XObject" => dictionary! {
912 "Im1" => Object::Reference(image_id),
913 },
914 },
915 "Contents" => Object::Reference(content_id),
916 }),
917 );
918 doc.objects.insert(
919 pages_id,
920 Object::Dictionary(dictionary! {
921 "Type" => "Pages",
922 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
923 "Count" => Object::Integer(1),
924 }),
925 );
926 let catalog_id = doc.new_object_id();
927 doc.objects.insert(
928 catalog_id,
929 Object::Dictionary(dictionary! {
930 "Type" => "Catalog",
931 "Pages" => Object::Reference(pages_id),
932 }),
933 );
934 doc.trailer.set("Root", Object::Reference(catalog_id));
935
936 let mut bytes = Vec::new();
937 doc.save_to(&mut bytes).expect("save cmyk image fixture");
938 bytes
939 }
940
941 fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
942 let idx = ((y * rendered.width + x) * 4) as usize;
943 [
944 rendered.pixels[idx],
945 rendered.pixels[idx + 1],
946 rendered.pixels[idx + 2],
947 rendered.pixels[idx + 3],
948 ]
949 }
950
951 fn non_embedded_truetype_pdf_bytes(
956 base_font: &[u8],
957 encoding: &[u8],
958 text_bytes: &[u8],
959 ) -> Vec<u8> {
960 use lopdf::{dictionary, Document, Object, Stream};
961
962 let mut doc = Document::with_version("1.4");
963
964 let font_id = doc.add_object(Object::Dictionary(dictionary! {
965 "Type" => "Font",
966 "Subtype" => "TrueType",
967 "Name" => Object::Name(b"F0".to_vec()),
968 "BaseFont" => Object::Name(base_font.to_vec()),
969 "Encoding" => Object::Name(encoding.to_vec()),
970 }));
971
972 let resources_id = doc.add_object(Object::Dictionary(dictionary! {
973 "Font" => dictionary! { "F0" => Object::Reference(font_id) },
974 }));
975
976 let mut content = Vec::new();
977 content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
978 for &b in text_bytes {
979 match b {
980 b'(' | b')' | b'\\' => {
981 content.push(b'\\');
982 content.push(b);
983 }
984 _ => content.push(b),
985 }
986 }
987 content.extend_from_slice(b") Tj\nET\n");
988 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
989
990 let pages_id = doc.new_object_id();
991 let page_id = doc.add_object(Object::Dictionary(dictionary! {
992 "Type" => "Page",
993 "Parent" => Object::Reference(pages_id),
994 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
995 "Resources" => Object::Reference(resources_id),
996 "Contents" => Object::Reference(content_id),
997 }));
998 doc.objects.insert(
999 pages_id,
1000 Object::Dictionary(dictionary! {
1001 "Type" => "Pages",
1002 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1003 "Count" => Object::Integer(1),
1004 }),
1005 );
1006 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1007 "Type" => "Catalog",
1008 "Pages" => Object::Reference(pages_id),
1009 }));
1010 doc.trailer.set("Root", Object::Reference(catalog_id));
1011
1012 let mut bytes = Vec::new();
1013 doc.save_to(&mut bytes).expect("save non-embedded fixture");
1014 bytes
1015 }
1016
1017 fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1020 use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1021
1022 let mut doc = Document::with_version("1.4");
1023
1024 let catalog_id = doc.new_object_id();
1025 let pages_id = doc.new_object_id();
1026 let page_id = doc.new_object_id();
1027 let acroform_id = doc.new_object_id();
1028 let content_id = doc.new_object_id();
1029 let widget_id = doc.new_object_id();
1030
1031 doc.objects.insert(
1032 content_id,
1033 Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1034 );
1035 doc.objects.insert(
1036 widget_id,
1037 Object::Dictionary(dictionary! {
1038 "Type" => "Annot",
1039 "Subtype" => "Widget",
1040 "FT" => "Btn",
1041 "Ff" => Object::Integer(1 << 16),
1042 "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1043 "MK" => dictionary! {
1044 "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1045 },
1046 "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1047 "P" => Object::Reference(page_id),
1048 }),
1049 );
1050 doc.objects.insert(
1051 page_id,
1052 Object::Dictionary(dictionary! {
1053 "Type" => "Page",
1054 "Parent" => Object::Reference(pages_id),
1055 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1056 "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1057 "Contents" => Object::Reference(content_id),
1058 }),
1059 );
1060 doc.objects.insert(
1061 pages_id,
1062 Object::Dictionary(dictionary! {
1063 "Type" => "Pages",
1064 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1065 "Count" => Object::Integer(1),
1066 }),
1067 );
1068 doc.objects.insert(
1069 acroform_id,
1070 Object::Dictionary(dictionary! {
1071 "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1072 }),
1073 );
1074 doc.objects.insert(
1075 catalog_id,
1076 Object::Dictionary(dictionary! {
1077 "Type" => "Catalog",
1078 "Pages" => Object::Reference(pages_id),
1079 "AcroForm" => Object::Reference(acroform_id),
1080 }),
1081 );
1082 doc.trailer.set("Root", Object::Reference(catalog_id));
1083
1084 let mut bytes = Vec::new();
1085 doc.save_to(&mut bytes)
1086 .expect("save push-button caption fixture");
1087 bytes
1088 }
1089
1090 #[test]
1091 fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1092 let bytes = non_embedded_truetype_pdf_bytes(
1098 b"TimesNewRoman",
1099 b"WinAnsiEncoding",
1100 b"UNITED STATES DISTRICT COURT",
1101 );
1102 let text = PdfDocument::open(bytes)
1103 .expect("open non-embedded TrueType fixture")
1104 .extract_text(0)
1105 .expect("extract non-embedded TrueType text");
1106 let norm = normalize_text(&text);
1107 assert!(
1108 norm.contains("UNITED STATES DISTRICT COURT"),
1109 "expected WinAnsi-decoded text, got: {norm:?}"
1110 );
1111 }
1112
1113 #[test]
1114 fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1115 let bytes = non_embedded_truetype_pdf_bytes(
1122 b"OpaqueCustomXYZ",
1123 b"WinAnsiEncoding",
1124 b"Hello, world!",
1125 );
1126 let text = PdfDocument::open(bytes)
1127 .expect("open custom non-embedded fixture")
1128 .extract_text(0)
1129 .expect("extract custom non-embedded text");
1130 let norm = normalize_text(&text);
1131 assert!(
1132 norm.contains("Hello, world!"),
1133 "expected WinAnsi-decoded text, got: {norm:?}"
1134 );
1135 }
1136
1137 #[test]
1138 fn extract_acroform_text_includes_push_button_mk_caption() {
1139 let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1140 let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1141
1142 let page_text = doc.extract_text(0).expect("extract page text");
1143 assert!(
1144 normalize_text(&page_text).is_empty(),
1145 "expected empty page content stream, got: {page_text:?}"
1146 );
1147
1148 let acroform_text = doc.extract_acroform_text();
1149 assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1150
1151 let all_text = doc.extract_all_text();
1152 assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1153 }
1154
1155 #[test]
1156 fn bytes_to_string_utf8() {
1157 assert_eq!(bytes_to_string(b"hello"), "hello");
1158 }
1159
1160 #[test]
1161 fn bytes_to_string_latin1() {
1162 let bytes = &[0xC4, 0xD6, 0xDC]; let s = bytes_to_string(bytes);
1164 assert_eq!(s, "ÄÖÜ");
1165 }
1166
1167 #[test]
1168 fn bytes_to_string_utf16() {
1169 let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; assert_eq!(bytes_to_string(bytes), "Hi");
1171 }
1172
1173 #[test]
1174 fn document_info_default() {
1175 let info = DocumentInfo::default();
1176 assert!(info.title.is_none());
1177 assert!(info.author.is_none());
1178 }
1179
1180 #[test]
1181 fn bookmark_item_children() {
1182 let item = BookmarkItem {
1183 title: "Root".into(),
1184 page: None,
1185 children: vec![BookmarkItem {
1186 title: "Child".into(),
1187 page: Some(0),
1188 children: Vec::new(),
1189 }],
1190 };
1191 assert_eq!(item.children.len(), 1);
1192 assert_eq!(item.children[0].title, "Child");
1193 }
1194
1195 #[test]
1196 fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1197 let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1198 let expected = PdfDocument::open(original.clone())
1199 .expect("open original sf181")
1200 .extract_text(0)
1201 .expect("extract original sf181 text");
1202 assert!(
1203 expected.contains("Guide to Personnel Data Standards"),
1204 "unexpected baseline extraction: {expected}"
1205 );
1206
1207 let (stripped, removed) = strip_type0_tounicode(&original);
1208 assert!(
1209 removed > 0,
1210 "expected to strip at least one Type0 ToUnicode"
1211 );
1212
1213 let actual = PdfDocument::open(stripped)
1214 .expect("open stripped sf181")
1215 .extract_text(0)
1216 .expect("extract stripped sf181 text");
1217
1218 let actual_norm = normalize_text(&actual);
1219 let expected_norm = normalize_text(&expected);
1220
1221 assert!(
1222 actual_norm.contains("Guide to Personnel Data Standards"),
1223 "missing main heading after stripping ToUnicode: {actual_norm}"
1224 );
1225 assert!(
1226 actual_norm.contains("Privacy Act Statement"),
1227 "missing body text after stripping ToUnicode: {actual_norm}"
1228 );
1229 assert!(
1230 actual_norm.len() + 32 >= expected_norm.len(),
1231 "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1232 expected_norm.len(),
1233 actual_norm.len()
1234 );
1235 }
1236
1237 #[test]
1238 fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1239 let bytes =
1247 std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1248 let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1249 let text = doc.extract_all_text();
1250
1251 let norm = normalize_text(&text);
1252 assert!(
1253 norm.contains("Transatlantic Council"),
1254 "expected Identity-H codes to resolve as Unicode: {norm}"
1255 );
1256 assert!(
1257 norm.contains("Boy Scouts of America"),
1258 "expected body text to be recovered: {norm}"
1259 );
1260 }
1261
1262 #[test]
1263 fn render_page_with_config_srgb_matches_legacy_render_page() {
1264 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1265 let legacy = doc
1266 .render_page(
1267 0,
1268 &RenderOptions {
1269 dpi: 72.0,
1270 ..Default::default()
1271 },
1272 )
1273 .expect("legacy render succeeds");
1274 let configured = doc
1275 .render_page_with_config(
1276 0,
1277 &RenderConfig {
1278 color_mode: ColorMode::Srgb,
1279 dpi: 72,
1280 },
1281 )
1282 .expect("configured render succeeds");
1283
1284 assert_eq!(legacy.width, configured.width);
1285 assert_eq!(legacy.height, configured.height);
1286 assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1287 assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1288 assert_eq!(legacy.pixels, configured.pixels);
1289 }
1290
1291 #[test]
1292 fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1293 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1294 let rendered = doc
1295 .render_page_with_config(
1296 0,
1297 &RenderConfig {
1298 color_mode: ColorMode::PreserveCmyk,
1299 dpi: 72,
1300 },
1301 )
1302 .expect("cmyk render succeeds");
1303
1304 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1305 assert_eq!(
1306 rendered.pixels.len(),
1307 rendered.width as usize * rendered.height as usize * 4
1308 );
1309 assert_eq!(
1310 pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1311 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1312 );
1313 }
1314
1315 #[test]
1316 fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1317 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1318 let rendered = doc
1319 .render_page_with_config(
1320 0,
1321 &RenderConfig {
1322 color_mode: ColorMode::SimulateCmyk,
1323 dpi: 72,
1324 },
1325 )
1326 .expect("simulate cmyk render succeeds");
1327
1328 assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1329 assert!(!rendered.pixels.is_empty());
1330 }
1331
1332 #[test]
1333 fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1334 let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1335 let rendered = doc
1336 .render_page_with_config(
1337 0,
1338 &RenderConfig {
1339 color_mode: ColorMode::PreserveCmyk,
1340 dpi: 72,
1341 },
1342 )
1343 .expect("mixed render succeeds");
1344
1345 assert_eq!(
1346 pixel_at(&rendered, 54, 36),
1347 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1348 );
1349 assert_ne!(
1350 pixel_at(&rendered, 18, 36),
1351 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1352 );
1353 }
1354
1355 #[test]
1356 fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1357 let doc =
1358 PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1359 let rendered = doc
1360 .render_page_with_config(
1361 0,
1362 &RenderConfig {
1363 color_mode: ColorMode::PreserveCmyk,
1364 dpi: 72,
1365 },
1366 )
1367 .expect("transparent cmyk render succeeds");
1368
1369 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1370 assert_eq!(
1371 rendered.pixels.len(),
1372 rendered.width as usize * rendered.height as usize * 4
1373 );
1374 }
1375
1376 #[test]
1377 fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1378 let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1379 let rendered = doc
1380 .render_page_with_config(
1381 0,
1382 &RenderConfig {
1383 color_mode: ColorMode::PreserveCmyk,
1384 dpi: 72,
1385 },
1386 )
1387 .expect("cmyk image render succeeds");
1388
1389 assert_eq!(rendered.width, 2);
1390 assert_eq!(rendered.height, 1);
1391 assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1392 assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1393 }
1394}