1use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::{LimitError, ProcessingLimits};
7use std::sync::{Arc, Mutex};
8
9type LimitSlot = Arc<Mutex<Option<(u64, u64)>>>;
14use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
15use crate::text::{TextBlock, TextExtractionDevice};
16use crate::thumbnail::ThumbnailOptions;
17
18use pdf_forms::parse::parse_acroform;
19use pdf_forms::tree::{FieldType, FieldValue};
20use pdf_render::pdf_interpret::PageExt;
21use pdf_render::pdf_interpret::{
22 interpret_page, Cache, Context, InterpreterSettings, InterpreterWarning,
23};
24use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
25use pdf_render::pdf_syntax::object::{Dict, ObjectIdentifier};
26use pdf_render::pdf_syntax::page::Page;
27use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
28#[cfg(feature = "parallel")]
29use rayon::prelude::*;
30use std::collections::BTreeSet;
31
32use kurbo::Rect;
33
34#[derive(Debug, Clone, Default)]
36pub struct DocumentInfo {
37 pub title: Option<String>,
39 pub author: Option<String>,
41 pub subject: Option<String>,
43 pub keywords: Option<String>,
45 pub creator: Option<String>,
47 pub producer: Option<String>,
49}
50
51#[derive(Debug, Clone)]
53pub struct BookmarkItem {
54 pub title: String,
56 pub page: Option<usize>,
58 pub children: Vec<BookmarkItem>,
60}
61
62pub struct PdfDocument {
64 pdf: Pdf,
65 settings: InterpreterSettings,
66}
67
68impl PdfDocument {
69 pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
71 let pdf = Pdf::new(data).map_err(|e| match e {
72 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
73 EngineError::Encrypted(format!("{d:?}"))
74 }
75 _ => EngineError::InvalidPdf(format!("{e:?}")),
76 })?;
77 let settings = InterpreterSettings {
78 shared_cache: Some(Cache::new()),
79 ..InterpreterSettings::default()
80 };
81 Ok(Self { pdf, settings })
82 }
83
84 pub fn open_with_processing_limits(
86 data: impl Into<pdf_render::pdf_syntax::PdfData>,
87 limits: ProcessingLimits,
88 ) -> Result<Self> {
89 let syntax_limits = PdfLoadLimits::new()
90 .max_object_depth(limits.max_object_depth)
91 .max_image_pixels(limits.max_image_pixels)
92 .max_stream_bytes(limits.max_stream_bytes);
93 let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
94 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
95 EngineError::Encrypted(format!("{d:?}"))
96 }
97 _ => EngineError::InvalidPdf(format!("{e:?}")),
98 })?;
99 let settings = InterpreterSettings {
100 max_operator_count: Some(limits.max_operator_count),
101 shared_cache: Some(Cache::new()),
102 ..InterpreterSettings::default()
103 };
104 Ok(Self { pdf, settings })
105 }
106
107 pub fn open_with_password(
109 data: impl Into<pdf_render::pdf_syntax::PdfData>,
110 password: &str,
111 ) -> Result<Self> {
112 let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
113 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
114 EngineError::Encrypted(format!("{d:?}"))
115 }
116 _ => EngineError::InvalidPdf(format!("{e:?}")),
117 })?;
118 let settings = InterpreterSettings {
119 shared_cache: Some(Cache::new()),
120 ..InterpreterSettings::default()
121 };
122 Ok(Self { pdf, settings })
123 }
124
125 pub fn open_with_password_and_processing_limits(
127 data: impl Into<pdf_render::pdf_syntax::PdfData>,
128 password: &str,
129 limits: ProcessingLimits,
130 ) -> Result<Self> {
131 let syntax_limits = PdfLoadLimits::new()
132 .max_object_depth(limits.max_object_depth)
133 .max_image_pixels(limits.max_image_pixels)
134 .max_stream_bytes(limits.max_stream_bytes);
135 let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
136 |e| match e {
137 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
138 EngineError::Encrypted(format!("{d:?}"))
139 }
140 _ => EngineError::InvalidPdf(format!("{e:?}")),
141 },
142 )?;
143 let settings = InterpreterSettings {
144 max_operator_count: Some(limits.max_operator_count),
145 shared_cache: Some(Cache::new()),
146 ..InterpreterSettings::default()
147 };
148 Ok(Self { pdf, settings })
149 }
150
151 pub fn pdf(&self) -> &Pdf {
153 &self.pdf
154 }
155
156 pub fn load_recovery(&self) -> pdf_render::pdf_syntax::LoadRecovery {
159 self.pdf.load_recovery()
160 }
161
162 pub fn set_settings(&mut self, settings: InterpreterSettings) {
164 self.settings = settings;
165 }
166
167 pub fn set_warning_sink(&mut self, sink: pdf_render::pdf_interpret::WarningSinkFn) {
174 self.settings.warning_sink = sink;
175 }
176
177 pub fn page_count(&self) -> usize {
179 self.pdf.pages().len()
180 }
181
182 pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
184 let page = self.get_page(index)?;
185 Ok(geometry::extract_geometry(page))
186 }
187
188 pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
196 #[cfg(feature = "xfa")]
197 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
198 return flat_doc.render_page(index, options);
199 }
200 let page = self.get_page(index)?;
201 let (w, h) = page.render_dimensions();
205 if w <= 0.0 || h <= 0.0 {
206 return Err(EngineError::InvalidPageGeometry {
207 width: w,
208 height: h,
209 reason: "page has zero or negative dimensions".into(),
210 });
211 }
212 const MIN_PAGE_PT: f32 = 1.0;
215 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
216 return Err(EngineError::InvalidPageGeometry {
217 width: w,
218 height: h,
219 reason: "page too small to render (< 1pt)".into(),
220 });
221 }
222 let (settings, slot) = Self::with_limit_collector(&self.settings);
223 let rendered = render::render_page(page, options, &settings);
224 Self::check_limit_slot(&slot)?;
225 Ok(rendered)
226 }
227
228 pub fn render_page_with_config(
233 &self,
234 index: usize,
235 config: &RenderConfig,
236 ) -> Result<RenderedPage> {
237 #[cfg(feature = "xfa")]
238 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
239 return flat_doc.render_page_with_config(index, config);
240 }
241 let page = self.get_page(index)?;
242 let (w, h) = page.render_dimensions();
243 if w <= 0.0 || h <= 0.0 {
244 return Err(EngineError::InvalidPageGeometry {
245 width: w,
246 height: h,
247 reason: "page has zero or negative dimensions".into(),
248 });
249 }
250 const MIN_PAGE_PT: f32 = 1.0;
251 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
252 return Err(EngineError::InvalidPageGeometry {
253 width: w,
254 height: h,
255 reason: "page too small to render (< 1pt)".into(),
256 });
257 }
258 let (settings, slot) = Self::with_limit_collector(&self.settings);
259 let rendered = render::render_page_with_config(page, config, &settings);
260 Self::check_limit_slot(&slot)?;
261 Ok(rendered)
262 }
263
264 pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
266 self.render_page_with_config(
267 index,
268 &RenderConfig {
269 color_mode: ColorMode::PreserveCmyk,
270 dpi,
271 },
272 )
273 }
274
275 pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
277 let pages = self.pdf.pages();
278 #[cfg(feature = "parallel")]
279 return (0..pages.len())
280 .into_par_iter()
281 .map(|i| render::render_page(&pages[i], options, &self.settings))
282 .collect();
283 #[cfg(not(feature = "parallel"))]
284 (0..pages.len())
285 .map(|i| render::render_page(&pages[i], options, &self.settings))
286 .collect()
287 }
288
289 pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
291 let pages = self.pdf.pages();
292 #[cfg(feature = "parallel")]
293 return (0..pages.len())
294 .into_par_iter()
295 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
296 .collect();
297 #[cfg(not(feature = "parallel"))]
298 (0..pages.len())
299 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
300 .collect()
301 }
302
303 pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
305 let page = self.get_page(index)?;
306 Ok(render::render_thumbnail(
307 page,
308 options.max_dimension,
309 &self.settings,
310 ))
311 }
312
313 pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
315 let pages = self.pdf.pages();
316 #[cfg(feature = "parallel")]
317 return (0..pages.len())
318 .into_par_iter()
319 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
320 .collect();
321 #[cfg(not(feature = "parallel"))]
322 (0..pages.len())
323 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
324 .collect()
325 }
326
327 pub fn extract_text(&self, index: usize) -> Result<String> {
329 let page = self.get_page(index)?;
330 let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
331 let mut device = TextExtractionDevice::new();
332 let mut ctx = Self::create_context_with_settings(page, settings);
333 interpret_page(page, &mut ctx, &mut device);
334 Self::check_limit_slot(&slot)?;
335 Ok(device.into_text())
336 }
337
338 #[doc(hidden)]
340 pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
341 where
342 I: IntoIterator<Item = usize>,
343 {
344 let pages = self.pdf.pages();
345 let mut settings = self.text_extraction_settings();
346 let indices = indices.into_iter();
347 let (lower_bound, upper_bound) = indices.size_hint();
348 let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
349
350 for index in indices {
351 let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
352 index,
353 count: pages.len(),
354 })?;
355 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
356 settings = next_settings;
357 texts.push(text);
358 }
359
360 Ok(texts)
361 }
362
363 pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
365 let page = self.get_page(index)?;
366 let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
367 let mut device = TextExtractionDevice::new();
368 let mut ctx = Self::create_context_with_settings(page, settings);
369 interpret_page(page, &mut ctx, &mut device);
370 Self::check_limit_slot(&slot)?;
371 Ok(device.into_blocks())
372 }
373
374 pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
376 let pages = self.pdf.pages();
377 let mut settings = self.text_extraction_settings();
378 let mut blocks = Vec::with_capacity(pages.len());
379
380 for page in pages.iter() {
381 let (page_blocks, next_settings) =
382 Self::extract_text_blocks_with_settings(page, settings);
383 settings = next_settings;
384 blocks.push(page_blocks);
385 }
386
387 blocks
388 }
389
390 pub fn extract_acroform_text(&self) -> String {
396 let Some(tree) = parse_acroform(&self.pdf) else {
397 return String::new();
398 };
399 let mut parts: Vec<String> = Vec::new();
400 for id in tree.all_ids() {
401 let node = tree.get(id);
402 if node.children.is_empty() {
403 let value_str = match &node.value {
405 Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
406 Some(FieldValue::StringArray(arr)) => {
407 let joined = arr
408 .iter()
409 .filter(|s| !s.is_empty())
410 .cloned()
411 .collect::<Vec<_>>()
412 .join(", ");
413 if joined.is_empty() {
414 None
415 } else {
416 Some(joined)
417 }
418 }
419 _ => None,
420 };
421 let button_caption =
422 value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
423 let extracted = value_str.or_else(|| {
424 button_caption.then(|| {
425 node.mk
426 .as_ref()
427 .and_then(|mk| mk.caption.as_ref())
428 .filter(|caption| !caption.is_empty())
429 .cloned()
430 })?
431 });
432 if let Some(s) = extracted {
433 parts.push(s);
434 }
435 }
436 }
437 parts.join("\n")
438 }
439
440 pub fn extract_all_text(&self) -> String {
443 let pages = self.pdf.pages();
444 let mut settings = self.text_extraction_settings();
445 let mut page_texts = Vec::with_capacity(pages.len());
446 for page in pages.iter() {
447 let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
448 settings = next_settings;
449 page_texts.push(page_text);
450 }
451
452 let mut text = join_page_texts(page_texts.iter().map(String::as_str));
453 let acroform = self.extract_acroform_text();
454 if !acroform.is_empty() {
455 if !text.is_empty() && !text.ends_with('\n') {
456 text.push('\n');
457 }
458 text.push_str(&acroform);
459 }
460 text
461 }
462
463 pub fn search_text(&self, query: &str) -> Vec<usize> {
465 let pages = self.pdf.pages();
466 let query_lower = query.to_lowercase();
467 #[cfg(feature = "parallel")]
468 let page_contains = |i: usize| -> Option<usize> {
469 let page = &pages[i];
470 let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
471 if text.to_lowercase().contains(&query_lower) {
472 Some(i)
473 } else {
474 None
475 }
476 };
477 #[cfg(feature = "parallel")]
478 return (0..pages.len())
479 .into_par_iter()
480 .filter_map(page_contains)
481 .collect();
482 #[cfg(not(feature = "parallel"))]
483 {
484 let mut settings = self.text_extraction_settings();
485 let mut hits = Vec::new();
486 for (i, page) in pages.iter().enumerate() {
487 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
488 settings = next_settings;
489 if text.to_lowercase().contains(&query_lower) {
490 hits.push(i);
491 }
492 }
493 hits
494 }
495 }
496
497 pub fn info(&self) -> DocumentInfo {
499 let meta = self.pdf.metadata();
500 DocumentInfo {
501 title: meta.title.as_ref().map(|b| bytes_to_string(b)),
502 author: meta.author.as_ref().map(|b| bytes_to_string(b)),
503 subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
504 keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
505 creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
506 producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
507 }
508 }
509
510 pub fn bookmarks(&self) -> Vec<BookmarkItem> {
512 let xref = self.pdf.xref();
513 let root_id = xref.root_id();
514 let catalog: Dict<'_> = match xref.get(root_id) {
515 Some(d) => d,
516 None => return Vec::new(),
517 };
518
519 let outlines: Dict<'_> = match catalog.get(OUTLINES) {
520 Some(d) => d,
521 None => return Vec::new(),
522 };
523
524 let first: Dict<'_> = match outlines.get(FIRST) {
525 Some(d) => d,
526 None => return Vec::new(),
527 };
528
529 let mut visited = BTreeSet::new();
533 parse_outline_items(&first, 0, &mut visited)
534 }
535
536 pub fn ocr_page(
555 &self,
556 index: usize,
557 backend: &dyn crate::ocr::OcrBackend,
558 dpi: f64,
559 ) -> crate::error::Result<crate::ocr::OcrResult> {
560 let opts = crate::render::RenderOptions {
561 dpi,
562 ..Default::default()
563 };
564 let rendered = self.render_page(index, &opts)?;
565
566 let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
568 for chunk in rendered.pixels.chunks(4) {
569 rgb.push(chunk[0]);
570 rgb.push(chunk[1]);
571 rgb.push(chunk[2]);
572 }
573
574 backend
575 .recognize(&rgb, rendered.width, rendered.height)
576 .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
577 }
578
579 fn with_limit_collector(settings: &InterpreterSettings) -> (InterpreterSettings, LimitSlot) {
586 let slot: LimitSlot = Arc::new(Mutex::new(None));
587 let slot_clone = Arc::clone(&slot);
588 let prev_sink = settings.warning_sink.clone();
589 let mut new_settings = settings.clone();
590 new_settings.warning_sink = Arc::new(move |w: InterpreterWarning| {
591 if let InterpreterWarning::StreamTooLarge { observed, limit } = w {
592 let mut guard = slot_clone.lock().unwrap_or_else(|e| e.into_inner());
593 if guard.is_none() {
594 *guard = Some((observed, limit));
595 }
596 }
597 prev_sink(w);
598 });
599 (new_settings, slot)
600 }
601
602 fn check_limit_slot(slot: &LimitSlot) -> Result<()> {
607 if let Some((observed, limit)) = *slot.lock().unwrap_or_else(|e| e.into_inner()) {
608 return Err(EngineError::LimitExceeded(LimitError::StreamTooLarge {
609 actual_bytes: observed,
610 limit_bytes: limit,
611 }));
612 }
613 Ok(())
614 }
615
616 fn get_page(&self, index: usize) -> Result<&Page<'_>> {
617 let pages = self.pdf.pages();
618 if index >= pages.len() {
619 return Err(EngineError::PageOutOfRange {
620 index,
621 count: pages.len(),
622 });
623 }
624 Ok(&pages[index])
625 }
626
627 fn text_extraction_settings(&self) -> InterpreterSettings {
628 let mut settings = self.settings.clone();
629 settings.skip_signature_widgets = false;
632 settings
633 }
634
635 fn create_context_with_settings<'a>(
636 page: &Page<'a>,
637 settings: InterpreterSettings,
638 ) -> Context<'a> {
639 let (w, h) = page.render_dimensions();
640 Context::new(
641 page.initial_transform(false),
642 Rect::new(0.0, 0.0, w as f64, h as f64),
643 page.xref(),
644 settings,
645 )
646 }
647
648 fn extract_text_with_settings<'a>(
649 page: &Page<'a>,
650 settings: InterpreterSettings,
651 ) -> (String, InterpreterSettings) {
652 let mut device = TextExtractionDevice::new();
653 let mut ctx = Self::create_context_with_settings(page, settings);
654 interpret_page(page, &mut ctx, &mut device);
655 let settings = ctx.into_settings();
656 (device.into_text(), settings)
657 }
658
659 fn extract_text_blocks_with_settings<'a>(
660 page: &Page<'a>,
661 settings: InterpreterSettings,
662 ) -> (Vec<TextBlock>, InterpreterSettings) {
663 let mut device = TextExtractionDevice::new();
664 let mut ctx = Self::create_context_with_settings(page, settings);
665 interpret_page(page, &mut ctx, &mut device);
666 let settings = ctx.into_settings();
667 (device.into_blocks(), settings)
668 }
669
670 #[cfg(feature = "xfa")]
671 fn open_flattened_xfa_for_render(&self) -> Option<Self> {
672 if !crate::xfa::has_xfa(self) {
673 return None;
674 }
675
676 let flat_bytes = crate::xfa::flatten(self).ok()?;
677 let mut flat_doc = Self::open(flat_bytes).ok()?;
678 flat_doc.settings = self.settings.clone();
679 Some(flat_doc)
680 }
681}
682
683fn join_page_texts<I>(page_texts: I) -> String
684where
685 I: IntoIterator,
686 I::Item: AsRef<str>,
687{
688 let mut text = String::new();
689 let mut is_first = true;
690
691 for page_text in page_texts {
692 if !is_first {
693 while !text.is_empty() && !text.ends_with("\n\n") {
694 text.push('\n');
695 }
696 text.push('\u{000C}');
697 }
698 text.push_str(page_text.as_ref());
699 is_first = false;
700 }
701
702 text
703}
704
705#[cfg(test)]
706mod extract_all_text_tests {
707 use super::join_page_texts;
708
709 #[test]
710 fn separates_nonempty_pages_like_pdftotext() {
711 assert_eq!(
712 join_page_texts(["Page 1", "Page 2"]),
713 "Page 1\n\n\u{000C}Page 2"
714 );
715 }
716
717 #[test]
718 fn preserves_leading_blank_pages_without_extra_newlines() {
719 assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
720 }
721
722 #[test]
723 fn reuses_existing_blank_line_before_form_feed() {
724 assert_eq!(
725 join_page_texts(["Page 1\n\n", "Page 2"]),
726 "Page 1\n\n\u{000C}Page 2"
727 );
728 }
729}
730
731const MAX_OUTLINE_DEPTH: usize = 100;
734
735fn parse_outline_items(
741 item_dict: &Dict<'_>,
742 depth: usize,
743 visited: &mut BTreeSet<ObjectIdentifier>,
744) -> Vec<BookmarkItem> {
745 let mut items = Vec::new();
746 if depth >= MAX_OUTLINE_DEPTH {
747 return items;
748 }
749 let mut current: Option<Dict<'_>> = Some(item_dict.clone());
750
751 while let Some(dict) = current {
752 if let Some(id) = dict.obj_id() {
755 if !visited.insert(id) {
756 break;
757 }
758 }
759
760 let title = dict
761 .get::<pdf_render::pdf_syntax::object::String>(TITLE)
762 .map(|s| bytes_to_string(s.as_bytes()))
763 .unwrap_or_default();
764
765 let children = match dict.get::<Dict<'_>>(FIRST) {
766 Some(child_dict) => parse_outline_items(&child_dict, depth + 1, visited),
767 None => Vec::new(),
768 };
769
770 items.push(BookmarkItem {
771 title,
772 page: None, children,
774 });
775
776 current = dict.get::<Dict<'_>>(NEXT);
777 }
778
779 items
780}
781
782fn bytes_to_string(bytes: &[u8]) -> String {
784 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
786 let chars: Vec<u16> = bytes[2..]
787 .chunks(2)
788 .filter_map(|c| {
789 if c.len() == 2 {
790 Some(u16::from_be_bytes([c[0], c[1]]))
791 } else {
792 None
793 }
794 })
795 .collect();
796 return String::from_utf16_lossy(&chars);
797 }
798
799 match std::str::from_utf8(bytes) {
801 Ok(s) => s.to_string(),
802 Err(_) => bytes.iter().map(|&b| b as char).collect(),
803 }
804}
805
806#[cfg(test)]
807mod tests {
808 use super::*;
809 use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
810 use lopdf::{Document as LoDocument, Object};
811 use std::path::PathBuf;
812
813 fn corpus_path(name: &str) -> PathBuf {
814 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
815 .join("../../corpus")
816 .join(name)
817 }
818
819 #[test]
822 fn cyclic_outline_terminates_and_is_bounded() {
823 fn cyclic_outline_pdf() -> Vec<u8> {
824 let objs: [&[u8]; 6] = [
825 b"<< /Type /Catalog /Pages 2 0 R /Outlines 4 0 R >>",
826 b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
827 b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] >>",
828 b"<< /Type /Outlines /First 5 0 R >>",
829 b"<< /Title (A) /Next 6 0 R >>",
830 b"<< /Title (B) /Next 5 0 R >>", ];
832 let mut buf = Vec::new();
833 let mut offsets = [0usize; 7];
834 buf.extend_from_slice(b"%PDF-1.7\n");
835 for (i, body) in objs.iter().enumerate() {
836 offsets[i + 1] = buf.len();
837 buf.extend_from_slice(format!("{} 0 obj\n", i + 1).as_bytes());
838 buf.extend_from_slice(body);
839 buf.extend_from_slice(b"\nendobj\n");
840 }
841 let xref_off = buf.len();
842 buf.extend_from_slice(b"xref\n0 7\n0000000000 65535 f \n");
843 for o in &offsets[1..7] {
844 buf.extend_from_slice(format!("{o:010} 00000 n \n").as_bytes());
845 }
846 buf.extend_from_slice(
847 format!("trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF")
848 .as_bytes(),
849 );
850 buf
851 }
852
853 fn count(b: &BookmarkItem) -> usize {
854 1 + b.children.iter().map(count).sum::<usize>()
855 }
856
857 let doc = PdfDocument::open(cyclic_outline_pdf()).expect("open cyclic-outline PDF");
858 let bookmarks = doc.bookmarks();
859 let total: usize = bookmarks.iter().map(count).sum();
860 assert!(
861 total <= 2,
862 "cyclic /Next outline must not loop forever; got {total} items"
863 );
864 }
865
866 #[test]
871 fn shared_image_cache_is_render_neutral() {
872 let path =
873 PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../tests/corpus-mini/scanned.pdf");
874 let data = std::fs::read(&path).expect("read scanned.pdf fixture");
875 let cfg = RenderConfig::default();
876
877 let doc_on = PdfDocument::open(data.clone()).expect("open cache-on");
880 let miss = doc_on
881 .render_page_with_config(0, &cfg)
882 .expect("cold render (cache miss)");
883 let hit = doc_on
884 .render_page_with_config(0, &cfg)
885 .expect("warm render (cache hit)");
886
887 let mut doc_off = PdfDocument::open(data).expect("open cache-off");
890 doc_off.set_settings(InterpreterSettings::default());
891 let uncached = doc_off
892 .render_page_with_config(0, &cfg)
893 .expect("render with cache disabled");
894
895 assert_eq!(
896 (miss.width, miss.height),
897 (uncached.width, uncached.height),
898 "render dimensions must match"
899 );
900 assert_eq!(
901 miss.pixels, hit.pixels,
902 "a cache hit must return exactly the freshly-decoded render"
903 );
904 assert_eq!(
905 miss.pixels, uncached.pixels,
906 "cache-enabled render must equal cache-disabled render"
907 );
908 }
909
910 fn normalize_text(text: &str) -> String {
911 text.split_whitespace().collect::<Vec<_>>().join(" ")
912 }
913
914 fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
915 fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
916 match dict.get(key).ok()? {
917 Object::Name(name) => Some(name.clone()),
918 _ => None,
919 }
920 }
921
922 fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
923 let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
924 return false;
925 };
926 let Some(Object::Reference(desc_id)) = descendants.first() else {
927 return false;
928 };
929 let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
930 return false;
931 };
932 matches!(
933 descendant.get(b"Subtype").ok(),
934 Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
935 )
936 }
937
938 let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
939 let ids: Vec<_> = doc.objects.keys().copied().collect();
940 let mut removed = 0usize;
941
942 for id in ids {
943 let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
944 continue;
945 };
946 if !matches!(
947 dict.get(b"Subtype").ok(),
948 Some(Object::Name(name)) if name.as_slice() == b"Type0"
949 ) {
950 continue;
951 }
952 if !matches!(
953 get_name(dict, b"Encoding").as_deref(),
954 Some(b"Identity-H") | Some(b"Identity-V")
955 ) {
956 continue;
957 }
958 if !descendant_is_cidfont_type2(&doc, dict) {
959 continue;
960 }
961
962 if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
963 if type0.has(b"ToUnicode") {
964 type0.remove(b"ToUnicode");
965 removed += 1;
966 }
967 }
968 }
969
970 let mut out = Vec::new();
971 doc.save_to(&mut out)
972 .expect("save stripped-to-unicode fixture");
973 (out, removed)
974 }
975
976 fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
977 use lopdf::{dictionary, Document, Object, Stream};
978
979 let mut doc = Document::with_version("1.4");
980
981 let pages_id = doc.new_object_id();
982 let page_id = doc.new_object_id();
983 let content = format!("{color_operator}\n0 0 72 72 re f\n");
984 let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
985
986 doc.objects.insert(
987 page_id,
988 Object::Dictionary(dictionary! {
989 "Type" => Object::Name(b"Page".to_vec()),
990 "Parent" => Object::Reference(pages_id),
991 "MediaBox" => Object::Array(vec![
992 Object::Integer(0),
993 Object::Integer(0),
994 Object::Integer(72),
995 Object::Integer(72),
996 ]),
997 "Contents" => Object::Reference(content_id),
998 }),
999 );
1000
1001 doc.objects.insert(
1002 pages_id,
1003 Object::Dictionary(dictionary! {
1004 "Type" => Object::Name(b"Pages".to_vec()),
1005 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1006 "Count" => Object::Integer(1),
1007 }),
1008 );
1009
1010 let catalog_id = doc.new_object_id();
1011 doc.objects.insert(
1012 catalog_id,
1013 Object::Dictionary(dictionary! {
1014 "Type" => Object::Name(b"Catalog".to_vec()),
1015 "Pages" => Object::Reference(pages_id),
1016 }),
1017 );
1018
1019 doc.trailer.set("Root", Object::Reference(catalog_id));
1020
1021 let mut bytes = Vec::new();
1022 doc.save_to(&mut bytes).expect("save solid fill fixture");
1023 bytes
1024 }
1025
1026 fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
1027 use lopdf::{dictionary, Document, Object, Stream};
1028
1029 let mut doc = Document::with_version("1.4");
1030 let pages_id = doc.new_object_id();
1031 let page_id = doc.new_object_id();
1032 let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
1033 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1034
1035 doc.objects.insert(
1036 page_id,
1037 Object::Dictionary(dictionary! {
1038 "Type" => "Page",
1039 "Parent" => Object::Reference(pages_id),
1040 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1041 "Contents" => Object::Reference(content_id),
1042 }),
1043 );
1044 doc.objects.insert(
1045 pages_id,
1046 Object::Dictionary(dictionary! {
1047 "Type" => "Pages",
1048 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1049 "Count" => Object::Integer(1),
1050 }),
1051 );
1052 let catalog_id = doc.new_object_id();
1053 doc.objects.insert(
1054 catalog_id,
1055 Object::Dictionary(dictionary! {
1056 "Type" => "Catalog",
1057 "Pages" => Object::Reference(pages_id),
1058 }),
1059 );
1060 doc.trailer.set("Root", Object::Reference(catalog_id));
1061
1062 let mut bytes = Vec::new();
1063 doc.save_to(&mut bytes)
1064 .expect("save mixed rgb/cmyk fixture");
1065 bytes
1066 }
1067
1068 fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
1069 use lopdf::{dictionary, Document, Object, Stream};
1070
1071 let mut doc = Document::with_version("1.4");
1072 let pages_id = doc.new_object_id();
1073 let page_id = doc.new_object_id();
1074 let gs_id = doc.add_object(Object::Dictionary(dictionary! {
1075 "Type" => "ExtGState",
1076 "ca" => Object::Real(0.5),
1077 }));
1078 let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
1079 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1080
1081 doc.objects.insert(
1082 page_id,
1083 Object::Dictionary(dictionary! {
1084 "Type" => "Page",
1085 "Parent" => Object::Reference(pages_id),
1086 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1087 "Resources" => dictionary! {
1088 "ExtGState" => dictionary! {
1089 "GS1" => Object::Reference(gs_id),
1090 },
1091 },
1092 "Contents" => Object::Reference(content_id),
1093 }),
1094 );
1095 doc.objects.insert(
1096 pages_id,
1097 Object::Dictionary(dictionary! {
1098 "Type" => "Pages",
1099 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1100 "Count" => Object::Integer(1),
1101 }),
1102 );
1103 let catalog_id = doc.new_object_id();
1104 doc.objects.insert(
1105 catalog_id,
1106 Object::Dictionary(dictionary! {
1107 "Type" => "Catalog",
1108 "Pages" => Object::Reference(pages_id),
1109 }),
1110 );
1111 doc.trailer.set("Root", Object::Reference(catalog_id));
1112
1113 let mut bytes = Vec::new();
1114 doc.save_to(&mut bytes)
1115 .expect("save transparent cmyk fixture");
1116 bytes
1117 }
1118
1119 fn cmyk_image_pdf_bytes() -> Vec<u8> {
1120 use lopdf::{dictionary, Document, Object, Stream};
1121
1122 let mut doc = Document::with_version("1.4");
1123 let pages_id = doc.new_object_id();
1124 let page_id = doc.new_object_id();
1125 let image_id = doc.add_object(Stream::new(
1126 dictionary! {
1127 "Type" => "XObject",
1128 "Subtype" => "Image",
1129 "Width" => Object::Integer(2),
1130 "Height" => Object::Integer(1),
1131 "BitsPerComponent" => Object::Integer(8),
1132 "ColorSpace" => "DeviceCMYK",
1133 },
1134 vec![255, 0, 0, 0, 0, 255, 0, 0],
1135 ));
1136 let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
1137 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1138
1139 doc.objects.insert(
1140 page_id,
1141 Object::Dictionary(dictionary! {
1142 "Type" => "Page",
1143 "Parent" => Object::Reference(pages_id),
1144 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
1145 "Resources" => dictionary! {
1146 "XObject" => dictionary! {
1147 "Im1" => Object::Reference(image_id),
1148 },
1149 },
1150 "Contents" => Object::Reference(content_id),
1151 }),
1152 );
1153 doc.objects.insert(
1154 pages_id,
1155 Object::Dictionary(dictionary! {
1156 "Type" => "Pages",
1157 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1158 "Count" => Object::Integer(1),
1159 }),
1160 );
1161 let catalog_id = doc.new_object_id();
1162 doc.objects.insert(
1163 catalog_id,
1164 Object::Dictionary(dictionary! {
1165 "Type" => "Catalog",
1166 "Pages" => Object::Reference(pages_id),
1167 }),
1168 );
1169 doc.trailer.set("Root", Object::Reference(catalog_id));
1170
1171 let mut bytes = Vec::new();
1172 doc.save_to(&mut bytes).expect("save cmyk image fixture");
1173 bytes
1174 }
1175
1176 fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
1177 let idx = ((y * rendered.width + x) * 4) as usize;
1178 [
1179 rendered.pixels[idx],
1180 rendered.pixels[idx + 1],
1181 rendered.pixels[idx + 2],
1182 rendered.pixels[idx + 3],
1183 ]
1184 }
1185
1186 fn non_embedded_truetype_pdf_bytes(
1191 base_font: &[u8],
1192 encoding: &[u8],
1193 text_bytes: &[u8],
1194 ) -> Vec<u8> {
1195 use lopdf::{dictionary, Document, Object, Stream};
1196
1197 let mut doc = Document::with_version("1.4");
1198
1199 let font_id = doc.add_object(Object::Dictionary(dictionary! {
1200 "Type" => "Font",
1201 "Subtype" => "TrueType",
1202 "Name" => Object::Name(b"F0".to_vec()),
1203 "BaseFont" => Object::Name(base_font.to_vec()),
1204 "Encoding" => Object::Name(encoding.to_vec()),
1205 }));
1206
1207 let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1208 "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1209 }));
1210
1211 let mut content = Vec::new();
1212 content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1213 for &b in text_bytes {
1214 match b {
1215 b'(' | b')' | b'\\' => {
1216 content.push(b'\\');
1217 content.push(b);
1218 }
1219 _ => content.push(b),
1220 }
1221 }
1222 content.extend_from_slice(b") Tj\nET\n");
1223 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1224
1225 let pages_id = doc.new_object_id();
1226 let page_id = doc.add_object(Object::Dictionary(dictionary! {
1227 "Type" => "Page",
1228 "Parent" => Object::Reference(pages_id),
1229 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1230 "Resources" => Object::Reference(resources_id),
1231 "Contents" => Object::Reference(content_id),
1232 }));
1233 doc.objects.insert(
1234 pages_id,
1235 Object::Dictionary(dictionary! {
1236 "Type" => "Pages",
1237 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1238 "Count" => Object::Integer(1),
1239 }),
1240 );
1241 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1242 "Type" => "Catalog",
1243 "Pages" => Object::Reference(pages_id),
1244 }));
1245 doc.trailer.set("Root", Object::Reference(catalog_id));
1246
1247 let mut bytes = Vec::new();
1248 doc.save_to(&mut bytes).expect("save non-embedded fixture");
1249 bytes
1250 }
1251
1252 fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1255 use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1256
1257 let mut doc = Document::with_version("1.4");
1258
1259 let catalog_id = doc.new_object_id();
1260 let pages_id = doc.new_object_id();
1261 let page_id = doc.new_object_id();
1262 let acroform_id = doc.new_object_id();
1263 let content_id = doc.new_object_id();
1264 let widget_id = doc.new_object_id();
1265
1266 doc.objects.insert(
1267 content_id,
1268 Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1269 );
1270 doc.objects.insert(
1271 widget_id,
1272 Object::Dictionary(dictionary! {
1273 "Type" => "Annot",
1274 "Subtype" => "Widget",
1275 "FT" => "Btn",
1276 "Ff" => Object::Integer(1 << 16),
1277 "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1278 "MK" => dictionary! {
1279 "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1280 },
1281 "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1282 "P" => Object::Reference(page_id),
1283 }),
1284 );
1285 doc.objects.insert(
1286 page_id,
1287 Object::Dictionary(dictionary! {
1288 "Type" => "Page",
1289 "Parent" => Object::Reference(pages_id),
1290 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1291 "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1292 "Contents" => Object::Reference(content_id),
1293 }),
1294 );
1295 doc.objects.insert(
1296 pages_id,
1297 Object::Dictionary(dictionary! {
1298 "Type" => "Pages",
1299 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1300 "Count" => Object::Integer(1),
1301 }),
1302 );
1303 doc.objects.insert(
1304 acroform_id,
1305 Object::Dictionary(dictionary! {
1306 "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1307 }),
1308 );
1309 doc.objects.insert(
1310 catalog_id,
1311 Object::Dictionary(dictionary! {
1312 "Type" => "Catalog",
1313 "Pages" => Object::Reference(pages_id),
1314 "AcroForm" => Object::Reference(acroform_id),
1315 }),
1316 );
1317 doc.trailer.set("Root", Object::Reference(catalog_id));
1318
1319 let mut bytes = Vec::new();
1320 doc.save_to(&mut bytes)
1321 .expect("save push-button caption fixture");
1322 bytes
1323 }
1324
1325 #[test]
1326 fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1327 let bytes = non_embedded_truetype_pdf_bytes(
1333 b"TimesNewRoman",
1334 b"WinAnsiEncoding",
1335 b"UNITED STATES DISTRICT COURT",
1336 );
1337 let text = PdfDocument::open(bytes)
1338 .expect("open non-embedded TrueType fixture")
1339 .extract_text(0)
1340 .expect("extract non-embedded TrueType text");
1341 let norm = normalize_text(&text);
1342 assert!(
1343 norm.contains("UNITED STATES DISTRICT COURT"),
1344 "expected WinAnsi-decoded text, got: {norm:?}"
1345 );
1346 }
1347
1348 #[test]
1349 fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1350 let bytes = non_embedded_truetype_pdf_bytes(
1357 b"OpaqueCustomXYZ",
1358 b"WinAnsiEncoding",
1359 b"Hello, world!",
1360 );
1361 let text = PdfDocument::open(bytes)
1362 .expect("open custom non-embedded fixture")
1363 .extract_text(0)
1364 .expect("extract custom non-embedded text");
1365 let norm = normalize_text(&text);
1366 assert!(
1367 norm.contains("Hello, world!"),
1368 "expected WinAnsi-decoded text, got: {norm:?}"
1369 );
1370 }
1371
1372 #[test]
1373 fn extract_acroform_text_includes_push_button_mk_caption() {
1374 let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1375 let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1376
1377 let page_text = doc.extract_text(0).expect("extract page text");
1378 assert!(
1379 normalize_text(&page_text).is_empty(),
1380 "expected empty page content stream, got: {page_text:?}"
1381 );
1382
1383 let acroform_text = doc.extract_acroform_text();
1384 assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1385
1386 let all_text = doc.extract_all_text();
1387 assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1388 }
1389
1390 #[test]
1391 fn bytes_to_string_utf8() {
1392 assert_eq!(bytes_to_string(b"hello"), "hello");
1393 }
1394
1395 #[test]
1396 fn bytes_to_string_latin1() {
1397 let bytes = &[0xC4, 0xD6, 0xDC]; let s = bytes_to_string(bytes);
1399 assert_eq!(s, "ÄÖÜ");
1400 }
1401
1402 #[test]
1403 fn bytes_to_string_utf16() {
1404 let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; assert_eq!(bytes_to_string(bytes), "Hi");
1406 }
1407
1408 #[test]
1409 fn document_info_default() {
1410 let info = DocumentInfo::default();
1411 assert!(info.title.is_none());
1412 assert!(info.author.is_none());
1413 }
1414
1415 #[test]
1416 fn bookmark_item_children() {
1417 let item = BookmarkItem {
1418 title: "Root".into(),
1419 page: None,
1420 children: vec![BookmarkItem {
1421 title: "Child".into(),
1422 page: Some(0),
1423 children: Vec::new(),
1424 }],
1425 };
1426 assert_eq!(item.children.len(), 1);
1427 assert_eq!(item.children[0].title, "Child");
1428 }
1429
1430 #[test]
1431 fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1432 let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1433 let expected = PdfDocument::open(original.clone())
1434 .expect("open original sf181")
1435 .extract_text(0)
1436 .expect("extract original sf181 text");
1437 assert!(
1438 expected.contains("Guide to Personnel Data Standards"),
1439 "unexpected baseline extraction: {expected}"
1440 );
1441
1442 let (stripped, removed) = strip_type0_tounicode(&original);
1443 assert!(
1444 removed > 0,
1445 "expected to strip at least one Type0 ToUnicode"
1446 );
1447
1448 let actual = PdfDocument::open(stripped)
1449 .expect("open stripped sf181")
1450 .extract_text(0)
1451 .expect("extract stripped sf181 text");
1452
1453 let actual_norm = normalize_text(&actual);
1454 let expected_norm = normalize_text(&expected);
1455
1456 assert!(
1457 actual_norm.contains("Guide to Personnel Data Standards"),
1458 "missing main heading after stripping ToUnicode: {actual_norm}"
1459 );
1460 assert!(
1461 actual_norm.contains("Privacy Act Statement"),
1462 "missing body text after stripping ToUnicode: {actual_norm}"
1463 );
1464 assert!(
1465 actual_norm.len() + 32 >= expected_norm.len(),
1466 "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1467 expected_norm.len(),
1468 actual_norm.len()
1469 );
1470 }
1471
1472 #[test]
1473 fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1474 let bytes =
1482 std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1483 let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1484 let text = doc.extract_all_text();
1485
1486 let norm = normalize_text(&text);
1487 assert!(
1488 norm.contains("Transatlantic Council"),
1489 "expected Identity-H codes to resolve as Unicode: {norm}"
1490 );
1491 assert!(
1492 norm.contains("Boy Scouts of America"),
1493 "expected body text to be recovered: {norm}"
1494 );
1495 }
1496
1497 #[test]
1498 fn render_max_pixels_none_is_unchanged_default_behavior() {
1499 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1501 let baseline = doc
1502 .render_page(
1503 0,
1504 &RenderOptions {
1505 dpi: 144.0,
1506 ..Default::default()
1507 },
1508 )
1509 .expect("baseline render");
1510 let explicit_none = doc
1511 .render_page(
1512 0,
1513 &RenderOptions {
1514 dpi: 144.0,
1515 max_pixels: None,
1516 ..Default::default()
1517 },
1518 )
1519 .expect("explicit-none render");
1520 assert_eq!(baseline.width, explicit_none.width);
1521 assert_eq!(baseline.height, explicit_none.height);
1522 assert_eq!(baseline.pixels, explicit_none.pixels);
1523 }
1524
1525 #[test]
1526 fn render_max_pixels_budget_clamps_resolution() {
1527 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1528 let full = doc
1529 .render_page(
1530 0,
1531 &RenderOptions {
1532 dpi: 288.0,
1533 ..Default::default()
1534 },
1535 )
1536 .expect("full render");
1537 let full_px = full.width * full.height;
1538 let budget = full_px / 4;
1540 let capped = doc
1541 .render_page(
1542 0,
1543 &RenderOptions {
1544 dpi: 288.0,
1545 max_pixels: Some(budget),
1546 ..Default::default()
1547 },
1548 )
1549 .expect("capped render");
1550 assert!(
1551 capped.width * capped.height <= full_px,
1552 "capped output must not exceed full output"
1553 );
1554 assert!(
1555 capped.width < full.width || capped.height < full.height,
1556 "budget below full pixel count must shrink at least one dimension"
1557 );
1558 }
1559
1560 #[test]
1561 fn render_max_pixels_large_budget_no_clamp() {
1562 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1564 let baseline = doc
1565 .render_page(
1566 0,
1567 &RenderOptions {
1568 dpi: 72.0,
1569 ..Default::default()
1570 },
1571 )
1572 .expect("baseline");
1573 let huge = doc
1574 .render_page(
1575 0,
1576 &RenderOptions {
1577 dpi: 72.0,
1578 max_pixels: Some(100_000_000),
1579 ..Default::default()
1580 },
1581 )
1582 .expect("huge-budget render");
1583 assert_eq!(baseline.width, huge.width);
1584 assert_eq!(baseline.height, huge.height);
1585 assert_eq!(baseline.pixels, huge.pixels);
1586 }
1587
1588 #[test]
1589 fn render_page_with_config_srgb_matches_legacy_render_page() {
1590 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1591 let legacy = doc
1592 .render_page(
1593 0,
1594 &RenderOptions {
1595 dpi: 72.0,
1596 ..Default::default()
1597 },
1598 )
1599 .expect("legacy render succeeds");
1600 let configured = doc
1601 .render_page_with_config(
1602 0,
1603 &RenderConfig {
1604 color_mode: ColorMode::Srgb,
1605 dpi: 72,
1606 },
1607 )
1608 .expect("configured render succeeds");
1609
1610 assert_eq!(legacy.width, configured.width);
1611 assert_eq!(legacy.height, configured.height);
1612 assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1613 assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1614 assert_eq!(legacy.pixels, configured.pixels);
1615 }
1616
1617 #[test]
1618 fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1619 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1620 let rendered = doc
1621 .render_page_with_config(
1622 0,
1623 &RenderConfig {
1624 color_mode: ColorMode::PreserveCmyk,
1625 dpi: 72,
1626 },
1627 )
1628 .expect("cmyk render succeeds");
1629
1630 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1631 assert_eq!(
1632 rendered.pixels.len(),
1633 rendered.width as usize * rendered.height as usize * 4
1634 );
1635 assert_eq!(
1636 pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1637 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1638 );
1639 }
1640
1641 #[test]
1642 fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1643 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1644 let rendered = doc
1645 .render_page_with_config(
1646 0,
1647 &RenderConfig {
1648 color_mode: ColorMode::SimulateCmyk,
1649 dpi: 72,
1650 },
1651 )
1652 .expect("simulate cmyk render succeeds");
1653
1654 assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1655 assert!(!rendered.pixels.is_empty());
1656 }
1657
1658 #[test]
1659 fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1660 let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1661 let rendered = doc
1662 .render_page_with_config(
1663 0,
1664 &RenderConfig {
1665 color_mode: ColorMode::PreserveCmyk,
1666 dpi: 72,
1667 },
1668 )
1669 .expect("mixed render succeeds");
1670
1671 assert_eq!(
1672 pixel_at(&rendered, 54, 36),
1673 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1674 );
1675 assert_ne!(
1676 pixel_at(&rendered, 18, 36),
1677 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1678 );
1679 }
1680
1681 #[test]
1682 fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1683 let doc =
1684 PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1685 let rendered = doc
1686 .render_page_with_config(
1687 0,
1688 &RenderConfig {
1689 color_mode: ColorMode::PreserveCmyk,
1690 dpi: 72,
1691 },
1692 )
1693 .expect("transparent cmyk render succeeds");
1694
1695 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1696 assert_eq!(
1697 rendered.pixels.len(),
1698 rendered.width as usize * rendered.height as usize * 4
1699 );
1700 }
1701
1702 #[test]
1703 fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1704 let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1705 let rendered = doc
1706 .render_page_with_config(
1707 0,
1708 &RenderConfig {
1709 color_mode: ColorMode::PreserveCmyk,
1710 dpi: 72,
1711 },
1712 )
1713 .expect("cmyk image render succeeds");
1714
1715 assert_eq!(rendered.width, 2);
1716 assert_eq!(rendered.height, 1);
1717 assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1718 assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1719 }
1720}
1721
1722#[cfg(test)]
1723mod load_recovery_tests {
1724 use super::PdfDocument;
1725
1726 #[test]
1727 fn broken_xref_sets_xref_rebuilt() {
1728 let body: &[u8] = b"%PDF-1.7\n\
17311 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\
17322 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n\
17333 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>\nendobj\n\
1734trailer\n<< /Root 1 0 R /Size 4 >>\nstartxref\n999999\n%%EOF";
1735 let doc = PdfDocument::open(body.to_vec()).expect("recovers via xref rebuild");
1736 assert!(doc.load_recovery().xref_rebuilt, "xref_rebuilt must be set");
1737 }
1738
1739 #[test]
1740 fn clean_document_reports_no_recovery() {
1741 let objs: [&[u8]; 3] = [
1743 b"<< /Type /Catalog /Pages 2 0 R >>",
1744 b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
1745 b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
1746 ];
1747 let mut buf = Vec::new();
1748 let mut off = [0usize; 4];
1749 buf.extend_from_slice(b"%PDF-1.7\n");
1750 for (i, body) in objs.iter().enumerate() {
1751 off[i + 1] = buf.len();
1752 buf.extend_from_slice(format!("{} 0 obj\n", i + 1).as_bytes());
1753 buf.extend_from_slice(body);
1754 buf.extend_from_slice(b"\nendobj\n");
1755 }
1756 let xref_off = buf.len();
1757 buf.extend_from_slice(b"xref\n0 4\n0000000000 65535 f \n");
1758 for o in &off[1..4] {
1759 buf.extend_from_slice(format!("{o:010} 00000 n \n").as_bytes());
1760 }
1761 buf.extend_from_slice(
1762 format!("trailer\n<< /Root 1 0 R /Size 4 >>\nstartxref\n{xref_off}\n%%EOF").as_bytes(),
1763 );
1764 let doc = PdfDocument::open(buf).expect("clean doc opens");
1765 let r = doc.load_recovery();
1766 assert!(
1767 !r.xref_rebuilt && !r.page_tree_rebuilt,
1768 "a clean document must report no recovery; got {r:?}"
1769 );
1770 }
1771}