1use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::{LimitError, ProcessingLimits};
7use std::sync::{Arc, Mutex};
8
9type LimitSlot = Arc<Mutex<Option<(u64, u64)>>>;
14use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
15use crate::text::{TextBlock, TextExtractionDevice};
16use crate::thumbnail::ThumbnailOptions;
17
18use pdf_forms::parse::parse_acroform;
19use pdf_forms::tree::{FieldType, FieldValue};
20use pdf_render::pdf_interpret::PageExt;
21use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings, InterpreterWarning};
22use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
23use pdf_render::pdf_syntax::object::Dict;
24use pdf_render::pdf_syntax::page::Page;
25use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
26#[cfg(feature = "parallel")]
27use rayon::prelude::*;
28
29use kurbo::Rect;
30
31#[derive(Debug, Clone, Default)]
33pub struct DocumentInfo {
34 pub title: Option<String>,
36 pub author: Option<String>,
38 pub subject: Option<String>,
40 pub keywords: Option<String>,
42 pub creator: Option<String>,
44 pub producer: Option<String>,
46}
47
48#[derive(Debug, Clone)]
50pub struct BookmarkItem {
51 pub title: String,
53 pub page: Option<usize>,
55 pub children: Vec<BookmarkItem>,
57}
58
59pub struct PdfDocument {
61 pdf: Pdf,
62 settings: InterpreterSettings,
63}
64
65impl PdfDocument {
66 pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
68 let pdf = Pdf::new(data).map_err(|e| match e {
69 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
70 EngineError::Encrypted(format!("{d:?}"))
71 }
72 _ => EngineError::InvalidPdf(format!("{e:?}")),
73 })?;
74 Ok(Self {
75 pdf,
76 settings: InterpreterSettings::default(),
77 })
78 }
79
80 pub fn open_with_processing_limits(
82 data: impl Into<pdf_render::pdf_syntax::PdfData>,
83 limits: ProcessingLimits,
84 ) -> Result<Self> {
85 let syntax_limits = PdfLoadLimits::new()
86 .max_object_depth(limits.max_object_depth)
87 .max_image_pixels(limits.max_image_pixels)
88 .max_stream_bytes(limits.max_stream_bytes);
89 let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
90 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
91 EngineError::Encrypted(format!("{d:?}"))
92 }
93 _ => EngineError::InvalidPdf(format!("{e:?}")),
94 })?;
95 let settings = InterpreterSettings {
96 max_operator_count: Some(limits.max_operator_count),
97 ..InterpreterSettings::default()
98 };
99 Ok(Self { pdf, settings })
100 }
101
102 pub fn open_with_password(
104 data: impl Into<pdf_render::pdf_syntax::PdfData>,
105 password: &str,
106 ) -> Result<Self> {
107 let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
108 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
109 EngineError::Encrypted(format!("{d:?}"))
110 }
111 _ => EngineError::InvalidPdf(format!("{e:?}")),
112 })?;
113 Ok(Self {
114 pdf,
115 settings: InterpreterSettings::default(),
116 })
117 }
118
119 pub fn open_with_password_and_processing_limits(
121 data: impl Into<pdf_render::pdf_syntax::PdfData>,
122 password: &str,
123 limits: ProcessingLimits,
124 ) -> Result<Self> {
125 let syntax_limits = PdfLoadLimits::new()
126 .max_object_depth(limits.max_object_depth)
127 .max_image_pixels(limits.max_image_pixels)
128 .max_stream_bytes(limits.max_stream_bytes);
129 let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
130 |e| match e {
131 pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
132 EngineError::Encrypted(format!("{d:?}"))
133 }
134 _ => EngineError::InvalidPdf(format!("{e:?}")),
135 },
136 )?;
137 let settings = InterpreterSettings {
138 max_operator_count: Some(limits.max_operator_count),
139 ..InterpreterSettings::default()
140 };
141 Ok(Self { pdf, settings })
142 }
143
144 pub fn pdf(&self) -> &Pdf {
146 &self.pdf
147 }
148
149 pub fn set_settings(&mut self, settings: InterpreterSettings) {
151 self.settings = settings;
152 }
153
154 pub fn page_count(&self) -> usize {
156 self.pdf.pages().len()
157 }
158
159 pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
161 let page = self.get_page(index)?;
162 Ok(geometry::extract_geometry(page))
163 }
164
165 pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
173 #[cfg(feature = "xfa")]
174 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
175 return flat_doc.render_page(index, options);
176 }
177 let page = self.get_page(index)?;
178 let (w, h) = page.render_dimensions();
182 if w <= 0.0 || h <= 0.0 {
183 return Err(EngineError::InvalidPageGeometry {
184 width: w,
185 height: h,
186 reason: "page has zero or negative dimensions".into(),
187 });
188 }
189 const MIN_PAGE_PT: f32 = 1.0;
192 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
193 return Err(EngineError::InvalidPageGeometry {
194 width: w,
195 height: h,
196 reason: "page too small to render (< 1pt)".into(),
197 });
198 }
199 let (settings, slot) = Self::with_limit_collector(&self.settings);
200 let rendered = render::render_page(page, options, &settings);
201 Self::check_limit_slot(&slot)?;
202 Ok(rendered)
203 }
204
205 pub fn render_page_with_config(
210 &self,
211 index: usize,
212 config: &RenderConfig,
213 ) -> Result<RenderedPage> {
214 #[cfg(feature = "xfa")]
215 if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
216 return flat_doc.render_page_with_config(index, config);
217 }
218 let page = self.get_page(index)?;
219 let (w, h) = page.render_dimensions();
220 if w <= 0.0 || h <= 0.0 {
221 return Err(EngineError::InvalidPageGeometry {
222 width: w,
223 height: h,
224 reason: "page has zero or negative dimensions".into(),
225 });
226 }
227 const MIN_PAGE_PT: f32 = 1.0;
228 if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
229 return Err(EngineError::InvalidPageGeometry {
230 width: w,
231 height: h,
232 reason: "page too small to render (< 1pt)".into(),
233 });
234 }
235 let (settings, slot) = Self::with_limit_collector(&self.settings);
236 let rendered = render::render_page_with_config(page, config, &settings);
237 Self::check_limit_slot(&slot)?;
238 Ok(rendered)
239 }
240
241 pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
243 self.render_page_with_config(
244 index,
245 &RenderConfig {
246 color_mode: ColorMode::PreserveCmyk,
247 dpi,
248 },
249 )
250 }
251
252 pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
254 let pages = self.pdf.pages();
255 #[cfg(feature = "parallel")]
256 return (0..pages.len())
257 .into_par_iter()
258 .map(|i| render::render_page(&pages[i], options, &self.settings))
259 .collect();
260 #[cfg(not(feature = "parallel"))]
261 (0..pages.len())
262 .map(|i| render::render_page(&pages[i], options, &self.settings))
263 .collect()
264 }
265
266 pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
268 let pages = self.pdf.pages();
269 #[cfg(feature = "parallel")]
270 return (0..pages.len())
271 .into_par_iter()
272 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
273 .collect();
274 #[cfg(not(feature = "parallel"))]
275 (0..pages.len())
276 .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
277 .collect()
278 }
279
280 pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
282 let page = self.get_page(index)?;
283 Ok(render::render_thumbnail(
284 page,
285 options.max_dimension,
286 &self.settings,
287 ))
288 }
289
290 pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
292 let pages = self.pdf.pages();
293 #[cfg(feature = "parallel")]
294 return (0..pages.len())
295 .into_par_iter()
296 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
297 .collect();
298 #[cfg(not(feature = "parallel"))]
299 (0..pages.len())
300 .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
301 .collect()
302 }
303
304 pub fn extract_text(&self, index: usize) -> Result<String> {
306 let page = self.get_page(index)?;
307 let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
308 let mut device = TextExtractionDevice::new();
309 let mut ctx = Self::create_context_with_settings(page, settings);
310 interpret_page(page, &mut ctx, &mut device);
311 Self::check_limit_slot(&slot)?;
312 Ok(device.into_text())
313 }
314
315 #[doc(hidden)]
317 pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
318 where
319 I: IntoIterator<Item = usize>,
320 {
321 let pages = self.pdf.pages();
322 let mut settings = self.text_extraction_settings();
323 let indices = indices.into_iter();
324 let (lower_bound, upper_bound) = indices.size_hint();
325 let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
326
327 for index in indices {
328 let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
329 index,
330 count: pages.len(),
331 })?;
332 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
333 settings = next_settings;
334 texts.push(text);
335 }
336
337 Ok(texts)
338 }
339
340 pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
342 let page = self.get_page(index)?;
343 let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
344 let mut device = TextExtractionDevice::new();
345 let mut ctx = Self::create_context_with_settings(page, settings);
346 interpret_page(page, &mut ctx, &mut device);
347 Self::check_limit_slot(&slot)?;
348 Ok(device.into_blocks())
349 }
350
351 pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
353 let pages = self.pdf.pages();
354 let mut settings = self.text_extraction_settings();
355 let mut blocks = Vec::with_capacity(pages.len());
356
357 for page in pages.iter() {
358 let (page_blocks, next_settings) =
359 Self::extract_text_blocks_with_settings(page, settings);
360 settings = next_settings;
361 blocks.push(page_blocks);
362 }
363
364 blocks
365 }
366
367 pub fn extract_acroform_text(&self) -> String {
373 let Some(tree) = parse_acroform(&self.pdf) else {
374 return String::new();
375 };
376 let mut parts: Vec<String> = Vec::new();
377 for id in tree.all_ids() {
378 let node = tree.get(id);
379 if node.children.is_empty() {
380 let value_str = match &node.value {
382 Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
383 Some(FieldValue::StringArray(arr)) => {
384 let joined = arr
385 .iter()
386 .filter(|s| !s.is_empty())
387 .cloned()
388 .collect::<Vec<_>>()
389 .join(", ");
390 if joined.is_empty() {
391 None
392 } else {
393 Some(joined)
394 }
395 }
396 _ => None,
397 };
398 let button_caption =
399 value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
400 let extracted = value_str.or_else(|| {
401 button_caption.then(|| {
402 node.mk
403 .as_ref()
404 .and_then(|mk| mk.caption.as_ref())
405 .filter(|caption| !caption.is_empty())
406 .cloned()
407 })?
408 });
409 if let Some(s) = extracted {
410 parts.push(s);
411 }
412 }
413 }
414 parts.join("\n")
415 }
416
417 pub fn extract_all_text(&self) -> String {
430 let raw = self.extract_all_text_raw();
431
432 #[cfg(feature = "xfa")]
433 {
434 if let Some(flat_text) = self.extract_all_text_via_xfa_flatten() {
435 if Self::should_prefer_flat_extract(&raw, &flat_text) {
436 return flat_text;
437 }
438 }
439 }
440
441 raw
442 }
443
444 #[doc(hidden)]
448 pub fn extract_all_text_raw(&self) -> String {
449 let pages = self.pdf.pages();
450 let mut settings = self.text_extraction_settings();
451 let mut page_texts = Vec::with_capacity(pages.len());
452 for page in pages.iter() {
453 let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
454 settings = next_settings;
455 page_texts.push(page_text);
456 }
457
458 let mut text = join_page_texts(page_texts.iter().map(String::as_str));
459 let acroform = self.extract_acroform_text();
460 if !acroform.is_empty() {
461 if !text.is_empty() && !text.ends_with('\n') {
462 text.push('\n');
463 }
464 text.push_str(&acroform);
465 }
466 text
467 }
468
469 #[cfg(feature = "xfa")]
473 fn extract_all_text_via_xfa_flatten(&self) -> Option<String> {
474 let flat_doc = self.open_flattened_xfa_for_render()?;
475 Some(flat_doc.extract_all_text_raw())
479 }
480
481 #[cfg(feature = "xfa")]
501 fn should_prefer_flat_extract(raw: &str, flat: &str) -> bool {
502 if flat.is_empty() {
503 return false;
504 }
505 const ADOBE_PLACEHOLDER_MARKERS: [&str; 4] = [
510 "requires Adobe Reader",
511 "Please wait...",
512 "To view the full contents of this document",
513 "form is not supported with the current version of Acrobat",
514 ];
515 ADOBE_PLACEHOLDER_MARKERS
516 .iter()
517 .any(|marker| raw.contains(marker))
518 }
519
520 pub fn search_text(&self, query: &str) -> Vec<usize> {
522 let pages = self.pdf.pages();
523 let query_lower = query.to_lowercase();
524 #[cfg(feature = "parallel")]
525 let page_contains = |i: usize| -> Option<usize> {
526 let page = &pages[i];
527 let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
528 if text.to_lowercase().contains(&query_lower) {
529 Some(i)
530 } else {
531 None
532 }
533 };
534 #[cfg(feature = "parallel")]
535 return (0..pages.len())
536 .into_par_iter()
537 .filter_map(page_contains)
538 .collect();
539 #[cfg(not(feature = "parallel"))]
540 {
541 let mut settings = self.text_extraction_settings();
542 let mut hits = Vec::new();
543 for (i, page) in pages.iter().enumerate() {
544 let (text, next_settings) = Self::extract_text_with_settings(page, settings);
545 settings = next_settings;
546 if text.to_lowercase().contains(&query_lower) {
547 hits.push(i);
548 }
549 }
550 hits
551 }
552 }
553
554 pub fn info(&self) -> DocumentInfo {
556 let meta = self.pdf.metadata();
557 DocumentInfo {
558 title: meta.title.as_ref().map(|b| bytes_to_string(b)),
559 author: meta.author.as_ref().map(|b| bytes_to_string(b)),
560 subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
561 keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
562 creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
563 producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
564 }
565 }
566
567 pub fn bookmarks(&self) -> Vec<BookmarkItem> {
569 let xref = self.pdf.xref();
570 let root_id = xref.root_id();
571 let catalog: Dict<'_> = match xref.get(root_id) {
572 Some(d) => d,
573 None => return Vec::new(),
574 };
575
576 let outlines: Dict<'_> = match catalog.get(OUTLINES) {
577 Some(d) => d,
578 None => return Vec::new(),
579 };
580
581 let first: Dict<'_> = match outlines.get(FIRST) {
582 Some(d) => d,
583 None => return Vec::new(),
584 };
585
586 parse_outline_items(&first)
587 }
588
589 pub fn ocr_page(
608 &self,
609 index: usize,
610 backend: &dyn crate::ocr::OcrBackend,
611 dpi: f64,
612 ) -> crate::error::Result<crate::ocr::OcrResult> {
613 let opts = crate::render::RenderOptions {
614 dpi,
615 ..Default::default()
616 };
617 let rendered = self.render_page(index, &opts)?;
618
619 let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
621 for chunk in rendered.pixels.chunks(4) {
622 rgb.push(chunk[0]);
623 rgb.push(chunk[1]);
624 rgb.push(chunk[2]);
625 }
626
627 backend
628 .recognize(&rgb, rendered.width, rendered.height)
629 .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
630 }
631
632 fn with_limit_collector(settings: &InterpreterSettings) -> (InterpreterSettings, LimitSlot) {
639 let slot: LimitSlot = Arc::new(Mutex::new(None));
640 let slot_clone = Arc::clone(&slot);
641 let prev_sink = settings.warning_sink.clone();
642 let mut new_settings = settings.clone();
643 new_settings.warning_sink = Arc::new(move |w: InterpreterWarning| {
644 if let InterpreterWarning::StreamTooLarge { observed, limit } = w {
645 let mut guard = slot_clone.lock().unwrap_or_else(|e| e.into_inner());
646 if guard.is_none() {
647 *guard = Some((observed, limit));
648 }
649 }
650 prev_sink(w);
651 });
652 (new_settings, slot)
653 }
654
655 fn check_limit_slot(slot: &LimitSlot) -> Result<()> {
660 if let Some((observed, limit)) = *slot.lock().unwrap_or_else(|e| e.into_inner()) {
661 return Err(EngineError::LimitExceeded(LimitError::StreamTooLarge {
662 actual_bytes: observed,
663 limit_bytes: limit,
664 }));
665 }
666 Ok(())
667 }
668
669 fn get_page(&self, index: usize) -> Result<&Page<'_>> {
670 let pages = self.pdf.pages();
671 if index >= pages.len() {
672 return Err(EngineError::PageOutOfRange {
673 index,
674 count: pages.len(),
675 });
676 }
677 Ok(&pages[index])
678 }
679
680 fn text_extraction_settings(&self) -> InterpreterSettings {
681 let mut settings = self.settings.clone();
682 settings.skip_signature_widgets = false;
685 settings
686 }
687
688 fn create_context_with_settings<'a>(
689 page: &Page<'a>,
690 settings: InterpreterSettings,
691 ) -> Context<'a> {
692 let (w, h) = page.render_dimensions();
693 Context::new(
694 page.initial_transform(false),
695 Rect::new(0.0, 0.0, w as f64, h as f64),
696 page.xref(),
697 settings,
698 )
699 }
700
701 fn extract_text_with_settings<'a>(
702 page: &Page<'a>,
703 settings: InterpreterSettings,
704 ) -> (String, InterpreterSettings) {
705 let mut device = TextExtractionDevice::new();
706 let mut ctx = Self::create_context_with_settings(page, settings);
707 interpret_page(page, &mut ctx, &mut device);
708 let settings = ctx.into_settings();
709 (device.into_text(), settings)
710 }
711
712 fn extract_text_blocks_with_settings<'a>(
713 page: &Page<'a>,
714 settings: InterpreterSettings,
715 ) -> (Vec<TextBlock>, InterpreterSettings) {
716 let mut device = TextExtractionDevice::new();
717 let mut ctx = Self::create_context_with_settings(page, settings);
718 interpret_page(page, &mut ctx, &mut device);
719 let settings = ctx.into_settings();
720 (device.into_blocks(), settings)
721 }
722
723 #[cfg(feature = "xfa")]
724 fn open_flattened_xfa_for_render(&self) -> Option<Self> {
725 if !crate::xfa::has_xfa(self) {
726 return None;
727 }
728
729 let flat_bytes = crate::xfa::flatten(self).ok()?;
730 let mut flat_doc = Self::open(flat_bytes).ok()?;
731 flat_doc.settings = self.settings.clone();
732 Some(flat_doc)
733 }
734}
735
736fn join_page_texts<I>(page_texts: I) -> String
737where
738 I: IntoIterator,
739 I::Item: AsRef<str>,
740{
741 let mut text = String::new();
742 let mut is_first = true;
743
744 for page_text in page_texts {
745 if !is_first {
746 while !text.is_empty() && !text.ends_with("\n\n") {
747 text.push('\n');
748 }
749 text.push('\u{000C}');
750 }
751 text.push_str(page_text.as_ref());
752 is_first = false;
753 }
754
755 text
756}
757
758#[cfg(test)]
759mod extract_all_text_tests {
760 use super::join_page_texts;
761
762 #[test]
763 fn separates_nonempty_pages_like_pdftotext() {
764 assert_eq!(
765 join_page_texts(["Page 1", "Page 2"]),
766 "Page 1\n\n\u{000C}Page 2"
767 );
768 }
769
770 #[test]
771 fn preserves_leading_blank_pages_without_extra_newlines() {
772 assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
773 }
774
775 #[test]
776 fn reuses_existing_blank_line_before_form_feed() {
777 assert_eq!(
778 join_page_texts(["Page 1\n\n", "Page 2"]),
779 "Page 1\n\n\u{000C}Page 2"
780 );
781 }
782
783 #[cfg(feature = "xfa")]
793 #[test]
794 fn flat_extract_preferred_for_adobe_placeholder_only() {
795 use crate::PdfDocument;
796 assert!(!PdfDocument::should_prefer_flat_extract(
799 "",
800 "Some flat text"
801 ));
802 let placeholder = "The document you are trying to load \
804 requires Adobe Reader 8 or higher.";
805 assert!(PdfDocument::should_prefer_flat_extract(
806 placeholder,
807 "rendered XFA content"
808 ));
809 let please_wait = "Please wait... If this message is not \
811 eventually replaced...";
812 assert!(PdfDocument::should_prefer_flat_extract(
813 please_wait,
814 "rendered XFA content"
815 ));
816 let to_view = "To view the full contents of this document, \
818 you need a later version of the PDF viewer.";
819 assert!(PdfDocument::should_prefer_flat_extract(
820 to_view,
821 "rendered XFA content"
822 ));
823 let warning = "Warning: This form is not supported with the \
825 current version of Acrobat or Adobe Reader.";
826 assert!(PdfDocument::should_prefer_flat_extract(
827 warning,
828 "rendered XFA content"
829 ));
830 assert!(!PdfDocument::should_prefer_flat_extract(
832 "Real form: Name: ___",
833 "rendered version of the same form"
834 ));
835 let long_raw = "X".repeat(2000);
837 let long_flat = "Y".repeat(20000);
838 assert!(!PdfDocument::should_prefer_flat_extract(
839 &long_raw, &long_flat
840 ));
841 assert!(!PdfDocument::should_prefer_flat_extract(
843 placeholder, ""
844 ));
845 }
846}
847
848fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
850 let mut items = Vec::new();
851 let mut current: Option<Dict<'_>> = Some(item_dict.clone());
852
853 while let Some(dict) = current {
854 let title = dict
855 .get::<pdf_render::pdf_syntax::object::String>(TITLE)
856 .map(|s| bytes_to_string(s.as_bytes()))
857 .unwrap_or_default();
858
859 let children = match dict.get::<Dict<'_>>(FIRST) {
860 Some(child_dict) => parse_outline_items(&child_dict),
861 None => Vec::new(),
862 };
863
864 items.push(BookmarkItem {
865 title,
866 page: None, children,
868 });
869
870 current = dict.get::<Dict<'_>>(NEXT);
871 }
872
873 items
874}
875
876fn bytes_to_string(bytes: &[u8]) -> String {
878 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
880 let chars: Vec<u16> = bytes[2..]
881 .chunks(2)
882 .filter_map(|c| {
883 if c.len() == 2 {
884 Some(u16::from_be_bytes([c[0], c[1]]))
885 } else {
886 None
887 }
888 })
889 .collect();
890 return String::from_utf16_lossy(&chars);
891 }
892
893 match std::str::from_utf8(bytes) {
895 Ok(s) => s.to_string(),
896 Err(_) => bytes.iter().map(|&b| b as char).collect(),
897 }
898}
899
900#[cfg(test)]
901mod tests {
902 use super::*;
903 use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
904 use lopdf::{Document as LoDocument, Object};
905 use std::path::PathBuf;
906
907 fn corpus_path(name: &str) -> PathBuf {
908 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
909 .join("../../corpus")
910 .join(name)
911 }
912
913 fn normalize_text(text: &str) -> String {
914 text.split_whitespace().collect::<Vec<_>>().join(" ")
915 }
916
917 fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
918 fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
919 match dict.get(key).ok()? {
920 Object::Name(name) => Some(name.clone()),
921 _ => None,
922 }
923 }
924
925 fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
926 let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
927 return false;
928 };
929 let Some(Object::Reference(desc_id)) = descendants.first() else {
930 return false;
931 };
932 let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
933 return false;
934 };
935 matches!(
936 descendant.get(b"Subtype").ok(),
937 Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
938 )
939 }
940
941 let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
942 let ids: Vec<_> = doc.objects.keys().copied().collect();
943 let mut removed = 0usize;
944
945 for id in ids {
946 let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
947 continue;
948 };
949 if !matches!(
950 dict.get(b"Subtype").ok(),
951 Some(Object::Name(name)) if name.as_slice() == b"Type0"
952 ) {
953 continue;
954 }
955 if !matches!(
956 get_name(dict, b"Encoding").as_deref(),
957 Some(b"Identity-H") | Some(b"Identity-V")
958 ) {
959 continue;
960 }
961 if !descendant_is_cidfont_type2(&doc, dict) {
962 continue;
963 }
964
965 if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
966 if type0.has(b"ToUnicode") {
967 type0.remove(b"ToUnicode");
968 removed += 1;
969 }
970 }
971 }
972
973 let mut out = Vec::new();
974 doc.save_to(&mut out)
975 .expect("save stripped-to-unicode fixture");
976 (out, removed)
977 }
978
979 fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
980 use lopdf::{dictionary, Document, Object, Stream};
981
982 let mut doc = Document::with_version("1.4");
983
984 let pages_id = doc.new_object_id();
985 let page_id = doc.new_object_id();
986 let content = format!("{color_operator}\n0 0 72 72 re f\n");
987 let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
988
989 doc.objects.insert(
990 page_id,
991 Object::Dictionary(dictionary! {
992 "Type" => Object::Name(b"Page".to_vec()),
993 "Parent" => Object::Reference(pages_id),
994 "MediaBox" => Object::Array(vec![
995 Object::Integer(0),
996 Object::Integer(0),
997 Object::Integer(72),
998 Object::Integer(72),
999 ]),
1000 "Contents" => Object::Reference(content_id),
1001 }),
1002 );
1003
1004 doc.objects.insert(
1005 pages_id,
1006 Object::Dictionary(dictionary! {
1007 "Type" => Object::Name(b"Pages".to_vec()),
1008 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1009 "Count" => Object::Integer(1),
1010 }),
1011 );
1012
1013 let catalog_id = doc.new_object_id();
1014 doc.objects.insert(
1015 catalog_id,
1016 Object::Dictionary(dictionary! {
1017 "Type" => Object::Name(b"Catalog".to_vec()),
1018 "Pages" => Object::Reference(pages_id),
1019 }),
1020 );
1021
1022 doc.trailer.set("Root", Object::Reference(catalog_id));
1023
1024 let mut bytes = Vec::new();
1025 doc.save_to(&mut bytes).expect("save solid fill fixture");
1026 bytes
1027 }
1028
1029 fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
1030 use lopdf::{dictionary, Document, Object, Stream};
1031
1032 let mut doc = Document::with_version("1.4");
1033 let pages_id = doc.new_object_id();
1034 let page_id = doc.new_object_id();
1035 let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
1036 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1037
1038 doc.objects.insert(
1039 page_id,
1040 Object::Dictionary(dictionary! {
1041 "Type" => "Page",
1042 "Parent" => Object::Reference(pages_id),
1043 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1044 "Contents" => Object::Reference(content_id),
1045 }),
1046 );
1047 doc.objects.insert(
1048 pages_id,
1049 Object::Dictionary(dictionary! {
1050 "Type" => "Pages",
1051 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1052 "Count" => Object::Integer(1),
1053 }),
1054 );
1055 let catalog_id = doc.new_object_id();
1056 doc.objects.insert(
1057 catalog_id,
1058 Object::Dictionary(dictionary! {
1059 "Type" => "Catalog",
1060 "Pages" => Object::Reference(pages_id),
1061 }),
1062 );
1063 doc.trailer.set("Root", Object::Reference(catalog_id));
1064
1065 let mut bytes = Vec::new();
1066 doc.save_to(&mut bytes)
1067 .expect("save mixed rgb/cmyk fixture");
1068 bytes
1069 }
1070
1071 fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
1072 use lopdf::{dictionary, Document, Object, Stream};
1073
1074 let mut doc = Document::with_version("1.4");
1075 let pages_id = doc.new_object_id();
1076 let page_id = doc.new_object_id();
1077 let gs_id = doc.add_object(Object::Dictionary(dictionary! {
1078 "Type" => "ExtGState",
1079 "ca" => Object::Real(0.5),
1080 }));
1081 let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
1082 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1083
1084 doc.objects.insert(
1085 page_id,
1086 Object::Dictionary(dictionary! {
1087 "Type" => "Page",
1088 "Parent" => Object::Reference(pages_id),
1089 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1090 "Resources" => dictionary! {
1091 "ExtGState" => dictionary! {
1092 "GS1" => Object::Reference(gs_id),
1093 },
1094 },
1095 "Contents" => Object::Reference(content_id),
1096 }),
1097 );
1098 doc.objects.insert(
1099 pages_id,
1100 Object::Dictionary(dictionary! {
1101 "Type" => "Pages",
1102 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1103 "Count" => Object::Integer(1),
1104 }),
1105 );
1106 let catalog_id = doc.new_object_id();
1107 doc.objects.insert(
1108 catalog_id,
1109 Object::Dictionary(dictionary! {
1110 "Type" => "Catalog",
1111 "Pages" => Object::Reference(pages_id),
1112 }),
1113 );
1114 doc.trailer.set("Root", Object::Reference(catalog_id));
1115
1116 let mut bytes = Vec::new();
1117 doc.save_to(&mut bytes)
1118 .expect("save transparent cmyk fixture");
1119 bytes
1120 }
1121
1122 fn cmyk_image_pdf_bytes() -> Vec<u8> {
1123 use lopdf::{dictionary, Document, Object, Stream};
1124
1125 let mut doc = Document::with_version("1.4");
1126 let pages_id = doc.new_object_id();
1127 let page_id = doc.new_object_id();
1128 let image_id = doc.add_object(Stream::new(
1129 dictionary! {
1130 "Type" => "XObject",
1131 "Subtype" => "Image",
1132 "Width" => Object::Integer(2),
1133 "Height" => Object::Integer(1),
1134 "BitsPerComponent" => Object::Integer(8),
1135 "ColorSpace" => "DeviceCMYK",
1136 },
1137 vec![255, 0, 0, 0, 0, 255, 0, 0],
1138 ));
1139 let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
1140 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1141
1142 doc.objects.insert(
1143 page_id,
1144 Object::Dictionary(dictionary! {
1145 "Type" => "Page",
1146 "Parent" => Object::Reference(pages_id),
1147 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
1148 "Resources" => dictionary! {
1149 "XObject" => dictionary! {
1150 "Im1" => Object::Reference(image_id),
1151 },
1152 },
1153 "Contents" => Object::Reference(content_id),
1154 }),
1155 );
1156 doc.objects.insert(
1157 pages_id,
1158 Object::Dictionary(dictionary! {
1159 "Type" => "Pages",
1160 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1161 "Count" => Object::Integer(1),
1162 }),
1163 );
1164 let catalog_id = doc.new_object_id();
1165 doc.objects.insert(
1166 catalog_id,
1167 Object::Dictionary(dictionary! {
1168 "Type" => "Catalog",
1169 "Pages" => Object::Reference(pages_id),
1170 }),
1171 );
1172 doc.trailer.set("Root", Object::Reference(catalog_id));
1173
1174 let mut bytes = Vec::new();
1175 doc.save_to(&mut bytes).expect("save cmyk image fixture");
1176 bytes
1177 }
1178
1179 fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
1180 let idx = ((y * rendered.width + x) * 4) as usize;
1181 [
1182 rendered.pixels[idx],
1183 rendered.pixels[idx + 1],
1184 rendered.pixels[idx + 2],
1185 rendered.pixels[idx + 3],
1186 ]
1187 }
1188
1189 fn non_embedded_truetype_pdf_bytes(
1194 base_font: &[u8],
1195 encoding: &[u8],
1196 text_bytes: &[u8],
1197 ) -> Vec<u8> {
1198 use lopdf::{dictionary, Document, Object, Stream};
1199
1200 let mut doc = Document::with_version("1.4");
1201
1202 let font_id = doc.add_object(Object::Dictionary(dictionary! {
1203 "Type" => "Font",
1204 "Subtype" => "TrueType",
1205 "Name" => Object::Name(b"F0".to_vec()),
1206 "BaseFont" => Object::Name(base_font.to_vec()),
1207 "Encoding" => Object::Name(encoding.to_vec()),
1208 }));
1209
1210 let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1211 "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1212 }));
1213
1214 let mut content = Vec::new();
1215 content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1216 for &b in text_bytes {
1217 match b {
1218 b'(' | b')' | b'\\' => {
1219 content.push(b'\\');
1220 content.push(b);
1221 }
1222 _ => content.push(b),
1223 }
1224 }
1225 content.extend_from_slice(b") Tj\nET\n");
1226 let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1227
1228 let pages_id = doc.new_object_id();
1229 let page_id = doc.add_object(Object::Dictionary(dictionary! {
1230 "Type" => "Page",
1231 "Parent" => Object::Reference(pages_id),
1232 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1233 "Resources" => Object::Reference(resources_id),
1234 "Contents" => Object::Reference(content_id),
1235 }));
1236 doc.objects.insert(
1237 pages_id,
1238 Object::Dictionary(dictionary! {
1239 "Type" => "Pages",
1240 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1241 "Count" => Object::Integer(1),
1242 }),
1243 );
1244 let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1245 "Type" => "Catalog",
1246 "Pages" => Object::Reference(pages_id),
1247 }));
1248 doc.trailer.set("Root", Object::Reference(catalog_id));
1249
1250 let mut bytes = Vec::new();
1251 doc.save_to(&mut bytes).expect("save non-embedded fixture");
1252 bytes
1253 }
1254
1255 fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1258 use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1259
1260 let mut doc = Document::with_version("1.4");
1261
1262 let catalog_id = doc.new_object_id();
1263 let pages_id = doc.new_object_id();
1264 let page_id = doc.new_object_id();
1265 let acroform_id = doc.new_object_id();
1266 let content_id = doc.new_object_id();
1267 let widget_id = doc.new_object_id();
1268
1269 doc.objects.insert(
1270 content_id,
1271 Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1272 );
1273 doc.objects.insert(
1274 widget_id,
1275 Object::Dictionary(dictionary! {
1276 "Type" => "Annot",
1277 "Subtype" => "Widget",
1278 "FT" => "Btn",
1279 "Ff" => Object::Integer(1 << 16),
1280 "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1281 "MK" => dictionary! {
1282 "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1283 },
1284 "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1285 "P" => Object::Reference(page_id),
1286 }),
1287 );
1288 doc.objects.insert(
1289 page_id,
1290 Object::Dictionary(dictionary! {
1291 "Type" => "Page",
1292 "Parent" => Object::Reference(pages_id),
1293 "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1294 "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1295 "Contents" => Object::Reference(content_id),
1296 }),
1297 );
1298 doc.objects.insert(
1299 pages_id,
1300 Object::Dictionary(dictionary! {
1301 "Type" => "Pages",
1302 "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1303 "Count" => Object::Integer(1),
1304 }),
1305 );
1306 doc.objects.insert(
1307 acroform_id,
1308 Object::Dictionary(dictionary! {
1309 "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1310 }),
1311 );
1312 doc.objects.insert(
1313 catalog_id,
1314 Object::Dictionary(dictionary! {
1315 "Type" => "Catalog",
1316 "Pages" => Object::Reference(pages_id),
1317 "AcroForm" => Object::Reference(acroform_id),
1318 }),
1319 );
1320 doc.trailer.set("Root", Object::Reference(catalog_id));
1321
1322 let mut bytes = Vec::new();
1323 doc.save_to(&mut bytes)
1324 .expect("save push-button caption fixture");
1325 bytes
1326 }
1327
1328 #[test]
1329 fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1330 let bytes = non_embedded_truetype_pdf_bytes(
1336 b"TimesNewRoman",
1337 b"WinAnsiEncoding",
1338 b"UNITED STATES DISTRICT COURT",
1339 );
1340 let text = PdfDocument::open(bytes)
1341 .expect("open non-embedded TrueType fixture")
1342 .extract_text(0)
1343 .expect("extract non-embedded TrueType text");
1344 let norm = normalize_text(&text);
1345 assert!(
1346 norm.contains("UNITED STATES DISTRICT COURT"),
1347 "expected WinAnsi-decoded text, got: {norm:?}"
1348 );
1349 }
1350
1351 #[test]
1352 fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1353 let bytes = non_embedded_truetype_pdf_bytes(
1360 b"OpaqueCustomXYZ",
1361 b"WinAnsiEncoding",
1362 b"Hello, world!",
1363 );
1364 let text = PdfDocument::open(bytes)
1365 .expect("open custom non-embedded fixture")
1366 .extract_text(0)
1367 .expect("extract custom non-embedded text");
1368 let norm = normalize_text(&text);
1369 assert!(
1370 norm.contains("Hello, world!"),
1371 "expected WinAnsi-decoded text, got: {norm:?}"
1372 );
1373 }
1374
1375 #[test]
1376 fn extract_acroform_text_includes_push_button_mk_caption() {
1377 let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1378 let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1379
1380 let page_text = doc.extract_text(0).expect("extract page text");
1381 assert!(
1382 normalize_text(&page_text).is_empty(),
1383 "expected empty page content stream, got: {page_text:?}"
1384 );
1385
1386 let acroform_text = doc.extract_acroform_text();
1387 assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1388
1389 let all_text = doc.extract_all_text();
1390 assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1391 }
1392
1393 #[test]
1394 fn bytes_to_string_utf8() {
1395 assert_eq!(bytes_to_string(b"hello"), "hello");
1396 }
1397
1398 #[test]
1399 fn bytes_to_string_latin1() {
1400 let bytes = &[0xC4, 0xD6, 0xDC]; let s = bytes_to_string(bytes);
1402 assert_eq!(s, "ÄÖÜ");
1403 }
1404
1405 #[test]
1406 fn bytes_to_string_utf16() {
1407 let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; assert_eq!(bytes_to_string(bytes), "Hi");
1409 }
1410
1411 #[test]
1412 fn document_info_default() {
1413 let info = DocumentInfo::default();
1414 assert!(info.title.is_none());
1415 assert!(info.author.is_none());
1416 }
1417
1418 #[test]
1419 fn bookmark_item_children() {
1420 let item = BookmarkItem {
1421 title: "Root".into(),
1422 page: None,
1423 children: vec![BookmarkItem {
1424 title: "Child".into(),
1425 page: Some(0),
1426 children: Vec::new(),
1427 }],
1428 };
1429 assert_eq!(item.children.len(), 1);
1430 assert_eq!(item.children[0].title, "Child");
1431 }
1432
1433 #[test]
1434 fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1435 let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1436 let expected = PdfDocument::open(original.clone())
1437 .expect("open original sf181")
1438 .extract_text(0)
1439 .expect("extract original sf181 text");
1440 assert!(
1441 expected.contains("Guide to Personnel Data Standards"),
1442 "unexpected baseline extraction: {expected}"
1443 );
1444
1445 let (stripped, removed) = strip_type0_tounicode(&original);
1446 assert!(
1447 removed > 0,
1448 "expected to strip at least one Type0 ToUnicode"
1449 );
1450
1451 let actual = PdfDocument::open(stripped)
1452 .expect("open stripped sf181")
1453 .extract_text(0)
1454 .expect("extract stripped sf181 text");
1455
1456 let actual_norm = normalize_text(&actual);
1457 let expected_norm = normalize_text(&expected);
1458
1459 assert!(
1460 actual_norm.contains("Guide to Personnel Data Standards"),
1461 "missing main heading after stripping ToUnicode: {actual_norm}"
1462 );
1463 assert!(
1464 actual_norm.contains("Privacy Act Statement"),
1465 "missing body text after stripping ToUnicode: {actual_norm}"
1466 );
1467 assert!(
1468 actual_norm.len() + 32 >= expected_norm.len(),
1469 "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1470 expected_norm.len(),
1471 actual_norm.len()
1472 );
1473 }
1474
1475 #[test]
1476 fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1477 let bytes =
1485 std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1486 let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1487 let text = doc.extract_all_text();
1488
1489 let norm = normalize_text(&text);
1490 assert!(
1491 norm.contains("Transatlantic Council"),
1492 "expected Identity-H codes to resolve as Unicode: {norm}"
1493 );
1494 assert!(
1495 norm.contains("Boy Scouts of America"),
1496 "expected body text to be recovered: {norm}"
1497 );
1498 }
1499
1500 #[test]
1501 fn render_page_with_config_srgb_matches_legacy_render_page() {
1502 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1503 let legacy = doc
1504 .render_page(
1505 0,
1506 &RenderOptions {
1507 dpi: 72.0,
1508 ..Default::default()
1509 },
1510 )
1511 .expect("legacy render succeeds");
1512 let configured = doc
1513 .render_page_with_config(
1514 0,
1515 &RenderConfig {
1516 color_mode: ColorMode::Srgb,
1517 dpi: 72,
1518 },
1519 )
1520 .expect("configured render succeeds");
1521
1522 assert_eq!(legacy.width, configured.width);
1523 assert_eq!(legacy.height, configured.height);
1524 assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1525 assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1526 assert_eq!(legacy.pixels, configured.pixels);
1527 }
1528
1529 #[test]
1530 fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1531 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1532 let rendered = doc
1533 .render_page_with_config(
1534 0,
1535 &RenderConfig {
1536 color_mode: ColorMode::PreserveCmyk,
1537 dpi: 72,
1538 },
1539 )
1540 .expect("cmyk render succeeds");
1541
1542 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1543 assert_eq!(
1544 rendered.pixels.len(),
1545 rendered.width as usize * rendered.height as usize * 4
1546 );
1547 assert_eq!(
1548 pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1549 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1550 );
1551 }
1552
1553 #[test]
1554 fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1555 let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1556 let rendered = doc
1557 .render_page_with_config(
1558 0,
1559 &RenderConfig {
1560 color_mode: ColorMode::SimulateCmyk,
1561 dpi: 72,
1562 },
1563 )
1564 .expect("simulate cmyk render succeeds");
1565
1566 assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1567 assert!(!rendered.pixels.is_empty());
1568 }
1569
1570 #[test]
1571 fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1572 let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1573 let rendered = doc
1574 .render_page_with_config(
1575 0,
1576 &RenderConfig {
1577 color_mode: ColorMode::PreserveCmyk,
1578 dpi: 72,
1579 },
1580 )
1581 .expect("mixed render succeeds");
1582
1583 assert_eq!(
1584 pixel_at(&rendered, 54, 36),
1585 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1586 );
1587 assert_ne!(
1588 pixel_at(&rendered, 18, 36),
1589 crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1590 );
1591 }
1592
1593 #[test]
1594 fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1595 let doc =
1596 PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1597 let rendered = doc
1598 .render_page_with_config(
1599 0,
1600 &RenderConfig {
1601 color_mode: ColorMode::PreserveCmyk,
1602 dpi: 72,
1603 },
1604 )
1605 .expect("transparent cmyk render succeeds");
1606
1607 assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1608 assert_eq!(
1609 rendered.pixels.len(),
1610 rendered.width as usize * rendered.height as usize * 4
1611 );
1612 }
1613
1614 #[test]
1615 fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1616 let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1617 let rendered = doc
1618 .render_page_with_config(
1619 0,
1620 &RenderConfig {
1621 color_mode: ColorMode::PreserveCmyk,
1622 dpi: 72,
1623 },
1624 )
1625 .expect("cmyk image render succeeds");
1626
1627 assert_eq!(rendered.width, 2);
1628 assert_eq!(rendered.height, 1);
1629 assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1630 assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1631 }
1632}