Skip to main content

pdf_engine/
document.rs

1//! Unified document facade — multi-page rendering, text extraction,
2//! metadata, bookmarks, and thumbnails.
3
4use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::ProcessingLimits;
7use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
8use crate::text::{TextBlock, TextExtractionDevice};
9use crate::thumbnail::ThumbnailOptions;
10
11use pdf_forms::parse::parse_acroform;
12use pdf_forms::tree::{FieldType, FieldValue};
13use pdf_render::pdf_interpret::PageExt;
14use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings};
15use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
16use pdf_render::pdf_syntax::object::Dict;
17use pdf_render::pdf_syntax::page::Page;
18use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
19#[cfg(feature = "parallel")]
20use rayon::prelude::*;
21
22use kurbo::Rect;
23
24/// Document metadata extracted from the info dictionary.
25#[derive(Debug, Clone, Default)]
26pub struct DocumentInfo {
27    /// Document title.
28    pub title: Option<String>,
29    /// Author.
30    pub author: Option<String>,
31    /// Subject.
32    pub subject: Option<String>,
33    /// Keywords.
34    pub keywords: Option<String>,
35    /// Creator application.
36    pub creator: Option<String>,
37    /// Producer application.
38    pub producer: Option<String>,
39}
40
41/// A bookmark / outline item.
42#[derive(Debug, Clone)]
43pub struct BookmarkItem {
44    /// Bookmark title.
45    pub title: String,
46    /// Target page index (0-based), if resolvable.
47    pub page: Option<usize>,
48    /// Nested child bookmarks.
49    pub children: Vec<BookmarkItem>,
50}
51
52/// High-level PDF document handle.
53pub struct PdfDocument {
54    pdf: Pdf,
55    settings: InterpreterSettings,
56}
57
58impl PdfDocument {
59    /// Open a PDF from bytes.
60    pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
61        let pdf = Pdf::new(data).map_err(|e| match e {
62            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
63                EngineError::Encrypted(format!("{d:?}"))
64            }
65            _ => EngineError::InvalidPdf(format!("{e:?}")),
66        })?;
67        Ok(Self {
68            pdf,
69            settings: InterpreterSettings::default(),
70        })
71    }
72
73    /// Open a PDF from bytes with processing limits.
74    pub fn open_with_processing_limits(
75        data: impl Into<pdf_render::pdf_syntax::PdfData>,
76        limits: ProcessingLimits,
77    ) -> Result<Self> {
78        let syntax_limits = PdfLoadLimits::new()
79            .max_object_depth(limits.max_object_depth)
80            .max_image_pixels(limits.max_image_pixels)
81            .max_stream_bytes(limits.max_stream_bytes);
82        let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
83            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
84                EngineError::Encrypted(format!("{d:?}"))
85            }
86            _ => EngineError::InvalidPdf(format!("{e:?}")),
87        })?;
88        let settings = InterpreterSettings {
89            max_operator_count: Some(limits.max_operator_count),
90            ..InterpreterSettings::default()
91        };
92        Ok(Self { pdf, settings })
93    }
94
95    /// Open a password-protected PDF.
96    pub fn open_with_password(
97        data: impl Into<pdf_render::pdf_syntax::PdfData>,
98        password: &str,
99    ) -> Result<Self> {
100        let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
101            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
102                EngineError::Encrypted(format!("{d:?}"))
103            }
104            _ => EngineError::InvalidPdf(format!("{e:?}")),
105        })?;
106        Ok(Self {
107            pdf,
108            settings: InterpreterSettings::default(),
109        })
110    }
111
112    /// Open a password-protected PDF with processing limits.
113    pub fn open_with_password_and_processing_limits(
114        data: impl Into<pdf_render::pdf_syntax::PdfData>,
115        password: &str,
116        limits: ProcessingLimits,
117    ) -> Result<Self> {
118        let syntax_limits = PdfLoadLimits::new()
119            .max_object_depth(limits.max_object_depth)
120            .max_image_pixels(limits.max_image_pixels)
121            .max_stream_bytes(limits.max_stream_bytes);
122        let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
123            |e| match e {
124                pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
125                    EngineError::Encrypted(format!("{d:?}"))
126                }
127                _ => EngineError::InvalidPdf(format!("{e:?}")),
128            },
129        )?;
130        let settings = InterpreterSettings {
131            max_operator_count: Some(limits.max_operator_count),
132            ..InterpreterSettings::default()
133        };
134        Ok(Self { pdf, settings })
135    }
136
137    /// Access the underlying parsed PDF.
138    pub fn pdf(&self) -> &Pdf {
139        &self.pdf
140    }
141
142    /// Set interpreter settings (font resolver, cmap resolver, etc.).
143    pub fn set_settings(&mut self, settings: InterpreterSettings) {
144        self.settings = settings;
145    }
146
147    /// Number of pages.
148    pub fn page_count(&self) -> usize {
149        self.pdf.pages().len()
150    }
151
152    /// Get the geometry of a page.
153    pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
154        let page = self.get_page(index)?;
155        Ok(geometry::extract_geometry(page))
156    }
157
158    /// Render a single page.
159    ///
160    /// If the document contains an XFA template, it is automatically flattened
161    /// to static PDF content before rendering.  This prevents the "Please wait"
162    /// placeholder page that Adobe Reader would show when rendering an XFA PDF
163    /// with a conventional renderer. If flattening fails, rendering falls back
164    /// to the original document as a best-effort path.
165    pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
166        #[cfg(feature = "xfa")]
167        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
168            return flat_doc.render_page(index, options);
169        }
170        let page = self.get_page(index)?;
171        // Pre-flight: reject pathologically small or zero-dimension pages before
172        // allocating any pixel buffer. Non-positive dimensions cause panics or
173        // zero-sized allocations inside the rasteriser.
174        let (w, h) = page.render_dimensions();
175        if w <= 0.0 || h <= 0.0 {
176            return Err(EngineError::InvalidPageGeometry {
177                width: w,
178                height: h,
179                reason: "page has zero or negative dimensions".into(),
180            });
181        }
182        // Also reject pages so small they produce zero pixels even at the
183        // minimum meaningful DPI (1 DPI). Below ~0.72pt at 1 DPI = 0 pixels.
184        const MIN_PAGE_PT: f32 = 1.0;
185        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
186            return Err(EngineError::InvalidPageGeometry {
187                width: w,
188                height: h,
189                reason: "page too small to render (< 1pt)".into(),
190            });
191        }
192        Ok(render::render_page(page, options, &self.settings))
193    }
194
195    /// Render a single page using the high-level render config.
196    ///
197    /// XFA documents are auto-flattened before rendering (same as `render_page`).
198    /// If flattening fails, rendering falls back to the original document.
199    pub fn render_page_with_config(
200        &self,
201        index: usize,
202        config: &RenderConfig,
203    ) -> Result<RenderedPage> {
204        #[cfg(feature = "xfa")]
205        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
206            return flat_doc.render_page_with_config(index, config);
207        }
208        let page = self.get_page(index)?;
209        let (w, h) = page.render_dimensions();
210        if w <= 0.0 || h <= 0.0 {
211            return Err(EngineError::InvalidPageGeometry {
212                width: w,
213                height: h,
214                reason: "page has zero or negative dimensions".into(),
215            });
216        }
217        const MIN_PAGE_PT: f32 = 1.0;
218        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
219            return Err(EngineError::InvalidPageGeometry {
220                width: w,
221                height: h,
222                reason: "page too small to render (< 1pt)".into(),
223            });
224        }
225        Ok(render::render_page_with_config(
226            page,
227            config,
228            &self.settings,
229        ))
230    }
231
232    /// Render a single page to a CMYK buffer.
233    pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
234        self.render_page_with_config(
235            index,
236            &RenderConfig {
237                color_mode: ColorMode::PreserveCmyk,
238                dpi,
239            },
240        )
241    }
242
243    /// Render all pages, in parallel when the `parallel` feature is enabled.
244    pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
245        let pages = self.pdf.pages();
246        #[cfg(feature = "parallel")]
247        return (0..pages.len())
248            .into_par_iter()
249            .map(|i| render::render_page(&pages[i], options, &self.settings))
250            .collect();
251        #[cfg(not(feature = "parallel"))]
252        (0..pages.len())
253            .map(|i| render::render_page(&pages[i], options, &self.settings))
254            .collect()
255    }
256
257    /// Render all pages using the high-level render config.
258    pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
259        let pages = self.pdf.pages();
260        #[cfg(feature = "parallel")]
261        return (0..pages.len())
262            .into_par_iter()
263            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
264            .collect();
265        #[cfg(not(feature = "parallel"))]
266        (0..pages.len())
267            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
268            .collect()
269    }
270
271    /// Generate a thumbnail for a single page.
272    pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
273        let page = self.get_page(index)?;
274        Ok(render::render_thumbnail(
275            page,
276            options.max_dimension,
277            &self.settings,
278        ))
279    }
280
281    /// Generate thumbnails for all pages, in parallel when the `parallel` feature is enabled.
282    pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
283        let pages = self.pdf.pages();
284        #[cfg(feature = "parallel")]
285        return (0..pages.len())
286            .into_par_iter()
287            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
288            .collect();
289        #[cfg(not(feature = "parallel"))]
290        (0..pages.len())
291            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
292            .collect()
293    }
294
295    /// Extract text from a page as a single string.
296    pub fn extract_text(&self, index: usize) -> Result<String> {
297        let page = self.get_page(index)?;
298        let mut device = TextExtractionDevice::new();
299        let mut ctx = self.create_context(page);
300        interpret_page(page, &mut ctx, &mut device);
301        Ok(device.into_text())
302    }
303
304    /// Extract text from a sequence of pages while reusing the same settings object.
305    #[doc(hidden)]
306    pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
307    where
308        I: IntoIterator<Item = usize>,
309    {
310        let pages = self.pdf.pages();
311        let mut settings = self.text_extraction_settings();
312        let indices = indices.into_iter();
313        let (lower_bound, upper_bound) = indices.size_hint();
314        let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
315
316        for index in indices {
317            let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
318                index,
319                count: pages.len(),
320            })?;
321            let (text, next_settings) = Self::extract_text_with_settings(page, settings);
322            settings = next_settings;
323            texts.push(text);
324        }
325
326        Ok(texts)
327    }
328
329    /// Extract structured text blocks from a page.
330    pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
331        let page = self.get_page(index)?;
332        let mut device = TextExtractionDevice::new();
333        let mut ctx = self.create_context(page);
334        interpret_page(page, &mut ctx, &mut device);
335        Ok(device.into_blocks())
336    }
337
338    /// Extract structured text blocks from all pages, reusing interpreter settings.
339    pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
340        let pages = self.pdf.pages();
341        let mut settings = self.text_extraction_settings();
342        let mut blocks = Vec::with_capacity(pages.len());
343
344        for page in pages.iter() {
345            let (page_blocks, next_settings) =
346                Self::extract_text_blocks_with_settings(page, settings);
347            settings = next_settings;
348            blocks.push(page_blocks);
349        }
350
351        blocks
352    }
353
354    /// Extract text values from AcroForm fields, including push-button captions.
355    ///
356    /// Returns a single string concatenating all non-empty field values separated
357    /// by newlines. Useful when the document stores its readable content in form
358    /// field values rather than (or in addition to) page content streams.
359    pub fn extract_acroform_text(&self) -> String {
360        let Some(tree) = parse_acroform(&self.pdf) else {
361            return String::new();
362        };
363        let mut parts: Vec<String> = Vec::new();
364        for id in tree.all_ids() {
365            let node = tree.get(id);
366            if node.children.is_empty() {
367                // Terminal (widget) — collect text-like values.
368                let value_str = match &node.value {
369                    Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
370                    Some(FieldValue::StringArray(arr)) => {
371                        let joined = arr
372                            .iter()
373                            .filter(|s| !s.is_empty())
374                            .cloned()
375                            .collect::<Vec<_>>()
376                            .join(", ");
377                        if joined.is_empty() {
378                            None
379                        } else {
380                            Some(joined)
381                        }
382                    }
383                    _ => None,
384                };
385                let button_caption =
386                    value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
387                let extracted = value_str.or_else(|| {
388                    button_caption.then(|| {
389                        node.mk
390                            .as_ref()
391                            .and_then(|mk| mk.caption.as_ref())
392                            .filter(|caption| !caption.is_empty())
393                            .cloned()
394                    })?
395                });
396                if let Some(s) = extracted {
397                    parts.push(s);
398                }
399            }
400        }
401        parts.join("\n")
402    }
403
404    /// Extract all text from the document: page content streams plus AcroForm
405    /// field values.  Mirrors pdftotext behaviour.
406    pub fn extract_all_text(&self) -> String {
407        let pages = self.pdf.pages();
408        let mut settings = self.text_extraction_settings();
409        let mut page_texts = Vec::with_capacity(pages.len());
410        for page in pages.iter() {
411            let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
412            settings = next_settings;
413            page_texts.push(page_text);
414        }
415
416        let mut text = join_page_texts(page_texts.iter().map(String::as_str));
417        let acroform = self.extract_acroform_text();
418        if !acroform.is_empty() {
419            if !text.is_empty() && !text.ends_with('\n') {
420                text.push('\n');
421            }
422            text.push_str(&acroform);
423        }
424        text
425    }
426
427    /// Simple text search: returns page indices containing the query string.
428    pub fn search_text(&self, query: &str) -> Vec<usize> {
429        let pages = self.pdf.pages();
430        let query_lower = query.to_lowercase();
431        #[cfg(feature = "parallel")]
432        let page_contains = |i: usize| -> Option<usize> {
433            let page = &pages[i];
434            let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
435            if text.to_lowercase().contains(&query_lower) {
436                Some(i)
437            } else {
438                None
439            }
440        };
441        #[cfg(feature = "parallel")]
442        return (0..pages.len())
443            .into_par_iter()
444            .filter_map(page_contains)
445            .collect();
446        #[cfg(not(feature = "parallel"))]
447        {
448            let mut settings = self.text_extraction_settings();
449            let mut hits = Vec::new();
450            for (i, page) in pages.iter().enumerate() {
451                let (text, next_settings) = Self::extract_text_with_settings(page, settings);
452                settings = next_settings;
453                if text.to_lowercase().contains(&query_lower) {
454                    hits.push(i);
455                }
456            }
457            hits
458        }
459    }
460
461    /// Extract document metadata.
462    pub fn info(&self) -> DocumentInfo {
463        let meta = self.pdf.metadata();
464        DocumentInfo {
465            title: meta.title.as_ref().map(|b| bytes_to_string(b)),
466            author: meta.author.as_ref().map(|b| bytes_to_string(b)),
467            subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
468            keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
469            creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
470            producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
471        }
472    }
473
474    /// Extract document outline / bookmarks.
475    pub fn bookmarks(&self) -> Vec<BookmarkItem> {
476        let xref = self.pdf.xref();
477        let root_id = xref.root_id();
478        let catalog: Dict<'_> = match xref.get(root_id) {
479            Some(d) => d,
480            None => return Vec::new(),
481        };
482
483        let outlines: Dict<'_> = match catalog.get(OUTLINES) {
484            Some(d) => d,
485            None => return Vec::new(),
486        };
487
488        let first: Dict<'_> = match outlines.get(FIRST) {
489            Some(d) => d,
490            None => return Vec::new(),
491        };
492
493        parse_outline_items(&first)
494    }
495
496    /// Run OCR on a page and return the recognized text and word positions.
497    ///
498    /// The page is rendered at `dpi` (default 150) before recognition.
499    /// Pass any [`OcrBackend`] implementation; use [`OcrsBackend::try_default`]
500    /// to load the pure-Rust `ocrs` engine from the standard model paths.
501    ///
502    /// # Example
503    ///
504    /// ```no_run
505    /// # #[cfg(feature = "ocr")] {
506    /// use pdf_engine::{PdfDocument, OcrsBackend, RenderOptions};
507    ///
508    /// let doc = PdfDocument::open(std::fs::read("scan.pdf").unwrap()).unwrap();
509    /// let backend = OcrsBackend::try_default().unwrap();
510    /// let result = doc.ocr_page(0, &backend, 150.0_f64).unwrap();
511    /// println!("{}", result.text);
512    /// # }
513    /// ```
514    pub fn ocr_page(
515        &self,
516        index: usize,
517        backend: &dyn crate::ocr::OcrBackend,
518        dpi: f64,
519    ) -> crate::error::Result<crate::ocr::OcrResult> {
520        let opts = crate::render::RenderOptions {
521            dpi,
522            ..Default::default()
523        };
524        let rendered = self.render_page(index, &opts)?;
525
526        // Convert RGBA → RGB (ocrs expects RGB input).
527        let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
528        for chunk in rendered.pixels.chunks(4) {
529            rgb.push(chunk[0]);
530            rgb.push(chunk[1]);
531            rgb.push(chunk[2]);
532        }
533
534        backend
535            .recognize(&rgb, rendered.width, rendered.height)
536            .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
537    }
538
539    fn get_page(&self, index: usize) -> Result<&Page<'_>> {
540        let pages = self.pdf.pages();
541        if index >= pages.len() {
542            return Err(EngineError::PageOutOfRange {
543                index,
544                count: pages.len(),
545            });
546        }
547        Ok(&pages[index])
548    }
549
550    fn text_extraction_settings(&self) -> InterpreterSettings {
551        let mut settings = self.settings.clone();
552        // Text extraction should include signature widget appearance streams
553        // that rendering skips to match MuPDF visual output.
554        settings.skip_signature_widgets = false;
555        settings
556    }
557
558    fn create_context<'a>(&self, page: &Page<'a>) -> Context<'a> {
559        Self::create_context_with_settings(page, self.text_extraction_settings())
560    }
561
562    fn create_context_with_settings<'a>(
563        page: &Page<'a>,
564        settings: InterpreterSettings,
565    ) -> Context<'a> {
566        let (w, h) = page.render_dimensions();
567        Context::new(
568            page.initial_transform(false),
569            Rect::new(0.0, 0.0, w as f64, h as f64),
570            page.xref(),
571            settings,
572        )
573    }
574
575    fn extract_text_with_settings<'a>(
576        page: &Page<'a>,
577        settings: InterpreterSettings,
578    ) -> (String, InterpreterSettings) {
579        let mut device = TextExtractionDevice::new();
580        let mut ctx = Self::create_context_with_settings(page, settings);
581        interpret_page(page, &mut ctx, &mut device);
582        let settings = ctx.into_settings();
583        (device.into_text(), settings)
584    }
585
586    fn extract_text_blocks_with_settings<'a>(
587        page: &Page<'a>,
588        settings: InterpreterSettings,
589    ) -> (Vec<TextBlock>, InterpreterSettings) {
590        let mut device = TextExtractionDevice::new();
591        let mut ctx = Self::create_context_with_settings(page, settings);
592        interpret_page(page, &mut ctx, &mut device);
593        let settings = ctx.into_settings();
594        (device.into_blocks(), settings)
595    }
596
597    #[cfg(feature = "xfa")]
598    fn open_flattened_xfa_for_render(&self) -> Option<Self> {
599        if !crate::xfa::has_xfa(self) {
600            return None;
601        }
602
603        let flat_bytes = crate::xfa::flatten(self).ok()?;
604        let mut flat_doc = Self::open(flat_bytes).ok()?;
605        flat_doc.settings = self.settings.clone();
606        Some(flat_doc)
607    }
608}
609
610fn join_page_texts<I>(page_texts: I) -> String
611where
612    I: IntoIterator,
613    I::Item: AsRef<str>,
614{
615    let mut text = String::new();
616    let mut is_first = true;
617
618    for page_text in page_texts {
619        if !is_first {
620            while !text.is_empty() && !text.ends_with("\n\n") {
621                text.push('\n');
622            }
623            text.push('\u{000C}');
624        }
625        text.push_str(page_text.as_ref());
626        is_first = false;
627    }
628
629    text
630}
631
632#[cfg(test)]
633mod extract_all_text_tests {
634    use super::join_page_texts;
635
636    #[test]
637    fn separates_nonempty_pages_like_pdftotext() {
638        assert_eq!(
639            join_page_texts(["Page 1", "Page 2"]),
640            "Page 1\n\n\u{000C}Page 2"
641        );
642    }
643
644    #[test]
645    fn preserves_leading_blank_pages_without_extra_newlines() {
646        assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
647    }
648
649    #[test]
650    fn reuses_existing_blank_line_before_form_feed() {
651        assert_eq!(
652            join_page_texts(["Page 1\n\n", "Page 2"]),
653            "Page 1\n\n\u{000C}Page 2"
654        );
655    }
656}
657
658/// Walk the outline linked list (FIRST → NEXT chain).
659fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
660    let mut items = Vec::new();
661    let mut current: Option<Dict<'_>> = Some(item_dict.clone());
662
663    while let Some(dict) = current {
664        let title = dict
665            .get::<pdf_render::pdf_syntax::object::String>(TITLE)
666            .map(|s| bytes_to_string(s.as_bytes()))
667            .unwrap_or_default();
668
669        let children = match dict.get::<Dict<'_>>(FIRST) {
670            Some(child_dict) => parse_outline_items(&child_dict),
671            None => Vec::new(),
672        };
673
674        items.push(BookmarkItem {
675            title,
676            page: None, // Destination resolution requires named-dest lookup — left for follow-up
677            children,
678        });
679
680        current = dict.get::<Dict<'_>>(NEXT);
681    }
682
683    items
684}
685
686/// Convert PDF string bytes to a Rust String (UTF-8 with Latin-1 fallback).
687fn bytes_to_string(bytes: &[u8]) -> String {
688    // Check for UTF-16 BOM
689    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
690        let chars: Vec<u16> = bytes[2..]
691            .chunks(2)
692            .filter_map(|c| {
693                if c.len() == 2 {
694                    Some(u16::from_be_bytes([c[0], c[1]]))
695                } else {
696                    None
697                }
698            })
699            .collect();
700        return String::from_utf16_lossy(&chars);
701    }
702
703    // Try UTF-8, fall back to Latin-1.
704    match std::str::from_utf8(bytes) {
705        Ok(s) => s.to_string(),
706        Err(_) => bytes.iter().map(|&b| b as char).collect(),
707    }
708}
709
710#[cfg(test)]
711mod tests {
712    use super::*;
713    use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
714    use lopdf::{Document as LoDocument, Object};
715    use std::path::PathBuf;
716
717    fn corpus_path(name: &str) -> PathBuf {
718        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
719            .join("../../corpus")
720            .join(name)
721    }
722
723    fn normalize_text(text: &str) -> String {
724        text.split_whitespace().collect::<Vec<_>>().join(" ")
725    }
726
727    fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
728        fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
729            match dict.get(key).ok()? {
730                Object::Name(name) => Some(name.clone()),
731                _ => None,
732            }
733        }
734
735        fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
736            let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
737                return false;
738            };
739            let Some(Object::Reference(desc_id)) = descendants.first() else {
740                return false;
741            };
742            let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
743                return false;
744            };
745            matches!(
746                descendant.get(b"Subtype").ok(),
747                Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
748            )
749        }
750
751        let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
752        let ids: Vec<_> = doc.objects.keys().copied().collect();
753        let mut removed = 0usize;
754
755        for id in ids {
756            let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
757                continue;
758            };
759            if !matches!(
760                dict.get(b"Subtype").ok(),
761                Some(Object::Name(name)) if name.as_slice() == b"Type0"
762            ) {
763                continue;
764            }
765            if !matches!(
766                get_name(dict, b"Encoding").as_deref(),
767                Some(b"Identity-H") | Some(b"Identity-V")
768            ) {
769                continue;
770            }
771            if !descendant_is_cidfont_type2(&doc, dict) {
772                continue;
773            }
774
775            if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
776                if type0.has(b"ToUnicode") {
777                    type0.remove(b"ToUnicode");
778                    removed += 1;
779                }
780            }
781        }
782
783        let mut out = Vec::new();
784        doc.save_to(&mut out)
785            .expect("save stripped-to-unicode fixture");
786        (out, removed)
787    }
788
789    fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
790        use lopdf::{dictionary, Document, Object, Stream};
791
792        let mut doc = Document::with_version("1.4");
793
794        let pages_id = doc.new_object_id();
795        let page_id = doc.new_object_id();
796        let content = format!("{color_operator}\n0 0 72 72 re f\n");
797        let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
798
799        doc.objects.insert(
800            page_id,
801            Object::Dictionary(dictionary! {
802                "Type" => Object::Name(b"Page".to_vec()),
803                "Parent" => Object::Reference(pages_id),
804                "MediaBox" => Object::Array(vec![
805                    Object::Integer(0),
806                    Object::Integer(0),
807                    Object::Integer(72),
808                    Object::Integer(72),
809                ]),
810                "Contents" => Object::Reference(content_id),
811            }),
812        );
813
814        doc.objects.insert(
815            pages_id,
816            Object::Dictionary(dictionary! {
817                "Type" => Object::Name(b"Pages".to_vec()),
818                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
819                "Count" => Object::Integer(1),
820            }),
821        );
822
823        let catalog_id = doc.new_object_id();
824        doc.objects.insert(
825            catalog_id,
826            Object::Dictionary(dictionary! {
827                "Type" => Object::Name(b"Catalog".to_vec()),
828                "Pages" => Object::Reference(pages_id),
829            }),
830        );
831
832        doc.trailer.set("Root", Object::Reference(catalog_id));
833
834        let mut bytes = Vec::new();
835        doc.save_to(&mut bytes).expect("save solid fill fixture");
836        bytes
837    }
838
839    fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
840        use lopdf::{dictionary, Document, Object, Stream};
841
842        let mut doc = Document::with_version("1.4");
843        let pages_id = doc.new_object_id();
844        let page_id = doc.new_object_id();
845        let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
846        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
847
848        doc.objects.insert(
849            page_id,
850            Object::Dictionary(dictionary! {
851                "Type" => "Page",
852                "Parent" => Object::Reference(pages_id),
853                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
854                "Contents" => Object::Reference(content_id),
855            }),
856        );
857        doc.objects.insert(
858            pages_id,
859            Object::Dictionary(dictionary! {
860                "Type" => "Pages",
861                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
862                "Count" => Object::Integer(1),
863            }),
864        );
865        let catalog_id = doc.new_object_id();
866        doc.objects.insert(
867            catalog_id,
868            Object::Dictionary(dictionary! {
869                "Type" => "Catalog",
870                "Pages" => Object::Reference(pages_id),
871            }),
872        );
873        doc.trailer.set("Root", Object::Reference(catalog_id));
874
875        let mut bytes = Vec::new();
876        doc.save_to(&mut bytes)
877            .expect("save mixed rgb/cmyk fixture");
878        bytes
879    }
880
881    fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
882        use lopdf::{dictionary, Document, Object, Stream};
883
884        let mut doc = Document::with_version("1.4");
885        let pages_id = doc.new_object_id();
886        let page_id = doc.new_object_id();
887        let gs_id = doc.add_object(Object::Dictionary(dictionary! {
888            "Type" => "ExtGState",
889            "ca" => Object::Real(0.5),
890        }));
891        let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
892        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
893
894        doc.objects.insert(
895            page_id,
896            Object::Dictionary(dictionary! {
897                "Type" => "Page",
898                "Parent" => Object::Reference(pages_id),
899                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
900                "Resources" => dictionary! {
901                    "ExtGState" => dictionary! {
902                        "GS1" => Object::Reference(gs_id),
903                    },
904                },
905                "Contents" => Object::Reference(content_id),
906            }),
907        );
908        doc.objects.insert(
909            pages_id,
910            Object::Dictionary(dictionary! {
911                "Type" => "Pages",
912                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
913                "Count" => Object::Integer(1),
914            }),
915        );
916        let catalog_id = doc.new_object_id();
917        doc.objects.insert(
918            catalog_id,
919            Object::Dictionary(dictionary! {
920                "Type" => "Catalog",
921                "Pages" => Object::Reference(pages_id),
922            }),
923        );
924        doc.trailer.set("Root", Object::Reference(catalog_id));
925
926        let mut bytes = Vec::new();
927        doc.save_to(&mut bytes)
928            .expect("save transparent cmyk fixture");
929        bytes
930    }
931
932    fn cmyk_image_pdf_bytes() -> Vec<u8> {
933        use lopdf::{dictionary, Document, Object, Stream};
934
935        let mut doc = Document::with_version("1.4");
936        let pages_id = doc.new_object_id();
937        let page_id = doc.new_object_id();
938        let image_id = doc.add_object(Stream::new(
939            dictionary! {
940                "Type" => "XObject",
941                "Subtype" => "Image",
942                "Width" => Object::Integer(2),
943                "Height" => Object::Integer(1),
944                "BitsPerComponent" => Object::Integer(8),
945                "ColorSpace" => "DeviceCMYK",
946            },
947            vec![255, 0, 0, 0, 0, 255, 0, 0],
948        ));
949        let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
950        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
951
952        doc.objects.insert(
953            page_id,
954            Object::Dictionary(dictionary! {
955                "Type" => "Page",
956                "Parent" => Object::Reference(pages_id),
957                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
958                "Resources" => dictionary! {
959                    "XObject" => dictionary! {
960                        "Im1" => Object::Reference(image_id),
961                    },
962                },
963                "Contents" => Object::Reference(content_id),
964            }),
965        );
966        doc.objects.insert(
967            pages_id,
968            Object::Dictionary(dictionary! {
969                "Type" => "Pages",
970                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
971                "Count" => Object::Integer(1),
972            }),
973        );
974        let catalog_id = doc.new_object_id();
975        doc.objects.insert(
976            catalog_id,
977            Object::Dictionary(dictionary! {
978                "Type" => "Catalog",
979                "Pages" => Object::Reference(pages_id),
980            }),
981        );
982        doc.trailer.set("Root", Object::Reference(catalog_id));
983
984        let mut bytes = Vec::new();
985        doc.save_to(&mut bytes).expect("save cmyk image fixture");
986        bytes
987    }
988
989    fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
990        let idx = ((y * rendered.width + x) * 4) as usize;
991        [
992            rendered.pixels[idx],
993            rendered.pixels[idx + 1],
994            rendered.pixels[idx + 2],
995            rendered.pixels[idx + 3],
996        ]
997    }
998
999    /// Build a minimal one-page PDF whose only font is a non-embedded TrueType
1000    /// reference (no `FontFile2`). The character codes in the content stream
1001    /// resolve through the declared `/Encoding`, exercising the same code path
1002    /// as corpus PDFs like `171_171940.pdf`.
1003    fn non_embedded_truetype_pdf_bytes(
1004        base_font: &[u8],
1005        encoding: &[u8],
1006        text_bytes: &[u8],
1007    ) -> Vec<u8> {
1008        use lopdf::{dictionary, Document, Object, Stream};
1009
1010        let mut doc = Document::with_version("1.4");
1011
1012        let font_id = doc.add_object(Object::Dictionary(dictionary! {
1013            "Type" => "Font",
1014            "Subtype" => "TrueType",
1015            "Name" => Object::Name(b"F0".to_vec()),
1016            "BaseFont" => Object::Name(base_font.to_vec()),
1017            "Encoding" => Object::Name(encoding.to_vec()),
1018        }));
1019
1020        let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1021            "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1022        }));
1023
1024        let mut content = Vec::new();
1025        content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1026        for &b in text_bytes {
1027            match b {
1028                b'(' | b')' | b'\\' => {
1029                    content.push(b'\\');
1030                    content.push(b);
1031                }
1032                _ => content.push(b),
1033            }
1034        }
1035        content.extend_from_slice(b") Tj\nET\n");
1036        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1037
1038        let pages_id = doc.new_object_id();
1039        let page_id = doc.add_object(Object::Dictionary(dictionary! {
1040            "Type" => "Page",
1041            "Parent" => Object::Reference(pages_id),
1042            "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1043            "Resources" => Object::Reference(resources_id),
1044            "Contents" => Object::Reference(content_id),
1045        }));
1046        doc.objects.insert(
1047            pages_id,
1048            Object::Dictionary(dictionary! {
1049                "Type" => "Pages",
1050                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1051                "Count" => Object::Integer(1),
1052            }),
1053        );
1054        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1055            "Type" => "Catalog",
1056            "Pages" => Object::Reference(pages_id),
1057        }));
1058        doc.trailer.set("Root", Object::Reference(catalog_id));
1059
1060        let mut bytes = Vec::new();
1061        doc.save_to(&mut bytes).expect("save non-embedded fixture");
1062        bytes
1063    }
1064
1065    /// Build a minimal AcroForm push button whose only human-readable text
1066    /// lives in the widget `/MK /CA` caption entry.
1067    fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1068        use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1069
1070        let mut doc = Document::with_version("1.4");
1071
1072        let catalog_id = doc.new_object_id();
1073        let pages_id = doc.new_object_id();
1074        let page_id = doc.new_object_id();
1075        let acroform_id = doc.new_object_id();
1076        let content_id = doc.new_object_id();
1077        let widget_id = doc.new_object_id();
1078
1079        doc.objects.insert(
1080            content_id,
1081            Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1082        );
1083        doc.objects.insert(
1084            widget_id,
1085            Object::Dictionary(dictionary! {
1086                "Type" => "Annot",
1087                "Subtype" => "Widget",
1088                "FT" => "Btn",
1089                "Ff" => Object::Integer(1 << 16),
1090                "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1091                "MK" => dictionary! {
1092                    "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1093                },
1094                "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1095                "P" => Object::Reference(page_id),
1096            }),
1097        );
1098        doc.objects.insert(
1099            page_id,
1100            Object::Dictionary(dictionary! {
1101                "Type" => "Page",
1102                "Parent" => Object::Reference(pages_id),
1103                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1104                "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1105                "Contents" => Object::Reference(content_id),
1106            }),
1107        );
1108        doc.objects.insert(
1109            pages_id,
1110            Object::Dictionary(dictionary! {
1111                "Type" => "Pages",
1112                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1113                "Count" => Object::Integer(1),
1114            }),
1115        );
1116        doc.objects.insert(
1117            acroform_id,
1118            Object::Dictionary(dictionary! {
1119                "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1120            }),
1121        );
1122        doc.objects.insert(
1123            catalog_id,
1124            Object::Dictionary(dictionary! {
1125                "Type" => "Catalog",
1126                "Pages" => Object::Reference(pages_id),
1127                "AcroForm" => Object::Reference(acroform_id),
1128            }),
1129        );
1130        doc.trailer.set("Root", Object::Reference(catalog_id));
1131
1132        let mut bytes = Vec::new();
1133        doc.save_to(&mut bytes)
1134            .expect("save push-button caption fixture");
1135        bytes
1136    }
1137
1138    #[test]
1139    fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1140        // Mirrors corpus PDF `171_171940.pdf`: TrueType font references
1141        // `TimesNewRoman` (resolves through the standard-font alias table)
1142        // with `WinAnsiEncoding` and no embedded font program. Extraction must
1143        // recover the text from the declared encoding even though no glyph
1144        // outlines are available.
1145        let bytes = non_embedded_truetype_pdf_bytes(
1146            b"TimesNewRoman",
1147            b"WinAnsiEncoding",
1148            b"UNITED STATES DISTRICT COURT",
1149        );
1150        let text = PdfDocument::open(bytes)
1151            .expect("open non-embedded TrueType fixture")
1152            .extract_text(0)
1153            .expect("extract non-embedded TrueType text");
1154        let norm = normalize_text(&text);
1155        assert!(
1156            norm.contains("UNITED STATES DISTRICT COURT"),
1157            "expected WinAnsi-decoded text, got: {norm:?}"
1158        );
1159    }
1160
1161    #[test]
1162    fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1163        // Custom BaseFont that does not match any standard alias and lacks the
1164        // keywords used by the heuristic. The standard-font fallback (via
1165        // FallbackFontQuery) still picks Helvetica, but on hosts without the
1166        // embedded font assets that path returns None — the new TextOnly
1167        // branch is what keeps extraction non-empty in that case. Either way,
1168        // the WinAnsi-driven char map must produce the original prose.
1169        let bytes = non_embedded_truetype_pdf_bytes(
1170            b"OpaqueCustomXYZ",
1171            b"WinAnsiEncoding",
1172            b"Hello, world!",
1173        );
1174        let text = PdfDocument::open(bytes)
1175            .expect("open custom non-embedded fixture")
1176            .extract_text(0)
1177            .expect("extract custom non-embedded text");
1178        let norm = normalize_text(&text);
1179        assert!(
1180            norm.contains("Hello, world!"),
1181            "expected WinAnsi-decoded text, got: {norm:?}"
1182        );
1183    }
1184
1185    #[test]
1186    fn extract_acroform_text_includes_push_button_mk_caption() {
1187        let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1188        let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1189
1190        let page_text = doc.extract_text(0).expect("extract page text");
1191        assert!(
1192            normalize_text(&page_text).is_empty(),
1193            "expected empty page content stream, got: {page_text:?}"
1194        );
1195
1196        let acroform_text = doc.extract_acroform_text();
1197        assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1198
1199        let all_text = doc.extract_all_text();
1200        assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1201    }
1202
1203    #[test]
1204    fn bytes_to_string_utf8() {
1205        assert_eq!(bytes_to_string(b"hello"), "hello");
1206    }
1207
1208    #[test]
1209    fn bytes_to_string_latin1() {
1210        let bytes = &[0xC4, 0xD6, 0xDC]; // ÄÖÜ in Latin-1
1211        let s = bytes_to_string(bytes);
1212        assert_eq!(s, "ÄÖÜ");
1213    }
1214
1215    #[test]
1216    fn bytes_to_string_utf16() {
1217        let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; // UTF-16 "Hi"
1218        assert_eq!(bytes_to_string(bytes), "Hi");
1219    }
1220
1221    #[test]
1222    fn document_info_default() {
1223        let info = DocumentInfo::default();
1224        assert!(info.title.is_none());
1225        assert!(info.author.is_none());
1226    }
1227
1228    #[test]
1229    fn bookmark_item_children() {
1230        let item = BookmarkItem {
1231            title: "Root".into(),
1232            page: None,
1233            children: vec![BookmarkItem {
1234                title: "Child".into(),
1235                page: Some(0),
1236                children: Vec::new(),
1237            }],
1238        };
1239        assert_eq!(item.children.len(), 1);
1240        assert_eq!(item.children[0].title, "Child");
1241    }
1242
1243    #[test]
1244    fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1245        let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1246        let expected = PdfDocument::open(original.clone())
1247            .expect("open original sf181")
1248            .extract_text(0)
1249            .expect("extract original sf181 text");
1250        assert!(
1251            expected.contains("Guide to Personnel Data Standards"),
1252            "unexpected baseline extraction: {expected}"
1253        );
1254
1255        let (stripped, removed) = strip_type0_tounicode(&original);
1256        assert!(
1257            removed > 0,
1258            "expected to strip at least one Type0 ToUnicode"
1259        );
1260
1261        let actual = PdfDocument::open(stripped)
1262            .expect("open stripped sf181")
1263            .extract_text(0)
1264            .expect("extract stripped sf181 text");
1265
1266        let actual_norm = normalize_text(&actual);
1267        let expected_norm = normalize_text(&expected);
1268
1269        assert!(
1270            actual_norm.contains("Guide to Personnel Data Standards"),
1271            "missing main heading after stripping ToUnicode: {actual_norm}"
1272        );
1273        assert!(
1274            actual_norm.contains("Privacy Act Statement"),
1275            "missing body text after stripping ToUnicode: {actual_norm}"
1276        );
1277        assert!(
1278            actual_norm.len() + 32 >= expected_norm.len(),
1279            "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1280            expected_norm.len(),
1281            actual_norm.len()
1282        );
1283    }
1284
1285    #[test]
1286    fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1287        // PDFBOX-4322-3.pdf ships an Identity-H Type0 font whose `/ToUnicode`
1288        // stream is actually an Identity-H *encoding* CMap (only
1289        // `begincidrange <0000> <FFFF> 0`, no bf-mappings). The embedded
1290        // TrueType subset also has no `cmap` table, so both the ToUnicode
1291        // lookup and the reverse-cmap fallback fail. Previously this yielded
1292        // a 0-byte extraction because the character codes — which are Unicode
1293        // code points under Identity-H — were silently discarded.
1294        let bytes =
1295            std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1296        let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1297        let text = doc.extract_all_text();
1298
1299        let norm = normalize_text(&text);
1300        assert!(
1301            norm.contains("Transatlantic Council"),
1302            "expected Identity-H codes to resolve as Unicode: {norm}"
1303        );
1304        assert!(
1305            norm.contains("Boy Scouts of America"),
1306            "expected body text to be recovered: {norm}"
1307        );
1308    }
1309
1310    #[test]
1311    fn render_page_with_config_srgb_matches_legacy_render_page() {
1312        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1313        let legacy = doc
1314            .render_page(
1315                0,
1316                &RenderOptions {
1317                    dpi: 72.0,
1318                    ..Default::default()
1319                },
1320            )
1321            .expect("legacy render succeeds");
1322        let configured = doc
1323            .render_page_with_config(
1324                0,
1325                &RenderConfig {
1326                    color_mode: ColorMode::Srgb,
1327                    dpi: 72,
1328                },
1329            )
1330            .expect("configured render succeeds");
1331
1332        assert_eq!(legacy.width, configured.width);
1333        assert_eq!(legacy.height, configured.height);
1334        assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1335        assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1336        assert_eq!(legacy.pixels, configured.pixels);
1337    }
1338
1339    #[test]
1340    fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1341        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1342        let rendered = doc
1343            .render_page_with_config(
1344                0,
1345                &RenderConfig {
1346                    color_mode: ColorMode::PreserveCmyk,
1347                    dpi: 72,
1348                },
1349            )
1350            .expect("cmyk render succeeds");
1351
1352        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1353        assert_eq!(
1354            rendered.pixels.len(),
1355            rendered.width as usize * rendered.height as usize * 4
1356        );
1357        assert_eq!(
1358            pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1359            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1360        );
1361    }
1362
1363    #[test]
1364    fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1365        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1366        let rendered = doc
1367            .render_page_with_config(
1368                0,
1369                &RenderConfig {
1370                    color_mode: ColorMode::SimulateCmyk,
1371                    dpi: 72,
1372                },
1373            )
1374            .expect("simulate cmyk render succeeds");
1375
1376        assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1377        assert!(!rendered.pixels.is_empty());
1378    }
1379
1380    #[test]
1381    fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1382        let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1383        let rendered = doc
1384            .render_page_with_config(
1385                0,
1386                &RenderConfig {
1387                    color_mode: ColorMode::PreserveCmyk,
1388                    dpi: 72,
1389                },
1390            )
1391            .expect("mixed render succeeds");
1392
1393        assert_eq!(
1394            pixel_at(&rendered, 54, 36),
1395            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1396        );
1397        assert_ne!(
1398            pixel_at(&rendered, 18, 36),
1399            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1400        );
1401    }
1402
1403    #[test]
1404    fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1405        let doc =
1406            PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1407        let rendered = doc
1408            .render_page_with_config(
1409                0,
1410                &RenderConfig {
1411                    color_mode: ColorMode::PreserveCmyk,
1412                    dpi: 72,
1413                },
1414            )
1415            .expect("transparent cmyk render succeeds");
1416
1417        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1418        assert_eq!(
1419            rendered.pixels.len(),
1420            rendered.width as usize * rendered.height as usize * 4
1421        );
1422    }
1423
1424    #[test]
1425    fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1426        let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1427        let rendered = doc
1428            .render_page_with_config(
1429                0,
1430                &RenderConfig {
1431                    color_mode: ColorMode::PreserveCmyk,
1432                    dpi: 72,
1433                },
1434            )
1435            .expect("cmyk image render succeeds");
1436
1437        assert_eq!(rendered.width, 2);
1438        assert_eq!(rendered.height, 1);
1439        assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1440        assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1441    }
1442}