Skip to main content

pdf_engine/
document.rs

1//! Unified document facade — multi-page rendering, text extraction,
2//! metadata, bookmarks, and thumbnails.
3
4use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
7use crate::text::{TextBlock, TextExtractionDevice};
8use crate::thumbnail::ThumbnailOptions;
9
10use pdf_forms::parse::parse_acroform;
11use pdf_forms::tree::{FieldType, FieldValue};
12use pdf_render::pdf_interpret::PageExt;
13use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings};
14use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
15use pdf_render::pdf_syntax::object::Dict;
16use pdf_render::pdf_syntax::page::Page;
17use pdf_render::pdf_syntax::Pdf;
18#[cfg(feature = "parallel")]
19use rayon::prelude::*;
20
21use kurbo::Rect;
22
23/// Document metadata extracted from the info dictionary.
24#[derive(Debug, Clone, Default)]
25pub struct DocumentInfo {
26    /// Document title.
27    pub title: Option<String>,
28    /// Author.
29    pub author: Option<String>,
30    /// Subject.
31    pub subject: Option<String>,
32    /// Keywords.
33    pub keywords: Option<String>,
34    /// Creator application.
35    pub creator: Option<String>,
36    /// Producer application.
37    pub producer: Option<String>,
38}
39
40/// A bookmark / outline item.
41#[derive(Debug, Clone)]
42pub struct BookmarkItem {
43    /// Bookmark title.
44    pub title: String,
45    /// Target page index (0-based), if resolvable.
46    pub page: Option<usize>,
47    /// Nested child bookmarks.
48    pub children: Vec<BookmarkItem>,
49}
50
51/// High-level PDF document handle.
52pub struct PdfDocument {
53    pdf: Pdf,
54    settings: InterpreterSettings,
55}
56
57impl PdfDocument {
58    /// Open a PDF from bytes.
59    pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
60        let pdf = Pdf::new(data).map_err(|e| match e {
61            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
62                EngineError::Encrypted(format!("{d:?}"))
63            }
64            _ => EngineError::InvalidPdf(format!("{e:?}")),
65        })?;
66        Ok(Self {
67            pdf,
68            settings: InterpreterSettings::default(),
69        })
70    }
71
72    /// Open a password-protected PDF.
73    pub fn open_with_password(
74        data: impl Into<pdf_render::pdf_syntax::PdfData>,
75        password: &str,
76    ) -> Result<Self> {
77        let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
78            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
79                EngineError::Encrypted(format!("{d:?}"))
80            }
81            _ => EngineError::InvalidPdf(format!("{e:?}")),
82        })?;
83        Ok(Self {
84            pdf,
85            settings: InterpreterSettings::default(),
86        })
87    }
88
89    /// Access the underlying parsed PDF.
90    pub fn pdf(&self) -> &Pdf {
91        &self.pdf
92    }
93
94    /// Set interpreter settings (font resolver, cmap resolver, etc.).
95    pub fn set_settings(&mut self, settings: InterpreterSettings) {
96        self.settings = settings;
97    }
98
99    /// Number of pages.
100    pub fn page_count(&self) -> usize {
101        self.pdf.pages().len()
102    }
103
104    /// Get the geometry of a page.
105    pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
106        let page = self.get_page(index)?;
107        Ok(geometry::extract_geometry(page))
108    }
109
110    /// Render a single page.
111    ///
112    /// If the document contains an XFA template, it is automatically flattened
113    /// to static PDF content before rendering.  This prevents the "Please wait"
114    /// placeholder page that Adobe Reader would show when rendering an XFA PDF
115    /// with a conventional renderer. If flattening fails, rendering falls back
116    /// to the original document as a best-effort path.
117    pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
118        #[cfg(feature = "xfa")]
119        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
120            return flat_doc.render_page(index, options);
121        }
122        let page = self.get_page(index)?;
123        // Pre-flight: reject pathologically small or zero-dimension pages before
124        // allocating any pixel buffer. Non-positive dimensions cause panics or
125        // zero-sized allocations inside the rasteriser.
126        let (w, h) = page.render_dimensions();
127        if w <= 0.0 || h <= 0.0 {
128            return Err(EngineError::InvalidPageGeometry {
129                width: w,
130                height: h,
131                reason: "page has zero or negative dimensions".into(),
132            });
133        }
134        // Also reject pages so small they produce zero pixels even at the
135        // minimum meaningful DPI (1 DPI). Below ~0.72pt at 1 DPI = 0 pixels.
136        const MIN_PAGE_PT: f32 = 1.0;
137        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
138            return Err(EngineError::InvalidPageGeometry {
139                width: w,
140                height: h,
141                reason: "page too small to render (< 1pt)".into(),
142            });
143        }
144        Ok(render::render_page(page, options, &self.settings))
145    }
146
147    /// Render a single page using the high-level render config.
148    ///
149    /// XFA documents are auto-flattened before rendering (same as `render_page`).
150    /// If flattening fails, rendering falls back to the original document.
151    pub fn render_page_with_config(
152        &self,
153        index: usize,
154        config: &RenderConfig,
155    ) -> Result<RenderedPage> {
156        #[cfg(feature = "xfa")]
157        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
158            return flat_doc.render_page_with_config(index, config);
159        }
160        let page = self.get_page(index)?;
161        let (w, h) = page.render_dimensions();
162        if w <= 0.0 || h <= 0.0 {
163            return Err(EngineError::InvalidPageGeometry {
164                width: w,
165                height: h,
166                reason: "page has zero or negative dimensions".into(),
167            });
168        }
169        const MIN_PAGE_PT: f32 = 1.0;
170        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
171            return Err(EngineError::InvalidPageGeometry {
172                width: w,
173                height: h,
174                reason: "page too small to render (< 1pt)".into(),
175            });
176        }
177        Ok(render::render_page_with_config(
178            page,
179            config,
180            &self.settings,
181        ))
182    }
183
184    /// Render a single page to a CMYK buffer.
185    pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
186        self.render_page_with_config(
187            index,
188            &RenderConfig {
189                color_mode: ColorMode::PreserveCmyk,
190                dpi,
191            },
192        )
193    }
194
195    /// Render all pages, in parallel when the `parallel` feature is enabled.
196    pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
197        let pages = self.pdf.pages();
198        #[cfg(feature = "parallel")]
199        return (0..pages.len())
200            .into_par_iter()
201            .map(|i| render::render_page(&pages[i], options, &self.settings))
202            .collect();
203        #[cfg(not(feature = "parallel"))]
204        (0..pages.len())
205            .map(|i| render::render_page(&pages[i], options, &self.settings))
206            .collect()
207    }
208
209    /// Render all pages using the high-level render config.
210    pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
211        let pages = self.pdf.pages();
212        #[cfg(feature = "parallel")]
213        return (0..pages.len())
214            .into_par_iter()
215            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
216            .collect();
217        #[cfg(not(feature = "parallel"))]
218        (0..pages.len())
219            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
220            .collect()
221    }
222
223    /// Generate a thumbnail for a single page.
224    pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
225        let page = self.get_page(index)?;
226        Ok(render::render_thumbnail(
227            page,
228            options.max_dimension,
229            &self.settings,
230        ))
231    }
232
233    /// Generate thumbnails for all pages, in parallel when the `parallel` feature is enabled.
234    pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
235        let pages = self.pdf.pages();
236        #[cfg(feature = "parallel")]
237        return (0..pages.len())
238            .into_par_iter()
239            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
240            .collect();
241        #[cfg(not(feature = "parallel"))]
242        (0..pages.len())
243            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
244            .collect()
245    }
246
247    /// Extract text from a page as a single string.
248    pub fn extract_text(&self, index: usize) -> Result<String> {
249        let page = self.get_page(index)?;
250        let mut device = TextExtractionDevice::new();
251        let mut ctx = self.create_context(page);
252        interpret_page(page, &mut ctx, &mut device);
253        Ok(device.into_text())
254    }
255
256    /// Extract text from a sequence of pages while reusing the same settings object.
257    #[doc(hidden)]
258    pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
259    where
260        I: IntoIterator<Item = usize>,
261    {
262        let pages = self.pdf.pages();
263        let mut settings = self.text_extraction_settings();
264        let indices = indices.into_iter();
265        let (lower_bound, upper_bound) = indices.size_hint();
266        let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
267
268        for index in indices {
269            let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
270                index,
271                count: pages.len(),
272            })?;
273            let (text, next_settings) = Self::extract_text_with_settings(page, settings);
274            settings = next_settings;
275            texts.push(text);
276        }
277
278        Ok(texts)
279    }
280
281    /// Extract structured text blocks from a page.
282    pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
283        let page = self.get_page(index)?;
284        let mut device = TextExtractionDevice::new();
285        let mut ctx = self.create_context(page);
286        interpret_page(page, &mut ctx, &mut device);
287        Ok(device.into_blocks())
288    }
289
290    /// Extract structured text blocks from all pages, reusing interpreter settings.
291    pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
292        let pages = self.pdf.pages();
293        let mut settings = self.text_extraction_settings();
294        let mut blocks = Vec::with_capacity(pages.len());
295
296        for page in pages.iter() {
297            let (page_blocks, next_settings) =
298                Self::extract_text_blocks_with_settings(page, settings);
299            settings = next_settings;
300            blocks.push(page_blocks);
301        }
302
303        blocks
304    }
305
306    /// Extract text values from AcroForm fields, including push-button captions.
307    ///
308    /// Returns a single string concatenating all non-empty field values separated
309    /// by newlines. Useful when the document stores its readable content in form
310    /// field values rather than (or in addition to) page content streams.
311    pub fn extract_acroform_text(&self) -> String {
312        let Some(tree) = parse_acroform(&self.pdf) else {
313            return String::new();
314        };
315        let mut parts: Vec<String> = Vec::new();
316        for id in tree.all_ids() {
317            let node = tree.get(id);
318            if node.children.is_empty() {
319                // Terminal (widget) — collect text-like values.
320                let value_str = match &node.value {
321                    Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
322                    Some(FieldValue::StringArray(arr)) => {
323                        let joined = arr
324                            .iter()
325                            .filter(|s| !s.is_empty())
326                            .cloned()
327                            .collect::<Vec<_>>()
328                            .join(", ");
329                        if joined.is_empty() {
330                            None
331                        } else {
332                            Some(joined)
333                        }
334                    }
335                    _ => None,
336                };
337                let button_caption =
338                    value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
339                let extracted = value_str.or_else(|| {
340                    button_caption.then(|| {
341                        node.mk
342                            .as_ref()
343                            .and_then(|mk| mk.caption.as_ref())
344                            .filter(|caption| !caption.is_empty())
345                            .cloned()
346                    })?
347                });
348                if let Some(s) = extracted {
349                    parts.push(s);
350                }
351            }
352        }
353        parts.join("\n")
354    }
355
356    /// Extract all text from the document: page content streams plus AcroForm
357    /// field values.  Mirrors pdftotext behaviour.
358    pub fn extract_all_text(&self) -> String {
359        let pages = self.pdf.pages();
360        let mut settings = self.text_extraction_settings();
361        let mut page_texts = Vec::with_capacity(pages.len());
362        for page in pages.iter() {
363            let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
364            settings = next_settings;
365            page_texts.push(page_text);
366        }
367
368        let mut text = join_page_texts(page_texts.iter().map(String::as_str));
369        let acroform = self.extract_acroform_text();
370        if !acroform.is_empty() {
371            if !text.is_empty() && !text.ends_with('\n') {
372                text.push('\n');
373            }
374            text.push_str(&acroform);
375        }
376        text
377    }
378
379    /// Simple text search: returns page indices containing the query string.
380    pub fn search_text(&self, query: &str) -> Vec<usize> {
381        let pages = self.pdf.pages();
382        let query_lower = query.to_lowercase();
383        #[cfg(feature = "parallel")]
384        let page_contains = |i: usize| -> Option<usize> {
385            let page = &pages[i];
386            let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
387            if text.to_lowercase().contains(&query_lower) {
388                Some(i)
389            } else {
390                None
391            }
392        };
393        #[cfg(feature = "parallel")]
394        return (0..pages.len())
395            .into_par_iter()
396            .filter_map(page_contains)
397            .collect();
398        #[cfg(not(feature = "parallel"))]
399        {
400            let mut settings = self.text_extraction_settings();
401            let mut hits = Vec::new();
402            for (i, page) in pages.iter().enumerate() {
403                let (text, next_settings) = Self::extract_text_with_settings(page, settings);
404                settings = next_settings;
405                if text.to_lowercase().contains(&query_lower) {
406                    hits.push(i);
407                }
408            }
409            hits
410        }
411    }
412
413    /// Extract document metadata.
414    pub fn info(&self) -> DocumentInfo {
415        let meta = self.pdf.metadata();
416        DocumentInfo {
417            title: meta.title.as_ref().map(|b| bytes_to_string(b)),
418            author: meta.author.as_ref().map(|b| bytes_to_string(b)),
419            subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
420            keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
421            creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
422            producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
423        }
424    }
425
426    /// Extract document outline / bookmarks.
427    pub fn bookmarks(&self) -> Vec<BookmarkItem> {
428        let xref = self.pdf.xref();
429        let root_id = xref.root_id();
430        let catalog: Dict<'_> = match xref.get(root_id) {
431            Some(d) => d,
432            None => return Vec::new(),
433        };
434
435        let outlines: Dict<'_> = match catalog.get(OUTLINES) {
436            Some(d) => d,
437            None => return Vec::new(),
438        };
439
440        let first: Dict<'_> = match outlines.get(FIRST) {
441            Some(d) => d,
442            None => return Vec::new(),
443        };
444
445        parse_outline_items(&first)
446    }
447
448    /// Run OCR on a page and return the recognized text and word positions.
449    ///
450    /// The page is rendered at `dpi` (default 150) before recognition.
451    /// Pass any [`OcrBackend`] implementation; use [`OcrsBackend::try_default`]
452    /// to load the pure-Rust `ocrs` engine from the standard model paths.
453    ///
454    /// # Example
455    ///
456    /// ```no_run
457    /// # #[cfg(feature = "ocr")] {
458    /// use pdf_engine::{PdfDocument, OcrsBackend, RenderOptions};
459    ///
460    /// let doc = PdfDocument::open(std::fs::read("scan.pdf").unwrap()).unwrap();
461    /// let backend = OcrsBackend::try_default().unwrap();
462    /// let result = doc.ocr_page(0, &backend, 150.0_f64).unwrap();
463    /// println!("{}", result.text);
464    /// # }
465    /// ```
466    pub fn ocr_page(
467        &self,
468        index: usize,
469        backend: &dyn crate::ocr::OcrBackend,
470        dpi: f64,
471    ) -> crate::error::Result<crate::ocr::OcrResult> {
472        let opts = crate::render::RenderOptions {
473            dpi,
474            ..Default::default()
475        };
476        let rendered = self.render_page(index, &opts)?;
477
478        // Convert RGBA → RGB (ocrs expects RGB input).
479        let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
480        for chunk in rendered.pixels.chunks(4) {
481            rgb.push(chunk[0]);
482            rgb.push(chunk[1]);
483            rgb.push(chunk[2]);
484        }
485
486        backend
487            .recognize(&rgb, rendered.width, rendered.height)
488            .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
489    }
490
491    fn get_page(&self, index: usize) -> Result<&Page<'_>> {
492        let pages = self.pdf.pages();
493        if index >= pages.len() {
494            return Err(EngineError::PageOutOfRange {
495                index,
496                count: pages.len(),
497            });
498        }
499        Ok(&pages[index])
500    }
501
502    fn text_extraction_settings(&self) -> InterpreterSettings {
503        let mut settings = self.settings.clone();
504        // Text extraction should include signature widget appearance streams
505        // that rendering skips to match MuPDF visual output.
506        settings.skip_signature_widgets = false;
507        settings
508    }
509
510    fn create_context<'a>(&self, page: &Page<'a>) -> Context<'a> {
511        Self::create_context_with_settings(page, self.text_extraction_settings())
512    }
513
514    fn create_context_with_settings<'a>(
515        page: &Page<'a>,
516        settings: InterpreterSettings,
517    ) -> Context<'a> {
518        let (w, h) = page.render_dimensions();
519        Context::new(
520            page.initial_transform(false),
521            Rect::new(0.0, 0.0, w as f64, h as f64),
522            page.xref(),
523            settings,
524        )
525    }
526
527    fn extract_text_with_settings<'a>(
528        page: &Page<'a>,
529        settings: InterpreterSettings,
530    ) -> (String, InterpreterSettings) {
531        let mut device = TextExtractionDevice::new();
532        let mut ctx = Self::create_context_with_settings(page, settings);
533        interpret_page(page, &mut ctx, &mut device);
534        let settings = ctx.into_settings();
535        (device.into_text(), settings)
536    }
537
538    fn extract_text_blocks_with_settings<'a>(
539        page: &Page<'a>,
540        settings: InterpreterSettings,
541    ) -> (Vec<TextBlock>, InterpreterSettings) {
542        let mut device = TextExtractionDevice::new();
543        let mut ctx = Self::create_context_with_settings(page, settings);
544        interpret_page(page, &mut ctx, &mut device);
545        let settings = ctx.into_settings();
546        (device.into_blocks(), settings)
547    }
548
549    #[cfg(feature = "xfa")]
550    fn open_flattened_xfa_for_render(&self) -> Option<Self> {
551        if !crate::xfa::has_xfa(self) {
552            return None;
553        }
554
555        let flat_bytes = crate::xfa::flatten(self).ok()?;
556        let mut flat_doc = Self::open(flat_bytes).ok()?;
557        flat_doc.settings = self.settings.clone();
558        Some(flat_doc)
559    }
560}
561
562fn join_page_texts<I>(page_texts: I) -> String
563where
564    I: IntoIterator,
565    I::Item: AsRef<str>,
566{
567    let mut text = String::new();
568    let mut is_first = true;
569
570    for page_text in page_texts {
571        if !is_first {
572            while !text.is_empty() && !text.ends_with("\n\n") {
573                text.push('\n');
574            }
575            text.push('\u{000C}');
576        }
577        text.push_str(page_text.as_ref());
578        is_first = false;
579    }
580
581    text
582}
583
584#[cfg(test)]
585mod extract_all_text_tests {
586    use super::join_page_texts;
587
588    #[test]
589    fn separates_nonempty_pages_like_pdftotext() {
590        assert_eq!(
591            join_page_texts(["Page 1", "Page 2"]),
592            "Page 1\n\n\u{000C}Page 2"
593        );
594    }
595
596    #[test]
597    fn preserves_leading_blank_pages_without_extra_newlines() {
598        assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
599    }
600
601    #[test]
602    fn reuses_existing_blank_line_before_form_feed() {
603        assert_eq!(
604            join_page_texts(["Page 1\n\n", "Page 2"]),
605            "Page 1\n\n\u{000C}Page 2"
606        );
607    }
608}
609
610/// Walk the outline linked list (FIRST → NEXT chain).
611fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
612    let mut items = Vec::new();
613    let mut current: Option<Dict<'_>> = Some(item_dict.clone());
614
615    while let Some(dict) = current {
616        let title = dict
617            .get::<pdf_render::pdf_syntax::object::String>(TITLE)
618            .map(|s| bytes_to_string(s.as_bytes()))
619            .unwrap_or_default();
620
621        let children = match dict.get::<Dict<'_>>(FIRST) {
622            Some(child_dict) => parse_outline_items(&child_dict),
623            None => Vec::new(),
624        };
625
626        items.push(BookmarkItem {
627            title,
628            page: None, // Destination resolution requires named-dest lookup — left for follow-up
629            children,
630        });
631
632        current = dict.get::<Dict<'_>>(NEXT);
633    }
634
635    items
636}
637
638/// Convert PDF string bytes to a Rust String (UTF-8 with Latin-1 fallback).
639fn bytes_to_string(bytes: &[u8]) -> String {
640    // Check for UTF-16 BOM
641    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
642        let chars: Vec<u16> = bytes[2..]
643            .chunks(2)
644            .filter_map(|c| {
645                if c.len() == 2 {
646                    Some(u16::from_be_bytes([c[0], c[1]]))
647                } else {
648                    None
649                }
650            })
651            .collect();
652        return String::from_utf16_lossy(&chars);
653    }
654
655    // Try UTF-8, fall back to Latin-1.
656    match std::str::from_utf8(bytes) {
657        Ok(s) => s.to_string(),
658        Err(_) => bytes.iter().map(|&b| b as char).collect(),
659    }
660}
661
662#[cfg(test)]
663mod tests {
664    use super::*;
665    use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
666    use lopdf::{Document as LoDocument, Object};
667    use std::path::PathBuf;
668
669    fn corpus_path(name: &str) -> PathBuf {
670        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
671            .join("../../corpus")
672            .join(name)
673    }
674
675    fn normalize_text(text: &str) -> String {
676        text.split_whitespace().collect::<Vec<_>>().join(" ")
677    }
678
679    fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
680        fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
681            match dict.get(key).ok()? {
682                Object::Name(name) => Some(name.clone()),
683                _ => None,
684            }
685        }
686
687        fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
688            let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
689                return false;
690            };
691            let Some(Object::Reference(desc_id)) = descendants.first() else {
692                return false;
693            };
694            let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
695                return false;
696            };
697            matches!(
698                descendant.get(b"Subtype").ok(),
699                Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
700            )
701        }
702
703        let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
704        let ids: Vec<_> = doc.objects.keys().copied().collect();
705        let mut removed = 0usize;
706
707        for id in ids {
708            let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
709                continue;
710            };
711            if !matches!(
712                dict.get(b"Subtype").ok(),
713                Some(Object::Name(name)) if name.as_slice() == b"Type0"
714            ) {
715                continue;
716            }
717            if !matches!(
718                get_name(dict, b"Encoding").as_deref(),
719                Some(b"Identity-H") | Some(b"Identity-V")
720            ) {
721                continue;
722            }
723            if !descendant_is_cidfont_type2(&doc, dict) {
724                continue;
725            }
726
727            if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
728                if type0.has(b"ToUnicode") {
729                    type0.remove(b"ToUnicode");
730                    removed += 1;
731                }
732            }
733        }
734
735        let mut out = Vec::new();
736        doc.save_to(&mut out)
737            .expect("save stripped-to-unicode fixture");
738        (out, removed)
739    }
740
741    fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
742        use lopdf::{dictionary, Document, Object, Stream};
743
744        let mut doc = Document::with_version("1.4");
745
746        let pages_id = doc.new_object_id();
747        let page_id = doc.new_object_id();
748        let content = format!("{color_operator}\n0 0 72 72 re f\n");
749        let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
750
751        doc.objects.insert(
752            page_id,
753            Object::Dictionary(dictionary! {
754                "Type" => Object::Name(b"Page".to_vec()),
755                "Parent" => Object::Reference(pages_id),
756                "MediaBox" => Object::Array(vec![
757                    Object::Integer(0),
758                    Object::Integer(0),
759                    Object::Integer(72),
760                    Object::Integer(72),
761                ]),
762                "Contents" => Object::Reference(content_id),
763            }),
764        );
765
766        doc.objects.insert(
767            pages_id,
768            Object::Dictionary(dictionary! {
769                "Type" => Object::Name(b"Pages".to_vec()),
770                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
771                "Count" => Object::Integer(1),
772            }),
773        );
774
775        let catalog_id = doc.new_object_id();
776        doc.objects.insert(
777            catalog_id,
778            Object::Dictionary(dictionary! {
779                "Type" => Object::Name(b"Catalog".to_vec()),
780                "Pages" => Object::Reference(pages_id),
781            }),
782        );
783
784        doc.trailer.set("Root", Object::Reference(catalog_id));
785
786        let mut bytes = Vec::new();
787        doc.save_to(&mut bytes).expect("save solid fill fixture");
788        bytes
789    }
790
791    fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
792        use lopdf::{dictionary, Document, Object, Stream};
793
794        let mut doc = Document::with_version("1.4");
795        let pages_id = doc.new_object_id();
796        let page_id = doc.new_object_id();
797        let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
798        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
799
800        doc.objects.insert(
801            page_id,
802            Object::Dictionary(dictionary! {
803                "Type" => "Page",
804                "Parent" => Object::Reference(pages_id),
805                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
806                "Contents" => Object::Reference(content_id),
807            }),
808        );
809        doc.objects.insert(
810            pages_id,
811            Object::Dictionary(dictionary! {
812                "Type" => "Pages",
813                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
814                "Count" => Object::Integer(1),
815            }),
816        );
817        let catalog_id = doc.new_object_id();
818        doc.objects.insert(
819            catalog_id,
820            Object::Dictionary(dictionary! {
821                "Type" => "Catalog",
822                "Pages" => Object::Reference(pages_id),
823            }),
824        );
825        doc.trailer.set("Root", Object::Reference(catalog_id));
826
827        let mut bytes = Vec::new();
828        doc.save_to(&mut bytes)
829            .expect("save mixed rgb/cmyk fixture");
830        bytes
831    }
832
833    fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
834        use lopdf::{dictionary, Document, Object, Stream};
835
836        let mut doc = Document::with_version("1.4");
837        let pages_id = doc.new_object_id();
838        let page_id = doc.new_object_id();
839        let gs_id = doc.add_object(Object::Dictionary(dictionary! {
840            "Type" => "ExtGState",
841            "ca" => Object::Real(0.5),
842        }));
843        let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
844        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
845
846        doc.objects.insert(
847            page_id,
848            Object::Dictionary(dictionary! {
849                "Type" => "Page",
850                "Parent" => Object::Reference(pages_id),
851                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
852                "Resources" => dictionary! {
853                    "ExtGState" => dictionary! {
854                        "GS1" => Object::Reference(gs_id),
855                    },
856                },
857                "Contents" => Object::Reference(content_id),
858            }),
859        );
860        doc.objects.insert(
861            pages_id,
862            Object::Dictionary(dictionary! {
863                "Type" => "Pages",
864                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
865                "Count" => Object::Integer(1),
866            }),
867        );
868        let catalog_id = doc.new_object_id();
869        doc.objects.insert(
870            catalog_id,
871            Object::Dictionary(dictionary! {
872                "Type" => "Catalog",
873                "Pages" => Object::Reference(pages_id),
874            }),
875        );
876        doc.trailer.set("Root", Object::Reference(catalog_id));
877
878        let mut bytes = Vec::new();
879        doc.save_to(&mut bytes)
880            .expect("save transparent cmyk fixture");
881        bytes
882    }
883
884    fn cmyk_image_pdf_bytes() -> Vec<u8> {
885        use lopdf::{dictionary, Document, Object, Stream};
886
887        let mut doc = Document::with_version("1.4");
888        let pages_id = doc.new_object_id();
889        let page_id = doc.new_object_id();
890        let image_id = doc.add_object(Stream::new(
891            dictionary! {
892                "Type" => "XObject",
893                "Subtype" => "Image",
894                "Width" => Object::Integer(2),
895                "Height" => Object::Integer(1),
896                "BitsPerComponent" => Object::Integer(8),
897                "ColorSpace" => "DeviceCMYK",
898            },
899            vec![255, 0, 0, 0, 0, 255, 0, 0],
900        ));
901        let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
902        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
903
904        doc.objects.insert(
905            page_id,
906            Object::Dictionary(dictionary! {
907                "Type" => "Page",
908                "Parent" => Object::Reference(pages_id),
909                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
910                "Resources" => dictionary! {
911                    "XObject" => dictionary! {
912                        "Im1" => Object::Reference(image_id),
913                    },
914                },
915                "Contents" => Object::Reference(content_id),
916            }),
917        );
918        doc.objects.insert(
919            pages_id,
920            Object::Dictionary(dictionary! {
921                "Type" => "Pages",
922                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
923                "Count" => Object::Integer(1),
924            }),
925        );
926        let catalog_id = doc.new_object_id();
927        doc.objects.insert(
928            catalog_id,
929            Object::Dictionary(dictionary! {
930                "Type" => "Catalog",
931                "Pages" => Object::Reference(pages_id),
932            }),
933        );
934        doc.trailer.set("Root", Object::Reference(catalog_id));
935
936        let mut bytes = Vec::new();
937        doc.save_to(&mut bytes).expect("save cmyk image fixture");
938        bytes
939    }
940
941    fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
942        let idx = ((y * rendered.width + x) * 4) as usize;
943        [
944            rendered.pixels[idx],
945            rendered.pixels[idx + 1],
946            rendered.pixels[idx + 2],
947            rendered.pixels[idx + 3],
948        ]
949    }
950
951    /// Build a minimal one-page PDF whose only font is a non-embedded TrueType
952    /// reference (no `FontFile2`). The character codes in the content stream
953    /// resolve through the declared `/Encoding`, exercising the same code path
954    /// as corpus PDFs like `171_171940.pdf`.
955    fn non_embedded_truetype_pdf_bytes(
956        base_font: &[u8],
957        encoding: &[u8],
958        text_bytes: &[u8],
959    ) -> Vec<u8> {
960        use lopdf::{dictionary, Document, Object, Stream};
961
962        let mut doc = Document::with_version("1.4");
963
964        let font_id = doc.add_object(Object::Dictionary(dictionary! {
965            "Type" => "Font",
966            "Subtype" => "TrueType",
967            "Name" => Object::Name(b"F0".to_vec()),
968            "BaseFont" => Object::Name(base_font.to_vec()),
969            "Encoding" => Object::Name(encoding.to_vec()),
970        }));
971
972        let resources_id = doc.add_object(Object::Dictionary(dictionary! {
973            "Font" => dictionary! { "F0" => Object::Reference(font_id) },
974        }));
975
976        let mut content = Vec::new();
977        content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
978        for &b in text_bytes {
979            match b {
980                b'(' | b')' | b'\\' => {
981                    content.push(b'\\');
982                    content.push(b);
983                }
984                _ => content.push(b),
985            }
986        }
987        content.extend_from_slice(b") Tj\nET\n");
988        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
989
990        let pages_id = doc.new_object_id();
991        let page_id = doc.add_object(Object::Dictionary(dictionary! {
992            "Type" => "Page",
993            "Parent" => Object::Reference(pages_id),
994            "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
995            "Resources" => Object::Reference(resources_id),
996            "Contents" => Object::Reference(content_id),
997        }));
998        doc.objects.insert(
999            pages_id,
1000            Object::Dictionary(dictionary! {
1001                "Type" => "Pages",
1002                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1003                "Count" => Object::Integer(1),
1004            }),
1005        );
1006        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1007            "Type" => "Catalog",
1008            "Pages" => Object::Reference(pages_id),
1009        }));
1010        doc.trailer.set("Root", Object::Reference(catalog_id));
1011
1012        let mut bytes = Vec::new();
1013        doc.save_to(&mut bytes).expect("save non-embedded fixture");
1014        bytes
1015    }
1016
1017    /// Build a minimal AcroForm push button whose only human-readable text
1018    /// lives in the widget `/MK /CA` caption entry.
1019    fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1020        use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1021
1022        let mut doc = Document::with_version("1.4");
1023
1024        let catalog_id = doc.new_object_id();
1025        let pages_id = doc.new_object_id();
1026        let page_id = doc.new_object_id();
1027        let acroform_id = doc.new_object_id();
1028        let content_id = doc.new_object_id();
1029        let widget_id = doc.new_object_id();
1030
1031        doc.objects.insert(
1032            content_id,
1033            Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1034        );
1035        doc.objects.insert(
1036            widget_id,
1037            Object::Dictionary(dictionary! {
1038                "Type" => "Annot",
1039                "Subtype" => "Widget",
1040                "FT" => "Btn",
1041                "Ff" => Object::Integer(1 << 16),
1042                "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1043                "MK" => dictionary! {
1044                    "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1045                },
1046                "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1047                "P" => Object::Reference(page_id),
1048            }),
1049        );
1050        doc.objects.insert(
1051            page_id,
1052            Object::Dictionary(dictionary! {
1053                "Type" => "Page",
1054                "Parent" => Object::Reference(pages_id),
1055                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1056                "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1057                "Contents" => Object::Reference(content_id),
1058            }),
1059        );
1060        doc.objects.insert(
1061            pages_id,
1062            Object::Dictionary(dictionary! {
1063                "Type" => "Pages",
1064                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1065                "Count" => Object::Integer(1),
1066            }),
1067        );
1068        doc.objects.insert(
1069            acroform_id,
1070            Object::Dictionary(dictionary! {
1071                "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1072            }),
1073        );
1074        doc.objects.insert(
1075            catalog_id,
1076            Object::Dictionary(dictionary! {
1077                "Type" => "Catalog",
1078                "Pages" => Object::Reference(pages_id),
1079                "AcroForm" => Object::Reference(acroform_id),
1080            }),
1081        );
1082        doc.trailer.set("Root", Object::Reference(catalog_id));
1083
1084        let mut bytes = Vec::new();
1085        doc.save_to(&mut bytes)
1086            .expect("save push-button caption fixture");
1087        bytes
1088    }
1089
1090    #[test]
1091    fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1092        // Mirrors corpus PDF `171_171940.pdf`: TrueType font references
1093        // `TimesNewRoman` (resolves through the standard-font alias table)
1094        // with `WinAnsiEncoding` and no embedded font program. Extraction must
1095        // recover the text from the declared encoding even though no glyph
1096        // outlines are available.
1097        let bytes = non_embedded_truetype_pdf_bytes(
1098            b"TimesNewRoman",
1099            b"WinAnsiEncoding",
1100            b"UNITED STATES DISTRICT COURT",
1101        );
1102        let text = PdfDocument::open(bytes)
1103            .expect("open non-embedded TrueType fixture")
1104            .extract_text(0)
1105            .expect("extract non-embedded TrueType text");
1106        let norm = normalize_text(&text);
1107        assert!(
1108            norm.contains("UNITED STATES DISTRICT COURT"),
1109            "expected WinAnsi-decoded text, got: {norm:?}"
1110        );
1111    }
1112
1113    #[test]
1114    fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1115        // Custom BaseFont that does not match any standard alias and lacks the
1116        // keywords used by the heuristic. The standard-font fallback (via
1117        // FallbackFontQuery) still picks Helvetica, but on hosts without the
1118        // embedded font assets that path returns None — the new TextOnly
1119        // branch is what keeps extraction non-empty in that case. Either way,
1120        // the WinAnsi-driven char map must produce the original prose.
1121        let bytes = non_embedded_truetype_pdf_bytes(
1122            b"OpaqueCustomXYZ",
1123            b"WinAnsiEncoding",
1124            b"Hello, world!",
1125        );
1126        let text = PdfDocument::open(bytes)
1127            .expect("open custom non-embedded fixture")
1128            .extract_text(0)
1129            .expect("extract custom non-embedded text");
1130        let norm = normalize_text(&text);
1131        assert!(
1132            norm.contains("Hello, world!"),
1133            "expected WinAnsi-decoded text, got: {norm:?}"
1134        );
1135    }
1136
1137    #[test]
1138    fn extract_acroform_text_includes_push_button_mk_caption() {
1139        let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1140        let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1141
1142        let page_text = doc.extract_text(0).expect("extract page text");
1143        assert!(
1144            normalize_text(&page_text).is_empty(),
1145            "expected empty page content stream, got: {page_text:?}"
1146        );
1147
1148        let acroform_text = doc.extract_acroform_text();
1149        assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1150
1151        let all_text = doc.extract_all_text();
1152        assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1153    }
1154
1155    #[test]
1156    fn bytes_to_string_utf8() {
1157        assert_eq!(bytes_to_string(b"hello"), "hello");
1158    }
1159
1160    #[test]
1161    fn bytes_to_string_latin1() {
1162        let bytes = &[0xC4, 0xD6, 0xDC]; // ÄÖÜ in Latin-1
1163        let s = bytes_to_string(bytes);
1164        assert_eq!(s, "ÄÖÜ");
1165    }
1166
1167    #[test]
1168    fn bytes_to_string_utf16() {
1169        let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; // UTF-16 "Hi"
1170        assert_eq!(bytes_to_string(bytes), "Hi");
1171    }
1172
1173    #[test]
1174    fn document_info_default() {
1175        let info = DocumentInfo::default();
1176        assert!(info.title.is_none());
1177        assert!(info.author.is_none());
1178    }
1179
1180    #[test]
1181    fn bookmark_item_children() {
1182        let item = BookmarkItem {
1183            title: "Root".into(),
1184            page: None,
1185            children: vec![BookmarkItem {
1186                title: "Child".into(),
1187                page: Some(0),
1188                children: Vec::new(),
1189            }],
1190        };
1191        assert_eq!(item.children.len(), 1);
1192        assert_eq!(item.children[0].title, "Child");
1193    }
1194
1195    #[test]
1196    fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1197        let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1198        let expected = PdfDocument::open(original.clone())
1199            .expect("open original sf181")
1200            .extract_text(0)
1201            .expect("extract original sf181 text");
1202        assert!(
1203            expected.contains("Guide to Personnel Data Standards"),
1204            "unexpected baseline extraction: {expected}"
1205        );
1206
1207        let (stripped, removed) = strip_type0_tounicode(&original);
1208        assert!(
1209            removed > 0,
1210            "expected to strip at least one Type0 ToUnicode"
1211        );
1212
1213        let actual = PdfDocument::open(stripped)
1214            .expect("open stripped sf181")
1215            .extract_text(0)
1216            .expect("extract stripped sf181 text");
1217
1218        let actual_norm = normalize_text(&actual);
1219        let expected_norm = normalize_text(&expected);
1220
1221        assert!(
1222            actual_norm.contains("Guide to Personnel Data Standards"),
1223            "missing main heading after stripping ToUnicode: {actual_norm}"
1224        );
1225        assert!(
1226            actual_norm.contains("Privacy Act Statement"),
1227            "missing body text after stripping ToUnicode: {actual_norm}"
1228        );
1229        assert!(
1230            actual_norm.len() + 32 >= expected_norm.len(),
1231            "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1232            expected_norm.len(),
1233            actual_norm.len()
1234        );
1235    }
1236
1237    #[test]
1238    fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1239        // PDFBOX-4322-3.pdf ships an Identity-H Type0 font whose `/ToUnicode`
1240        // stream is actually an Identity-H *encoding* CMap (only
1241        // `begincidrange <0000> <FFFF> 0`, no bf-mappings). The embedded
1242        // TrueType subset also has no `cmap` table, so both the ToUnicode
1243        // lookup and the reverse-cmap fallback fail. Previously this yielded
1244        // a 0-byte extraction because the character codes — which are Unicode
1245        // code points under Identity-H — were silently discarded.
1246        let bytes =
1247            std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1248        let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1249        let text = doc.extract_all_text();
1250
1251        let norm = normalize_text(&text);
1252        assert!(
1253            norm.contains("Transatlantic Council"),
1254            "expected Identity-H codes to resolve as Unicode: {norm}"
1255        );
1256        assert!(
1257            norm.contains("Boy Scouts of America"),
1258            "expected body text to be recovered: {norm}"
1259        );
1260    }
1261
1262    #[test]
1263    fn render_page_with_config_srgb_matches_legacy_render_page() {
1264        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1265        let legacy = doc
1266            .render_page(
1267                0,
1268                &RenderOptions {
1269                    dpi: 72.0,
1270                    ..Default::default()
1271                },
1272            )
1273            .expect("legacy render succeeds");
1274        let configured = doc
1275            .render_page_with_config(
1276                0,
1277                &RenderConfig {
1278                    color_mode: ColorMode::Srgb,
1279                    dpi: 72,
1280                },
1281            )
1282            .expect("configured render succeeds");
1283
1284        assert_eq!(legacy.width, configured.width);
1285        assert_eq!(legacy.height, configured.height);
1286        assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1287        assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1288        assert_eq!(legacy.pixels, configured.pixels);
1289    }
1290
1291    #[test]
1292    fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1293        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1294        let rendered = doc
1295            .render_page_with_config(
1296                0,
1297                &RenderConfig {
1298                    color_mode: ColorMode::PreserveCmyk,
1299                    dpi: 72,
1300                },
1301            )
1302            .expect("cmyk render succeeds");
1303
1304        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1305        assert_eq!(
1306            rendered.pixels.len(),
1307            rendered.width as usize * rendered.height as usize * 4
1308        );
1309        assert_eq!(
1310            pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1311            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1312        );
1313    }
1314
1315    #[test]
1316    fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1317        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1318        let rendered = doc
1319            .render_page_with_config(
1320                0,
1321                &RenderConfig {
1322                    color_mode: ColorMode::SimulateCmyk,
1323                    dpi: 72,
1324                },
1325            )
1326            .expect("simulate cmyk render succeeds");
1327
1328        assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1329        assert!(!rendered.pixels.is_empty());
1330    }
1331
1332    #[test]
1333    fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1334        let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1335        let rendered = doc
1336            .render_page_with_config(
1337                0,
1338                &RenderConfig {
1339                    color_mode: ColorMode::PreserveCmyk,
1340                    dpi: 72,
1341                },
1342            )
1343            .expect("mixed render succeeds");
1344
1345        assert_eq!(
1346            pixel_at(&rendered, 54, 36),
1347            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1348        );
1349        assert_ne!(
1350            pixel_at(&rendered, 18, 36),
1351            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1352        );
1353    }
1354
1355    #[test]
1356    fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1357        let doc =
1358            PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1359        let rendered = doc
1360            .render_page_with_config(
1361                0,
1362                &RenderConfig {
1363                    color_mode: ColorMode::PreserveCmyk,
1364                    dpi: 72,
1365                },
1366            )
1367            .expect("transparent cmyk render succeeds");
1368
1369        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1370        assert_eq!(
1371            rendered.pixels.len(),
1372            rendered.width as usize * rendered.height as usize * 4
1373        );
1374    }
1375
1376    #[test]
1377    fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1378        let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1379        let rendered = doc
1380            .render_page_with_config(
1381                0,
1382                &RenderConfig {
1383                    color_mode: ColorMode::PreserveCmyk,
1384                    dpi: 72,
1385                },
1386            )
1387            .expect("cmyk image render succeeds");
1388
1389        assert_eq!(rendered.width, 2);
1390        assert_eq!(rendered.height, 1);
1391        assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1392        assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1393    }
1394}