Skip to main content

pdf_engine/
document.rs

1//! Unified document facade — multi-page rendering, text extraction,
2//! metadata, bookmarks, and thumbnails.
3
4use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::{LimitError, ProcessingLimits};
7use std::sync::{Arc, Mutex};
8
9/// Shared slot used by the limit-warning collector.
10///
11/// Stores `Some((observed_bytes, limit_bytes))` when a
12/// `StreamTooLarge` warning fires during rendering or extraction.
13type LimitSlot = Arc<Mutex<Option<(u64, u64)>>>;
14use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
15use crate::text::{TextBlock, TextExtractionDevice};
16use crate::thumbnail::ThumbnailOptions;
17
18use pdf_forms::parse::parse_acroform;
19use pdf_forms::tree::{FieldType, FieldValue};
20use pdf_render::pdf_interpret::PageExt;
21use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings, InterpreterWarning};
22use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
23use pdf_render::pdf_syntax::object::Dict;
24use pdf_render::pdf_syntax::page::Page;
25use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
26#[cfg(feature = "parallel")]
27use rayon::prelude::*;
28
29use kurbo::Rect;
30
31/// Document metadata extracted from the info dictionary.
32#[derive(Debug, Clone, Default)]
33pub struct DocumentInfo {
34    /// Document title.
35    pub title: Option<String>,
36    /// Author.
37    pub author: Option<String>,
38    /// Subject.
39    pub subject: Option<String>,
40    /// Keywords.
41    pub keywords: Option<String>,
42    /// Creator application.
43    pub creator: Option<String>,
44    /// Producer application.
45    pub producer: Option<String>,
46}
47
48/// A bookmark / outline item.
49#[derive(Debug, Clone)]
50pub struct BookmarkItem {
51    /// Bookmark title.
52    pub title: String,
53    /// Target page index (0-based), if resolvable.
54    pub page: Option<usize>,
55    /// Nested child bookmarks.
56    pub children: Vec<BookmarkItem>,
57}
58
59/// High-level PDF document handle.
60pub struct PdfDocument {
61    pdf: Pdf,
62    settings: InterpreterSettings,
63}
64
65impl PdfDocument {
66    /// Open a PDF from bytes.
67    pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
68        let pdf = Pdf::new(data).map_err(|e| match e {
69            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
70                EngineError::Encrypted(format!("{d:?}"))
71            }
72            _ => EngineError::InvalidPdf(format!("{e:?}")),
73        })?;
74        Ok(Self {
75            pdf,
76            settings: InterpreterSettings::default(),
77        })
78    }
79
80    /// Open a PDF from bytes with processing limits.
81    pub fn open_with_processing_limits(
82        data: impl Into<pdf_render::pdf_syntax::PdfData>,
83        limits: ProcessingLimits,
84    ) -> Result<Self> {
85        let syntax_limits = PdfLoadLimits::new()
86            .max_object_depth(limits.max_object_depth)
87            .max_image_pixels(limits.max_image_pixels)
88            .max_stream_bytes(limits.max_stream_bytes);
89        let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
90            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
91                EngineError::Encrypted(format!("{d:?}"))
92            }
93            _ => EngineError::InvalidPdf(format!("{e:?}")),
94        })?;
95        let settings = InterpreterSettings {
96            max_operator_count: Some(limits.max_operator_count),
97            ..InterpreterSettings::default()
98        };
99        Ok(Self { pdf, settings })
100    }
101
102    /// Open a password-protected PDF.
103    pub fn open_with_password(
104        data: impl Into<pdf_render::pdf_syntax::PdfData>,
105        password: &str,
106    ) -> Result<Self> {
107        let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
108            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
109                EngineError::Encrypted(format!("{d:?}"))
110            }
111            _ => EngineError::InvalidPdf(format!("{e:?}")),
112        })?;
113        Ok(Self {
114            pdf,
115            settings: InterpreterSettings::default(),
116        })
117    }
118
119    /// Open a password-protected PDF with processing limits.
120    pub fn open_with_password_and_processing_limits(
121        data: impl Into<pdf_render::pdf_syntax::PdfData>,
122        password: &str,
123        limits: ProcessingLimits,
124    ) -> Result<Self> {
125        let syntax_limits = PdfLoadLimits::new()
126            .max_object_depth(limits.max_object_depth)
127            .max_image_pixels(limits.max_image_pixels)
128            .max_stream_bytes(limits.max_stream_bytes);
129        let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
130            |e| match e {
131                pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
132                    EngineError::Encrypted(format!("{d:?}"))
133                }
134                _ => EngineError::InvalidPdf(format!("{e:?}")),
135            },
136        )?;
137        let settings = InterpreterSettings {
138            max_operator_count: Some(limits.max_operator_count),
139            ..InterpreterSettings::default()
140        };
141        Ok(Self { pdf, settings })
142    }
143
144    /// Access the underlying parsed PDF.
145    pub fn pdf(&self) -> &Pdf {
146        &self.pdf
147    }
148
149    /// Set interpreter settings (font resolver, cmap resolver, etc.).
150    pub fn set_settings(&mut self, settings: InterpreterSettings) {
151        self.settings = settings;
152    }
153
154    /// Number of pages.
155    pub fn page_count(&self) -> usize {
156        self.pdf.pages().len()
157    }
158
159    /// Get the geometry of a page.
160    pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
161        let page = self.get_page(index)?;
162        Ok(geometry::extract_geometry(page))
163    }
164
165    /// Render a single page.
166    ///
167    /// If the document contains an XFA template, it is automatically flattened
168    /// to static PDF content before rendering.  This prevents the "Please wait"
169    /// placeholder page that Adobe Reader would show when rendering an XFA PDF
170    /// with a conventional renderer. If flattening fails, rendering falls back
171    /// to the original document as a best-effort path.
172    pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
173        #[cfg(feature = "xfa")]
174        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
175            return flat_doc.render_page(index, options);
176        }
177        let page = self.get_page(index)?;
178        // Pre-flight: reject pathologically small or zero-dimension pages before
179        // allocating any pixel buffer. Non-positive dimensions cause panics or
180        // zero-sized allocations inside the rasteriser.
181        let (w, h) = page.render_dimensions();
182        if w <= 0.0 || h <= 0.0 {
183            return Err(EngineError::InvalidPageGeometry {
184                width: w,
185                height: h,
186                reason: "page has zero or negative dimensions".into(),
187            });
188        }
189        // Also reject pages so small they produce zero pixels even at the
190        // minimum meaningful DPI (1 DPI). Below ~0.72pt at 1 DPI = 0 pixels.
191        const MIN_PAGE_PT: f32 = 1.0;
192        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
193            return Err(EngineError::InvalidPageGeometry {
194                width: w,
195                height: h,
196                reason: "page too small to render (< 1pt)".into(),
197            });
198        }
199        let (settings, slot) = Self::with_limit_collector(&self.settings);
200        let rendered = render::render_page(page, options, &settings);
201        Self::check_limit_slot(&slot)?;
202        Ok(rendered)
203    }
204
205    /// Render a single page using the high-level render config.
206    ///
207    /// XFA documents are auto-flattened before rendering (same as `render_page`).
208    /// If flattening fails, rendering falls back to the original document.
209    pub fn render_page_with_config(
210        &self,
211        index: usize,
212        config: &RenderConfig,
213    ) -> Result<RenderedPage> {
214        #[cfg(feature = "xfa")]
215        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
216            return flat_doc.render_page_with_config(index, config);
217        }
218        let page = self.get_page(index)?;
219        let (w, h) = page.render_dimensions();
220        if w <= 0.0 || h <= 0.0 {
221            return Err(EngineError::InvalidPageGeometry {
222                width: w,
223                height: h,
224                reason: "page has zero or negative dimensions".into(),
225            });
226        }
227        const MIN_PAGE_PT: f32 = 1.0;
228        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
229            return Err(EngineError::InvalidPageGeometry {
230                width: w,
231                height: h,
232                reason: "page too small to render (< 1pt)".into(),
233            });
234        }
235        let (settings, slot) = Self::with_limit_collector(&self.settings);
236        let rendered = render::render_page_with_config(page, config, &settings);
237        Self::check_limit_slot(&slot)?;
238        Ok(rendered)
239    }
240
241    /// Render a single page to a CMYK buffer.
242    pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
243        self.render_page_with_config(
244            index,
245            &RenderConfig {
246                color_mode: ColorMode::PreserveCmyk,
247                dpi,
248            },
249        )
250    }
251
252    /// Render all pages, in parallel when the `parallel` feature is enabled.
253    pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
254        let pages = self.pdf.pages();
255        #[cfg(feature = "parallel")]
256        return (0..pages.len())
257            .into_par_iter()
258            .map(|i| render::render_page(&pages[i], options, &self.settings))
259            .collect();
260        #[cfg(not(feature = "parallel"))]
261        (0..pages.len())
262            .map(|i| render::render_page(&pages[i], options, &self.settings))
263            .collect()
264    }
265
266    /// Render all pages using the high-level render config.
267    pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
268        let pages = self.pdf.pages();
269        #[cfg(feature = "parallel")]
270        return (0..pages.len())
271            .into_par_iter()
272            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
273            .collect();
274        #[cfg(not(feature = "parallel"))]
275        (0..pages.len())
276            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
277            .collect()
278    }
279
280    /// Generate a thumbnail for a single page.
281    pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
282        let page = self.get_page(index)?;
283        Ok(render::render_thumbnail(
284            page,
285            options.max_dimension,
286            &self.settings,
287        ))
288    }
289
290    /// Generate thumbnails for all pages, in parallel when the `parallel` feature is enabled.
291    pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
292        let pages = self.pdf.pages();
293        #[cfg(feature = "parallel")]
294        return (0..pages.len())
295            .into_par_iter()
296            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
297            .collect();
298        #[cfg(not(feature = "parallel"))]
299        (0..pages.len())
300            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
301            .collect()
302    }
303
304    /// Extract text from a page as a single string.
305    pub fn extract_text(&self, index: usize) -> Result<String> {
306        let page = self.get_page(index)?;
307        let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
308        let mut device = TextExtractionDevice::new();
309        let mut ctx = Self::create_context_with_settings(page, settings);
310        interpret_page(page, &mut ctx, &mut device);
311        Self::check_limit_slot(&slot)?;
312        Ok(device.into_text())
313    }
314
315    /// Extract text from a sequence of pages while reusing the same settings object.
316    #[doc(hidden)]
317    pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
318    where
319        I: IntoIterator<Item = usize>,
320    {
321        let pages = self.pdf.pages();
322        let mut settings = self.text_extraction_settings();
323        let indices = indices.into_iter();
324        let (lower_bound, upper_bound) = indices.size_hint();
325        let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
326
327        for index in indices {
328            let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
329                index,
330                count: pages.len(),
331            })?;
332            let (text, next_settings) = Self::extract_text_with_settings(page, settings);
333            settings = next_settings;
334            texts.push(text);
335        }
336
337        Ok(texts)
338    }
339
340    /// Extract structured text blocks from a page.
341    pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
342        let page = self.get_page(index)?;
343        let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
344        let mut device = TextExtractionDevice::new();
345        let mut ctx = Self::create_context_with_settings(page, settings);
346        interpret_page(page, &mut ctx, &mut device);
347        Self::check_limit_slot(&slot)?;
348        Ok(device.into_blocks())
349    }
350
351    /// Extract structured text blocks from all pages, reusing interpreter settings.
352    pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
353        let pages = self.pdf.pages();
354        let mut settings = self.text_extraction_settings();
355        let mut blocks = Vec::with_capacity(pages.len());
356
357        for page in pages.iter() {
358            let (page_blocks, next_settings) =
359                Self::extract_text_blocks_with_settings(page, settings);
360            settings = next_settings;
361            blocks.push(page_blocks);
362        }
363
364        blocks
365    }
366
367    /// Extract text values from AcroForm fields, including push-button captions.
368    ///
369    /// Returns a single string concatenating all non-empty field values separated
370    /// by newlines. Useful when the document stores its readable content in form
371    /// field values rather than (or in addition to) page content streams.
372    pub fn extract_acroform_text(&self) -> String {
373        let Some(tree) = parse_acroform(&self.pdf) else {
374            return String::new();
375        };
376        let mut parts: Vec<String> = Vec::new();
377        for id in tree.all_ids() {
378            let node = tree.get(id);
379            if node.children.is_empty() {
380                // Terminal (widget) — collect text-like values.
381                let value_str = match &node.value {
382                    Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
383                    Some(FieldValue::StringArray(arr)) => {
384                        let joined = arr
385                            .iter()
386                            .filter(|s| !s.is_empty())
387                            .cloned()
388                            .collect::<Vec<_>>()
389                            .join(", ");
390                        if joined.is_empty() {
391                            None
392                        } else {
393                            Some(joined)
394                        }
395                    }
396                    _ => None,
397                };
398                let button_caption =
399                    value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
400                let extracted = value_str.or_else(|| {
401                    button_caption.then(|| {
402                        node.mk
403                            .as_ref()
404                            .and_then(|mk| mk.caption.as_ref())
405                            .filter(|caption| !caption.is_empty())
406                            .cloned()
407                    })?
408                });
409                if let Some(s) = extracted {
410                    parts.push(s);
411                }
412            }
413        }
414        parts.join("\n")
415    }
416
417    /// Extract all text from the document: page content streams plus AcroForm
418    /// field values.  Mirrors pdftotext behaviour.
419    pub fn extract_all_text(&self) -> String {
420        let pages = self.pdf.pages();
421        let mut settings = self.text_extraction_settings();
422        let mut page_texts = Vec::with_capacity(pages.len());
423        for page in pages.iter() {
424            let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
425            settings = next_settings;
426            page_texts.push(page_text);
427        }
428
429        let mut text = join_page_texts(page_texts.iter().map(String::as_str));
430        let acroform = self.extract_acroform_text();
431        if !acroform.is_empty() {
432            if !text.is_empty() && !text.ends_with('\n') {
433                text.push('\n');
434            }
435            text.push_str(&acroform);
436        }
437        text
438    }
439
440    /// Simple text search: returns page indices containing the query string.
441    pub fn search_text(&self, query: &str) -> Vec<usize> {
442        let pages = self.pdf.pages();
443        let query_lower = query.to_lowercase();
444        #[cfg(feature = "parallel")]
445        let page_contains = |i: usize| -> Option<usize> {
446            let page = &pages[i];
447            let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
448            if text.to_lowercase().contains(&query_lower) {
449                Some(i)
450            } else {
451                None
452            }
453        };
454        #[cfg(feature = "parallel")]
455        return (0..pages.len())
456            .into_par_iter()
457            .filter_map(page_contains)
458            .collect();
459        #[cfg(not(feature = "parallel"))]
460        {
461            let mut settings = self.text_extraction_settings();
462            let mut hits = Vec::new();
463            for (i, page) in pages.iter().enumerate() {
464                let (text, next_settings) = Self::extract_text_with_settings(page, settings);
465                settings = next_settings;
466                if text.to_lowercase().contains(&query_lower) {
467                    hits.push(i);
468                }
469            }
470            hits
471        }
472    }
473
474    /// Extract document metadata.
475    pub fn info(&self) -> DocumentInfo {
476        let meta = self.pdf.metadata();
477        DocumentInfo {
478            title: meta.title.as_ref().map(|b| bytes_to_string(b)),
479            author: meta.author.as_ref().map(|b| bytes_to_string(b)),
480            subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
481            keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
482            creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
483            producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
484        }
485    }
486
487    /// Extract document outline / bookmarks.
488    pub fn bookmarks(&self) -> Vec<BookmarkItem> {
489        let xref = self.pdf.xref();
490        let root_id = xref.root_id();
491        let catalog: Dict<'_> = match xref.get(root_id) {
492            Some(d) => d,
493            None => return Vec::new(),
494        };
495
496        let outlines: Dict<'_> = match catalog.get(OUTLINES) {
497            Some(d) => d,
498            None => return Vec::new(),
499        };
500
501        let first: Dict<'_> = match outlines.get(FIRST) {
502            Some(d) => d,
503            None => return Vec::new(),
504        };
505
506        parse_outline_items(&first)
507    }
508
509    /// Run OCR on a page and return the recognized text and word positions.
510    ///
511    /// The page is rendered at `dpi` (default 150) before recognition.
512    /// Pass any [`OcrBackend`] implementation; use [`OcrsBackend::try_default`]
513    /// to load the pure-Rust `ocrs` engine from the standard model paths.
514    ///
515    /// # Example
516    ///
517    /// ```no_run
518    /// # #[cfg(feature = "ocr")] {
519    /// use pdf_engine::{PdfDocument, OcrsBackend, RenderOptions};
520    ///
521    /// let doc = PdfDocument::open(std::fs::read("scan.pdf").unwrap()).unwrap();
522    /// let backend = OcrsBackend::try_default().unwrap();
523    /// let result = doc.ocr_page(0, &backend, 150.0_f64).unwrap();
524    /// println!("{}", result.text);
525    /// # }
526    /// ```
527    pub fn ocr_page(
528        &self,
529        index: usize,
530        backend: &dyn crate::ocr::OcrBackend,
531        dpi: f64,
532    ) -> crate::error::Result<crate::ocr::OcrResult> {
533        let opts = crate::render::RenderOptions {
534            dpi,
535            ..Default::default()
536        };
537        let rendered = self.render_page(index, &opts)?;
538
539        // Convert RGBA → RGB (ocrs expects RGB input).
540        let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
541        for chunk in rendered.pixels.chunks(4) {
542            rgb.push(chunk[0]);
543            rgb.push(chunk[1]);
544            rgb.push(chunk[2]);
545        }
546
547        backend
548            .recognize(&rgb, rendered.width, rendered.height)
549            .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
550    }
551
552    /// Wrap `settings` with a warning sink that captures the first
553    /// `InterpreterWarning::StreamTooLarge` into a shared slot.
554    ///
555    /// The returned slot is checked by [`Self::check_limit_slot`] after
556    /// the operation completes. Any previously installed sink is still
557    /// called so no warnings are silently dropped.
558    fn with_limit_collector(settings: &InterpreterSettings) -> (InterpreterSettings, LimitSlot) {
559        let slot: LimitSlot = Arc::new(Mutex::new(None));
560        let slot_clone = Arc::clone(&slot);
561        let prev_sink = settings.warning_sink.clone();
562        let mut new_settings = settings.clone();
563        new_settings.warning_sink = Arc::new(move |w: InterpreterWarning| {
564            if let InterpreterWarning::StreamTooLarge { observed, limit } = w {
565                let mut guard = slot_clone.lock().unwrap_or_else(|e| e.into_inner());
566                if guard.is_none() {
567                    *guard = Some((observed, limit));
568                }
569            }
570            prev_sink(w);
571        });
572        (new_settings, slot)
573    }
574
575    /// Check the slot populated by [`Self::with_limit_collector`].
576    ///
577    /// Returns `Err(EngineError::LimitExceeded(...))` if a
578    /// `StreamTooLarge` warning was captured, `Ok(())` otherwise.
579    fn check_limit_slot(slot: &LimitSlot) -> Result<()> {
580        if let Some((observed, limit)) = *slot.lock().unwrap_or_else(|e| e.into_inner()) {
581            return Err(EngineError::LimitExceeded(LimitError::StreamTooLarge {
582                actual_bytes: observed,
583                limit_bytes: limit,
584            }));
585        }
586        Ok(())
587    }
588
589    fn get_page(&self, index: usize) -> Result<&Page<'_>> {
590        let pages = self.pdf.pages();
591        if index >= pages.len() {
592            return Err(EngineError::PageOutOfRange {
593                index,
594                count: pages.len(),
595            });
596        }
597        Ok(&pages[index])
598    }
599
600    fn text_extraction_settings(&self) -> InterpreterSettings {
601        let mut settings = self.settings.clone();
602        // Text extraction should include signature widget appearance streams
603        // that rendering skips to match MuPDF visual output.
604        settings.skip_signature_widgets = false;
605        settings
606    }
607
608    fn create_context_with_settings<'a>(
609        page: &Page<'a>,
610        settings: InterpreterSettings,
611    ) -> Context<'a> {
612        let (w, h) = page.render_dimensions();
613        Context::new(
614            page.initial_transform(false),
615            Rect::new(0.0, 0.0, w as f64, h as f64),
616            page.xref(),
617            settings,
618        )
619    }
620
621    fn extract_text_with_settings<'a>(
622        page: &Page<'a>,
623        settings: InterpreterSettings,
624    ) -> (String, InterpreterSettings) {
625        let mut device = TextExtractionDevice::new();
626        let mut ctx = Self::create_context_with_settings(page, settings);
627        interpret_page(page, &mut ctx, &mut device);
628        let settings = ctx.into_settings();
629        (device.into_text(), settings)
630    }
631
632    fn extract_text_blocks_with_settings<'a>(
633        page: &Page<'a>,
634        settings: InterpreterSettings,
635    ) -> (Vec<TextBlock>, InterpreterSettings) {
636        let mut device = TextExtractionDevice::new();
637        let mut ctx = Self::create_context_with_settings(page, settings);
638        interpret_page(page, &mut ctx, &mut device);
639        let settings = ctx.into_settings();
640        (device.into_blocks(), settings)
641    }
642
643    #[cfg(feature = "xfa")]
644    fn open_flattened_xfa_for_render(&self) -> Option<Self> {
645        if !crate::xfa::has_xfa(self) {
646            return None;
647        }
648
649        let flat_bytes = crate::xfa::flatten(self).ok()?;
650        let mut flat_doc = Self::open(flat_bytes).ok()?;
651        flat_doc.settings = self.settings.clone();
652        Some(flat_doc)
653    }
654}
655
656fn join_page_texts<I>(page_texts: I) -> String
657where
658    I: IntoIterator,
659    I::Item: AsRef<str>,
660{
661    let mut text = String::new();
662    let mut is_first = true;
663
664    for page_text in page_texts {
665        if !is_first {
666            while !text.is_empty() && !text.ends_with("\n\n") {
667                text.push('\n');
668            }
669            text.push('\u{000C}');
670        }
671        text.push_str(page_text.as_ref());
672        is_first = false;
673    }
674
675    text
676}
677
678#[cfg(test)]
679mod extract_all_text_tests {
680    use super::join_page_texts;
681
682    #[test]
683    fn separates_nonempty_pages_like_pdftotext() {
684        assert_eq!(
685            join_page_texts(["Page 1", "Page 2"]),
686            "Page 1\n\n\u{000C}Page 2"
687        );
688    }
689
690    #[test]
691    fn preserves_leading_blank_pages_without_extra_newlines() {
692        assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
693    }
694
695    #[test]
696    fn reuses_existing_blank_line_before_form_feed() {
697        assert_eq!(
698            join_page_texts(["Page 1\n\n", "Page 2"]),
699            "Page 1\n\n\u{000C}Page 2"
700        );
701    }
702}
703
704/// Walk the outline linked list (FIRST → NEXT chain).
705fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
706    let mut items = Vec::new();
707    let mut current: Option<Dict<'_>> = Some(item_dict.clone());
708
709    while let Some(dict) = current {
710        let title = dict
711            .get::<pdf_render::pdf_syntax::object::String>(TITLE)
712            .map(|s| bytes_to_string(s.as_bytes()))
713            .unwrap_or_default();
714
715        let children = match dict.get::<Dict<'_>>(FIRST) {
716            Some(child_dict) => parse_outline_items(&child_dict),
717            None => Vec::new(),
718        };
719
720        items.push(BookmarkItem {
721            title,
722            page: None, // Destination resolution requires named-dest lookup — left for follow-up
723            children,
724        });
725
726        current = dict.get::<Dict<'_>>(NEXT);
727    }
728
729    items
730}
731
732/// Convert PDF string bytes to a Rust String (UTF-8 with Latin-1 fallback).
733fn bytes_to_string(bytes: &[u8]) -> String {
734    // Check for UTF-16 BOM
735    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
736        let chars: Vec<u16> = bytes[2..]
737            .chunks(2)
738            .filter_map(|c| {
739                if c.len() == 2 {
740                    Some(u16::from_be_bytes([c[0], c[1]]))
741                } else {
742                    None
743                }
744            })
745            .collect();
746        return String::from_utf16_lossy(&chars);
747    }
748
749    // Try UTF-8, fall back to Latin-1.
750    match std::str::from_utf8(bytes) {
751        Ok(s) => s.to_string(),
752        Err(_) => bytes.iter().map(|&b| b as char).collect(),
753    }
754}
755
756#[cfg(test)]
757mod tests {
758    use super::*;
759    use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
760    use lopdf::{Document as LoDocument, Object};
761    use std::path::PathBuf;
762
763    fn corpus_path(name: &str) -> PathBuf {
764        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
765            .join("../../corpus")
766            .join(name)
767    }
768
769    fn normalize_text(text: &str) -> String {
770        text.split_whitespace().collect::<Vec<_>>().join(" ")
771    }
772
773    fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
774        fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
775            match dict.get(key).ok()? {
776                Object::Name(name) => Some(name.clone()),
777                _ => None,
778            }
779        }
780
781        fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
782            let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
783                return false;
784            };
785            let Some(Object::Reference(desc_id)) = descendants.first() else {
786                return false;
787            };
788            let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
789                return false;
790            };
791            matches!(
792                descendant.get(b"Subtype").ok(),
793                Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
794            )
795        }
796
797        let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
798        let ids: Vec<_> = doc.objects.keys().copied().collect();
799        let mut removed = 0usize;
800
801        for id in ids {
802            let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
803                continue;
804            };
805            if !matches!(
806                dict.get(b"Subtype").ok(),
807                Some(Object::Name(name)) if name.as_slice() == b"Type0"
808            ) {
809                continue;
810            }
811            if !matches!(
812                get_name(dict, b"Encoding").as_deref(),
813                Some(b"Identity-H") | Some(b"Identity-V")
814            ) {
815                continue;
816            }
817            if !descendant_is_cidfont_type2(&doc, dict) {
818                continue;
819            }
820
821            if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
822                if type0.has(b"ToUnicode") {
823                    type0.remove(b"ToUnicode");
824                    removed += 1;
825                }
826            }
827        }
828
829        let mut out = Vec::new();
830        doc.save_to(&mut out)
831            .expect("save stripped-to-unicode fixture");
832        (out, removed)
833    }
834
835    fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
836        use lopdf::{dictionary, Document, Object, Stream};
837
838        let mut doc = Document::with_version("1.4");
839
840        let pages_id = doc.new_object_id();
841        let page_id = doc.new_object_id();
842        let content = format!("{color_operator}\n0 0 72 72 re f\n");
843        let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
844
845        doc.objects.insert(
846            page_id,
847            Object::Dictionary(dictionary! {
848                "Type" => Object::Name(b"Page".to_vec()),
849                "Parent" => Object::Reference(pages_id),
850                "MediaBox" => Object::Array(vec![
851                    Object::Integer(0),
852                    Object::Integer(0),
853                    Object::Integer(72),
854                    Object::Integer(72),
855                ]),
856                "Contents" => Object::Reference(content_id),
857            }),
858        );
859
860        doc.objects.insert(
861            pages_id,
862            Object::Dictionary(dictionary! {
863                "Type" => Object::Name(b"Pages".to_vec()),
864                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
865                "Count" => Object::Integer(1),
866            }),
867        );
868
869        let catalog_id = doc.new_object_id();
870        doc.objects.insert(
871            catalog_id,
872            Object::Dictionary(dictionary! {
873                "Type" => Object::Name(b"Catalog".to_vec()),
874                "Pages" => Object::Reference(pages_id),
875            }),
876        );
877
878        doc.trailer.set("Root", Object::Reference(catalog_id));
879
880        let mut bytes = Vec::new();
881        doc.save_to(&mut bytes).expect("save solid fill fixture");
882        bytes
883    }
884
885    fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
886        use lopdf::{dictionary, Document, Object, Stream};
887
888        let mut doc = Document::with_version("1.4");
889        let pages_id = doc.new_object_id();
890        let page_id = doc.new_object_id();
891        let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
892        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
893
894        doc.objects.insert(
895            page_id,
896            Object::Dictionary(dictionary! {
897                "Type" => "Page",
898                "Parent" => Object::Reference(pages_id),
899                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
900                "Contents" => Object::Reference(content_id),
901            }),
902        );
903        doc.objects.insert(
904            pages_id,
905            Object::Dictionary(dictionary! {
906                "Type" => "Pages",
907                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
908                "Count" => Object::Integer(1),
909            }),
910        );
911        let catalog_id = doc.new_object_id();
912        doc.objects.insert(
913            catalog_id,
914            Object::Dictionary(dictionary! {
915                "Type" => "Catalog",
916                "Pages" => Object::Reference(pages_id),
917            }),
918        );
919        doc.trailer.set("Root", Object::Reference(catalog_id));
920
921        let mut bytes = Vec::new();
922        doc.save_to(&mut bytes)
923            .expect("save mixed rgb/cmyk fixture");
924        bytes
925    }
926
927    fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
928        use lopdf::{dictionary, Document, Object, Stream};
929
930        let mut doc = Document::with_version("1.4");
931        let pages_id = doc.new_object_id();
932        let page_id = doc.new_object_id();
933        let gs_id = doc.add_object(Object::Dictionary(dictionary! {
934            "Type" => "ExtGState",
935            "ca" => Object::Real(0.5),
936        }));
937        let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
938        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
939
940        doc.objects.insert(
941            page_id,
942            Object::Dictionary(dictionary! {
943                "Type" => "Page",
944                "Parent" => Object::Reference(pages_id),
945                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
946                "Resources" => dictionary! {
947                    "ExtGState" => dictionary! {
948                        "GS1" => Object::Reference(gs_id),
949                    },
950                },
951                "Contents" => Object::Reference(content_id),
952            }),
953        );
954        doc.objects.insert(
955            pages_id,
956            Object::Dictionary(dictionary! {
957                "Type" => "Pages",
958                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
959                "Count" => Object::Integer(1),
960            }),
961        );
962        let catalog_id = doc.new_object_id();
963        doc.objects.insert(
964            catalog_id,
965            Object::Dictionary(dictionary! {
966                "Type" => "Catalog",
967                "Pages" => Object::Reference(pages_id),
968            }),
969        );
970        doc.trailer.set("Root", Object::Reference(catalog_id));
971
972        let mut bytes = Vec::new();
973        doc.save_to(&mut bytes)
974            .expect("save transparent cmyk fixture");
975        bytes
976    }
977
978    fn cmyk_image_pdf_bytes() -> Vec<u8> {
979        use lopdf::{dictionary, Document, Object, Stream};
980
981        let mut doc = Document::with_version("1.4");
982        let pages_id = doc.new_object_id();
983        let page_id = doc.new_object_id();
984        let image_id = doc.add_object(Stream::new(
985            dictionary! {
986                "Type" => "XObject",
987                "Subtype" => "Image",
988                "Width" => Object::Integer(2),
989                "Height" => Object::Integer(1),
990                "BitsPerComponent" => Object::Integer(8),
991                "ColorSpace" => "DeviceCMYK",
992            },
993            vec![255, 0, 0, 0, 0, 255, 0, 0],
994        ));
995        let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
996        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
997
998        doc.objects.insert(
999            page_id,
1000            Object::Dictionary(dictionary! {
1001                "Type" => "Page",
1002                "Parent" => Object::Reference(pages_id),
1003                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
1004                "Resources" => dictionary! {
1005                    "XObject" => dictionary! {
1006                        "Im1" => Object::Reference(image_id),
1007                    },
1008                },
1009                "Contents" => Object::Reference(content_id),
1010            }),
1011        );
1012        doc.objects.insert(
1013            pages_id,
1014            Object::Dictionary(dictionary! {
1015                "Type" => "Pages",
1016                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1017                "Count" => Object::Integer(1),
1018            }),
1019        );
1020        let catalog_id = doc.new_object_id();
1021        doc.objects.insert(
1022            catalog_id,
1023            Object::Dictionary(dictionary! {
1024                "Type" => "Catalog",
1025                "Pages" => Object::Reference(pages_id),
1026            }),
1027        );
1028        doc.trailer.set("Root", Object::Reference(catalog_id));
1029
1030        let mut bytes = Vec::new();
1031        doc.save_to(&mut bytes).expect("save cmyk image fixture");
1032        bytes
1033    }
1034
1035    fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
1036        let idx = ((y * rendered.width + x) * 4) as usize;
1037        [
1038            rendered.pixels[idx],
1039            rendered.pixels[idx + 1],
1040            rendered.pixels[idx + 2],
1041            rendered.pixels[idx + 3],
1042        ]
1043    }
1044
1045    /// Build a minimal one-page PDF whose only font is a non-embedded TrueType
1046    /// reference (no `FontFile2`). The character codes in the content stream
1047    /// resolve through the declared `/Encoding`, exercising the same code path
1048    /// as corpus PDFs like `171_171940.pdf`.
1049    fn non_embedded_truetype_pdf_bytes(
1050        base_font: &[u8],
1051        encoding: &[u8],
1052        text_bytes: &[u8],
1053    ) -> Vec<u8> {
1054        use lopdf::{dictionary, Document, Object, Stream};
1055
1056        let mut doc = Document::with_version("1.4");
1057
1058        let font_id = doc.add_object(Object::Dictionary(dictionary! {
1059            "Type" => "Font",
1060            "Subtype" => "TrueType",
1061            "Name" => Object::Name(b"F0".to_vec()),
1062            "BaseFont" => Object::Name(base_font.to_vec()),
1063            "Encoding" => Object::Name(encoding.to_vec()),
1064        }));
1065
1066        let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1067            "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1068        }));
1069
1070        let mut content = Vec::new();
1071        content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1072        for &b in text_bytes {
1073            match b {
1074                b'(' | b')' | b'\\' => {
1075                    content.push(b'\\');
1076                    content.push(b);
1077                }
1078                _ => content.push(b),
1079            }
1080        }
1081        content.extend_from_slice(b") Tj\nET\n");
1082        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1083
1084        let pages_id = doc.new_object_id();
1085        let page_id = doc.add_object(Object::Dictionary(dictionary! {
1086            "Type" => "Page",
1087            "Parent" => Object::Reference(pages_id),
1088            "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1089            "Resources" => Object::Reference(resources_id),
1090            "Contents" => Object::Reference(content_id),
1091        }));
1092        doc.objects.insert(
1093            pages_id,
1094            Object::Dictionary(dictionary! {
1095                "Type" => "Pages",
1096                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1097                "Count" => Object::Integer(1),
1098            }),
1099        );
1100        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1101            "Type" => "Catalog",
1102            "Pages" => Object::Reference(pages_id),
1103        }));
1104        doc.trailer.set("Root", Object::Reference(catalog_id));
1105
1106        let mut bytes = Vec::new();
1107        doc.save_to(&mut bytes).expect("save non-embedded fixture");
1108        bytes
1109    }
1110
1111    /// Build a minimal AcroForm push button whose only human-readable text
1112    /// lives in the widget `/MK /CA` caption entry.
1113    fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1114        use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1115
1116        let mut doc = Document::with_version("1.4");
1117
1118        let catalog_id = doc.new_object_id();
1119        let pages_id = doc.new_object_id();
1120        let page_id = doc.new_object_id();
1121        let acroform_id = doc.new_object_id();
1122        let content_id = doc.new_object_id();
1123        let widget_id = doc.new_object_id();
1124
1125        doc.objects.insert(
1126            content_id,
1127            Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1128        );
1129        doc.objects.insert(
1130            widget_id,
1131            Object::Dictionary(dictionary! {
1132                "Type" => "Annot",
1133                "Subtype" => "Widget",
1134                "FT" => "Btn",
1135                "Ff" => Object::Integer(1 << 16),
1136                "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1137                "MK" => dictionary! {
1138                    "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1139                },
1140                "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1141                "P" => Object::Reference(page_id),
1142            }),
1143        );
1144        doc.objects.insert(
1145            page_id,
1146            Object::Dictionary(dictionary! {
1147                "Type" => "Page",
1148                "Parent" => Object::Reference(pages_id),
1149                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1150                "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1151                "Contents" => Object::Reference(content_id),
1152            }),
1153        );
1154        doc.objects.insert(
1155            pages_id,
1156            Object::Dictionary(dictionary! {
1157                "Type" => "Pages",
1158                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1159                "Count" => Object::Integer(1),
1160            }),
1161        );
1162        doc.objects.insert(
1163            acroform_id,
1164            Object::Dictionary(dictionary! {
1165                "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1166            }),
1167        );
1168        doc.objects.insert(
1169            catalog_id,
1170            Object::Dictionary(dictionary! {
1171                "Type" => "Catalog",
1172                "Pages" => Object::Reference(pages_id),
1173                "AcroForm" => Object::Reference(acroform_id),
1174            }),
1175        );
1176        doc.trailer.set("Root", Object::Reference(catalog_id));
1177
1178        let mut bytes = Vec::new();
1179        doc.save_to(&mut bytes)
1180            .expect("save push-button caption fixture");
1181        bytes
1182    }
1183
1184    #[test]
1185    fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1186        // Mirrors corpus PDF `171_171940.pdf`: TrueType font references
1187        // `TimesNewRoman` (resolves through the standard-font alias table)
1188        // with `WinAnsiEncoding` and no embedded font program. Extraction must
1189        // recover the text from the declared encoding even though no glyph
1190        // outlines are available.
1191        let bytes = non_embedded_truetype_pdf_bytes(
1192            b"TimesNewRoman",
1193            b"WinAnsiEncoding",
1194            b"UNITED STATES DISTRICT COURT",
1195        );
1196        let text = PdfDocument::open(bytes)
1197            .expect("open non-embedded TrueType fixture")
1198            .extract_text(0)
1199            .expect("extract non-embedded TrueType text");
1200        let norm = normalize_text(&text);
1201        assert!(
1202            norm.contains("UNITED STATES DISTRICT COURT"),
1203            "expected WinAnsi-decoded text, got: {norm:?}"
1204        );
1205    }
1206
1207    #[test]
1208    fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1209        // Custom BaseFont that does not match any standard alias and lacks the
1210        // keywords used by the heuristic. The standard-font fallback (via
1211        // FallbackFontQuery) still picks Helvetica, but on hosts without the
1212        // embedded font assets that path returns None — the new TextOnly
1213        // branch is what keeps extraction non-empty in that case. Either way,
1214        // the WinAnsi-driven char map must produce the original prose.
1215        let bytes = non_embedded_truetype_pdf_bytes(
1216            b"OpaqueCustomXYZ",
1217            b"WinAnsiEncoding",
1218            b"Hello, world!",
1219        );
1220        let text = PdfDocument::open(bytes)
1221            .expect("open custom non-embedded fixture")
1222            .extract_text(0)
1223            .expect("extract custom non-embedded text");
1224        let norm = normalize_text(&text);
1225        assert!(
1226            norm.contains("Hello, world!"),
1227            "expected WinAnsi-decoded text, got: {norm:?}"
1228        );
1229    }
1230
1231    #[test]
1232    fn extract_acroform_text_includes_push_button_mk_caption() {
1233        let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1234        let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1235
1236        let page_text = doc.extract_text(0).expect("extract page text");
1237        assert!(
1238            normalize_text(&page_text).is_empty(),
1239            "expected empty page content stream, got: {page_text:?}"
1240        );
1241
1242        let acroform_text = doc.extract_acroform_text();
1243        assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1244
1245        let all_text = doc.extract_all_text();
1246        assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1247    }
1248
1249    #[test]
1250    fn bytes_to_string_utf8() {
1251        assert_eq!(bytes_to_string(b"hello"), "hello");
1252    }
1253
1254    #[test]
1255    fn bytes_to_string_latin1() {
1256        let bytes = &[0xC4, 0xD6, 0xDC]; // ÄÖÜ in Latin-1
1257        let s = bytes_to_string(bytes);
1258        assert_eq!(s, "ÄÖÜ");
1259    }
1260
1261    #[test]
1262    fn bytes_to_string_utf16() {
1263        let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; // UTF-16 "Hi"
1264        assert_eq!(bytes_to_string(bytes), "Hi");
1265    }
1266
1267    #[test]
1268    fn document_info_default() {
1269        let info = DocumentInfo::default();
1270        assert!(info.title.is_none());
1271        assert!(info.author.is_none());
1272    }
1273
1274    #[test]
1275    fn bookmark_item_children() {
1276        let item = BookmarkItem {
1277            title: "Root".into(),
1278            page: None,
1279            children: vec![BookmarkItem {
1280                title: "Child".into(),
1281                page: Some(0),
1282                children: Vec::new(),
1283            }],
1284        };
1285        assert_eq!(item.children.len(), 1);
1286        assert_eq!(item.children[0].title, "Child");
1287    }
1288
1289    #[test]
1290    fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1291        let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1292        let expected = PdfDocument::open(original.clone())
1293            .expect("open original sf181")
1294            .extract_text(0)
1295            .expect("extract original sf181 text");
1296        assert!(
1297            expected.contains("Guide to Personnel Data Standards"),
1298            "unexpected baseline extraction: {expected}"
1299        );
1300
1301        let (stripped, removed) = strip_type0_tounicode(&original);
1302        assert!(
1303            removed > 0,
1304            "expected to strip at least one Type0 ToUnicode"
1305        );
1306
1307        let actual = PdfDocument::open(stripped)
1308            .expect("open stripped sf181")
1309            .extract_text(0)
1310            .expect("extract stripped sf181 text");
1311
1312        let actual_norm = normalize_text(&actual);
1313        let expected_norm = normalize_text(&expected);
1314
1315        assert!(
1316            actual_norm.contains("Guide to Personnel Data Standards"),
1317            "missing main heading after stripping ToUnicode: {actual_norm}"
1318        );
1319        assert!(
1320            actual_norm.contains("Privacy Act Statement"),
1321            "missing body text after stripping ToUnicode: {actual_norm}"
1322        );
1323        assert!(
1324            actual_norm.len() + 32 >= expected_norm.len(),
1325            "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1326            expected_norm.len(),
1327            actual_norm.len()
1328        );
1329    }
1330
1331    #[test]
1332    fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1333        // PDFBOX-4322-3.pdf ships an Identity-H Type0 font whose `/ToUnicode`
1334        // stream is actually an Identity-H *encoding* CMap (only
1335        // `begincidrange <0000> <FFFF> 0`, no bf-mappings). The embedded
1336        // TrueType subset also has no `cmap` table, so both the ToUnicode
1337        // lookup and the reverse-cmap fallback fail. Previously this yielded
1338        // a 0-byte extraction because the character codes — which are Unicode
1339        // code points under Identity-H — were silently discarded.
1340        let bytes =
1341            std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1342        let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1343        let text = doc.extract_all_text();
1344
1345        let norm = normalize_text(&text);
1346        assert!(
1347            norm.contains("Transatlantic Council"),
1348            "expected Identity-H codes to resolve as Unicode: {norm}"
1349        );
1350        assert!(
1351            norm.contains("Boy Scouts of America"),
1352            "expected body text to be recovered: {norm}"
1353        );
1354    }
1355
1356    #[test]
1357    fn render_max_pixels_none_is_unchanged_default_behavior() {
1358        // Default (max_pixels = None) must be byte-identical to an explicit no-budget render.
1359        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1360        let baseline = doc
1361            .render_page(
1362                0,
1363                &RenderOptions {
1364                    dpi: 144.0,
1365                    ..Default::default()
1366                },
1367            )
1368            .expect("baseline render");
1369        let explicit_none = doc
1370            .render_page(
1371                0,
1372                &RenderOptions {
1373                    dpi: 144.0,
1374                    max_pixels: None,
1375                    ..Default::default()
1376                },
1377            )
1378            .expect("explicit-none render");
1379        assert_eq!(baseline.width, explicit_none.width);
1380        assert_eq!(baseline.height, explicit_none.height);
1381        assert_eq!(baseline.pixels, explicit_none.pixels);
1382    }
1383
1384    #[test]
1385    fn render_max_pixels_budget_clamps_resolution() {
1386        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1387        let full = doc
1388            .render_page(
1389                0,
1390                &RenderOptions {
1391                    dpi: 288.0,
1392                    ..Default::default()
1393                },
1394            )
1395            .expect("full render");
1396        let full_px = full.width * full.height;
1397        // Budget well below the full pixel count must reduce output dimensions.
1398        let budget = full_px / 4;
1399        let capped = doc
1400            .render_page(
1401                0,
1402                &RenderOptions {
1403                    dpi: 288.0,
1404                    max_pixels: Some(budget),
1405                    ..Default::default()
1406                },
1407            )
1408            .expect("capped render");
1409        assert!(
1410            capped.width * capped.height <= full_px,
1411            "capped output must not exceed full output"
1412        );
1413        assert!(
1414            capped.width < full.width || capped.height < full.height,
1415            "budget below full pixel count must shrink at least one dimension"
1416        );
1417    }
1418
1419    #[test]
1420    fn render_max_pixels_large_budget_no_clamp() {
1421        // A budget larger than the rendered size must not change the output.
1422        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1423        let baseline = doc
1424            .render_page(
1425                0,
1426                &RenderOptions {
1427                    dpi: 72.0,
1428                    ..Default::default()
1429                },
1430            )
1431            .expect("baseline");
1432        let huge = doc
1433            .render_page(
1434                0,
1435                &RenderOptions {
1436                    dpi: 72.0,
1437                    max_pixels: Some(100_000_000),
1438                    ..Default::default()
1439                },
1440            )
1441            .expect("huge-budget render");
1442        assert_eq!(baseline.width, huge.width);
1443        assert_eq!(baseline.height, huge.height);
1444        assert_eq!(baseline.pixels, huge.pixels);
1445    }
1446
1447    #[test]
1448    fn render_page_with_config_srgb_matches_legacy_render_page() {
1449        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1450        let legacy = doc
1451            .render_page(
1452                0,
1453                &RenderOptions {
1454                    dpi: 72.0,
1455                    ..Default::default()
1456                },
1457            )
1458            .expect("legacy render succeeds");
1459        let configured = doc
1460            .render_page_with_config(
1461                0,
1462                &RenderConfig {
1463                    color_mode: ColorMode::Srgb,
1464                    dpi: 72,
1465                },
1466            )
1467            .expect("configured render succeeds");
1468
1469        assert_eq!(legacy.width, configured.width);
1470        assert_eq!(legacy.height, configured.height);
1471        assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1472        assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1473        assert_eq!(legacy.pixels, configured.pixels);
1474    }
1475
1476    #[test]
1477    fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1478        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1479        let rendered = doc
1480            .render_page_with_config(
1481                0,
1482                &RenderConfig {
1483                    color_mode: ColorMode::PreserveCmyk,
1484                    dpi: 72,
1485                },
1486            )
1487            .expect("cmyk render succeeds");
1488
1489        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1490        assert_eq!(
1491            rendered.pixels.len(),
1492            rendered.width as usize * rendered.height as usize * 4
1493        );
1494        assert_eq!(
1495            pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1496            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1497        );
1498    }
1499
1500    #[test]
1501    fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1502        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1503        let rendered = doc
1504            .render_page_with_config(
1505                0,
1506                &RenderConfig {
1507                    color_mode: ColorMode::SimulateCmyk,
1508                    dpi: 72,
1509                },
1510            )
1511            .expect("simulate cmyk render succeeds");
1512
1513        assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1514        assert!(!rendered.pixels.is_empty());
1515    }
1516
1517    #[test]
1518    fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1519        let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1520        let rendered = doc
1521            .render_page_with_config(
1522                0,
1523                &RenderConfig {
1524                    color_mode: ColorMode::PreserveCmyk,
1525                    dpi: 72,
1526                },
1527            )
1528            .expect("mixed render succeeds");
1529
1530        assert_eq!(
1531            pixel_at(&rendered, 54, 36),
1532            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1533        );
1534        assert_ne!(
1535            pixel_at(&rendered, 18, 36),
1536            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1537        );
1538    }
1539
1540    #[test]
1541    fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1542        let doc =
1543            PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1544        let rendered = doc
1545            .render_page_with_config(
1546                0,
1547                &RenderConfig {
1548                    color_mode: ColorMode::PreserveCmyk,
1549                    dpi: 72,
1550                },
1551            )
1552            .expect("transparent cmyk render succeeds");
1553
1554        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1555        assert_eq!(
1556            rendered.pixels.len(),
1557            rendered.width as usize * rendered.height as usize * 4
1558        );
1559    }
1560
1561    #[test]
1562    fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1563        let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1564        let rendered = doc
1565            .render_page_with_config(
1566                0,
1567                &RenderConfig {
1568                    color_mode: ColorMode::PreserveCmyk,
1569                    dpi: 72,
1570                },
1571            )
1572            .expect("cmyk image render succeeds");
1573
1574        assert_eq!(rendered.width, 2);
1575        assert_eq!(rendered.height, 1);
1576        assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1577        assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1578    }
1579}