pdf_engine/
document.rs

1//! Unified document facade — multi-page rendering, text extraction,
2//! metadata, bookmarks, and thumbnails.
3
4use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::{LimitError, ProcessingLimits};
7use std::sync::{Arc, Mutex};
8
9/// Shared slot used by the limit-warning collector.
10///
11/// Stores `Some((observed_bytes, limit_bytes))` when a
12/// `StreamTooLarge` warning fires during rendering or extraction.
13type LimitSlot = Arc<Mutex<Option<(u64, u64)>>>;
14use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
15use crate::text::{TextBlock, TextExtractionDevice};
16use crate::thumbnail::ThumbnailOptions;
17
18use pdf_forms::parse::parse_acroform;
19use pdf_forms::tree::{FieldType, FieldValue};
20use pdf_render::pdf_interpret::PageExt;
21use pdf_render::pdf_interpret::{interpret_page, Context, InterpreterSettings, InterpreterWarning};
22use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
23use pdf_render::pdf_syntax::object::Dict;
24use pdf_render::pdf_syntax::page::Page;
25use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
26#[cfg(feature = "parallel")]
27use rayon::prelude::*;
28
29use kurbo::Rect;
30
31/// Document metadata extracted from the info dictionary.
32#[derive(Debug, Clone, Default)]
33pub struct DocumentInfo {
34    /// Document title.
35    pub title: Option<String>,
36    /// Author.
37    pub author: Option<String>,
38    /// Subject.
39    pub subject: Option<String>,
40    /// Keywords.
41    pub keywords: Option<String>,
42    /// Creator application.
43    pub creator: Option<String>,
44    /// Producer application.
45    pub producer: Option<String>,
46}
47
48/// A bookmark / outline item.
49#[derive(Debug, Clone)]
50pub struct BookmarkItem {
51    /// Bookmark title.
52    pub title: String,
53    /// Target page index (0-based), if resolvable.
54    pub page: Option<usize>,
55    /// Nested child bookmarks.
56    pub children: Vec<BookmarkItem>,
57}
58
59/// High-level PDF document handle.
60pub struct PdfDocument {
61    pdf: Pdf,
62    settings: InterpreterSettings,
63}
64
65impl PdfDocument {
66    /// Open a PDF from bytes.
67    pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
68        let pdf = Pdf::new(data).map_err(|e| match e {
69            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
70                EngineError::Encrypted(format!("{d:?}"))
71            }
72            _ => EngineError::InvalidPdf(format!("{e:?}")),
73        })?;
74        Ok(Self {
75            pdf,
76            settings: InterpreterSettings::default(),
77        })
78    }
79
80    /// Open a PDF from bytes with processing limits.
81    pub fn open_with_processing_limits(
82        data: impl Into<pdf_render::pdf_syntax::PdfData>,
83        limits: ProcessingLimits,
84    ) -> Result<Self> {
85        let syntax_limits = PdfLoadLimits::new()
86            .max_object_depth(limits.max_object_depth)
87            .max_image_pixels(limits.max_image_pixels)
88            .max_stream_bytes(limits.max_stream_bytes);
89        let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
90            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
91                EngineError::Encrypted(format!("{d:?}"))
92            }
93            _ => EngineError::InvalidPdf(format!("{e:?}")),
94        })?;
95        let settings = InterpreterSettings {
96            max_operator_count: Some(limits.max_operator_count),
97            ..InterpreterSettings::default()
98        };
99        Ok(Self { pdf, settings })
100    }
101
102    /// Open a password-protected PDF.
103    pub fn open_with_password(
104        data: impl Into<pdf_render::pdf_syntax::PdfData>,
105        password: &str,
106    ) -> Result<Self> {
107        let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
108            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
109                EngineError::Encrypted(format!("{d:?}"))
110            }
111            _ => EngineError::InvalidPdf(format!("{e:?}")),
112        })?;
113        Ok(Self {
114            pdf,
115            settings: InterpreterSettings::default(),
116        })
117    }
118
119    /// Open a password-protected PDF with processing limits.
120    pub fn open_with_password_and_processing_limits(
121        data: impl Into<pdf_render::pdf_syntax::PdfData>,
122        password: &str,
123        limits: ProcessingLimits,
124    ) -> Result<Self> {
125        let syntax_limits = PdfLoadLimits::new()
126            .max_object_depth(limits.max_object_depth)
127            .max_image_pixels(limits.max_image_pixels)
128            .max_stream_bytes(limits.max_stream_bytes);
129        let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
130            |e| match e {
131                pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
132                    EngineError::Encrypted(format!("{d:?}"))
133                }
134                _ => EngineError::InvalidPdf(format!("{e:?}")),
135            },
136        )?;
137        let settings = InterpreterSettings {
138            max_operator_count: Some(limits.max_operator_count),
139            ..InterpreterSettings::default()
140        };
141        Ok(Self { pdf, settings })
142    }
143
144    /// Access the underlying parsed PDF.
145    pub fn pdf(&self) -> &Pdf {
146        &self.pdf
147    }
148
149    /// Set interpreter settings (font resolver, cmap resolver, etc.).
150    pub fn set_settings(&mut self, settings: InterpreterSettings) {
151        self.settings = settings;
152    }
153
154    /// Number of pages.
155    pub fn page_count(&self) -> usize {
156        self.pdf.pages().len()
157    }
158
159    /// Get the geometry of a page.
160    pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
161        let page = self.get_page(index)?;
162        Ok(geometry::extract_geometry(page))
163    }
164
165    /// Render a single page.
166    ///
167    /// If the document contains an XFA template, it is automatically flattened
168    /// to static PDF content before rendering.  This prevents the "Please wait"
169    /// placeholder page that Adobe Reader would show when rendering an XFA PDF
170    /// with a conventional renderer. If flattening fails, rendering falls back
171    /// to the original document as a best-effort path.
172    pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
173        #[cfg(feature = "xfa")]
174        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
175            return flat_doc.render_page(index, options);
176        }
177        let page = self.get_page(index)?;
178        // Pre-flight: reject pathologically small or zero-dimension pages before
179        // allocating any pixel buffer. Non-positive dimensions cause panics or
180        // zero-sized allocations inside the rasteriser.
181        let (w, h) = page.render_dimensions();
182        if w <= 0.0 || h <= 0.0 {
183            return Err(EngineError::InvalidPageGeometry {
184                width: w,
185                height: h,
186                reason: "page has zero or negative dimensions".into(),
187            });
188        }
189        // Also reject pages so small they produce zero pixels even at the
190        // minimum meaningful DPI (1 DPI). Below ~0.72pt at 1 DPI = 0 pixels.
191        const MIN_PAGE_PT: f32 = 1.0;
192        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
193            return Err(EngineError::InvalidPageGeometry {
194                width: w,
195                height: h,
196                reason: "page too small to render (< 1pt)".into(),
197            });
198        }
199        let (settings, slot) = Self::with_limit_collector(&self.settings);
200        let rendered = render::render_page(page, options, &settings);
201        Self::check_limit_slot(&slot)?;
202        Ok(rendered)
203    }
204
205    /// Render a single page using the high-level render config.
206    ///
207    /// XFA documents are auto-flattened before rendering (same as `render_page`).
208    /// If flattening fails, rendering falls back to the original document.
209    pub fn render_page_with_config(
210        &self,
211        index: usize,
212        config: &RenderConfig,
213    ) -> Result<RenderedPage> {
214        #[cfg(feature = "xfa")]
215        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
216            return flat_doc.render_page_with_config(index, config);
217        }
218        let page = self.get_page(index)?;
219        let (w, h) = page.render_dimensions();
220        if w <= 0.0 || h <= 0.0 {
221            return Err(EngineError::InvalidPageGeometry {
222                width: w,
223                height: h,
224                reason: "page has zero or negative dimensions".into(),
225            });
226        }
227        const MIN_PAGE_PT: f32 = 1.0;
228        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
229            return Err(EngineError::InvalidPageGeometry {
230                width: w,
231                height: h,
232                reason: "page too small to render (< 1pt)".into(),
233            });
234        }
235        let (settings, slot) = Self::with_limit_collector(&self.settings);
236        let rendered = render::render_page_with_config(page, config, &settings);
237        Self::check_limit_slot(&slot)?;
238        Ok(rendered)
239    }
240
241    /// Render a single page to a CMYK buffer.
242    pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
243        self.render_page_with_config(
244            index,
245            &RenderConfig {
246                color_mode: ColorMode::PreserveCmyk,
247                dpi,
248            },
249        )
250    }
251
252    /// Render all pages, in parallel when the `parallel` feature is enabled.
253    pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
254        let pages = self.pdf.pages();
255        #[cfg(feature = "parallel")]
256        return (0..pages.len())
257            .into_par_iter()
258            .map(|i| render::render_page(&pages[i], options, &self.settings))
259            .collect();
260        #[cfg(not(feature = "parallel"))]
261        (0..pages.len())
262            .map(|i| render::render_page(&pages[i], options, &self.settings))
263            .collect()
264    }
265
266    /// Render all pages using the high-level render config.
267    pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
268        let pages = self.pdf.pages();
269        #[cfg(feature = "parallel")]
270        return (0..pages.len())
271            .into_par_iter()
272            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
273            .collect();
274        #[cfg(not(feature = "parallel"))]
275        (0..pages.len())
276            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
277            .collect()
278    }
279
280    /// Generate a thumbnail for a single page.
281    pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
282        let page = self.get_page(index)?;
283        Ok(render::render_thumbnail(
284            page,
285            options.max_dimension,
286            &self.settings,
287        ))
288    }
289
290    /// Generate thumbnails for all pages, in parallel when the `parallel` feature is enabled.
291    pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
292        let pages = self.pdf.pages();
293        #[cfg(feature = "parallel")]
294        return (0..pages.len())
295            .into_par_iter()
296            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
297            .collect();
298        #[cfg(not(feature = "parallel"))]
299        (0..pages.len())
300            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
301            .collect()
302    }
303
304    /// Extract text from a page as a single string.
305    pub fn extract_text(&self, index: usize) -> Result<String> {
306        let page = self.get_page(index)?;
307        let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
308        let mut device = TextExtractionDevice::new();
309        let mut ctx = Self::create_context_with_settings(page, settings);
310        interpret_page(page, &mut ctx, &mut device);
311        Self::check_limit_slot(&slot)?;
312        Ok(device.into_text())
313    }
314
315    /// Extract text from a sequence of pages while reusing the same settings object.
316    #[doc(hidden)]
317    pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
318    where
319        I: IntoIterator<Item = usize>,
320    {
321        let pages = self.pdf.pages();
322        let mut settings = self.text_extraction_settings();
323        let indices = indices.into_iter();
324        let (lower_bound, upper_bound) = indices.size_hint();
325        let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
326
327        for index in indices {
328            let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
329                index,
330                count: pages.len(),
331            })?;
332            let (text, next_settings) = Self::extract_text_with_settings(page, settings);
333            settings = next_settings;
334            texts.push(text);
335        }
336
337        Ok(texts)
338    }
339
340    /// Extract structured text blocks from a page.
341    pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
342        let page = self.get_page(index)?;
343        let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
344        let mut device = TextExtractionDevice::new();
345        let mut ctx = Self::create_context_with_settings(page, settings);
346        interpret_page(page, &mut ctx, &mut device);
347        Self::check_limit_slot(&slot)?;
348        Ok(device.into_blocks())
349    }
350
351    /// Extract structured text blocks from all pages, reusing interpreter settings.
352    pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
353        let pages = self.pdf.pages();
354        let mut settings = self.text_extraction_settings();
355        let mut blocks = Vec::with_capacity(pages.len());
356
357        for page in pages.iter() {
358            let (page_blocks, next_settings) =
359                Self::extract_text_blocks_with_settings(page, settings);
360            settings = next_settings;
361            blocks.push(page_blocks);
362        }
363
364        blocks
365    }
366
367    /// Extract text values from AcroForm fields, including push-button captions.
368    ///
369    /// Returns a single string concatenating all non-empty field values separated
370    /// by newlines. Useful when the document stores its readable content in form
371    /// field values rather than (or in addition to) page content streams.
372    pub fn extract_acroform_text(&self) -> String {
373        let Some(tree) = parse_acroform(&self.pdf) else {
374            return String::new();
375        };
376        let mut parts: Vec<String> = Vec::new();
377        for id in tree.all_ids() {
378            let node = tree.get(id);
379            if node.children.is_empty() {
380                // Terminal (widget) — collect text-like values.
381                let value_str = match &node.value {
382                    Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
383                    Some(FieldValue::StringArray(arr)) => {
384                        let joined = arr
385                            .iter()
386                            .filter(|s| !s.is_empty())
387                            .cloned()
388                            .collect::<Vec<_>>()
389                            .join(", ");
390                        if joined.is_empty() {
391                            None
392                        } else {
393                            Some(joined)
394                        }
395                    }
396                    _ => None,
397                };
398                let button_caption =
399                    value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
400                let extracted = value_str.or_else(|| {
401                    button_caption.then(|| {
402                        node.mk
403                            .as_ref()
404                            .and_then(|mk| mk.caption.as_ref())
405                            .filter(|caption| !caption.is_empty())
406                            .cloned()
407                    })?
408                });
409                if let Some(s) = extracted {
410                    parts.push(s);
411                }
412            }
413        }
414        parts.join("\n")
415    }
416
417    /// Extract all text from the document: page content streams plus AcroForm
418    /// field values.  Mirrors pdftotext behaviour.
419    ///
420    /// When the `xfa` feature is enabled and the document is an XFA form,
421    /// the raw page content stream typically contains only an Adobe-Reader
422    /// placeholder (`"The document you are trying to load requires Adobe
423    /// Reader 8 or higher…"`). For those cases we transparently re-extract
424    /// from the flattened representation so callers get the rendered form
425    /// content. The raw path is preferred whenever it produces non-trivially
426    /// more text than the flattened path, which preserves existing behaviour
427    /// for non-XFA PDFs and for XFA PDFs that already carry their content
428    /// as ordinary text operators.
429    pub fn extract_all_text(&self) -> String {
430        let raw = self.extract_all_text_raw();
431
432        #[cfg(feature = "xfa")]
433        {
434            if let Some(flat_text) = self.extract_all_text_via_xfa_flatten() {
435                if Self::should_prefer_flat_extract(&raw, &flat_text) {
436                    return flat_text;
437                }
438            }
439        }
440
441        raw
442    }
443
444    /// Inner extract that does NOT consult the XFA flatten path. Exposed
445    /// (`#[doc(hidden)]`) so internal code that explicitly wants the raw
446    /// path can opt out of the auto-routing.
447    #[doc(hidden)]
448    pub fn extract_all_text_raw(&self) -> String {
449        let pages = self.pdf.pages();
450        let mut settings = self.text_extraction_settings();
451        let mut page_texts = Vec::with_capacity(pages.len());
452        for page in pages.iter() {
453            let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
454            settings = next_settings;
455            page_texts.push(page_text);
456        }
457
458        let mut text = join_page_texts(page_texts.iter().map(String::as_str));
459        let acroform = self.extract_acroform_text();
460        if !acroform.is_empty() {
461            if !text.is_empty() && !text.ends_with('\n') {
462                text.push('\n');
463            }
464            text.push_str(&acroform);
465        }
466        text
467    }
468
469    /// Try to extract via the flattened XFA representation. Returns `None`
470    /// when the document is not an XFA form, the flatten step fails, or
471    /// the flattened doc cannot be reopened.
472    #[cfg(feature = "xfa")]
473    fn extract_all_text_via_xfa_flatten(&self) -> Option<String> {
474        let flat_doc = self.open_flattened_xfa_for_render()?;
475        // Recurse into the raw path on the flattened doc — never call
476        // `extract_all_text` here, otherwise we'd recurse forever if the
477        // flatten step somehow preserves the XFA marker.
478        Some(flat_doc.extract_all_text_raw())
479    }
480
481    /// Decide whether the flattened-XFA text path is preferable to the raw
482    /// path.
483    ///
484    /// The trigger is **strict**: only use flat text when the raw text
485    /// matches one of the well-known Adobe Reader / LiveCycle "viewer
486    /// required" placeholders. For any other XFA doc — including ones
487    /// whose raw text is empty — raw wins, preserving existing extraction
488    /// behaviour.
489    ///
490    /// Empirical history (2026-05-09, 281-doc text corpus):
491    ///   - looser triggers (`flat_len > 2× raw_len`) cost -0.15 char_f1
492    ///     because non-placeholder XFA docs re-extracted via flatten
493    ///     produced subtly different text scoring worse vs the oracle.
494    ///   - including "raw is empty → use flat" cost -0.13 char_f1 because
495    ///     several XFA docs have an empty-rendered oracle (form-feed only)
496    ///     and flat extracts calculation values that, while technically
497    ///     present, do not match the empty oracle.
498    /// Restricting to exact placeholder markers fixes the 35 MISSING_TEXT
499    /// docs without touching any other XFA doc.
500    #[cfg(feature = "xfa")]
501    fn should_prefer_flat_extract(raw: &str, flat: &str) -> bool {
502        if flat.is_empty() {
503            return false;
504        }
505        // Adobe Reader / LiveCycle / Designer placeholder phrasings.
506        // Each is shipped verbatim as the raw content stream when the
507        // PDF expects an XFA-aware viewer to render the form. Multiple
508        // phrasings appear across Adobe versions and locales:
509        const ADOBE_PLACEHOLDER_MARKERS: [&str; 4] = [
510            "requires Adobe Reader",
511            "Please wait...",
512            "To view the full contents of this document",
513            "form is not supported with the current version of Acrobat",
514        ];
515        ADOBE_PLACEHOLDER_MARKERS
516            .iter()
517            .any(|marker| raw.contains(marker))
518    }
519
520    /// Simple text search: returns page indices containing the query string.
521    pub fn search_text(&self, query: &str) -> Vec<usize> {
522        let pages = self.pdf.pages();
523        let query_lower = query.to_lowercase();
524        #[cfg(feature = "parallel")]
525        let page_contains = |i: usize| -> Option<usize> {
526            let page = &pages[i];
527            let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
528            if text.to_lowercase().contains(&query_lower) {
529                Some(i)
530            } else {
531                None
532            }
533        };
534        #[cfg(feature = "parallel")]
535        return (0..pages.len())
536            .into_par_iter()
537            .filter_map(page_contains)
538            .collect();
539        #[cfg(not(feature = "parallel"))]
540        {
541            let mut settings = self.text_extraction_settings();
542            let mut hits = Vec::new();
543            for (i, page) in pages.iter().enumerate() {
544                let (text, next_settings) = Self::extract_text_with_settings(page, settings);
545                settings = next_settings;
546                if text.to_lowercase().contains(&query_lower) {
547                    hits.push(i);
548                }
549            }
550            hits
551        }
552    }
553
554    /// Extract document metadata.
555    pub fn info(&self) -> DocumentInfo {
556        let meta = self.pdf.metadata();
557        DocumentInfo {
558            title: meta.title.as_ref().map(|b| bytes_to_string(b)),
559            author: meta.author.as_ref().map(|b| bytes_to_string(b)),
560            subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
561            keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
562            creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
563            producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
564        }
565    }
566
567    /// Extract document outline / bookmarks.
568    pub fn bookmarks(&self) -> Vec<BookmarkItem> {
569        let xref = self.pdf.xref();
570        let root_id = xref.root_id();
571        let catalog: Dict<'_> = match xref.get(root_id) {
572            Some(d) => d,
573            None => return Vec::new(),
574        };
575
576        let outlines: Dict<'_> = match catalog.get(OUTLINES) {
577            Some(d) => d,
578            None => return Vec::new(),
579        };
580
581        let first: Dict<'_> = match outlines.get(FIRST) {
582            Some(d) => d,
583            None => return Vec::new(),
584        };
585
586        parse_outline_items(&first)
587    }
588
589    /// Run OCR on a page and return the recognized text and word positions.
590    ///
591    /// The page is rendered at `dpi` (default 150) before recognition.
592    /// Pass any [`OcrBackend`] implementation; use [`OcrsBackend::try_default`]
593    /// to load the pure-Rust `ocrs` engine from the standard model paths.
594    ///
595    /// # Example
596    ///
597    /// ```no_run
598    /// # #[cfg(feature = "ocr")] {
599    /// use pdf_engine::{PdfDocument, OcrsBackend, RenderOptions};
600    ///
601    /// let doc = PdfDocument::open(std::fs::read("scan.pdf").unwrap()).unwrap();
602    /// let backend = OcrsBackend::try_default().unwrap();
603    /// let result = doc.ocr_page(0, &backend, 150.0_f64).unwrap();
604    /// println!("{}", result.text);
605    /// # }
606    /// ```
607    pub fn ocr_page(
608        &self,
609        index: usize,
610        backend: &dyn crate::ocr::OcrBackend,
611        dpi: f64,
612    ) -> crate::error::Result<crate::ocr::OcrResult> {
613        let opts = crate::render::RenderOptions {
614            dpi,
615            ..Default::default()
616        };
617        let rendered = self.render_page(index, &opts)?;
618
619        // Convert RGBA → RGB (ocrs expects RGB input).
620        let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
621        for chunk in rendered.pixels.chunks(4) {
622            rgb.push(chunk[0]);
623            rgb.push(chunk[1]);
624            rgb.push(chunk[2]);
625        }
626
627        backend
628            .recognize(&rgb, rendered.width, rendered.height)
629            .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
630    }
631
632    /// Wrap `settings` with a warning sink that captures the first
633    /// `InterpreterWarning::StreamTooLarge` into a shared slot.
634    ///
635    /// The returned slot is checked by [`Self::check_limit_slot`] after
636    /// the operation completes. Any previously installed sink is still
637    /// called so no warnings are silently dropped.
638    fn with_limit_collector(settings: &InterpreterSettings) -> (InterpreterSettings, LimitSlot) {
639        let slot: LimitSlot = Arc::new(Mutex::new(None));
640        let slot_clone = Arc::clone(&slot);
641        let prev_sink = settings.warning_sink.clone();
642        let mut new_settings = settings.clone();
643        new_settings.warning_sink = Arc::new(move |w: InterpreterWarning| {
644            if let InterpreterWarning::StreamTooLarge { observed, limit } = w {
645                let mut guard = slot_clone.lock().unwrap_or_else(|e| e.into_inner());
646                if guard.is_none() {
647                    *guard = Some((observed, limit));
648                }
649            }
650            prev_sink(w);
651        });
652        (new_settings, slot)
653    }
654
655    /// Check the slot populated by [`Self::with_limit_collector`].
656    ///
657    /// Returns `Err(EngineError::LimitExceeded(...))` if a
658    /// `StreamTooLarge` warning was captured, `Ok(())` otherwise.
659    fn check_limit_slot(slot: &LimitSlot) -> Result<()> {
660        if let Some((observed, limit)) = *slot.lock().unwrap_or_else(|e| e.into_inner()) {
661            return Err(EngineError::LimitExceeded(LimitError::StreamTooLarge {
662                actual_bytes: observed,
663                limit_bytes: limit,
664            }));
665        }
666        Ok(())
667    }
668
669    fn get_page(&self, index: usize) -> Result<&Page<'_>> {
670        let pages = self.pdf.pages();
671        if index >= pages.len() {
672            return Err(EngineError::PageOutOfRange {
673                index,
674                count: pages.len(),
675            });
676        }
677        Ok(&pages[index])
678    }
679
680    fn text_extraction_settings(&self) -> InterpreterSettings {
681        let mut settings = self.settings.clone();
682        // Text extraction should include signature widget appearance streams
683        // that rendering skips to match MuPDF visual output.
684        settings.skip_signature_widgets = false;
685        settings
686    }
687
688    fn create_context_with_settings<'a>(
689        page: &Page<'a>,
690        settings: InterpreterSettings,
691    ) -> Context<'a> {
692        let (w, h) = page.render_dimensions();
693        Context::new(
694            page.initial_transform(false),
695            Rect::new(0.0, 0.0, w as f64, h as f64),
696            page.xref(),
697            settings,
698        )
699    }
700
701    fn extract_text_with_settings<'a>(
702        page: &Page<'a>,
703        settings: InterpreterSettings,
704    ) -> (String, InterpreterSettings) {
705        let mut device = TextExtractionDevice::new();
706        let mut ctx = Self::create_context_with_settings(page, settings);
707        interpret_page(page, &mut ctx, &mut device);
708        let settings = ctx.into_settings();
709        (device.into_text(), settings)
710    }
711
712    fn extract_text_blocks_with_settings<'a>(
713        page: &Page<'a>,
714        settings: InterpreterSettings,
715    ) -> (Vec<TextBlock>, InterpreterSettings) {
716        let mut device = TextExtractionDevice::new();
717        let mut ctx = Self::create_context_with_settings(page, settings);
718        interpret_page(page, &mut ctx, &mut device);
719        let settings = ctx.into_settings();
720        (device.into_blocks(), settings)
721    }
722
723    #[cfg(feature = "xfa")]
724    fn open_flattened_xfa_for_render(&self) -> Option<Self> {
725        if !crate::xfa::has_xfa(self) {
726            return None;
727        }
728
729        let flat_bytes = crate::xfa::flatten(self).ok()?;
730        let mut flat_doc = Self::open(flat_bytes).ok()?;
731        flat_doc.settings = self.settings.clone();
732        Some(flat_doc)
733    }
734}
735
736fn join_page_texts<I>(page_texts: I) -> String
737where
738    I: IntoIterator,
739    I::Item: AsRef<str>,
740{
741    let mut text = String::new();
742    let mut is_first = true;
743
744    for page_text in page_texts {
745        if !is_first {
746            while !text.is_empty() && !text.ends_with("\n\n") {
747                text.push('\n');
748            }
749            text.push('\u{000C}');
750        }
751        text.push_str(page_text.as_ref());
752        is_first = false;
753    }
754
755    text
756}
757
758#[cfg(test)]
759mod extract_all_text_tests {
760    use super::join_page_texts;
761
762    #[test]
763    fn separates_nonempty_pages_like_pdftotext() {
764        assert_eq!(
765            join_page_texts(["Page 1", "Page 2"]),
766            "Page 1\n\n\u{000C}Page 2"
767        );
768    }
769
770    #[test]
771    fn preserves_leading_blank_pages_without_extra_newlines() {
772        assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
773    }
774
775    #[test]
776    fn reuses_existing_blank_line_before_form_feed() {
777        assert_eq!(
778            join_page_texts(["Page 1\n\n", "Page 2"]),
779            "Page 1\n\n\u{000C}Page 2"
780        );
781    }
782
783    /// `should_prefer_flat_extract` is the trigger for auto-routing
784    /// XFA documents through their flattened representation. The
785    /// strict version (introduced 2026-05-09 after the looser
786    /// 2× / 200-char rule caused -0.15 char_f1_mean regression on
787    /// the 281-doc corpus, and an empty-raw fallback caused another
788    /// -0.13 regression by routing 51 form-feed-only XFA docs whose
789    /// oracle is empty too) requires the raw text to literally
790    /// contain one of the well-known Adobe Reader placeholder
791    /// phrasings.
792    #[cfg(feature = "xfa")]
793    #[test]
794    fn flat_extract_preferred_for_adobe_placeholder_only() {
795        use crate::PdfDocument;
796        // Empty raw → keep raw (oracle may also be empty for XFA
797        // forms whose pdfRest output is just a form-feed).
798        assert!(!PdfDocument::should_prefer_flat_extract(
799            "",
800            "Some flat text"
801        ));
802        // Adobe Reader placeholder → use flat
803        let placeholder = "The document you are trying to load \
804            requires Adobe Reader 8 or higher.";
805        assert!(PdfDocument::should_prefer_flat_extract(
806            placeholder,
807            "rendered XFA content"
808        ));
809        // "Please wait..." variant → use flat
810        let please_wait = "Please wait... If this message is not \
811            eventually replaced...";
812        assert!(PdfDocument::should_prefer_flat_extract(
813            please_wait,
814            "rendered XFA content"
815        ));
816        // "To view the full contents" variant → use flat
817        let to_view = "To view the full contents of this document, \
818            you need a later version of the PDF viewer.";
819        assert!(PdfDocument::should_prefer_flat_extract(
820            to_view,
821            "rendered XFA content"
822        ));
823        // "Warning: This form is not supported" variant → use flat
824        let warning = "Warning: This form is not supported with the \
825            current version of Acrobat or Adobe Reader.";
826        assert!(PdfDocument::should_prefer_flat_extract(
827            warning,
828            "rendered XFA content"
829        ));
830        // Real XFA-doc raw text that happens to be short → keep raw
831        assert!(!PdfDocument::should_prefer_flat_extract(
832            "Real form: Name: ___",
833            "rendered version of the same form"
834        ));
835        // Real XFA-doc raw text that's long → keep raw
836        let long_raw = "X".repeat(2000);
837        let long_flat = "Y".repeat(20000);
838        assert!(!PdfDocument::should_prefer_flat_extract(
839            &long_raw, &long_flat
840        ));
841        // Empty flat → never use flat
842        assert!(!PdfDocument::should_prefer_flat_extract(
843            placeholder, ""
844        ));
845    }
846}
847
848/// Walk the outline linked list (FIRST → NEXT chain).
849fn parse_outline_items(item_dict: &Dict<'_>) -> Vec<BookmarkItem> {
850    let mut items = Vec::new();
851    let mut current: Option<Dict<'_>> = Some(item_dict.clone());
852
853    while let Some(dict) = current {
854        let title = dict
855            .get::<pdf_render::pdf_syntax::object::String>(TITLE)
856            .map(|s| bytes_to_string(s.as_bytes()))
857            .unwrap_or_default();
858
859        let children = match dict.get::<Dict<'_>>(FIRST) {
860            Some(child_dict) => parse_outline_items(&child_dict),
861            None => Vec::new(),
862        };
863
864        items.push(BookmarkItem {
865            title,
866            page: None, // Destination resolution requires named-dest lookup — left for follow-up
867            children,
868        });
869
870        current = dict.get::<Dict<'_>>(NEXT);
871    }
872
873    items
874}
875
876/// Convert PDF string bytes to a Rust String (UTF-8 with Latin-1 fallback).
877fn bytes_to_string(bytes: &[u8]) -> String {
878    // Check for UTF-16 BOM
879    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
880        let chars: Vec<u16> = bytes[2..]
881            .chunks(2)
882            .filter_map(|c| {
883                if c.len() == 2 {
884                    Some(u16::from_be_bytes([c[0], c[1]]))
885                } else {
886                    None
887                }
888            })
889            .collect();
890        return String::from_utf16_lossy(&chars);
891    }
892
893    // Try UTF-8, fall back to Latin-1.
894    match std::str::from_utf8(bytes) {
895        Ok(s) => s.to_string(),
896        Err(_) => bytes.iter().map(|&b| b as char).collect(),
897    }
898}
899
900#[cfg(test)]
901mod tests {
902    use super::*;
903    use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
904    use lopdf::{Document as LoDocument, Object};
905    use std::path::PathBuf;
906
907    fn corpus_path(name: &str) -> PathBuf {
908        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
909            .join("../../corpus")
910            .join(name)
911    }
912
913    fn normalize_text(text: &str) -> String {
914        text.split_whitespace().collect::<Vec<_>>().join(" ")
915    }
916
917    fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
918        fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
919            match dict.get(key).ok()? {
920                Object::Name(name) => Some(name.clone()),
921                _ => None,
922            }
923        }
924
925        fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
926            let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
927                return false;
928            };
929            let Some(Object::Reference(desc_id)) = descendants.first() else {
930                return false;
931            };
932            let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
933                return false;
934            };
935            matches!(
936                descendant.get(b"Subtype").ok(),
937                Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
938            )
939        }
940
941        let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
942        let ids: Vec<_> = doc.objects.keys().copied().collect();
943        let mut removed = 0usize;
944
945        for id in ids {
946            let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
947                continue;
948            };
949            if !matches!(
950                dict.get(b"Subtype").ok(),
951                Some(Object::Name(name)) if name.as_slice() == b"Type0"
952            ) {
953                continue;
954            }
955            if !matches!(
956                get_name(dict, b"Encoding").as_deref(),
957                Some(b"Identity-H") | Some(b"Identity-V")
958            ) {
959                continue;
960            }
961            if !descendant_is_cidfont_type2(&doc, dict) {
962                continue;
963            }
964
965            if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
966                if type0.has(b"ToUnicode") {
967                    type0.remove(b"ToUnicode");
968                    removed += 1;
969                }
970            }
971        }
972
973        let mut out = Vec::new();
974        doc.save_to(&mut out)
975            .expect("save stripped-to-unicode fixture");
976        (out, removed)
977    }
978
979    fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
980        use lopdf::{dictionary, Document, Object, Stream};
981
982        let mut doc = Document::with_version("1.4");
983
984        let pages_id = doc.new_object_id();
985        let page_id = doc.new_object_id();
986        let content = format!("{color_operator}\n0 0 72 72 re f\n");
987        let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
988
989        doc.objects.insert(
990            page_id,
991            Object::Dictionary(dictionary! {
992                "Type" => Object::Name(b"Page".to_vec()),
993                "Parent" => Object::Reference(pages_id),
994                "MediaBox" => Object::Array(vec![
995                    Object::Integer(0),
996                    Object::Integer(0),
997                    Object::Integer(72),
998                    Object::Integer(72),
999                ]),
1000                "Contents" => Object::Reference(content_id),
1001            }),
1002        );
1003
1004        doc.objects.insert(
1005            pages_id,
1006            Object::Dictionary(dictionary! {
1007                "Type" => Object::Name(b"Pages".to_vec()),
1008                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1009                "Count" => Object::Integer(1),
1010            }),
1011        );
1012
1013        let catalog_id = doc.new_object_id();
1014        doc.objects.insert(
1015            catalog_id,
1016            Object::Dictionary(dictionary! {
1017                "Type" => Object::Name(b"Catalog".to_vec()),
1018                "Pages" => Object::Reference(pages_id),
1019            }),
1020        );
1021
1022        doc.trailer.set("Root", Object::Reference(catalog_id));
1023
1024        let mut bytes = Vec::new();
1025        doc.save_to(&mut bytes).expect("save solid fill fixture");
1026        bytes
1027    }
1028
1029    fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
1030        use lopdf::{dictionary, Document, Object, Stream};
1031
1032        let mut doc = Document::with_version("1.4");
1033        let pages_id = doc.new_object_id();
1034        let page_id = doc.new_object_id();
1035        let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
1036        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1037
1038        doc.objects.insert(
1039            page_id,
1040            Object::Dictionary(dictionary! {
1041                "Type" => "Page",
1042                "Parent" => Object::Reference(pages_id),
1043                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1044                "Contents" => Object::Reference(content_id),
1045            }),
1046        );
1047        doc.objects.insert(
1048            pages_id,
1049            Object::Dictionary(dictionary! {
1050                "Type" => "Pages",
1051                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1052                "Count" => Object::Integer(1),
1053            }),
1054        );
1055        let catalog_id = doc.new_object_id();
1056        doc.objects.insert(
1057            catalog_id,
1058            Object::Dictionary(dictionary! {
1059                "Type" => "Catalog",
1060                "Pages" => Object::Reference(pages_id),
1061            }),
1062        );
1063        doc.trailer.set("Root", Object::Reference(catalog_id));
1064
1065        let mut bytes = Vec::new();
1066        doc.save_to(&mut bytes)
1067            .expect("save mixed rgb/cmyk fixture");
1068        bytes
1069    }
1070
1071    fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
1072        use lopdf::{dictionary, Document, Object, Stream};
1073
1074        let mut doc = Document::with_version("1.4");
1075        let pages_id = doc.new_object_id();
1076        let page_id = doc.new_object_id();
1077        let gs_id = doc.add_object(Object::Dictionary(dictionary! {
1078            "Type" => "ExtGState",
1079            "ca" => Object::Real(0.5),
1080        }));
1081        let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
1082        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1083
1084        doc.objects.insert(
1085            page_id,
1086            Object::Dictionary(dictionary! {
1087                "Type" => "Page",
1088                "Parent" => Object::Reference(pages_id),
1089                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1090                "Resources" => dictionary! {
1091                    "ExtGState" => dictionary! {
1092                        "GS1" => Object::Reference(gs_id),
1093                    },
1094                },
1095                "Contents" => Object::Reference(content_id),
1096            }),
1097        );
1098        doc.objects.insert(
1099            pages_id,
1100            Object::Dictionary(dictionary! {
1101                "Type" => "Pages",
1102                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1103                "Count" => Object::Integer(1),
1104            }),
1105        );
1106        let catalog_id = doc.new_object_id();
1107        doc.objects.insert(
1108            catalog_id,
1109            Object::Dictionary(dictionary! {
1110                "Type" => "Catalog",
1111                "Pages" => Object::Reference(pages_id),
1112            }),
1113        );
1114        doc.trailer.set("Root", Object::Reference(catalog_id));
1115
1116        let mut bytes = Vec::new();
1117        doc.save_to(&mut bytes)
1118            .expect("save transparent cmyk fixture");
1119        bytes
1120    }
1121
1122    fn cmyk_image_pdf_bytes() -> Vec<u8> {
1123        use lopdf::{dictionary, Document, Object, Stream};
1124
1125        let mut doc = Document::with_version("1.4");
1126        let pages_id = doc.new_object_id();
1127        let page_id = doc.new_object_id();
1128        let image_id = doc.add_object(Stream::new(
1129            dictionary! {
1130                "Type" => "XObject",
1131                "Subtype" => "Image",
1132                "Width" => Object::Integer(2),
1133                "Height" => Object::Integer(1),
1134                "BitsPerComponent" => Object::Integer(8),
1135                "ColorSpace" => "DeviceCMYK",
1136            },
1137            vec![255, 0, 0, 0, 0, 255, 0, 0],
1138        ));
1139        let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
1140        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1141
1142        doc.objects.insert(
1143            page_id,
1144            Object::Dictionary(dictionary! {
1145                "Type" => "Page",
1146                "Parent" => Object::Reference(pages_id),
1147                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
1148                "Resources" => dictionary! {
1149                    "XObject" => dictionary! {
1150                        "Im1" => Object::Reference(image_id),
1151                    },
1152                },
1153                "Contents" => Object::Reference(content_id),
1154            }),
1155        );
1156        doc.objects.insert(
1157            pages_id,
1158            Object::Dictionary(dictionary! {
1159                "Type" => "Pages",
1160                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1161                "Count" => Object::Integer(1),
1162            }),
1163        );
1164        let catalog_id = doc.new_object_id();
1165        doc.objects.insert(
1166            catalog_id,
1167            Object::Dictionary(dictionary! {
1168                "Type" => "Catalog",
1169                "Pages" => Object::Reference(pages_id),
1170            }),
1171        );
1172        doc.trailer.set("Root", Object::Reference(catalog_id));
1173
1174        let mut bytes = Vec::new();
1175        doc.save_to(&mut bytes).expect("save cmyk image fixture");
1176        bytes
1177    }
1178
1179    fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
1180        let idx = ((y * rendered.width + x) * 4) as usize;
1181        [
1182            rendered.pixels[idx],
1183            rendered.pixels[idx + 1],
1184            rendered.pixels[idx + 2],
1185            rendered.pixels[idx + 3],
1186        ]
1187    }
1188
1189    /// Build a minimal one-page PDF whose only font is a non-embedded TrueType
1190    /// reference (no `FontFile2`). The character codes in the content stream
1191    /// resolve through the declared `/Encoding`, exercising the same code path
1192    /// as corpus PDFs like `171_171940.pdf`.
1193    fn non_embedded_truetype_pdf_bytes(
1194        base_font: &[u8],
1195        encoding: &[u8],
1196        text_bytes: &[u8],
1197    ) -> Vec<u8> {
1198        use lopdf::{dictionary, Document, Object, Stream};
1199
1200        let mut doc = Document::with_version("1.4");
1201
1202        let font_id = doc.add_object(Object::Dictionary(dictionary! {
1203            "Type" => "Font",
1204            "Subtype" => "TrueType",
1205            "Name" => Object::Name(b"F0".to_vec()),
1206            "BaseFont" => Object::Name(base_font.to_vec()),
1207            "Encoding" => Object::Name(encoding.to_vec()),
1208        }));
1209
1210        let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1211            "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1212        }));
1213
1214        let mut content = Vec::new();
1215        content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1216        for &b in text_bytes {
1217            match b {
1218                b'(' | b')' | b'\\' => {
1219                    content.push(b'\\');
1220                    content.push(b);
1221                }
1222                _ => content.push(b),
1223            }
1224        }
1225        content.extend_from_slice(b") Tj\nET\n");
1226        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1227
1228        let pages_id = doc.new_object_id();
1229        let page_id = doc.add_object(Object::Dictionary(dictionary! {
1230            "Type" => "Page",
1231            "Parent" => Object::Reference(pages_id),
1232            "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1233            "Resources" => Object::Reference(resources_id),
1234            "Contents" => Object::Reference(content_id),
1235        }));
1236        doc.objects.insert(
1237            pages_id,
1238            Object::Dictionary(dictionary! {
1239                "Type" => "Pages",
1240                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1241                "Count" => Object::Integer(1),
1242            }),
1243        );
1244        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1245            "Type" => "Catalog",
1246            "Pages" => Object::Reference(pages_id),
1247        }));
1248        doc.trailer.set("Root", Object::Reference(catalog_id));
1249
1250        let mut bytes = Vec::new();
1251        doc.save_to(&mut bytes).expect("save non-embedded fixture");
1252        bytes
1253    }
1254
1255    /// Build a minimal AcroForm push button whose only human-readable text
1256    /// lives in the widget `/MK /CA` caption entry.
1257    fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1258        use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1259
1260        let mut doc = Document::with_version("1.4");
1261
1262        let catalog_id = doc.new_object_id();
1263        let pages_id = doc.new_object_id();
1264        let page_id = doc.new_object_id();
1265        let acroform_id = doc.new_object_id();
1266        let content_id = doc.new_object_id();
1267        let widget_id = doc.new_object_id();
1268
1269        doc.objects.insert(
1270            content_id,
1271            Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1272        );
1273        doc.objects.insert(
1274            widget_id,
1275            Object::Dictionary(dictionary! {
1276                "Type" => "Annot",
1277                "Subtype" => "Widget",
1278                "FT" => "Btn",
1279                "Ff" => Object::Integer(1 << 16),
1280                "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1281                "MK" => dictionary! {
1282                    "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1283                },
1284                "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1285                "P" => Object::Reference(page_id),
1286            }),
1287        );
1288        doc.objects.insert(
1289            page_id,
1290            Object::Dictionary(dictionary! {
1291                "Type" => "Page",
1292                "Parent" => Object::Reference(pages_id),
1293                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1294                "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1295                "Contents" => Object::Reference(content_id),
1296            }),
1297        );
1298        doc.objects.insert(
1299            pages_id,
1300            Object::Dictionary(dictionary! {
1301                "Type" => "Pages",
1302                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1303                "Count" => Object::Integer(1),
1304            }),
1305        );
1306        doc.objects.insert(
1307            acroform_id,
1308            Object::Dictionary(dictionary! {
1309                "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1310            }),
1311        );
1312        doc.objects.insert(
1313            catalog_id,
1314            Object::Dictionary(dictionary! {
1315                "Type" => "Catalog",
1316                "Pages" => Object::Reference(pages_id),
1317                "AcroForm" => Object::Reference(acroform_id),
1318            }),
1319        );
1320        doc.trailer.set("Root", Object::Reference(catalog_id));
1321
1322        let mut bytes = Vec::new();
1323        doc.save_to(&mut bytes)
1324            .expect("save push-button caption fixture");
1325        bytes
1326    }
1327
1328    #[test]
1329    fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1330        // Mirrors corpus PDF `171_171940.pdf`: TrueType font references
1331        // `TimesNewRoman` (resolves through the standard-font alias table)
1332        // with `WinAnsiEncoding` and no embedded font program. Extraction must
1333        // recover the text from the declared encoding even though no glyph
1334        // outlines are available.
1335        let bytes = non_embedded_truetype_pdf_bytes(
1336            b"TimesNewRoman",
1337            b"WinAnsiEncoding",
1338            b"UNITED STATES DISTRICT COURT",
1339        );
1340        let text = PdfDocument::open(bytes)
1341            .expect("open non-embedded TrueType fixture")
1342            .extract_text(0)
1343            .expect("extract non-embedded TrueType text");
1344        let norm = normalize_text(&text);
1345        assert!(
1346            norm.contains("UNITED STATES DISTRICT COURT"),
1347            "expected WinAnsi-decoded text, got: {norm:?}"
1348        );
1349    }
1350
1351    #[test]
1352    fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1353        // Custom BaseFont that does not match any standard alias and lacks the
1354        // keywords used by the heuristic. The standard-font fallback (via
1355        // FallbackFontQuery) still picks Helvetica, but on hosts without the
1356        // embedded font assets that path returns None — the new TextOnly
1357        // branch is what keeps extraction non-empty in that case. Either way,
1358        // the WinAnsi-driven char map must produce the original prose.
1359        let bytes = non_embedded_truetype_pdf_bytes(
1360            b"OpaqueCustomXYZ",
1361            b"WinAnsiEncoding",
1362            b"Hello, world!",
1363        );
1364        let text = PdfDocument::open(bytes)
1365            .expect("open custom non-embedded fixture")
1366            .extract_text(0)
1367            .expect("extract custom non-embedded text");
1368        let norm = normalize_text(&text);
1369        assert!(
1370            norm.contains("Hello, world!"),
1371            "expected WinAnsi-decoded text, got: {norm:?}"
1372        );
1373    }
1374
1375    #[test]
1376    fn extract_acroform_text_includes_push_button_mk_caption() {
1377        let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1378        let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1379
1380        let page_text = doc.extract_text(0).expect("extract page text");
1381        assert!(
1382            normalize_text(&page_text).is_empty(),
1383            "expected empty page content stream, got: {page_text:?}"
1384        );
1385
1386        let acroform_text = doc.extract_acroform_text();
1387        assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1388
1389        let all_text = doc.extract_all_text();
1390        assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1391    }
1392
1393    #[test]
1394    fn bytes_to_string_utf8() {
1395        assert_eq!(bytes_to_string(b"hello"), "hello");
1396    }
1397
1398    #[test]
1399    fn bytes_to_string_latin1() {
1400        let bytes = &[0xC4, 0xD6, 0xDC]; // ÄÖÜ in Latin-1
1401        let s = bytes_to_string(bytes);
1402        assert_eq!(s, "ÄÖÜ");
1403    }
1404
1405    #[test]
1406    fn bytes_to_string_utf16() {
1407        let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; // UTF-16 "Hi"
1408        assert_eq!(bytes_to_string(bytes), "Hi");
1409    }
1410
1411    #[test]
1412    fn document_info_default() {
1413        let info = DocumentInfo::default();
1414        assert!(info.title.is_none());
1415        assert!(info.author.is_none());
1416    }
1417
1418    #[test]
1419    fn bookmark_item_children() {
1420        let item = BookmarkItem {
1421            title: "Root".into(),
1422            page: None,
1423            children: vec![BookmarkItem {
1424                title: "Child".into(),
1425                page: Some(0),
1426                children: Vec::new(),
1427            }],
1428        };
1429        assert_eq!(item.children.len(), 1);
1430        assert_eq!(item.children[0].title, "Child");
1431    }
1432
1433    #[test]
1434    fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1435        let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1436        let expected = PdfDocument::open(original.clone())
1437            .expect("open original sf181")
1438            .extract_text(0)
1439            .expect("extract original sf181 text");
1440        assert!(
1441            expected.contains("Guide to Personnel Data Standards"),
1442            "unexpected baseline extraction: {expected}"
1443        );
1444
1445        let (stripped, removed) = strip_type0_tounicode(&original);
1446        assert!(
1447            removed > 0,
1448            "expected to strip at least one Type0 ToUnicode"
1449        );
1450
1451        let actual = PdfDocument::open(stripped)
1452            .expect("open stripped sf181")
1453            .extract_text(0)
1454            .expect("extract stripped sf181 text");
1455
1456        let actual_norm = normalize_text(&actual);
1457        let expected_norm = normalize_text(&expected);
1458
1459        assert!(
1460            actual_norm.contains("Guide to Personnel Data Standards"),
1461            "missing main heading after stripping ToUnicode: {actual_norm}"
1462        );
1463        assert!(
1464            actual_norm.contains("Privacy Act Statement"),
1465            "missing body text after stripping ToUnicode: {actual_norm}"
1466        );
1467        assert!(
1468            actual_norm.len() + 32 >= expected_norm.len(),
1469            "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1470            expected_norm.len(),
1471            actual_norm.len()
1472        );
1473    }
1474
1475    #[test]
1476    fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1477        // PDFBOX-4322-3.pdf ships an Identity-H Type0 font whose `/ToUnicode`
1478        // stream is actually an Identity-H *encoding* CMap (only
1479        // `begincidrange <0000> <FFFF> 0`, no bf-mappings). The embedded
1480        // TrueType subset also has no `cmap` table, so both the ToUnicode
1481        // lookup and the reverse-cmap fallback fail. Previously this yielded
1482        // a 0-byte extraction because the character codes — which are Unicode
1483        // code points under Identity-H — were silently discarded.
1484        let bytes =
1485            std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1486        let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1487        let text = doc.extract_all_text();
1488
1489        let norm = normalize_text(&text);
1490        assert!(
1491            norm.contains("Transatlantic Council"),
1492            "expected Identity-H codes to resolve as Unicode: {norm}"
1493        );
1494        assert!(
1495            norm.contains("Boy Scouts of America"),
1496            "expected body text to be recovered: {norm}"
1497        );
1498    }
1499
1500    #[test]
1501    fn render_page_with_config_srgb_matches_legacy_render_page() {
1502        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1503        let legacy = doc
1504            .render_page(
1505                0,
1506                &RenderOptions {
1507                    dpi: 72.0,
1508                    ..Default::default()
1509                },
1510            )
1511            .expect("legacy render succeeds");
1512        let configured = doc
1513            .render_page_with_config(
1514                0,
1515                &RenderConfig {
1516                    color_mode: ColorMode::Srgb,
1517                    dpi: 72,
1518                },
1519            )
1520            .expect("configured render succeeds");
1521
1522        assert_eq!(legacy.width, configured.width);
1523        assert_eq!(legacy.height, configured.height);
1524        assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1525        assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1526        assert_eq!(legacy.pixels, configured.pixels);
1527    }
1528
1529    #[test]
1530    fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1531        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1532        let rendered = doc
1533            .render_page_with_config(
1534                0,
1535                &RenderConfig {
1536                    color_mode: ColorMode::PreserveCmyk,
1537                    dpi: 72,
1538                },
1539            )
1540            .expect("cmyk render succeeds");
1541
1542        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1543        assert_eq!(
1544            rendered.pixels.len(),
1545            rendered.width as usize * rendered.height as usize * 4
1546        );
1547        assert_eq!(
1548            pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1549            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1550        );
1551    }
1552
1553    #[test]
1554    fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1555        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1556        let rendered = doc
1557            .render_page_with_config(
1558                0,
1559                &RenderConfig {
1560                    color_mode: ColorMode::SimulateCmyk,
1561                    dpi: 72,
1562                },
1563            )
1564            .expect("simulate cmyk render succeeds");
1565
1566        assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1567        assert!(!rendered.pixels.is_empty());
1568    }
1569
1570    #[test]
1571    fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1572        let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1573        let rendered = doc
1574            .render_page_with_config(
1575                0,
1576                &RenderConfig {
1577                    color_mode: ColorMode::PreserveCmyk,
1578                    dpi: 72,
1579                },
1580            )
1581            .expect("mixed render succeeds");
1582
1583        assert_eq!(
1584            pixel_at(&rendered, 54, 36),
1585            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1586        );
1587        assert_ne!(
1588            pixel_at(&rendered, 18, 36),
1589            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1590        );
1591    }
1592
1593    #[test]
1594    fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1595        let doc =
1596            PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1597        let rendered = doc
1598            .render_page_with_config(
1599                0,
1600                &RenderConfig {
1601                    color_mode: ColorMode::PreserveCmyk,
1602                    dpi: 72,
1603                },
1604            )
1605            .expect("transparent cmyk render succeeds");
1606
1607        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1608        assert_eq!(
1609            rendered.pixels.len(),
1610            rendered.width as usize * rendered.height as usize * 4
1611        );
1612    }
1613
1614    #[test]
1615    fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1616        let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1617        let rendered = doc
1618            .render_page_with_config(
1619                0,
1620                &RenderConfig {
1621                    color_mode: ColorMode::PreserveCmyk,
1622                    dpi: 72,
1623                },
1624            )
1625            .expect("cmyk image render succeeds");
1626
1627        assert_eq!(rendered.width, 2);
1628        assert_eq!(rendered.height, 1);
1629        assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1630        assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1631    }
1632}
pdf_engine/document.rs

pdf_engine/
document.rs