Skip to main content

pdf_engine/
document.rs

1//! Unified document facade — multi-page rendering, text extraction,
2//! metadata, bookmarks, and thumbnails.
3
4use crate::error::{EngineError, Result};
5use crate::geometry::{self, PageGeometry};
6use crate::limits::{LimitError, ProcessingLimits};
7use std::sync::{Arc, Mutex};
8
9/// Shared slot used by the limit-warning collector.
10///
11/// Stores `Some((observed_bytes, limit_bytes))` when a
12/// `StreamTooLarge` warning fires during rendering or extraction.
13type LimitSlot = Arc<Mutex<Option<(u64, u64)>>>;
14use crate::render::{self, ColorMode, RenderConfig, RenderOptions, RenderedPage};
15use crate::text::{TextBlock, TextExtractionDevice};
16use crate::thumbnail::ThumbnailOptions;
17
18use pdf_forms::parse::parse_acroform;
19use pdf_forms::tree::{FieldType, FieldValue};
20use pdf_render::pdf_interpret::PageExt;
21use pdf_render::pdf_interpret::{
22    interpret_page, Cache, Context, InterpreterSettings, InterpreterWarning,
23};
24use pdf_render::pdf_syntax::object::dict::keys::{FIRST, NEXT, OUTLINES, TITLE};
25use pdf_render::pdf_syntax::object::{Dict, ObjectIdentifier};
26use pdf_render::pdf_syntax::page::Page;
27use pdf_render::pdf_syntax::{Pdf, PdfLoadLimits};
28#[cfg(feature = "parallel")]
29use rayon::prelude::*;
30use std::collections::BTreeSet;
31
32use kurbo::Rect;
33
34/// Document metadata extracted from the info dictionary.
35#[derive(Debug, Clone, Default)]
36pub struct DocumentInfo {
37    /// Document title.
38    pub title: Option<String>,
39    /// Author.
40    pub author: Option<String>,
41    /// Subject.
42    pub subject: Option<String>,
43    /// Keywords.
44    pub keywords: Option<String>,
45    /// Creator application.
46    pub creator: Option<String>,
47    /// Producer application.
48    pub producer: Option<String>,
49}
50
51/// A bookmark / outline item.
52#[derive(Debug, Clone)]
53pub struct BookmarkItem {
54    /// Bookmark title.
55    pub title: String,
56    /// Target page index (0-based), if resolvable.
57    pub page: Option<usize>,
58    /// Nested child bookmarks.
59    pub children: Vec<BookmarkItem>,
60}
61
62/// High-level PDF document handle.
63pub struct PdfDocument {
64    pdf: Pdf,
65    settings: InterpreterSettings,
66}
67
68impl PdfDocument {
69    /// Open a PDF from bytes.
70    pub fn open(data: impl Into<pdf_render::pdf_syntax::PdfData>) -> Result<Self> {
71        let pdf = Pdf::new(data).map_err(|e| match e {
72            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
73                EngineError::Encrypted(format!("{d:?}"))
74            }
75            _ => EngineError::InvalidPdf(format!("{e:?}")),
76        })?;
77        let settings = InterpreterSettings {
78            shared_cache: Some(Cache::new()),
79            ..InterpreterSettings::default()
80        };
81        Ok(Self { pdf, settings })
82    }
83
84    /// Open a PDF from bytes with processing limits.
85    pub fn open_with_processing_limits(
86        data: impl Into<pdf_render::pdf_syntax::PdfData>,
87        limits: ProcessingLimits,
88    ) -> Result<Self> {
89        let syntax_limits = PdfLoadLimits::new()
90            .max_object_depth(limits.max_object_depth)
91            .max_image_pixels(limits.max_image_pixels)
92            .max_stream_bytes(limits.max_stream_bytes);
93        let pdf = Pdf::new_with_limits(data, syntax_limits).map_err(|e| match e {
94            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
95                EngineError::Encrypted(format!("{d:?}"))
96            }
97            _ => EngineError::InvalidPdf(format!("{e:?}")),
98        })?;
99        let settings = InterpreterSettings {
100            max_operator_count: Some(limits.max_operator_count),
101            shared_cache: Some(Cache::new()),
102            ..InterpreterSettings::default()
103        };
104        Ok(Self { pdf, settings })
105    }
106
107    /// Open a password-protected PDF.
108    pub fn open_with_password(
109        data: impl Into<pdf_render::pdf_syntax::PdfData>,
110        password: &str,
111    ) -> Result<Self> {
112        let pdf = Pdf::new_with_password(data, password).map_err(|e| match e {
113            pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
114                EngineError::Encrypted(format!("{d:?}"))
115            }
116            _ => EngineError::InvalidPdf(format!("{e:?}")),
117        })?;
118        let settings = InterpreterSettings {
119            shared_cache: Some(Cache::new()),
120            ..InterpreterSettings::default()
121        };
122        Ok(Self { pdf, settings })
123    }
124
125    /// Open a password-protected PDF with processing limits.
126    pub fn open_with_password_and_processing_limits(
127        data: impl Into<pdf_render::pdf_syntax::PdfData>,
128        password: &str,
129        limits: ProcessingLimits,
130    ) -> Result<Self> {
131        let syntax_limits = PdfLoadLimits::new()
132            .max_object_depth(limits.max_object_depth)
133            .max_image_pixels(limits.max_image_pixels)
134            .max_stream_bytes(limits.max_stream_bytes);
135        let pdf = Pdf::new_with_password_and_limits(data, password, syntax_limits).map_err(
136            |e| match e {
137                pdf_render::pdf_syntax::LoadPdfError::Decryption(d) => {
138                    EngineError::Encrypted(format!("{d:?}"))
139                }
140                _ => EngineError::InvalidPdf(format!("{e:?}")),
141            },
142        )?;
143        let settings = InterpreterSettings {
144            max_operator_count: Some(limits.max_operator_count),
145            shared_cache: Some(Cache::new()),
146            ..InterpreterSettings::default()
147        };
148        Ok(Self { pdf, settings })
149    }
150
151    /// Access the underlying parsed PDF.
152    pub fn pdf(&self) -> &Pdf {
153        &self.pdf
154    }
155
156    /// Structural recovery applied while loading this document (xref / page-tree
157    /// rebuild). All-`false` for a document that parsed cleanly.
158    pub fn load_recovery(&self) -> pdf_render::pdf_syntax::LoadRecovery {
159        self.pdf.load_recovery()
160    }
161
162    /// Set interpreter settings (font resolver, cmap resolver, etc.).
163    pub fn set_settings(&mut self, settings: InterpreterSettings) {
164        self.settings = settings;
165    }
166
167    /// Install a warning sink that receives [`InterpreterWarning`]s raised
168    /// during subsequent render / text-extraction operations.
169    ///
170    /// Replaces any previously installed sink; all other settings (cache,
171    /// limits, resolvers) are preserved. The sink is preserved across the
172    /// extraction-settings clone, so it covers both rendering and extraction.
173    pub fn set_warning_sink(&mut self, sink: pdf_render::pdf_interpret::WarningSinkFn) {
174        self.settings.warning_sink = sink;
175    }
176
177    /// Number of pages.
178    pub fn page_count(&self) -> usize {
179        self.pdf.pages().len()
180    }
181
182    /// Get the geometry of a page.
183    pub fn page_geometry(&self, index: usize) -> Result<PageGeometry> {
184        let page = self.get_page(index)?;
185        Ok(geometry::extract_geometry(page))
186    }
187
188    /// Render a single page.
189    ///
190    /// If the document contains an XFA template, it is automatically flattened
191    /// to static PDF content before rendering.  This prevents the "Please wait"
192    /// placeholder page that Adobe Reader would show when rendering an XFA PDF
193    /// with a conventional renderer. If flattening fails, rendering falls back
194    /// to the original document as a best-effort path.
195    pub fn render_page(&self, index: usize, options: &RenderOptions) -> Result<RenderedPage> {
196        #[cfg(feature = "xfa")]
197        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
198            return flat_doc.render_page(index, options);
199        }
200        let page = self.get_page(index)?;
201        // Pre-flight: reject pathologically small or zero-dimension pages before
202        // allocating any pixel buffer. Non-positive dimensions cause panics or
203        // zero-sized allocations inside the rasteriser.
204        let (w, h) = page.render_dimensions();
205        if w <= 0.0 || h <= 0.0 {
206            return Err(EngineError::InvalidPageGeometry {
207                width: w,
208                height: h,
209                reason: "page has zero or negative dimensions".into(),
210            });
211        }
212        // Also reject pages so small they produce zero pixels even at the
213        // minimum meaningful DPI (1 DPI). Below ~0.72pt at 1 DPI = 0 pixels.
214        const MIN_PAGE_PT: f32 = 1.0;
215        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
216            return Err(EngineError::InvalidPageGeometry {
217                width: w,
218                height: h,
219                reason: "page too small to render (< 1pt)".into(),
220            });
221        }
222        let (settings, slot) = Self::with_limit_collector(&self.settings);
223        let rendered = render::render_page(page, options, &settings);
224        Self::check_limit_slot(&slot)?;
225        Ok(rendered)
226    }
227
228    /// Render a single page using the high-level render config.
229    ///
230    /// XFA documents are auto-flattened before rendering (same as `render_page`).
231    /// If flattening fails, rendering falls back to the original document.
232    pub fn render_page_with_config(
233        &self,
234        index: usize,
235        config: &RenderConfig,
236    ) -> Result<RenderedPage> {
237        #[cfg(feature = "xfa")]
238        if let Some(flat_doc) = self.open_flattened_xfa_for_render() {
239            return flat_doc.render_page_with_config(index, config);
240        }
241        let page = self.get_page(index)?;
242        let (w, h) = page.render_dimensions();
243        if w <= 0.0 || h <= 0.0 {
244            return Err(EngineError::InvalidPageGeometry {
245                width: w,
246                height: h,
247                reason: "page has zero or negative dimensions".into(),
248            });
249        }
250        const MIN_PAGE_PT: f32 = 1.0;
251        if w < MIN_PAGE_PT || h < MIN_PAGE_PT {
252            return Err(EngineError::InvalidPageGeometry {
253                width: w,
254                height: h,
255                reason: "page too small to render (< 1pt)".into(),
256            });
257        }
258        let (settings, slot) = Self::with_limit_collector(&self.settings);
259        let rendered = render::render_page_with_config(page, config, &settings);
260        Self::check_limit_slot(&slot)?;
261        Ok(rendered)
262    }
263
264    /// Render a single page to a CMYK buffer.
265    pub fn render_page_cmyk(&self, index: usize, dpi: u32) -> Result<RenderedPage> {
266        self.render_page_with_config(
267            index,
268            &RenderConfig {
269                color_mode: ColorMode::PreserveCmyk,
270                dpi,
271            },
272        )
273    }
274
275    /// Render all pages, in parallel when the `parallel` feature is enabled.
276    pub fn render_all(&self, options: &RenderOptions) -> Vec<RenderedPage> {
277        let pages = self.pdf.pages();
278        #[cfg(feature = "parallel")]
279        return (0..pages.len())
280            .into_par_iter()
281            .map(|i| render::render_page(&pages[i], options, &self.settings))
282            .collect();
283        #[cfg(not(feature = "parallel"))]
284        (0..pages.len())
285            .map(|i| render::render_page(&pages[i], options, &self.settings))
286            .collect()
287    }
288
289    /// Render all pages using the high-level render config.
290    pub fn render_all_with_config(&self, config: &RenderConfig) -> Vec<RenderedPage> {
291        let pages = self.pdf.pages();
292        #[cfg(feature = "parallel")]
293        return (0..pages.len())
294            .into_par_iter()
295            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
296            .collect();
297        #[cfg(not(feature = "parallel"))]
298        (0..pages.len())
299            .map(|i| render::render_page_with_config(&pages[i], config, &self.settings))
300            .collect()
301    }
302
303    /// Generate a thumbnail for a single page.
304    pub fn thumbnail(&self, index: usize, options: &ThumbnailOptions) -> Result<RenderedPage> {
305        let page = self.get_page(index)?;
306        Ok(render::render_thumbnail(
307            page,
308            options.max_dimension,
309            &self.settings,
310        ))
311    }
312
313    /// Generate thumbnails for all pages, in parallel when the `parallel` feature is enabled.
314    pub fn thumbnails_all(&self, options: &ThumbnailOptions) -> Vec<RenderedPage> {
315        let pages = self.pdf.pages();
316        #[cfg(feature = "parallel")]
317        return (0..pages.len())
318            .into_par_iter()
319            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
320            .collect();
321        #[cfg(not(feature = "parallel"))]
322        (0..pages.len())
323            .map(|i| render::render_thumbnail(&pages[i], options.max_dimension, &self.settings))
324            .collect()
325    }
326
327    /// Extract text from a page as a single string.
328    pub fn extract_text(&self, index: usize) -> Result<String> {
329        let page = self.get_page(index)?;
330        let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
331        let mut device = TextExtractionDevice::new();
332        let mut ctx = Self::create_context_with_settings(page, settings);
333        interpret_page(page, &mut ctx, &mut device);
334        Self::check_limit_slot(&slot)?;
335        Ok(device.into_text())
336    }
337
338    /// Extract text from a sequence of pages while reusing the same settings object.
339    #[doc(hidden)]
340    pub fn extract_text_pages_reusing_settings<I>(&self, indices: I) -> Result<Vec<String>>
341    where
342        I: IntoIterator<Item = usize>,
343    {
344        let pages = self.pdf.pages();
345        let mut settings = self.text_extraction_settings();
346        let indices = indices.into_iter();
347        let (lower_bound, upper_bound) = indices.size_hint();
348        let mut texts = Vec::with_capacity(upper_bound.unwrap_or(lower_bound));
349
350        for index in indices {
351            let page = pages.get(index).ok_or(EngineError::PageOutOfRange {
352                index,
353                count: pages.len(),
354            })?;
355            let (text, next_settings) = Self::extract_text_with_settings(page, settings);
356            settings = next_settings;
357            texts.push(text);
358        }
359
360        Ok(texts)
361    }
362
363    /// Extract structured text blocks from a page.
364    pub fn extract_text_blocks(&self, index: usize) -> Result<Vec<TextBlock>> {
365        let page = self.get_page(index)?;
366        let (settings, slot) = Self::with_limit_collector(&self.text_extraction_settings());
367        let mut device = TextExtractionDevice::new();
368        let mut ctx = Self::create_context_with_settings(page, settings);
369        interpret_page(page, &mut ctx, &mut device);
370        Self::check_limit_slot(&slot)?;
371        Ok(device.into_blocks())
372    }
373
374    /// Extract structured text blocks from all pages, reusing interpreter settings.
375    pub fn extract_all_text_blocks(&self) -> Vec<Vec<TextBlock>> {
376        let pages = self.pdf.pages();
377        let mut settings = self.text_extraction_settings();
378        let mut blocks = Vec::with_capacity(pages.len());
379
380        for page in pages.iter() {
381            let (page_blocks, next_settings) =
382                Self::extract_text_blocks_with_settings(page, settings);
383            settings = next_settings;
384            blocks.push(page_blocks);
385        }
386
387        blocks
388    }
389
390    /// Extract text values from AcroForm fields, including push-button captions.
391    ///
392    /// Returns a single string concatenating all non-empty field values separated
393    /// by newlines. Useful when the document stores its readable content in form
394    /// field values rather than (or in addition to) page content streams.
395    pub fn extract_acroform_text(&self) -> String {
396        let Some(tree) = parse_acroform(&self.pdf) else {
397            return String::new();
398        };
399        let mut parts: Vec<String> = Vec::new();
400        for id in tree.all_ids() {
401            let node = tree.get(id);
402            if node.children.is_empty() {
403                // Terminal (widget) — collect text-like values.
404                let value_str = match &node.value {
405                    Some(FieldValue::Text(s)) if !s.is_empty() => Some(s.clone()),
406                    Some(FieldValue::StringArray(arr)) => {
407                        let joined = arr
408                            .iter()
409                            .filter(|s| !s.is_empty())
410                            .cloned()
411                            .collect::<Vec<_>>()
412                            .join(", ");
413                        if joined.is_empty() {
414                            None
415                        } else {
416                            Some(joined)
417                        }
418                    }
419                    _ => None,
420                };
421                let button_caption =
422                    value_str.is_none() && tree.effective_field_type(id) == Some(FieldType::Button);
423                let extracted = value_str.or_else(|| {
424                    button_caption.then(|| {
425                        node.mk
426                            .as_ref()
427                            .and_then(|mk| mk.caption.as_ref())
428                            .filter(|caption| !caption.is_empty())
429                            .cloned()
430                    })?
431                });
432                if let Some(s) = extracted {
433                    parts.push(s);
434                }
435            }
436        }
437        parts.join("\n")
438    }
439
440    /// Extract all text from the document: page content streams plus AcroForm
441    /// field values.  Mirrors pdftotext behaviour.
442    pub fn extract_all_text(&self) -> String {
443        let pages = self.pdf.pages();
444        let mut settings = self.text_extraction_settings();
445        let mut page_texts = Vec::with_capacity(pages.len());
446        for page in pages.iter() {
447            let (page_text, next_settings) = Self::extract_text_with_settings(page, settings);
448            settings = next_settings;
449            page_texts.push(page_text);
450        }
451
452        let mut text = join_page_texts(page_texts.iter().map(String::as_str));
453        let acroform = self.extract_acroform_text();
454        if !acroform.is_empty() {
455            if !text.is_empty() && !text.ends_with('\n') {
456                text.push('\n');
457            }
458            text.push_str(&acroform);
459        }
460        text
461    }
462
463    /// Simple text search: returns page indices containing the query string.
464    pub fn search_text(&self, query: &str) -> Vec<usize> {
465        let pages = self.pdf.pages();
466        let query_lower = query.to_lowercase();
467        #[cfg(feature = "parallel")]
468        let page_contains = |i: usize| -> Option<usize> {
469            let page = &pages[i];
470            let (text, _) = Self::extract_text_with_settings(page, self.text_extraction_settings());
471            if text.to_lowercase().contains(&query_lower) {
472                Some(i)
473            } else {
474                None
475            }
476        };
477        #[cfg(feature = "parallel")]
478        return (0..pages.len())
479            .into_par_iter()
480            .filter_map(page_contains)
481            .collect();
482        #[cfg(not(feature = "parallel"))]
483        {
484            let mut settings = self.text_extraction_settings();
485            let mut hits = Vec::new();
486            for (i, page) in pages.iter().enumerate() {
487                let (text, next_settings) = Self::extract_text_with_settings(page, settings);
488                settings = next_settings;
489                if text.to_lowercase().contains(&query_lower) {
490                    hits.push(i);
491                }
492            }
493            hits
494        }
495    }
496
497    /// Extract document metadata.
498    pub fn info(&self) -> DocumentInfo {
499        let meta = self.pdf.metadata();
500        DocumentInfo {
501            title: meta.title.as_ref().map(|b| bytes_to_string(b)),
502            author: meta.author.as_ref().map(|b| bytes_to_string(b)),
503            subject: meta.subject.as_ref().map(|b| bytes_to_string(b)),
504            keywords: meta.keywords.as_ref().map(|b| bytes_to_string(b)),
505            creator: meta.creator.as_ref().map(|b| bytes_to_string(b)),
506            producer: meta.producer.as_ref().map(|b| bytes_to_string(b)),
507        }
508    }
509
510    /// Extract document outline / bookmarks.
511    pub fn bookmarks(&self) -> Vec<BookmarkItem> {
512        let xref = self.pdf.xref();
513        let root_id = xref.root_id();
514        let catalog: Dict<'_> = match xref.get(root_id) {
515            Some(d) => d,
516            None => return Vec::new(),
517        };
518
519        let outlines: Dict<'_> = match catalog.get(OUTLINES) {
520            Some(d) => d,
521            None => return Vec::new(),
522        };
523
524        let first: Dict<'_> = match outlines.get(FIRST) {
525            Some(d) => d,
526            None => return Vec::new(),
527        };
528
529        // `visited` breaks cyclic /Next loops and /First back-references;
530        // `depth` bounds deep nesting so a malformed outline cannot hang or
531        // overflow the stack.
532        let mut visited = BTreeSet::new();
533        parse_outline_items(&first, 0, &mut visited)
534    }
535
536    /// Run OCR on a page and return the recognized text and word positions.
537    ///
538    /// The page is rendered at `dpi` (default 150) before recognition.
539    /// Pass any [`OcrBackend`] implementation; use [`OcrsBackend::try_default`]
540    /// to load the pure-Rust `ocrs` engine from the standard model paths.
541    ///
542    /// # Example
543    ///
544    /// ```no_run
545    /// # #[cfg(feature = "ocr")] {
546    /// use pdf_engine::{PdfDocument, OcrsBackend, RenderOptions};
547    ///
548    /// let doc = PdfDocument::open(std::fs::read("scan.pdf").unwrap()).unwrap();
549    /// let backend = OcrsBackend::try_default().unwrap();
550    /// let result = doc.ocr_page(0, &backend, 150.0_f64).unwrap();
551    /// println!("{}", result.text);
552    /// # }
553    /// ```
554    pub fn ocr_page(
555        &self,
556        index: usize,
557        backend: &dyn crate::ocr::OcrBackend,
558        dpi: f64,
559    ) -> crate::error::Result<crate::ocr::OcrResult> {
560        let opts = crate::render::RenderOptions {
561            dpi,
562            ..Default::default()
563        };
564        let rendered = self.render_page(index, &opts)?;
565
566        // Convert RGBA → RGB (ocrs expects RGB input).
567        let mut rgb = Vec::with_capacity((rendered.width * rendered.height * 3) as usize);
568        for chunk in rendered.pixels.chunks(4) {
569            rgb.push(chunk[0]);
570            rgb.push(chunk[1]);
571            rgb.push(chunk[2]);
572        }
573
574        backend
575            .recognize(&rgb, rendered.width, rendered.height)
576            .map_err(|e| crate::error::EngineError::RenderError(e.to_string()))
577    }
578
579    /// Wrap `settings` with a warning sink that captures the first
580    /// `InterpreterWarning::StreamTooLarge` into a shared slot.
581    ///
582    /// The returned slot is checked by [`Self::check_limit_slot`] after
583    /// the operation completes. Any previously installed sink is still
584    /// called so no warnings are silently dropped.
585    fn with_limit_collector(settings: &InterpreterSettings) -> (InterpreterSettings, LimitSlot) {
586        let slot: LimitSlot = Arc::new(Mutex::new(None));
587        let slot_clone = Arc::clone(&slot);
588        let prev_sink = settings.warning_sink.clone();
589        let mut new_settings = settings.clone();
590        new_settings.warning_sink = Arc::new(move |w: InterpreterWarning| {
591            if let InterpreterWarning::StreamTooLarge { observed, limit } = w {
592                let mut guard = slot_clone.lock().unwrap_or_else(|e| e.into_inner());
593                if guard.is_none() {
594                    *guard = Some((observed, limit));
595                }
596            }
597            prev_sink(w);
598        });
599        (new_settings, slot)
600    }
601
602    /// Check the slot populated by [`Self::with_limit_collector`].
603    ///
604    /// Returns `Err(EngineError::LimitExceeded(...))` if a
605    /// `StreamTooLarge` warning was captured, `Ok(())` otherwise.
606    fn check_limit_slot(slot: &LimitSlot) -> Result<()> {
607        if let Some((observed, limit)) = *slot.lock().unwrap_or_else(|e| e.into_inner()) {
608            return Err(EngineError::LimitExceeded(LimitError::StreamTooLarge {
609                actual_bytes: observed,
610                limit_bytes: limit,
611            }));
612        }
613        Ok(())
614    }
615
616    fn get_page(&self, index: usize) -> Result<&Page<'_>> {
617        let pages = self.pdf.pages();
618        if index >= pages.len() {
619            return Err(EngineError::PageOutOfRange {
620                index,
621                count: pages.len(),
622            });
623        }
624        Ok(&pages[index])
625    }
626
627    fn text_extraction_settings(&self) -> InterpreterSettings {
628        let mut settings = self.settings.clone();
629        // Text extraction should include signature widget appearance streams
630        // that rendering skips to match MuPDF visual output.
631        settings.skip_signature_widgets = false;
632        settings
633    }
634
635    fn create_context_with_settings<'a>(
636        page: &Page<'a>,
637        settings: InterpreterSettings,
638    ) -> Context<'a> {
639        let (w, h) = page.render_dimensions();
640        Context::new(
641            page.initial_transform(false),
642            Rect::new(0.0, 0.0, w as f64, h as f64),
643            page.xref(),
644            settings,
645        )
646    }
647
648    fn extract_text_with_settings<'a>(
649        page: &Page<'a>,
650        settings: InterpreterSettings,
651    ) -> (String, InterpreterSettings) {
652        let mut device = TextExtractionDevice::new();
653        let mut ctx = Self::create_context_with_settings(page, settings);
654        interpret_page(page, &mut ctx, &mut device);
655        let settings = ctx.into_settings();
656        (device.into_text(), settings)
657    }
658
659    fn extract_text_blocks_with_settings<'a>(
660        page: &Page<'a>,
661        settings: InterpreterSettings,
662    ) -> (Vec<TextBlock>, InterpreterSettings) {
663        let mut device = TextExtractionDevice::new();
664        let mut ctx = Self::create_context_with_settings(page, settings);
665        interpret_page(page, &mut ctx, &mut device);
666        let settings = ctx.into_settings();
667        (device.into_blocks(), settings)
668    }
669
670    #[cfg(feature = "xfa")]
671    fn open_flattened_xfa_for_render(&self) -> Option<Self> {
672        if !crate::xfa::has_xfa(self) {
673            return None;
674        }
675
676        let flat_bytes = crate::xfa::flatten(self).ok()?;
677        let mut flat_doc = Self::open(flat_bytes).ok()?;
678        flat_doc.settings = self.settings.clone();
679        Some(flat_doc)
680    }
681}
682
683fn join_page_texts<I>(page_texts: I) -> String
684where
685    I: IntoIterator,
686    I::Item: AsRef<str>,
687{
688    let mut text = String::new();
689    let mut is_first = true;
690
691    for page_text in page_texts {
692        if !is_first {
693            while !text.is_empty() && !text.ends_with("\n\n") {
694                text.push('\n');
695            }
696            text.push('\u{000C}');
697        }
698        text.push_str(page_text.as_ref());
699        is_first = false;
700    }
701
702    text
703}
704
705#[cfg(test)]
706mod extract_all_text_tests {
707    use super::join_page_texts;
708
709    #[test]
710    fn separates_nonempty_pages_like_pdftotext() {
711        assert_eq!(
712            join_page_texts(["Page 1", "Page 2"]),
713            "Page 1\n\n\u{000C}Page 2"
714        );
715    }
716
717    #[test]
718    fn preserves_leading_blank_pages_without_extra_newlines() {
719        assert_eq!(join_page_texts(["", "Page 2"]), "\u{000C}Page 2");
720    }
721
722    #[test]
723    fn reuses_existing_blank_line_before_form_feed() {
724        assert_eq!(
725            join_page_texts(["Page 1\n\n", "Page 2"]),
726            "Page 1\n\n\u{000C}Page 2"
727        );
728    }
729}
730
731/// Maximum outline (bookmark) nesting depth. Bounds recursion on adversarial
732/// `/First` chains; real outlines are far shallower.
733const MAX_OUTLINE_DEPTH: usize = 100;
734
735/// Walk the outline linked list (FIRST → NEXT chain).
736///
737/// `visited` (object ids of already-seen items) breaks cyclic `/Next` loops and
738/// `/First` back-references; `depth` bounds deeply nested `/First` chains. Both
739/// protect against malformed/adversarial outlines hanging or overflowing.
740fn parse_outline_items(
741    item_dict: &Dict<'_>,
742    depth: usize,
743    visited: &mut BTreeSet<ObjectIdentifier>,
744) -> Vec<BookmarkItem> {
745    let mut items = Vec::new();
746    if depth >= MAX_OUTLINE_DEPTH {
747        return items;
748    }
749    let mut current: Option<Dict<'_>> = Some(item_dict.clone());
750
751    while let Some(dict) = current {
752        // Stop if we re-enter an item: covers a /Next chain that loops back and
753        // a /First child that references an ancestor.
754        if let Some(id) = dict.obj_id() {
755            if !visited.insert(id) {
756                break;
757            }
758        }
759
760        let title = dict
761            .get::<pdf_render::pdf_syntax::object::String>(TITLE)
762            .map(|s| bytes_to_string(s.as_bytes()))
763            .unwrap_or_default();
764
765        let children = match dict.get::<Dict<'_>>(FIRST) {
766            Some(child_dict) => parse_outline_items(&child_dict, depth + 1, visited),
767            None => Vec::new(),
768        };
769
770        items.push(BookmarkItem {
771            title,
772            page: None, // Destination resolution requires named-dest lookup — left for follow-up
773            children,
774        });
775
776        current = dict.get::<Dict<'_>>(NEXT);
777    }
778
779    items
780}
781
782/// Convert PDF string bytes to a Rust String (UTF-8 with Latin-1 fallback).
783fn bytes_to_string(bytes: &[u8]) -> String {
784    // Check for UTF-16 BOM
785    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
786        let chars: Vec<u16> = bytes[2..]
787            .chunks(2)
788            .filter_map(|c| {
789                if c.len() == 2 {
790                    Some(u16::from_be_bytes([c[0], c[1]]))
791                } else {
792                    None
793                }
794            })
795            .collect();
796        return String::from_utf16_lossy(&chars);
797    }
798
799    // Try UTF-8, fall back to Latin-1.
800    match std::str::from_utf8(bytes) {
801        Ok(s) => s.to_string(),
802        Err(_) => bytes.iter().map(|&b| b as char).collect(),
803    }
804}
805
806#[cfg(test)]
807mod tests {
808    use super::*;
809    use crate::render::{ColorMode, PixelFormat, RenderConfig, RenderOptions};
810    use lopdf::{Document as LoDocument, Object};
811    use std::path::PathBuf;
812
813    fn corpus_path(name: &str) -> PathBuf {
814        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
815            .join("../../corpus")
816            .join(name)
817    }
818
819    /// A cyclic outline (`/Next` chain that loops back) must not hang the
820    /// bookmark walker; the cycle guard breaks it into a finite list.
821    #[test]
822    fn cyclic_outline_terminates_and_is_bounded() {
823        fn cyclic_outline_pdf() -> Vec<u8> {
824            let objs: [&[u8]; 6] = [
825                b"<< /Type /Catalog /Pages 2 0 R /Outlines 4 0 R >>",
826                b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
827                b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 200 200] >>",
828                b"<< /Type /Outlines /First 5 0 R >>",
829                b"<< /Title (A) /Next 6 0 R >>",
830                b"<< /Title (B) /Next 5 0 R >>", // /Next back to A -> cycle
831            ];
832            let mut buf = Vec::new();
833            let mut offsets = [0usize; 7];
834            buf.extend_from_slice(b"%PDF-1.7\n");
835            for (i, body) in objs.iter().enumerate() {
836                offsets[i + 1] = buf.len();
837                buf.extend_from_slice(format!("{} 0 obj\n", i + 1).as_bytes());
838                buf.extend_from_slice(body);
839                buf.extend_from_slice(b"\nendobj\n");
840            }
841            let xref_off = buf.len();
842            buf.extend_from_slice(b"xref\n0 7\n0000000000 65535 f \n");
843            for o in &offsets[1..7] {
844                buf.extend_from_slice(format!("{o:010} 00000 n \n").as_bytes());
845            }
846            buf.extend_from_slice(
847                format!("trailer\n<< /Size 7 /Root 1 0 R >>\nstartxref\n{xref_off}\n%%EOF")
848                    .as_bytes(),
849            );
850            buf
851        }
852
853        fn count(b: &BookmarkItem) -> usize {
854            1 + b.children.iter().map(count).sum::<usize>()
855        }
856
857        let doc = PdfDocument::open(cyclic_outline_pdf()).expect("open cyclic-outline PDF");
858        let bookmarks = doc.bookmarks();
859        let total: usize = bookmarks.iter().map(count).sum();
860        assert!(
861            total <= 2,
862            "cyclic /Next outline must not loop forever; got {total} items"
863        );
864    }
865
866    /// Cache-on vs cache-off: the shared decoded-image cache is a pure
867    /// performance optimisation, so rendering with it enabled must produce
868    /// pixel-identical output to rendering with it disabled. Exercised against a
869    /// committed image-bearing fixture.
870    #[test]
871    fn shared_image_cache_is_render_neutral() {
872        let path =
873            PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../tests/corpus-mini/scanned.pdf");
874        let data = std::fs::read(&path).expect("read scanned.pdf fixture");
875        let cfg = RenderConfig::default();
876
877        // Cache ON: `PdfDocument::open` installs a shared cache, so the second
878        // render of the same page is a cache hit.
879        let doc_on = PdfDocument::open(data.clone()).expect("open cache-on");
880        let miss = doc_on
881            .render_page_with_config(0, &cfg)
882            .expect("cold render (cache miss)");
883        let hit = doc_on
884            .render_page_with_config(0, &cfg)
885            .expect("warm render (cache hit)");
886
887        // Cache OFF: default settings carry `shared_cache: None`, so every render
888        // decodes images from scratch.
889        let mut doc_off = PdfDocument::open(data).expect("open cache-off");
890        doc_off.set_settings(InterpreterSettings::default());
891        let uncached = doc_off
892            .render_page_with_config(0, &cfg)
893            .expect("render with cache disabled");
894
895        assert_eq!(
896            (miss.width, miss.height),
897            (uncached.width, uncached.height),
898            "render dimensions must match"
899        );
900        assert_eq!(
901            miss.pixels, hit.pixels,
902            "a cache hit must return exactly the freshly-decoded render"
903        );
904        assert_eq!(
905            miss.pixels, uncached.pixels,
906            "cache-enabled render must equal cache-disabled render"
907        );
908    }
909
910    fn normalize_text(text: &str) -> String {
911        text.split_whitespace().collect::<Vec<_>>().join(" ")
912    }
913
914    fn strip_type0_tounicode(data: &[u8]) -> (Vec<u8>, usize) {
915        fn get_name(dict: &lopdf::Dictionary, key: &[u8]) -> Option<Vec<u8>> {
916            match dict.get(key).ok()? {
917                Object::Name(name) => Some(name.clone()),
918                _ => None,
919            }
920        }
921
922        fn descendant_is_cidfont_type2(doc: &LoDocument, type0: &lopdf::Dictionary) -> bool {
923            let Some(Object::Array(descendants)) = type0.get(b"DescendantFonts").ok() else {
924                return false;
925            };
926            let Some(Object::Reference(desc_id)) = descendants.first() else {
927                return false;
928            };
929            let Ok(Object::Dictionary(descendant)) = doc.get_object(*desc_id) else {
930                return false;
931            };
932            matches!(
933                descendant.get(b"Subtype").ok(),
934                Some(Object::Name(name)) if name.as_slice() == b"CIDFontType2"
935            )
936        }
937
938        let mut doc = LoDocument::load_mem(data).expect("load stripped-to-unicode fixture");
939        let ids: Vec<_> = doc.objects.keys().copied().collect();
940        let mut removed = 0usize;
941
942        for id in ids {
943            let Some(Object::Dictionary(dict)) = doc.objects.get(&id) else {
944                continue;
945            };
946            if !matches!(
947                dict.get(b"Subtype").ok(),
948                Some(Object::Name(name)) if name.as_slice() == b"Type0"
949            ) {
950                continue;
951            }
952            if !matches!(
953                get_name(dict, b"Encoding").as_deref(),
954                Some(b"Identity-H") | Some(b"Identity-V")
955            ) {
956                continue;
957            }
958            if !descendant_is_cidfont_type2(&doc, dict) {
959                continue;
960            }
961
962            if let Some(Object::Dictionary(type0)) = doc.objects.get_mut(&id) {
963                if type0.has(b"ToUnicode") {
964                    type0.remove(b"ToUnicode");
965                    removed += 1;
966                }
967            }
968        }
969
970        let mut out = Vec::new();
971        doc.save_to(&mut out)
972            .expect("save stripped-to-unicode fixture");
973        (out, removed)
974    }
975
976    fn solid_fill_pdf_bytes(color_operator: &str) -> Vec<u8> {
977        use lopdf::{dictionary, Document, Object, Stream};
978
979        let mut doc = Document::with_version("1.4");
980
981        let pages_id = doc.new_object_id();
982        let page_id = doc.new_object_id();
983        let content = format!("{color_operator}\n0 0 72 72 re f\n");
984        let content_id = doc.add_object(Stream::new(dictionary! {}, content.into_bytes()));
985
986        doc.objects.insert(
987            page_id,
988            Object::Dictionary(dictionary! {
989                "Type" => Object::Name(b"Page".to_vec()),
990                "Parent" => Object::Reference(pages_id),
991                "MediaBox" => Object::Array(vec![
992                    Object::Integer(0),
993                    Object::Integer(0),
994                    Object::Integer(72),
995                    Object::Integer(72),
996                ]),
997                "Contents" => Object::Reference(content_id),
998            }),
999        );
1000
1001        doc.objects.insert(
1002            pages_id,
1003            Object::Dictionary(dictionary! {
1004                "Type" => Object::Name(b"Pages".to_vec()),
1005                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1006                "Count" => Object::Integer(1),
1007            }),
1008        );
1009
1010        let catalog_id = doc.new_object_id();
1011        doc.objects.insert(
1012            catalog_id,
1013            Object::Dictionary(dictionary! {
1014                "Type" => Object::Name(b"Catalog".to_vec()),
1015                "Pages" => Object::Reference(pages_id),
1016            }),
1017        );
1018
1019        doc.trailer.set("Root", Object::Reference(catalog_id));
1020
1021        let mut bytes = Vec::new();
1022        doc.save_to(&mut bytes).expect("save solid fill fixture");
1023        bytes
1024    }
1025
1026    fn mixed_rgb_cmyk_pdf_bytes() -> Vec<u8> {
1027        use lopdf::{dictionary, Document, Object, Stream};
1028
1029        let mut doc = Document::with_version("1.4");
1030        let pages_id = doc.new_object_id();
1031        let page_id = doc.new_object_id();
1032        let content = b"1 0 0 rg\n0 0 36 72 re f\n1 0 0 0 k\n36 0 36 72 re f\n".to_vec();
1033        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1034
1035        doc.objects.insert(
1036            page_id,
1037            Object::Dictionary(dictionary! {
1038                "Type" => "Page",
1039                "Parent" => Object::Reference(pages_id),
1040                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1041                "Contents" => Object::Reference(content_id),
1042            }),
1043        );
1044        doc.objects.insert(
1045            pages_id,
1046            Object::Dictionary(dictionary! {
1047                "Type" => "Pages",
1048                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1049                "Count" => Object::Integer(1),
1050            }),
1051        );
1052        let catalog_id = doc.new_object_id();
1053        doc.objects.insert(
1054            catalog_id,
1055            Object::Dictionary(dictionary! {
1056                "Type" => "Catalog",
1057                "Pages" => Object::Reference(pages_id),
1058            }),
1059        );
1060        doc.trailer.set("Root", Object::Reference(catalog_id));
1061
1062        let mut bytes = Vec::new();
1063        doc.save_to(&mut bytes)
1064            .expect("save mixed rgb/cmyk fixture");
1065        bytes
1066    }
1067
1068    fn transparent_cmyk_pdf_bytes() -> Vec<u8> {
1069        use lopdf::{dictionary, Document, Object, Stream};
1070
1071        let mut doc = Document::with_version("1.4");
1072        let pages_id = doc.new_object_id();
1073        let page_id = doc.new_object_id();
1074        let gs_id = doc.add_object(Object::Dictionary(dictionary! {
1075            "Type" => "ExtGState",
1076            "ca" => Object::Real(0.5),
1077        }));
1078        let content = b"/GS1 gs\n1 0 0 0 k\n0 0 72 72 re f\n".to_vec();
1079        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1080
1081        doc.objects.insert(
1082            page_id,
1083            Object::Dictionary(dictionary! {
1084                "Type" => "Page",
1085                "Parent" => Object::Reference(pages_id),
1086                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 72.into(), 72.into()]),
1087                "Resources" => dictionary! {
1088                    "ExtGState" => dictionary! {
1089                        "GS1" => Object::Reference(gs_id),
1090                    },
1091                },
1092                "Contents" => Object::Reference(content_id),
1093            }),
1094        );
1095        doc.objects.insert(
1096            pages_id,
1097            Object::Dictionary(dictionary! {
1098                "Type" => "Pages",
1099                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1100                "Count" => Object::Integer(1),
1101            }),
1102        );
1103        let catalog_id = doc.new_object_id();
1104        doc.objects.insert(
1105            catalog_id,
1106            Object::Dictionary(dictionary! {
1107                "Type" => "Catalog",
1108                "Pages" => Object::Reference(pages_id),
1109            }),
1110        );
1111        doc.trailer.set("Root", Object::Reference(catalog_id));
1112
1113        let mut bytes = Vec::new();
1114        doc.save_to(&mut bytes)
1115            .expect("save transparent cmyk fixture");
1116        bytes
1117    }
1118
1119    fn cmyk_image_pdf_bytes() -> Vec<u8> {
1120        use lopdf::{dictionary, Document, Object, Stream};
1121
1122        let mut doc = Document::with_version("1.4");
1123        let pages_id = doc.new_object_id();
1124        let page_id = doc.new_object_id();
1125        let image_id = doc.add_object(Stream::new(
1126            dictionary! {
1127                "Type" => "XObject",
1128                "Subtype" => "Image",
1129                "Width" => Object::Integer(2),
1130                "Height" => Object::Integer(1),
1131                "BitsPerComponent" => Object::Integer(8),
1132                "ColorSpace" => "DeviceCMYK",
1133            },
1134            vec![255, 0, 0, 0, 0, 255, 0, 0],
1135        ));
1136        let content = b"q\n2 0 0 1 0 0 cm\n/Im1 Do\nQ\n".to_vec();
1137        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1138
1139        doc.objects.insert(
1140            page_id,
1141            Object::Dictionary(dictionary! {
1142                "Type" => "Page",
1143                "Parent" => Object::Reference(pages_id),
1144                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 2.into(), 1.into()]),
1145                "Resources" => dictionary! {
1146                    "XObject" => dictionary! {
1147                        "Im1" => Object::Reference(image_id),
1148                    },
1149                },
1150                "Contents" => Object::Reference(content_id),
1151            }),
1152        );
1153        doc.objects.insert(
1154            pages_id,
1155            Object::Dictionary(dictionary! {
1156                "Type" => "Pages",
1157                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1158                "Count" => Object::Integer(1),
1159            }),
1160        );
1161        let catalog_id = doc.new_object_id();
1162        doc.objects.insert(
1163            catalog_id,
1164            Object::Dictionary(dictionary! {
1165                "Type" => "Catalog",
1166                "Pages" => Object::Reference(pages_id),
1167            }),
1168        );
1169        doc.trailer.set("Root", Object::Reference(catalog_id));
1170
1171        let mut bytes = Vec::new();
1172        doc.save_to(&mut bytes).expect("save cmyk image fixture");
1173        bytes
1174    }
1175
1176    fn pixel_at(rendered: &RenderedPage, x: u32, y: u32) -> [u8; 4] {
1177        let idx = ((y * rendered.width + x) * 4) as usize;
1178        [
1179            rendered.pixels[idx],
1180            rendered.pixels[idx + 1],
1181            rendered.pixels[idx + 2],
1182            rendered.pixels[idx + 3],
1183        ]
1184    }
1185
1186    /// Build a minimal one-page PDF whose only font is a non-embedded TrueType
1187    /// reference (no `FontFile2`). The character codes in the content stream
1188    /// resolve through the declared `/Encoding`, exercising the same code path
1189    /// as corpus PDFs like `171_171940.pdf`.
1190    fn non_embedded_truetype_pdf_bytes(
1191        base_font: &[u8],
1192        encoding: &[u8],
1193        text_bytes: &[u8],
1194    ) -> Vec<u8> {
1195        use lopdf::{dictionary, Document, Object, Stream};
1196
1197        let mut doc = Document::with_version("1.4");
1198
1199        let font_id = doc.add_object(Object::Dictionary(dictionary! {
1200            "Type" => "Font",
1201            "Subtype" => "TrueType",
1202            "Name" => Object::Name(b"F0".to_vec()),
1203            "BaseFont" => Object::Name(base_font.to_vec()),
1204            "Encoding" => Object::Name(encoding.to_vec()),
1205        }));
1206
1207        let resources_id = doc.add_object(Object::Dictionary(dictionary! {
1208            "Font" => dictionary! { "F0" => Object::Reference(font_id) },
1209        }));
1210
1211        let mut content = Vec::new();
1212        content.extend_from_slice(b"BT\n/F0 12 Tf\n100 700 Td\n(");
1213        for &b in text_bytes {
1214            match b {
1215                b'(' | b')' | b'\\' => {
1216                    content.push(b'\\');
1217                    content.push(b);
1218                }
1219                _ => content.push(b),
1220            }
1221        }
1222        content.extend_from_slice(b") Tj\nET\n");
1223        let content_id = doc.add_object(Stream::new(dictionary! {}, content));
1224
1225        let pages_id = doc.new_object_id();
1226        let page_id = doc.add_object(Object::Dictionary(dictionary! {
1227            "Type" => "Page",
1228            "Parent" => Object::Reference(pages_id),
1229            "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1230            "Resources" => Object::Reference(resources_id),
1231            "Contents" => Object::Reference(content_id),
1232        }));
1233        doc.objects.insert(
1234            pages_id,
1235            Object::Dictionary(dictionary! {
1236                "Type" => "Pages",
1237                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1238                "Count" => Object::Integer(1),
1239            }),
1240        );
1241        let catalog_id = doc.add_object(Object::Dictionary(dictionary! {
1242            "Type" => "Catalog",
1243            "Pages" => Object::Reference(pages_id),
1244        }));
1245        doc.trailer.set("Root", Object::Reference(catalog_id));
1246
1247        let mut bytes = Vec::new();
1248        doc.save_to(&mut bytes).expect("save non-embedded fixture");
1249        bytes
1250    }
1251
1252    /// Build a minimal AcroForm push button whose only human-readable text
1253    /// lives in the widget `/MK /CA` caption entry.
1254    fn push_button_caption_pdf_bytes(caption: &[u8]) -> Vec<u8> {
1255        use lopdf::{dictionary, Document, Object, Stream, StringFormat};
1256
1257        let mut doc = Document::with_version("1.4");
1258
1259        let catalog_id = doc.new_object_id();
1260        let pages_id = doc.new_object_id();
1261        let page_id = doc.new_object_id();
1262        let acroform_id = doc.new_object_id();
1263        let content_id = doc.new_object_id();
1264        let widget_id = doc.new_object_id();
1265
1266        doc.objects.insert(
1267            content_id,
1268            Object::Stream(Stream::new(dictionary! {}, Vec::new())),
1269        );
1270        doc.objects.insert(
1271            widget_id,
1272            Object::Dictionary(dictionary! {
1273                "Type" => "Annot",
1274                "Subtype" => "Widget",
1275                "FT" => "Btn",
1276                "Ff" => Object::Integer(1 << 16),
1277                "T" => Object::String(b"button".to_vec(), StringFormat::Literal),
1278                "MK" => dictionary! {
1279                    "CA" => Object::String(caption.to_vec(), StringFormat::Literal),
1280                },
1281                "Rect" => Object::Array(vec![100.into(), 700.into(), 260.into(), 730.into()]),
1282                "P" => Object::Reference(page_id),
1283            }),
1284        );
1285        doc.objects.insert(
1286            page_id,
1287            Object::Dictionary(dictionary! {
1288                "Type" => "Page",
1289                "Parent" => Object::Reference(pages_id),
1290                "MediaBox" => Object::Array(vec![0.into(), 0.into(), 612.into(), 792.into()]),
1291                "Annots" => Object::Array(vec![Object::Reference(widget_id)]),
1292                "Contents" => Object::Reference(content_id),
1293            }),
1294        );
1295        doc.objects.insert(
1296            pages_id,
1297            Object::Dictionary(dictionary! {
1298                "Type" => "Pages",
1299                "Kids" => Object::Array(vec![Object::Reference(page_id)]),
1300                "Count" => Object::Integer(1),
1301            }),
1302        );
1303        doc.objects.insert(
1304            acroform_id,
1305            Object::Dictionary(dictionary! {
1306                "Fields" => Object::Array(vec![Object::Reference(widget_id)]),
1307            }),
1308        );
1309        doc.objects.insert(
1310            catalog_id,
1311            Object::Dictionary(dictionary! {
1312                "Type" => "Catalog",
1313                "Pages" => Object::Reference(pages_id),
1314                "AcroForm" => Object::Reference(acroform_id),
1315            }),
1316        );
1317        doc.trailer.set("Root", Object::Reference(catalog_id));
1318
1319        let mut bytes = Vec::new();
1320        doc.save_to(&mut bytes)
1321            .expect("save push-button caption fixture");
1322        bytes
1323    }
1324
1325    #[test]
1326    fn extract_text_non_embedded_truetype_alias_resolves_via_winansi() {
1327        // Mirrors corpus PDF `171_171940.pdf`: TrueType font references
1328        // `TimesNewRoman` (resolves through the standard-font alias table)
1329        // with `WinAnsiEncoding` and no embedded font program. Extraction must
1330        // recover the text from the declared encoding even though no glyph
1331        // outlines are available.
1332        let bytes = non_embedded_truetype_pdf_bytes(
1333            b"TimesNewRoman",
1334            b"WinAnsiEncoding",
1335            b"UNITED STATES DISTRICT COURT",
1336        );
1337        let text = PdfDocument::open(bytes)
1338            .expect("open non-embedded TrueType fixture")
1339            .extract_text(0)
1340            .expect("extract non-embedded TrueType text");
1341        let norm = normalize_text(&text);
1342        assert!(
1343            norm.contains("UNITED STATES DISTRICT COURT"),
1344            "expected WinAnsi-decoded text, got: {norm:?}"
1345        );
1346    }
1347
1348    #[test]
1349    fn extract_text_non_embedded_truetype_unknown_name_still_decodes() {
1350        // Custom BaseFont that does not match any standard alias and lacks the
1351        // keywords used by the heuristic. The standard-font fallback (via
1352        // FallbackFontQuery) still picks Helvetica, but on hosts without the
1353        // embedded font assets that path returns None — the new TextOnly
1354        // branch is what keeps extraction non-empty in that case. Either way,
1355        // the WinAnsi-driven char map must produce the original prose.
1356        let bytes = non_embedded_truetype_pdf_bytes(
1357            b"OpaqueCustomXYZ",
1358            b"WinAnsiEncoding",
1359            b"Hello, world!",
1360        );
1361        let text = PdfDocument::open(bytes)
1362            .expect("open custom non-embedded fixture")
1363            .extract_text(0)
1364            .expect("extract custom non-embedded text");
1365        let norm = normalize_text(&text);
1366        assert!(
1367            norm.contains("Hello, world!"),
1368            "expected WinAnsi-decoded text, got: {norm:?}"
1369        );
1370    }
1371
1372    #[test]
1373    fn extract_acroform_text_includes_push_button_mk_caption() {
1374        let bytes = push_button_caption_pdf_bytes(b"Don't cry over spilt milk");
1375        let doc = PdfDocument::open(bytes).expect("open push-button caption fixture");
1376
1377        let page_text = doc.extract_text(0).expect("extract page text");
1378        assert!(
1379            normalize_text(&page_text).is_empty(),
1380            "expected empty page content stream, got: {page_text:?}"
1381        );
1382
1383        let acroform_text = doc.extract_acroform_text();
1384        assert_eq!(normalize_text(&acroform_text), "Don't cry over spilt milk");
1385
1386        let all_text = doc.extract_all_text();
1387        assert_eq!(normalize_text(&all_text), "Don't cry over spilt milk");
1388    }
1389
1390    #[test]
1391    fn bytes_to_string_utf8() {
1392        assert_eq!(bytes_to_string(b"hello"), "hello");
1393    }
1394
1395    #[test]
1396    fn bytes_to_string_latin1() {
1397        let bytes = &[0xC4, 0xD6, 0xDC]; // ÄÖÜ in Latin-1
1398        let s = bytes_to_string(bytes);
1399        assert_eq!(s, "ÄÖÜ");
1400    }
1401
1402    #[test]
1403    fn bytes_to_string_utf16() {
1404        let bytes = &[0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; // UTF-16 "Hi"
1405        assert_eq!(bytes_to_string(bytes), "Hi");
1406    }
1407
1408    #[test]
1409    fn document_info_default() {
1410        let info = DocumentInfo::default();
1411        assert!(info.title.is_none());
1412        assert!(info.author.is_none());
1413    }
1414
1415    #[test]
1416    fn bookmark_item_children() {
1417        let item = BookmarkItem {
1418            title: "Root".into(),
1419            page: None,
1420            children: vec![BookmarkItem {
1421                title: "Child".into(),
1422                page: Some(0),
1423                children: Vec::new(),
1424            }],
1425        };
1426        assert_eq!(item.children.len(), 1);
1427        assert_eq!(item.children[0].title, "Child");
1428    }
1429
1430    #[test]
1431    fn extract_text_type0_without_tounicode_uses_font_program_fallback() {
1432        let original = std::fs::read(corpus_path("sf181.pdf")).expect("read sf181 fixture");
1433        let expected = PdfDocument::open(original.clone())
1434            .expect("open original sf181")
1435            .extract_text(0)
1436            .expect("extract original sf181 text");
1437        assert!(
1438            expected.contains("Guide to Personnel Data Standards"),
1439            "unexpected baseline extraction: {expected}"
1440        );
1441
1442        let (stripped, removed) = strip_type0_tounicode(&original);
1443        assert!(
1444            removed > 0,
1445            "expected to strip at least one Type0 ToUnicode"
1446        );
1447
1448        let actual = PdfDocument::open(stripped)
1449            .expect("open stripped sf181")
1450            .extract_text(0)
1451            .expect("extract stripped sf181 text");
1452
1453        let actual_norm = normalize_text(&actual);
1454        let expected_norm = normalize_text(&expected);
1455
1456        assert!(
1457            actual_norm.contains("Guide to Personnel Data Standards"),
1458            "missing main heading after stripping ToUnicode: {actual_norm}"
1459        );
1460        assert!(
1461            actual_norm.contains("Privacy Act Statement"),
1462            "missing body text after stripping ToUnicode: {actual_norm}"
1463        );
1464        assert!(
1465            actual_norm.len() + 32 >= expected_norm.len(),
1466            "too much text lost after stripping ToUnicode: expected {} chars, got {}",
1467            expected_norm.len(),
1468            actual_norm.len()
1469        );
1470    }
1471
1472    #[test]
1473    fn extract_text_identity_h_bogus_tounicode_recovers_via_identity_fallback() {
1474        // PDFBOX-4322-3.pdf ships an Identity-H Type0 font whose `/ToUnicode`
1475        // stream is actually an Identity-H *encoding* CMap (only
1476        // `begincidrange <0000> <FFFF> 0`, no bf-mappings). The embedded
1477        // TrueType subset also has no `cmap` table, so both the ToUnicode
1478        // lookup and the reverse-cmap fallback fail. Previously this yielded
1479        // a 0-byte extraction because the character codes — which are Unicode
1480        // code points under Identity-H — were silently discarded.
1481        let bytes =
1482            std::fs::read(corpus_path("PDFBOX-4322-3.pdf")).expect("read PDFBOX-4322-3 fixture");
1483        let doc = PdfDocument::open(bytes).expect("open PDFBOX-4322-3");
1484        let text = doc.extract_all_text();
1485
1486        let norm = normalize_text(&text);
1487        assert!(
1488            norm.contains("Transatlantic Council"),
1489            "expected Identity-H codes to resolve as Unicode: {norm}"
1490        );
1491        assert!(
1492            norm.contains("Boy Scouts of America"),
1493            "expected body text to be recovered: {norm}"
1494        );
1495    }
1496
1497    #[test]
1498    fn render_max_pixels_none_is_unchanged_default_behavior() {
1499        // Default (max_pixels = None) must be byte-identical to an explicit no-budget render.
1500        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1501        let baseline = doc
1502            .render_page(
1503                0,
1504                &RenderOptions {
1505                    dpi: 144.0,
1506                    ..Default::default()
1507                },
1508            )
1509            .expect("baseline render");
1510        let explicit_none = doc
1511            .render_page(
1512                0,
1513                &RenderOptions {
1514                    dpi: 144.0,
1515                    max_pixels: None,
1516                    ..Default::default()
1517                },
1518            )
1519            .expect("explicit-none render");
1520        assert_eq!(baseline.width, explicit_none.width);
1521        assert_eq!(baseline.height, explicit_none.height);
1522        assert_eq!(baseline.pixels, explicit_none.pixels);
1523    }
1524
1525    #[test]
1526    fn render_max_pixels_budget_clamps_resolution() {
1527        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1528        let full = doc
1529            .render_page(
1530                0,
1531                &RenderOptions {
1532                    dpi: 288.0,
1533                    ..Default::default()
1534                },
1535            )
1536            .expect("full render");
1537        let full_px = full.width * full.height;
1538        // Budget well below the full pixel count must reduce output dimensions.
1539        let budget = full_px / 4;
1540        let capped = doc
1541            .render_page(
1542                0,
1543                &RenderOptions {
1544                    dpi: 288.0,
1545                    max_pixels: Some(budget),
1546                    ..Default::default()
1547                },
1548            )
1549            .expect("capped render");
1550        assert!(
1551            capped.width * capped.height <= full_px,
1552            "capped output must not exceed full output"
1553        );
1554        assert!(
1555            capped.width < full.width || capped.height < full.height,
1556            "budget below full pixel count must shrink at least one dimension"
1557        );
1558    }
1559
1560    #[test]
1561    fn render_max_pixels_large_budget_no_clamp() {
1562        // A budget larger than the rendered size must not change the output.
1563        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open fixture");
1564        let baseline = doc
1565            .render_page(
1566                0,
1567                &RenderOptions {
1568                    dpi: 72.0,
1569                    ..Default::default()
1570                },
1571            )
1572            .expect("baseline");
1573        let huge = doc
1574            .render_page(
1575                0,
1576                &RenderOptions {
1577                    dpi: 72.0,
1578                    max_pixels: Some(100_000_000),
1579                    ..Default::default()
1580                },
1581            )
1582            .expect("huge-budget render");
1583        assert_eq!(baseline.width, huge.width);
1584        assert_eq!(baseline.height, huge.height);
1585        assert_eq!(baseline.pixels, huge.pixels);
1586    }
1587
1588    #[test]
1589    fn render_page_with_config_srgb_matches_legacy_render_page() {
1590        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 rg")).expect("open rgb fixture");
1591        let legacy = doc
1592            .render_page(
1593                0,
1594                &RenderOptions {
1595                    dpi: 72.0,
1596                    ..Default::default()
1597                },
1598            )
1599            .expect("legacy render succeeds");
1600        let configured = doc
1601            .render_page_with_config(
1602                0,
1603                &RenderConfig {
1604                    color_mode: ColorMode::Srgb,
1605                    dpi: 72,
1606                },
1607            )
1608            .expect("configured render succeeds");
1609
1610        assert_eq!(legacy.width, configured.width);
1611        assert_eq!(legacy.height, configured.height);
1612        assert_eq!(legacy.pixel_format, PixelFormat::Rgba8);
1613        assert_eq!(configured.pixel_format, PixelFormat::Rgba8);
1614        assert_eq!(legacy.pixels, configured.pixels);
1615    }
1616
1617    #[test]
1618    fn render_page_with_config_preserve_cmyk_returns_cmyk_buffer() {
1619        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1620        let rendered = doc
1621            .render_page_with_config(
1622                0,
1623                &RenderConfig {
1624                    color_mode: ColorMode::PreserveCmyk,
1625                    dpi: 72,
1626                },
1627            )
1628            .expect("cmyk render succeeds");
1629
1630        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1631        assert_eq!(
1632            rendered.pixels.len(),
1633            rendered.width as usize * rendered.height as usize * 4
1634        );
1635        assert_eq!(
1636            pixel_at(&rendered, rendered.width / 2, rendered.height / 2),
1637            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1638        );
1639    }
1640
1641    #[test]
1642    fn render_page_with_config_simulate_cmyk_does_not_panic_on_cmyk_pdf() {
1643        let doc = PdfDocument::open(solid_fill_pdf_bytes("1 0 0 0 k")).expect("open cmyk fixture");
1644        let rendered = doc
1645            .render_page_with_config(
1646                0,
1647                &RenderConfig {
1648                    color_mode: ColorMode::SimulateCmyk,
1649                    dpi: 72,
1650                },
1651            )
1652            .expect("simulate cmyk render succeeds");
1653
1654        assert_eq!(rendered.pixel_format, PixelFormat::Rgba8);
1655        assert!(!rendered.pixels.is_empty());
1656    }
1657
1658    #[test]
1659    fn render_page_with_config_preserve_cmyk_mixed_page_preserves_only_cmyk_region() {
1660        let doc = PdfDocument::open(mixed_rgb_cmyk_pdf_bytes()).expect("open mixed fixture");
1661        let rendered = doc
1662            .render_page_with_config(
1663                0,
1664                &RenderConfig {
1665                    color_mode: ColorMode::PreserveCmyk,
1666                    dpi: 72,
1667                },
1668            )
1669            .expect("mixed render succeeds");
1670
1671        assert_eq!(
1672            pixel_at(&rendered, 54, 36),
1673            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1674        );
1675        assert_ne!(
1676            pixel_at(&rendered, 18, 36),
1677            crate::color::preserve_device_cmyk(1.0, 0.0, 0.0, 0.0)
1678        );
1679    }
1680
1681    #[test]
1682    fn render_page_with_config_preserve_cmyk_transparent_page_does_not_crash() {
1683        let doc =
1684            PdfDocument::open(transparent_cmyk_pdf_bytes()).expect("open transparent cmyk fixture");
1685        let rendered = doc
1686            .render_page_with_config(
1687                0,
1688                &RenderConfig {
1689                    color_mode: ColorMode::PreserveCmyk,
1690                    dpi: 72,
1691                },
1692            )
1693            .expect("transparent cmyk render succeeds");
1694
1695        assert_eq!(rendered.pixel_format, PixelFormat::Cmyk8);
1696        assert_eq!(
1697            rendered.pixels.len(),
1698            rendered.width as usize * rendered.height as usize * 4
1699        );
1700    }
1701
1702    #[test]
1703    fn render_page_with_config_preserve_cmyk_keeps_device_cmyk_image_bytes() {
1704        let doc = PdfDocument::open(cmyk_image_pdf_bytes()).expect("open cmyk image fixture");
1705        let rendered = doc
1706            .render_page_with_config(
1707                0,
1708                &RenderConfig {
1709                    color_mode: ColorMode::PreserveCmyk,
1710                    dpi: 72,
1711                },
1712            )
1713            .expect("cmyk image render succeeds");
1714
1715        assert_eq!(rendered.width, 2);
1716        assert_eq!(rendered.height, 1);
1717        assert_eq!(pixel_at(&rendered, 0, 0), [255, 0, 0, 0]);
1718        assert_eq!(pixel_at(&rendered, 1, 0), [0, 255, 0, 0]);
1719    }
1720}
1721
1722#[cfg(test)]
1723mod load_recovery_tests {
1724    use super::PdfDocument;
1725
1726    #[test]
1727    fn broken_xref_sets_xref_rebuilt() {
1728        // Objects + trailer present, but startxref points nowhere: pd-syntax
1729        // rebuilds the xref by scanning, and the signal must reflect that.
1730        let body: &[u8] = b"%PDF-1.7\n\
17311 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n\
17322 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n\
17333 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>\nendobj\n\
1734trailer\n<< /Root 1 0 R /Size 4 >>\nstartxref\n999999\n%%EOF";
1735        let doc = PdfDocument::open(body.to_vec()).expect("recovers via xref rebuild");
1736        assert!(doc.load_recovery().xref_rebuilt, "xref_rebuilt must be set");
1737    }
1738
1739    #[test]
1740    fn clean_document_reports_no_recovery() {
1741        // A well-formed PDF with a correct xref must report no recovery.
1742        let objs: [&[u8]; 3] = [
1743            b"<< /Type /Catalog /Pages 2 0 R >>",
1744            b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
1745            b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 100 100] >>",
1746        ];
1747        let mut buf = Vec::new();
1748        let mut off = [0usize; 4];
1749        buf.extend_from_slice(b"%PDF-1.7\n");
1750        for (i, body) in objs.iter().enumerate() {
1751            off[i + 1] = buf.len();
1752            buf.extend_from_slice(format!("{} 0 obj\n", i + 1).as_bytes());
1753            buf.extend_from_slice(body);
1754            buf.extend_from_slice(b"\nendobj\n");
1755        }
1756        let xref_off = buf.len();
1757        buf.extend_from_slice(b"xref\n0 4\n0000000000 65535 f \n");
1758        for o in &off[1..4] {
1759            buf.extend_from_slice(format!("{o:010} 00000 n \n").as_bytes());
1760        }
1761        buf.extend_from_slice(
1762            format!("trailer\n<< /Root 1 0 R /Size 4 >>\nstartxref\n{xref_off}\n%%EOF").as_bytes(),
1763        );
1764        let doc = PdfDocument::open(buf).expect("clean doc opens");
1765        let r = doc.load_recovery();
1766        assert!(
1767            !r.xref_rebuilt && !r.page_tree_rebuilt,
1768            "a clean document must report no recovery; got {r:?}"
1769        );
1770    }
1771}