crw_pdf/
lib.rs

1// Vendored from firecrawl/pdf-inspector — suppress upstream clippy warnings
2#![allow(clippy::useless_vec, clippy::vec_init_then_push, unused_variables)]
3
4//! Smart PDF detection and text extraction using lopdf
5//!
6//! # Quick start
7//!
8//! ```no_run
9//! // Full processing (detect + extract + markdown) with defaults
10//! let result = crw_pdf::process_pdf("document.pdf").unwrap();
11//! println!("type: {:?}, pages: {}", result.pdf_type, result.page_count);
12//! if let Some(md) = &result.markdown {
13//!     println!("{md}");
14//! }
15//!
16//! // Fast metadata-only detection (no text extraction)
17//! let info = crw_pdf::detect_pdf("document.pdf").unwrap();
18//! println!("type: {:?}, pages: {}", info.pdf_type, info.page_count);
19//!
20//! // Custom options via builder
21//! use crw_pdf::{PdfOptions, ProcessMode};
22//! let result = crw_pdf::process_pdf_with_options(
23//!     "document.pdf",
24//!     PdfOptions::new().mode(ProcessMode::Analyze),
25//! ).unwrap();
26//! ```
27
28pub mod adobe_korea1;
29pub mod detector;
30pub mod extractor;
31pub mod glyph_names;
32pub mod markdown;
33pub mod process_mode;
34pub mod structure_tree;
35pub mod tables;
36pub mod text_utils;
37pub mod tounicode;
38pub mod types;
39
40pub use detector::{
41    detect_pdf_type, detect_pdf_type_mem, detect_pdf_type_mem_with_config,
42    detect_pdf_type_with_config, DetectionConfig, PdfType, PdfTypeResult, ScanStrategy,
43};
44pub use extractor::{extract_text, extract_text_with_positions, extract_text_with_positions_pages};
45pub use markdown::{
46    to_markdown, to_markdown_from_items, to_markdown_from_items_with_rects, MarkdownOptions,
47};
48pub use process_mode::ProcessMode;
49pub use types::{LayoutComplexity, PdfLine, PdfRect, TextItem};
50
51use lopdf::Document;
52use std::collections::HashSet;
53use std::path::Path;
54use tounicode::FontCMaps;
55
56// =========================================================================
57// Result type
58// =========================================================================
59
60/// High-level PDF processing result.
61#[derive(Debug)]
62pub struct PdfProcessResult {
63    /// The detected PDF type.
64    pub pdf_type: PdfType,
65    /// Markdown output (populated in [`ProcessMode::Full`], `None` otherwise).
66    pub markdown: Option<String>,
67    /// Page count.
68    pub page_count: u32,
69    /// Processing time in milliseconds.
70    pub processing_time_ms: u64,
71    /// 1-indexed page numbers that need OCR.
72    pub pages_needing_ocr: Vec<u32>,
73    /// Title from PDF metadata (if available).
74    pub title: Option<String>,
75    /// Detection confidence score (0.0–1.0).
76    pub confidence: f32,
77    /// Layout complexity analysis (tables, multi-column detection).
78    pub layout: LayoutComplexity,
79    /// `true` when broken font encodings are detected (garbled text,
80    /// replacement characters). Clients should fall back to OCR.
81    pub has_encoding_issues: bool,
82}
83
84// =========================================================================
85// Options builder
86// =========================================================================
87
88/// Configuration for [`process_pdf_with_options`] and friends.
89///
90/// Use the builder methods to customise behaviour:
91///
92/// ```
93/// use crw_pdf::{PdfOptions, ProcessMode};
94///
95/// let opts = PdfOptions::new()
96///     .mode(ProcessMode::Analyze)
97///     .pages([1, 3, 5]);
98/// ```
99#[derive(Debug, Clone)]
100pub struct PdfOptions {
101    /// How far the pipeline should run (default: [`ProcessMode::Full`]).
102    pub mode: ProcessMode,
103    /// Detection configuration.
104    pub detection: DetectionConfig,
105    /// Markdown formatting options (only used in [`ProcessMode::Full`]).
106    pub markdown: MarkdownOptions,
107    /// Optional set of 1-indexed pages to process.  `None` = all pages.
108    pub page_filter: Option<HashSet<u32>>,
109}
110
111impl Default for PdfOptions {
112    fn default() -> Self {
113        Self {
114            mode: ProcessMode::Full,
115            detection: DetectionConfig::default(),
116            markdown: MarkdownOptions::default(),
117            page_filter: None,
118        }
119    }
120}
121
122impl PdfOptions {
123    /// Create options with all defaults ([`ProcessMode::Full`]).
124    pub fn new() -> Self {
125        Self::default()
126    }
127
128    /// Shorthand for detect-only options.
129    pub fn detect_only() -> Self {
130        Self {
131            mode: ProcessMode::DetectOnly,
132            ..Self::default()
133        }
134    }
135
136    /// Set the processing mode.
137    pub fn mode(mut self, mode: ProcessMode) -> Self {
138        self.mode = mode;
139        self
140    }
141
142    /// Set detection configuration.
143    pub fn detection(mut self, config: DetectionConfig) -> Self {
144        self.detection = config;
145        self
146    }
147
148    /// Set markdown formatting options.
149    pub fn markdown(mut self, options: MarkdownOptions) -> Self {
150        self.markdown = options;
151        self
152    }
153
154    /// Limit processing to specific 1-indexed pages.
155    pub fn pages(mut self, pages: impl IntoIterator<Item = u32>) -> Self {
156        self.page_filter = Some(pages.into_iter().collect());
157        self
158    }
159}
160
161// =========================================================================
162// Public convenience functions
163// =========================================================================
164
165/// Process a PDF file with full extraction (detect → extract → markdown).
166///
167/// This is the most common entry point.  Equivalent to
168/// `process_pdf_with_options(path, PdfOptions::new())`.
169pub fn process_pdf<P: AsRef<Path>>(path: P) -> Result<PdfProcessResult, PdfError> {
170    process_pdf_with_options(path, PdfOptions::new())
171}
172
173/// Fast metadata-only detection — no text extraction or markdown generation.
174///
175/// Equivalent to `process_pdf_with_options(path, PdfOptions::detect_only())`.
176pub fn detect_pdf<P: AsRef<Path>>(path: P) -> Result<PdfProcessResult, PdfError> {
177    process_pdf_with_options(path, PdfOptions::detect_only())
178}
179
180/// Process a PDF file with custom options.
181///
182/// The document is loaded **once** and shared between detection and extraction.
183pub fn process_pdf_with_options<P: AsRef<Path>>(
184    path: P,
185    options: PdfOptions,
186) -> Result<PdfProcessResult, PdfError> {
187    let start = std::time::Instant::now();
188    validate_pdf_file(&path)?;
189
190    // Load the document once — shared by detection AND extraction.
191    let (doc, page_count) = load_document_from_path(&path)?;
192
193    process_document(doc, page_count, options, start)
194}
195
196/// Process a PDF from a memory buffer with full extraction.
197pub fn process_pdf_mem(buffer: &[u8]) -> Result<PdfProcessResult, PdfError> {
198    process_pdf_mem_with_options(buffer, PdfOptions::new())
199}
200
201/// Fast metadata-only detection from a memory buffer.
202pub fn detect_pdf_mem(buffer: &[u8]) -> Result<PdfProcessResult, PdfError> {
203    process_pdf_mem_with_options(buffer, PdfOptions::detect_only())
204}
205
206/// Process a PDF from a memory buffer with custom options.
207///
208/// The buffer is parsed **once** and shared between detection and extraction.
209pub fn process_pdf_mem_with_options(
210    buffer: &[u8],
211    options: PdfOptions,
212) -> Result<PdfProcessResult, PdfError> {
213    let start = std::time::Instant::now();
214    validate_pdf_bytes(buffer)?;
215
216    let (doc, page_count) = load_document_from_mem(buffer)?;
217
218    process_document(doc, page_count, options, start)
219}
220
221// =========================================================================
222// Deprecated compat shims
223// =========================================================================
224
225/// Process a PDF file with custom detection and markdown configuration.
226#[deprecated(since = "0.2.0", note = "Use process_pdf_with_options instead")]
227pub fn process_pdf_with_config<P: AsRef<Path>>(
228    path: P,
229    config: DetectionConfig,
230    markdown_options: MarkdownOptions,
231) -> Result<PdfProcessResult, PdfError> {
232    process_pdf_with_options(
233        path,
234        PdfOptions::new()
235            .detection(config)
236            .markdown(markdown_options),
237    )
238}
239
240/// Process a PDF file with custom configuration and optional page filter.
241#[deprecated(since = "0.2.0", note = "Use process_pdf_with_options instead")]
242pub fn process_pdf_with_config_pages<P: AsRef<Path>>(
243    path: P,
244    config: DetectionConfig,
245    markdown_options: MarkdownOptions,
246    page_filter: Option<&HashSet<u32>>,
247) -> Result<PdfProcessResult, PdfError> {
248    let mut opts = PdfOptions::new()
249        .detection(config)
250        .markdown(markdown_options);
251    opts.page_filter = page_filter.cloned();
252    process_pdf_with_options(path, opts)
253}
254
255/// Process PDF from memory buffer with custom detection and markdown configuration.
256#[deprecated(since = "0.2.0", note = "Use process_pdf_mem_with_options instead")]
257pub fn process_pdf_mem_with_config(
258    buffer: &[u8],
259    config: DetectionConfig,
260    markdown_options: MarkdownOptions,
261) -> Result<PdfProcessResult, PdfError> {
262    process_pdf_mem_with_options(
263        buffer,
264        PdfOptions::new()
265            .detection(config)
266            .markdown(markdown_options),
267    )
268}
269
270// =========================================================================
271// Internal: single-load document pipeline
272// =========================================================================
273
274/// Load a PDF from disk, returning the parsed document and page count.
275///
276/// `Document::load_metadata` for page count + `Document::load` for content
277/// are combined here, but lopdf loads the full doc in `load()` so we extract
278/// page count from it directly to avoid the metadata-only round-trip.
279fn load_document_from_path<P: AsRef<Path>>(path: P) -> Result<(Document, u32), PdfError> {
280    let buffer = std::fs::read(&path)?;
281    load_document_from_mem(&buffer)
282}
283
284/// Load a PDF from a memory buffer.
285fn load_document_from_mem(buffer: &[u8]) -> Result<(Document, u32), PdfError> {
286    // Fix malformed struct element names before parsing. Some PDF generators
287    // write bare names (/S Code) instead of proper PDF names (/S /Code), which
288    // causes lopdf to silently drop the entire object.
289    let fixed = structure_tree::fix_bare_struct_names(buffer);
290    let buf = fixed.as_ref();
291
292    let doc = match Document::load_mem(buf) {
293        Ok(d) => d,
294        Err(ref e) if is_encrypted_lopdf_error(e) => Document::load_mem_with_password(buf, "")?,
295        Err(e) => return Err(e.into()),
296    };
297    let page_count = doc.get_pages().len() as u32;
298    Ok((doc, page_count))
299}
300
301/// Core processing pipeline operating on a pre-loaded document.
302fn process_document(
303    doc: Document,
304    page_count: u32,
305    options: PdfOptions,
306    start: std::time::Instant,
307) -> Result<PdfProcessResult, PdfError> {
308    // Step 1 — Detection (cheap: scans content streams for text operators)
309    let detection = detector::detect_from_document(&doc, page_count, &options.detection)?;
310    let pdf_type = detection.pdf_type;
311    let pages_needing_ocr = detection.pages_needing_ocr;
312    let title = detection.title;
313    let confidence = detection.confidence;
314
315    // DetectOnly → return immediately
316    if options.mode == ProcessMode::DetectOnly {
317        return Ok(PdfProcessResult {
318            pdf_type,
319            markdown: None,
320            page_count,
321            processing_time_ms: start.elapsed().as_millis() as u64,
322            pages_needing_ocr,
323            title,
324            confidence,
325            layout: LayoutComplexity::default(),
326            has_encoding_issues: false,
327        });
328    }
329
330    // Scanned / ImageBased → nothing to extract
331    if matches!(pdf_type, PdfType::Scanned | PdfType::ImageBased) {
332        return Ok(PdfProcessResult {
333            pdf_type,
334            markdown: None,
335            page_count,
336            processing_time_ms: start.elapsed().as_millis() as u64,
337            pages_needing_ocr,
338            title,
339            confidence,
340            layout: LayoutComplexity::default(),
341            has_encoding_issues: false,
342        });
343    }
344
345    // Step 2 — Extraction (reuses the already-loaded document)
346    let extracted = {
347        let font_cmaps = FontCMaps::from_doc(&doc);
348        let result = extractor::extract_positioned_text_from_doc(
349            &doc,
350            &font_cmaps,
351            options.page_filter.as_ref(),
352        );
353
354        // For Mixed/template PDFs: if normal extraction produces garbage text
355        // (mostly non-alphanumeric), retry with invisible (Tr=3) text included.
356        // This unlocks OCR text layers behind scanned images.
357        if pdf_type == PdfType::Mixed {
358            if let Ok((ref items, _, _)) = result.as_ref().map(|(e, _, _)| e) {
359                let sample: String = items.iter().take(200).map(|i| i.text.as_str()).collect();
360                if is_garbage_text(&sample) || sample.trim().is_empty() {
361                    extractor::extract_positioned_text_include_invisible(
362                        &doc,
363                        &font_cmaps,
364                        options.page_filter.as_ref(),
365                    )
366                } else {
367                    result
368                }
369            } else {
370                // Normal extraction failed — try invisible as fallback
371                extractor::extract_positioned_text_include_invisible(
372                    &doc,
373                    &font_cmaps,
374                    options.page_filter.as_ref(),
375                )
376            }
377        } else {
378            result
379        }
380    };
381
382    // For Mixed PDFs, extraction failure is non-fatal
383    let extracted = if pdf_type == PdfType::Mixed {
384        extracted.ok()
385    } else {
386        Some(extracted?)
387    };
388
389    // Parse structure tree for tagged PDFs (reuses the loaded document)
390    let (struct_roles, struct_tables) = structure_tree::StructTree::from_doc(&doc)
391        .map(|tree| {
392            let page_ids = doc.get_pages();
393            let roles = tree.mcid_to_roles(&page_ids);
394            let tables = tree.extract_tables(&page_ids);
395            if !roles.is_empty() {
396                log::debug!(
397                    "structure tree: {} pages with MCID roles, {} total MCIDs, {} tagged tables",
398                    roles.len(),
399                    tree.mcid_count(),
400                    tables.len()
401                );
402            }
403            let roles = if roles.is_empty() { None } else { Some(roles) };
404            (roles, tables)
405        })
406        .unwrap_or((None, Vec::new()));
407
408    let (markdown, layout, has_encoding_issues, gid_pages) = match extracted {
409        Some(((items, rects, lines), page_thresholds, gid_encoded_pages)) => {
410            // For TextBased PDFs with pages flagged for OCR (Identity-H or
411            // Type3 fonts without ToUnicode), check whether the CID-as-Unicode
412            // passthrough actually produced readable text.  If a page's text
413            // is garbage, strip its items so we don't emit mojibake.
414            // Only applies to TextBased — for Mixed PDFs, OCR flags come from
415            // template images rather than font encoding issues.
416            let (items, rects, lines) =
417                if pages_needing_ocr.is_empty() || pdf_type != PdfType::TextBased {
418                    (items, rects, lines)
419                } else {
420                    let ocr_set: std::collections::HashSet<u32> =
421                        pages_needing_ocr.iter().copied().collect();
422                    // Collect text per OCR-flagged page and check quality
423                    let mut garbage_pages: std::collections::HashSet<u32> =
424                        std::collections::HashSet::new();
425                    for &pg in &ocr_set {
426                        let page_text: String = items
427                            .iter()
428                            .filter(|i| i.page == pg)
429                            .map(|i| i.text.as_str())
430                            .collect();
431                        if is_cid_garbage(&page_text) {
432                            garbage_pages.insert(pg);
433                        }
434                    }
435                    if garbage_pages.is_empty() {
436                        (items, rects, lines)
437                    } else {
438                        log::debug!(
439                            "suppressing garbage text from OCR-flagged pages: {:?}",
440                            garbage_pages
441                        );
442                        let items: Vec<_> = items
443                            .into_iter()
444                            .filter(|i| !garbage_pages.contains(&i.page))
445                            .collect();
446                        let rects: Vec<_> = rects
447                            .into_iter()
448                            .filter(|r| !garbage_pages.contains(&r.page))
449                            .collect();
450                        let lines: Vec<_> = lines
451                            .into_iter()
452                            .filter(|l| !garbage_pages.contains(&l.page))
453                            .collect();
454                        (items, rects, lines)
455                    }
456                };
457
458            let layout = compute_layout_complexity(&items, &rects, &lines);
459
460            let md = if options.mode == ProcessMode::Analyze {
461                None
462            } else {
463                Some(markdown::to_markdown_from_items_with_rects_and_lines(
464                    items,
465                    options.markdown,
466                    &rects,
467                    &lines,
468                    &page_thresholds,
469                    struct_roles.as_ref(),
470                    &struct_tables,
471                ))
472            };
473
474            let enc = md.as_ref().is_some_and(|m| detect_encoding_issues(m));
475            (md, layout, enc, gid_encoded_pages)
476        }
477        None => (
478            None,
479            LayoutComplexity::default(),
480            false,
481            std::collections::HashSet::new(),
482        ),
483    };
484
485    // If the extracted text is predominantly garbage (non-alphanumeric) and
486    // the PDF is image-backed (Mixed/template), upgrade to Scanned — the text
487    // layer comes from a bad OCR pass, and callers should use proper OCR.
488    let (pdf_type, markdown, confidence) =
489        if pdf_type == PdfType::Mixed && markdown.as_ref().is_some_and(|m| is_garbage_text(m)) {
490            (PdfType::Scanned, None, 0.95)
491        } else {
492            (pdf_type, markdown, confidence)
493        };
494
495    // If a TextBased PDF produces garbage text, the fonts are undecodable
496    // (e.g. Identity-H without ToUnicode for non-Latin scripts like Cyrillic).
497    // Drop the useless markdown and flag all pages for OCR.
498    let (markdown, has_encoding_issues, force_ocr_all) = if pdf_type == PdfType::TextBased
499        && markdown.as_ref().is_some_and(|m| is_garbage_text(m))
500    {
501        log::debug!("TextBased PDF has garbage text — flagging all pages for OCR");
502        (None, true, true)
503    } else {
504        (markdown, has_encoding_issues, false)
505    };
506
507    // Add pages with gid-encoded fonts (unresolvable encoding) to OCR list.
508    // When ALL pages have gid-encoded fonts, suppress unreliable markdown.
509    let all_gid = !gid_pages.is_empty() && gid_pages.len() as u32 >= page_count;
510    let mut pages_needing_ocr = pages_needing_ocr;
511    if force_ocr_all {
512        pages_needing_ocr = (1..=page_count).collect();
513    }
514    if !gid_pages.is_empty() {
515        log::debug!("pages with gid-encoded fonts (need OCR): {:?}", gid_pages);
516        for page in gid_pages {
517            if !pages_needing_ocr.contains(&page) {
518                pages_needing_ocr.push(page);
519            }
520        }
521        pages_needing_ocr.sort_unstable();
522    }
523
524    // Detect sparse extraction: when a TEXT-BASED PDF produces very few
525    // characters per page, the text is likely embedded in images/forms
526    // that need OCR.  Flag all pages for OCR in this case.
527    // Only check when markdown was actually generated (not in Analyze mode).
528    if pdf_type == PdfType::TextBased
529        && page_count > 0
530        && pages_needing_ocr.is_empty()
531        && markdown.is_some()
532    {
533        let md_len = markdown.as_ref().map_or(0, |m| m.len());
534        let chars_per_page = md_len as f32 / page_count as f32;
535        if chars_per_page < 50.0 && md_len < 500 {
536            log::debug!(
537                "sparse extraction: {:.0} chars/page — recommending OCR for all {} pages",
538                chars_per_page,
539                page_count
540            );
541            pages_needing_ocr = (1..=page_count).collect();
542        }
543    }
544
545    let markdown = if all_gid {
546        log::debug!(
547            "all {} pages have gid-encoded fonts — suppressing markdown output",
548            page_count
549        );
550        None
551    } else {
552        markdown
553    };
554
555    Ok(PdfProcessResult {
556        pdf_type,
557        markdown,
558        page_count,
559        processing_time_ms: start.elapsed().as_millis() as u64,
560        pages_needing_ocr,
561        title,
562        confidence,
563        layout,
564        has_encoding_issues,
565    })
566}
567
568// =========================================================================
569// Internal helpers
570// =========================================================================
571
572/// Detect broken font encodings in extracted markdown text.
573///
574/// Two heuristics:
575/// 1. **U+FFFD**: Any replacement character indicates decode failures.
576/// 2. **Dollar-as-space**: Pattern like `Word$Word$Word` where `$` is used as a
577///    word separator due to broken ToUnicode CMaps. Triggers when either:
578///    - More than 50% of `$` are between letters (clear substitution pattern), OR
579///    - More than 20 letter-dollar-letter occurrences (even if some `$` are also
580///      used as trailing/leading separators, 20+ is far beyond normal financial text).
581fn detect_encoding_issues(markdown: &str) -> bool {
582    // Heuristic 1: U+FFFD replacement characters
583    if markdown.contains('\u{FFFD}') {
584        return true;
585    }
586
587    // Heuristic 2: dollar-as-space pattern
588    let total_dollars = markdown.matches('$').count();
589    if total_dollars > 10 {
590        let bytes = markdown.as_bytes();
591        let mut letter_dollar_letter = 0usize;
592        for i in 1..bytes.len().saturating_sub(1) {
593            if bytes[i] == b'$'
594                && bytes[i - 1].is_ascii_alphabetic()
595                && bytes[i + 1].is_ascii_alphabetic()
596            {
597                letter_dollar_letter += 1;
598            }
599        }
600        if letter_dollar_letter > 20 || letter_dollar_letter * 2 > total_dollars {
601            return true;
602        }
603    }
604
605    false
606}
607
608/// Check if extracted text is predominantly garbage (non-alphanumeric).
609///
610/// Broken font encodings produce text like "----1-.-.-.___  --.-. .._ I_---."
611/// where most characters are punctuation/symbols. Real text in any language
612/// has >50% alphanumeric characters.
613fn is_garbage_text(markdown: &str) -> bool {
614    let mut alphanum = 0usize;
615    let mut non_alphanum = 0usize;
616    for ch in markdown.chars() {
617        if ch.is_whitespace() {
618            continue;
619        }
620        // Skip markdown syntax chars that we add (not from the PDF)
621        if matches!(ch, '#' | '*' | '|' | '-' | '\n') {
622            continue;
623        }
624        if ch.is_alphanumeric() {
625            alphanum += 1;
626        } else {
627            non_alphanum += 1;
628        }
629    }
630    let total = alphanum + non_alphanum;
631    total >= 50 && alphanum * 2 < total
632}
633
634/// Detect garbage from failed CID-to-Unicode mapping on Identity-H fonts.
635///
636/// When CID values don't correspond to Unicode codepoints, the raw bytes often
637/// produce characters in the C1 control range (U+0080–U+009F) or Private Use
638/// Area, mixed with random Latin Extended characters.  Valid text in any
639/// language almost never contains C1 controls.  We also fall back to the
640/// general `is_garbage_text` check for non-alphanumeric-heavy patterns.
641fn is_cid_garbage(text: &str) -> bool {
642    if is_garbage_text(text) {
643        return true;
644    }
645    let mut total = 0usize;
646    let mut c1_control = 0usize;
647    let mut high_latin = 0usize;
648    for ch in text.chars() {
649        if ch.is_whitespace() {
650            continue;
651        }
652        total += 1;
653        // C1 control characters (U+0080–U+009F) — almost never in real text
654        if ('\u{0080}'..='\u{009F}').contains(&ch) {
655            c1_control += 1;
656        }
657        // High Latin-1 (U+00A0–U+00FF) — legitimate in Western European text
658        // but when combined with ASCII in CID passthrough, indicates mojibake
659        // from CID values being misinterpreted as Latin-1 characters.
660        if ('\u{00A0}'..='\u{00FF}').contains(&ch) {
661            high_latin += 1;
662        }
663    }
664    if total < 5 {
665        return false;
666    }
667    // If ≥5% of non-whitespace chars are C1 controls, it's garbage
668    if c1_control * 20 >= total {
669        return true;
670    }
671    // If ≥40% of non-whitespace chars are high Latin-1 AND the text has few
672    // ASCII letters, it's likely CID-as-Latin-1 mojibake (Japanese/CJK PDFs
673    // where CID values 0x80-0xFF become accented Latin characters).
674    let ascii_letters = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
675    high_latin * 5 >= total * 2 && ascii_letters * 3 < total
676}
677
678/// Analyse extracted items and rects for layout complexity.
679fn compute_layout_complexity(
680    items: &[types::TextItem],
681    rects: &[types::PdfRect],
682    lines: &[types::PdfLine],
683) -> LayoutComplexity {
684    use markdown::analysis::calculate_font_stats_from_items;
685
686    // --- Collect unique pages ---
687    let mut seen_pages: Vec<u32> = items.iter().map(|i| i.page).collect();
688    seen_pages.sort();
689    seen_pages.dedup();
690
691    let font_stats = calculate_font_stats_from_items(items);
692    let base_size = font_stats.most_common_size;
693
694    // --- Tables: use rect-based → line-based → heuristic detectors per page,
695    //     with side-by-side band splitting ---
696    let mut pages_with_tables: Vec<u32> = Vec::new();
697    for &page in &seen_pages {
698        let page_items: Vec<&types::TextItem> = items.iter().filter(|i| i.page == page).collect();
699
700        // Check for side-by-side layout
701        let owned_items: Vec<types::TextItem> = page_items.iter().map(|i| (*i).clone()).collect();
702        let bands = markdown::split_side_by_side(&owned_items);
703
704        let band_ranges: Vec<(f32, f32)> = if bands.is_empty() {
705            // Single region — use sentinel range that includes everything
706            vec![(f32::MIN, f32::MAX)]
707        } else {
708            bands
709        };
710
711        let mut found_table = false;
712        for &(x_lo, x_hi) in &band_ranges {
713            let margin = 2.0;
714            let band_items: Vec<types::TextItem> = owned_items
715                .iter()
716                .filter(|item| {
717                    x_lo == f32::MIN || (item.x >= x_lo - margin && item.x < x_hi + margin)
718                })
719                .cloned()
720                .collect();
721
722            let band_rects: Vec<types::PdfRect> = if x_lo == f32::MIN {
723                rects.iter().filter(|r| r.page == page).cloned().collect()
724            } else {
725                markdown::filter_rects_to_band(rects, page, x_lo, x_hi)
726            };
727
728            let band_lines: Vec<types::PdfLine> = if x_lo == f32::MIN {
729                lines.iter().filter(|l| l.page == page).cloned().collect()
730            } else {
731                markdown::filter_lines_to_band(lines, page, x_lo, x_hi)
732            };
733
734            let (rect_tables, _) = tables::detect_tables_from_rects(&band_items, &band_rects, page);
735            if !rect_tables.is_empty() {
736                found_table = true;
737                break;
738            }
739            let line_tables = tables::detect_tables_from_lines(&band_items, &band_lines, page);
740            if !line_tables.is_empty() {
741                found_table = true;
742                break;
743            }
744            // Heuristic fallback for borderless tables
745            let heuristic_tables = tables::detect_tables(&band_items, base_size, false);
746            if !heuristic_tables.is_empty() {
747                found_table = true;
748                break;
749            }
750        }
751        if found_table {
752            pages_with_tables.push(page);
753        }
754    }
755
756    let mut pages_with_columns: Vec<u32> = Vec::new();
757    for page in seen_pages {
758        let cols = extractor::detect_columns(items, page, pages_with_tables.contains(&page));
759        if cols.len() >= 2 {
760            pages_with_columns.push(page);
761        }
762    }
763
764    let is_complex = !pages_with_tables.is_empty() || !pages_with_columns.is_empty();
765
766    LayoutComplexity {
767        is_complex,
768        pages_with_tables,
769        pages_with_columns,
770    }
771}
772
773#[derive(Debug, thiserror::Error)]
774pub enum PdfError {
775    #[error("IO error: {0}")]
776    Io(#[from] std::io::Error),
777    #[error("PDF parsing error: {0}")]
778    Parse(String),
779    #[error("PDF is encrypted")]
780    Encrypted,
781    #[error("Invalid PDF structure")]
782    InvalidStructure,
783    #[error("Not a PDF: {0}")]
784    NotAPdf(String),
785}
786
787impl From<lopdf::Error> for PdfError {
788    fn from(e: lopdf::Error) -> Self {
789        match e {
790            lopdf::Error::IO(io_err) => PdfError::Io(io_err),
791            lopdf::Error::Decryption(_)
792            | lopdf::Error::InvalidPassword
793            | lopdf::Error::AlreadyEncrypted
794            | lopdf::Error::UnsupportedSecurityHandler(_) => PdfError::Encrypted,
795            lopdf::Error::Unimplemented(msg) if msg.contains("encrypted") => PdfError::Encrypted,
796            lopdf::Error::Parse(ref pe) if pe.to_string().contains("invalid file header") => {
797                PdfError::NotAPdf("invalid PDF file header".to_string())
798            }
799            lopdf::Error::MissingXrefEntry
800            | lopdf::Error::Xref(_)
801            | lopdf::Error::IndirectObject { .. }
802            | lopdf::Error::ObjectIdMismatch
803            | lopdf::Error::InvalidObjectStream(_)
804            | lopdf::Error::InvalidOffset(_) => PdfError::InvalidStructure,
805            other => PdfError::Parse(other.to_string()),
806        }
807    }
808}
809
810/// Check whether a `lopdf::Error` represents an encryption-related failure.
811pub(crate) fn is_encrypted_lopdf_error(e: &lopdf::Error) -> bool {
812    matches!(
813        e,
814        lopdf::Error::Decryption(_)
815            | lopdf::Error::InvalidPassword
816            | lopdf::Error::AlreadyEncrypted
817            | lopdf::Error::UnsupportedSecurityHandler(_)
818    ) || matches!(e, lopdf::Error::Unimplemented(msg) if msg.contains("encrypted"))
819}
820
821// ---------------------------------------------------------------------------
822// PDF validation helpers
823// ---------------------------------------------------------------------------
824
825/// Strip UTF-8 BOM and leading ASCII whitespace from a byte slice.
826fn strip_bom_and_whitespace(bytes: &[u8]) -> &[u8] {
827    let b = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
828        &bytes[3..]
829    } else {
830        bytes
831    };
832    let start = b
833        .iter()
834        .position(|&c| !c.is_ascii_whitespace())
835        .unwrap_or(b.len());
836    &b[start..]
837}
838
839/// Case-insensitive prefix check on byte slices.
840fn starts_with_ci(haystack: &[u8], needle: &[u8]) -> bool {
841    if haystack.len() < needle.len() {
842        return false;
843    }
844    haystack[..needle.len()]
845        .iter()
846        .zip(needle)
847        .all(|(a, b)| a.eq_ignore_ascii_case(b))
848}
849
850/// Try to identify what kind of file the bytes represent.
851fn detect_file_type_hint(bytes: &[u8]) -> String {
852    if bytes.is_empty() {
853        return "file is empty".to_string();
854    }
855
856    let trimmed = strip_bom_and_whitespace(bytes);
857
858    // HTML
859    if starts_with_ci(trimmed, b"<!doctype html")
860        || starts_with_ci(trimmed, b"<html")
861        || starts_with_ci(trimmed, b"<head")
862        || starts_with_ci(trimmed, b"<body")
863    {
864        return "file appears to be HTML".to_string();
865    }
866
867    // XML (but not HTML)
868    if trimmed.starts_with(b"<?xml") || trimmed.starts_with(b"<") {
869        if starts_with_ci(trimmed, b"<?xml") {
870            return "file appears to be XML".to_string();
871        }
872        if trimmed.starts_with(b"<") && !trimmed.starts_with(b"<%") {
873            return "file appears to be XML".to_string();
874        }
875    }
876
877    // JSON
878    if trimmed.starts_with(b"{") || trimmed.starts_with(b"[") {
879        return "file appears to be JSON".to_string();
880    }
881
882    // PNG
883    if bytes.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
884        return "file appears to be a PNG image".to_string();
885    }
886
887    // JPEG
888    if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
889        return "file appears to be a JPEG image".to_string();
890    }
891
892    // ZIP / Office documents
893    if bytes.starts_with(&[0x50, 0x4B, 0x03, 0x04]) {
894        return "file appears to be a ZIP archive (possibly an Office document)".to_string();
895    }
896
897    // If it looks like mostly printable ASCII/UTF-8, call it plain text
898    let sample = &bytes[..bytes.len().min(512)];
899    let printable = sample
900        .iter()
901        .filter(|&&b| b.is_ascii_graphic() || b.is_ascii_whitespace())
902        .count();
903    if printable > sample.len() * 3 / 4 {
904        return "file appears to be plain text".to_string();
905    }
906
907    "file is not a PDF".to_string()
908}
909
910/// Validate that a byte buffer looks like a PDF (has `%PDF-` magic).
911///
912/// Scans the first 1024 bytes, allowing for a UTF-8 BOM and leading whitespace.
913pub(crate) fn validate_pdf_bytes(buffer: &[u8]) -> Result<(), PdfError> {
914    if buffer.is_empty() {
915        return Err(PdfError::NotAPdf(detect_file_type_hint(buffer)));
916    }
917
918    let header = &buffer[..buffer.len().min(1024)];
919    let trimmed = strip_bom_and_whitespace(header);
920
921    if trimmed.starts_with(b"%PDF-") {
922        Ok(())
923    } else {
924        Err(PdfError::NotAPdf(detect_file_type_hint(buffer)))
925    }
926}
927
928/// Validate that a file on disk looks like a PDF.
929///
930/// Reads only the first 1024 bytes and delegates to [`validate_pdf_bytes`].
931pub(crate) fn validate_pdf_file<P: AsRef<Path>>(path: P) -> Result<(), PdfError> {
932    use std::io::Read;
933    let mut file = std::fs::File::open(path)?;
934    let mut buf = [0u8; 1024];
935    let n = file.read(&mut buf)?;
936    validate_pdf_bytes(&buf[..n])
937}
938
939#[cfg(test)]
940mod tests {
941    use super::*;
942
943    #[test]
944    fn test_detect_encoding_issues_fffd() {
945        assert!(detect_encoding_issues(
946            "Some text with \u{FFFD} replacement"
947        ));
948    }
949
950    #[test]
951    fn test_detect_encoding_issues_dollar_as_space() {
952        // Simulates broken CMap: "$Workshop$on$Chest$Wall$Deformities$and$..."
953        let garbled = "Last$advanced$Book$Programm$3th$Workshop$on$Chest$Wall$Deformities$and$More";
954        assert!(detect_encoding_issues(garbled));
955    }
956
957    #[test]
958    fn test_detect_encoding_issues_financial_text() {
959        // Legitimate dollar signs in financial text should NOT trigger
960        let financial = "Revenue was $100M in Q1, up from $90M. Costs: $50M, $30M, $20M, $15M, $12M, $8M, $5M, $3M, $2M, $1M, $500K.";
961        assert!(!detect_encoding_issues(financial));
962    }
963
964    #[test]
965    fn test_detect_encoding_issues_clean_text() {
966        assert!(!detect_encoding_issues(
967            "Normal markdown text with no issues."
968        ));
969    }
970
971    #[test]
972    fn test_detect_encoding_issues_few_dollars() {
973        // Under threshold of 10 total dollars — should not trigger
974        let text = "a$b c$d e$f";
975        assert!(!detect_encoding_issues(text));
976    }
977
978    #[test]
979    fn test_garbage_text_detection() {
980        // Simulates garbage output from Identity-H fonts without ToUnicode.
981        // Needs >= 50 non-whitespace chars and < 50% alphanumeric.
982        let garbage = ",&<X ~%5&8-!A ~*(!,-!U (/#!U X ~#/=U 9/%*(!U !(  X \
983                       (%U-(-/ V %&((8-#&&< *,(6--< %5&8-!( (,(/! #/<5U X \
984                       º&( >/5 /5&(#(8-!5 *,(6--( *,%@/-A W";
985        assert!(is_garbage_text(garbage));
986
987        // Normal text should not be garbage
988        let normal = "This is a normal paragraph with words and sentences that contains enough characters to pass the threshold.";
989        assert!(!is_garbage_text(normal));
990
991        // Cyrillic text should not be garbage
992        let cyrillic =
993            "Роботизированные технологии комплексы для производства металлургических предприятий";
994        assert!(!is_garbage_text(cyrillic));
995    }
996
997    #[test]
998    fn test_cid_garbage_detection() {
999        // Simulates CID garbage from Identity-H fonts: Latin Extended chars
1000        // mixed with C1 control characters (U+0080–U+009F).
1001        let cid_garbage = "Ë>íÓ\tý\r\u{0088}æ&Ït\u{0094}äí;\ný;wAL¢©èåD\rü£\
1002                           qq\u{0096}¶Í Æ\réá; Ô 7G\u{008B}ý;èÕç¢ £ ý;C";
1003        assert!(
1004            is_cid_garbage(cid_garbage),
1005            "CID garbage with C1 controls should be detected"
1006        );
1007
1008        // Valid Korean text (CID-as-Unicode passthrough) should NOT be garbage
1009        let korean = "본 가격표는 국내 거주 중인 외국인을 위한 한국어 가격표의 비공식 번역본입니다";
1010        assert!(
1011            !is_cid_garbage(korean),
1012            "Valid Korean text should not be flagged as garbage"
1013        );
1014
1015        // Valid Japanese text should NOT be garbage
1016        let japanese = "羽田空港新飛行経路に係る航空機騒音の測定結果";
1017        assert!(
1018            !is_cid_garbage(japanese),
1019            "Valid Japanese text should not be flagged as garbage"
1020        );
1021    }
1022}
crw_pdf/lib.rs

crw_pdf/
lib.rs