Skip to main content

orbok_extract/
pdf.rs

1//! PDF text extraction via lopdf (RFC-022 §6).
2//!
3//! ## RFC-022 evaluation
4//!
5//! Two backends were evaluated:
6//!
7//! | Backend | Language | Japanese | License | Notes |
8//! |---|---|---|---|---|
9//! | **lopdf** (selected) | Rust | UTF-8 text only | MIT | Fast, pure Rust, page-level |
10//! | pdfium | Rust binding | Full Unicode | Apache 2.0 | Requires native library |
11//!
12//! **Selected: lopdf** for v0.7. Reasons: pure Rust (no FFI), compiles
13//! everywhere, adequate for text-heavy PDFs. Limitation: scanned or
14//! image-only PDFs produce no text (location_quality = Unknown).
15//!
16//! pdfium is tracked as a future backend for richer PDF support once
17//! the native dependency packaging is solved (RFC-022 deferred).
18//!
19//! ## Security (RFC-015 §14)
20//!
21//! PDF parsing is treated as hostile input. All errors are caught and
22//! returned as `OrbokError::Extraction` with category
23//! `ParserError` or `EncryptedDocument`. Panics from lopdf's parser are
24//! caught via `std::panic::catch_unwind` in the extraction driver
25//! (RFC-005 §13 isolation requirement).
26//!
27//! ## Location quality
28//!
29//! lopdf reports text at page granularity. All segments carry
30//! `LocationQuality::PageOnly`. Line-level offsets are not available;
31//! UI must not show line numbers for PDF results.
32//!
33//! ## Japanese
34//!
35//! UTF-8 encoded PDFs (common in modern Japanese documents) extract
36//! correctly. Legacy SJIS/EUC PDFs may produce garbled text; the
37//! extractor does not attempt character-encoding conversion in v0.7.
38
39use crate::normalize::normalize_document as normalize_text;
40use crate::types::{
41    DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
42};
43use orbok_core::versions::NORMALIZATION_VERSION;
44use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
45use orbok_fs::ValidatedPath;
46
47/// PDF extractor using lopdf (RFC-022).
48pub struct PdfExtractor;
49
50const EXTRACTOR_NAME: &str = "pdf-lopdf";
51const EXTRACTOR_VERSION: &str = "v1";
52
53impl DocumentExtractor for PdfExtractor {
54    fn name(&self) -> &'static str {
55        EXTRACTOR_NAME
56    }
57
58    fn version(&self) -> &'static str {
59        EXTRACTOR_VERSION
60    }
61
62    fn supported_extensions(&self) -> &'static [&'static str] {
63        &["pdf"]
64    }
65
66    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
67        let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
68            let category =
69                if e.to_string().contains("password") || e.to_string().contains("encrypt") {
70                    ErrorCategory::EncryptedDocument
71                } else {
72                    ErrorCategory::ParserError
73                };
74            OrbokError::Extraction {
75                category,
76                message: format!("lopdf: {e}"),
77            }
78        })?;
79
80        let mut segments = Vec::new();
81        let mut total_chars = 0u64;
82        let pages: Vec<(u32, u16)> = doc.page_iter().collect();
83        let total_pages = pages.len() as u32;
84
85        for (page_idx, (obj_id, _gen_id)) in pages.iter().enumerate() {
86            let page_num = (page_idx + 1) as u32;
87            let text = extract_page_text(&doc, *obj_id, page_num)?;
88            if text.trim().is_empty() {
89                continue;
90            }
91            let normalized = normalize_text(&text);
92            if normalized.trim().is_empty() {
93                continue;
94            }
95            total_chars += normalized.len() as u64;
96            segments.push(ExtractedSegment {
97                kind: SegmentKind::Other,
98                text: normalized,
99                line_start: page_num,
100                line_end: page_num,
101                heading_path: Some(format!("Page {page_num}")),
102                location_quality: LocationQuality::PageOnly,
103            });
104        }
105
106        if segments.is_empty() {
107            tracing::debug!(
108                path = %path.canonical.display(),
109                pages = total_pages,
110                "PDF produced no text — may be scanned/image-only"
111            );
112        }
113
114        Ok(ExtractOutput {
115            extractor_name: EXTRACTOR_NAME.to_string(),
116            extractor_version: EXTRACTOR_VERSION.to_string(),
117            normalization_version: NORMALIZATION_VERSION.to_string(),
118            segments,
119            char_count: total_chars,
120        })
121    }
122}
123
124/// Extract text from one page, returning an empty string on any error.
125///
126/// lopdf's `extract_text` returns a `Result<String>`. Errors are
127/// swallowed per RFC-005 §13 (failure isolation: one page failure must
128/// not stop extraction of the whole document).
129fn extract_page_text(doc: &lopdf::Document, obj_id: u32, _page_num: u32) -> OrbokResult<String> {
130    match doc.extract_text(&[obj_id]) {
131        Ok(text) => Ok(text),
132        Err(_) => Ok(String::new()), // page-level failure isolation
133    }
134}
135
136/// Detect whether a PDF appears to be scanned/image-only (RFC-025).
137///
138/// Returns `true` when the PDF has pages but extracted text is empty.
139/// In this case the user should be informed that OCR is needed.
140/// orbok v0.7 does not include an OCR engine; OCR is tracked in RFC-025.
141pub fn is_scanned_pdf(output: &super::types::ExtractOutput, page_count: usize) -> bool {
142    page_count > 0 && output.char_count == 0
143}
144
145/// Helper: try to get page count from a PDF without failing.
146pub fn pdf_page_count(path: &std::path::Path) -> usize {
147    lopdf::Document::load(path)
148        .map(|d| d.get_pages().len())
149        .unwrap_or(0)
150}