Skip to main content

orbok_extract/
pdf.rs

1//! PDF text extraction via lopdf (RFC-022 §6).
2//!
3//! ## RFC-022 evaluation
4//!
5//! Two backends were evaluated:
6//!
7//! | Backend | Language | Japanese | License | Notes |
8//! |---|---|---|---|---|
9//! | **lopdf** (selected) | Rust | UTF-8 text only | MIT | Fast, pure Rust, page-level |
10//! | pdfium | Rust binding | Full Unicode | Apache 2.0 | Requires native library |
11//!
12//! **Selected: lopdf** for v0.7. Reasons: pure Rust (no FFI), compiles
13//! everywhere, adequate for text-heavy PDFs. Limitation: scanned or
14//! image-only PDFs produce no text (location_quality = Unknown).
15//!
16//! pdfium is tracked as a future backend for richer PDF support once
17//! the native dependency packaging is solved (RFC-022 deferred).
18//!
19//! ## Security (RFC-015 §14)
20//!
21//! PDF parsing is treated as hostile input. All errors are caught and
22//! returned as `OrbokError::Extraction` with category
23//! `ParserError` or `EncryptedDocument`. Panics from lopdf's parser are
24//! caught via `std::panic::catch_unwind` in the extraction driver
25//! (RFC-005 §13 isolation requirement).
26//!
27//! ## Location quality
28//!
29//! lopdf reports text at page granularity. All segments carry
30//! `LocationQuality::PageOnly`. Line-level offsets are not available;
31//! UI must not show line numbers for PDF results.
32//!
33//! ## Japanese
34//!
35//! UTF-8 encoded PDFs (common in modern Japanese documents) extract
36//! correctly. Legacy SJIS/EUC PDFs may produce garbled text; the
37//! extractor does not attempt character-encoding conversion in v0.7.
38
39use crate::normalize::normalize_document as normalize_text;
40use orbok_core::versions::NORMALIZATION_VERSION;
41use crate::types::{
42    DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
43};
44use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
45use orbok_fs::ValidatedPath;
46
47/// PDF extractor using lopdf (RFC-022).
48pub struct PdfExtractor;
49
50const EXTRACTOR_NAME: &str = "pdf-lopdf";
51const EXTRACTOR_VERSION: &str = "v1";
52
53impl DocumentExtractor for PdfExtractor {
54    fn name(&self) -> &'static str {
55        EXTRACTOR_NAME
56    }
57
58    fn version(&self) -> &'static str {
59        EXTRACTOR_VERSION
60    }
61
62    fn supported_extensions(&self) -> &'static [&'static str] {
63        &["pdf"]
64    }
65
66    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
67        let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
68            let category = if e.to_string().contains("password")
69                || e.to_string().contains("encrypt")
70            {
71                ErrorCategory::EncryptedDocument
72            } else {
73                ErrorCategory::ParserError
74            };
75            OrbokError::Extraction {
76                category,
77                message: format!("lopdf: {e}"),
78            }
79        })?;
80
81        let mut segments = Vec::new();
82        let mut total_chars = 0u64;
83        let pages: Vec<(u32, u16)> = doc.page_iter().collect();
84        let total_pages = pages.len() as u32;
85
86        for (page_idx, (obj_id, _gen_id)) in pages.iter().enumerate() {
87            let page_num = (page_idx + 1) as u32;
88            let text = extract_page_text(&doc, *obj_id, page_num)?;
89            if text.trim().is_empty() {
90                continue;
91            }
92            let normalized = normalize_text(&text);
93            if normalized.trim().is_empty() {
94                continue;
95            }
96            total_chars += normalized.len() as u64;
97            segments.push(ExtractedSegment {
98                kind: SegmentKind::Other,
99                text: normalized,
100                line_start: page_num,
101                line_end: page_num,
102                heading_path: Some(format!("Page {page_num}")),
103                location_quality: LocationQuality::PageOnly,
104            });
105        }
106
107        if segments.is_empty() {
108            tracing::debug!(
109                path = %path.canonical.display(),
110                pages = total_pages,
111                "PDF produced no text — may be scanned/image-only"
112            );
113        }
114
115        Ok(ExtractOutput {
116            extractor_name: EXTRACTOR_NAME.to_string(),
117            extractor_version: EXTRACTOR_VERSION.to_string(),
118            normalization_version: NORMALIZATION_VERSION.to_string(),
119            segments,
120            char_count: total_chars,
121        })
122    }
123}
124
125/// Extract text from one page, returning an empty string on any error.
126///
127/// lopdf's `extract_text` returns a `Result<String>`. Errors are
128/// swallowed per RFC-005 §13 (failure isolation: one page failure must
129/// not stop extraction of the whole document).
130fn extract_page_text(
131    doc: &lopdf::Document,
132    obj_id: u32,
133    _page_num: u32,
134) -> OrbokResult<String> {
135    match doc.extract_text(&[obj_id]) {
136        Ok(text) => Ok(text),
137        Err(_) => Ok(String::new()), // page-level failure isolation
138    }
139}
140
141/// Detect whether a PDF appears to be scanned/image-only (RFC-025).
142///
143/// Returns `true` when the PDF has pages but extracted text is empty.
144/// In this case the user should be informed that OCR is needed.
145/// orbok v0.7 does not include an OCR engine; OCR is tracked in RFC-025.
146pub fn is_scanned_pdf(output: &super::types::ExtractOutput, page_count: usize) -> bool {
147    page_count > 0 && output.char_count == 0
148}
149
150/// Helper: try to get page count from a PDF without failing.
151pub fn pdf_page_count(path: &std::path::Path) -> usize {
152    lopdf::Document::load(path)
153        .map(|d| d.get_pages().len())
154        .unwrap_or(0)
155}