orbok_extract/pdf.rs
1//! PDF text extraction via lopdf (RFC-022 §6).
2//!
3//! ## RFC-022 evaluation
4//!
5//! Two backends were evaluated:
6//!
7//! | Backend | Language | Japanese | License | Notes |
8//! |---|---|---|---|---|
9//! | **lopdf** (selected) | Rust | UTF-8 text only | MIT | Fast, pure Rust, page-level |
10//! | pdfium | Rust binding | Full Unicode | Apache 2.0 | Requires native library |
11//!
12//! **Selected: lopdf** for v0.7. Reasons: pure Rust (no FFI), compiles
13//! everywhere, adequate for text-heavy PDFs. Limitation: scanned or
14//! image-only PDFs produce no text (location_quality = Unknown).
15//!
16//! pdfium is tracked as a future backend for richer PDF support once
17//! the native dependency packaging is solved (RFC-022 deferred).
18//!
19//! ## Security (RFC-015 §14)
20//!
21//! PDF parsing is treated as hostile input. All errors are caught and
22//! returned as `OrbokError::Extraction` with category
23//! `ParserError` or `EncryptedDocument`. Panics from lopdf's parser are
24//! caught via `std::panic::catch_unwind` in the extraction driver
25//! (RFC-005 §13 isolation requirement).
26//!
27//! ## Location quality
28//!
29//! lopdf reports text at page granularity. All segments carry
30//! `LocationQuality::PageOnly`. Line-level offsets are not available;
31//! UI must not show line numbers for PDF results.
32//!
33//! ## Japanese
34//!
35//! UTF-8 encoded PDFs (common in modern Japanese documents) extract
36//! correctly. Legacy SJIS/EUC PDFs may produce garbled text; the
37//! extractor does not attempt character-encoding conversion in v0.7.
38
39use crate::normalize::normalize_document as normalize_text;
40use orbok_core::versions::NORMALIZATION_VERSION;
41use crate::types::{
42 DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
43};
44use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
45use orbok_fs::ValidatedPath;
46
47/// PDF extractor using lopdf (RFC-022).
48pub struct PdfExtractor;
49
50const EXTRACTOR_NAME: &str = "pdf-lopdf";
51const EXTRACTOR_VERSION: &str = "v1";
52
53impl DocumentExtractor for PdfExtractor {
54 fn name(&self) -> &'static str {
55 EXTRACTOR_NAME
56 }
57
58 fn version(&self) -> &'static str {
59 EXTRACTOR_VERSION
60 }
61
62 fn supported_extensions(&self) -> &'static [&'static str] {
63 &["pdf"]
64 }
65
66 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
67 let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
68 let category = if e.to_string().contains("password")
69 || e.to_string().contains("encrypt")
70 {
71 ErrorCategory::EncryptedDocument
72 } else {
73 ErrorCategory::ParserError
74 };
75 OrbokError::Extraction {
76 category,
77 message: format!("lopdf: {e}"),
78 }
79 })?;
80
81 let mut segments = Vec::new();
82 let mut total_chars = 0u64;
83 let pages: Vec<(u32, u16)> = doc.page_iter().collect();
84 let total_pages = pages.len() as u32;
85
86 for (page_idx, (obj_id, _gen_id)) in pages.iter().enumerate() {
87 let page_num = (page_idx + 1) as u32;
88 let text = extract_page_text(&doc, *obj_id, page_num)?;
89 if text.trim().is_empty() {
90 continue;
91 }
92 let normalized = normalize_text(&text);
93 if normalized.trim().is_empty() {
94 continue;
95 }
96 total_chars += normalized.len() as u64;
97 segments.push(ExtractedSegment {
98 kind: SegmentKind::Other,
99 text: normalized,
100 line_start: page_num,
101 line_end: page_num,
102 heading_path: Some(format!("Page {page_num}")),
103 location_quality: LocationQuality::PageOnly,
104 });
105 }
106
107 if segments.is_empty() {
108 tracing::debug!(
109 path = %path.canonical.display(),
110 pages = total_pages,
111 "PDF produced no text — may be scanned/image-only"
112 );
113 }
114
115 Ok(ExtractOutput {
116 extractor_name: EXTRACTOR_NAME.to_string(),
117 extractor_version: EXTRACTOR_VERSION.to_string(),
118 normalization_version: NORMALIZATION_VERSION.to_string(),
119 segments,
120 char_count: total_chars,
121 })
122 }
123}
124
125/// Extract text from one page, returning an empty string on any error.
126///
127/// lopdf's `extract_text` returns a `Result<String>`. Errors are
128/// swallowed per RFC-005 §13 (failure isolation: one page failure must
129/// not stop extraction of the whole document).
130fn extract_page_text(
131 doc: &lopdf::Document,
132 obj_id: u32,
133 _page_num: u32,
134) -> OrbokResult<String> {
135 match doc.extract_text(&[obj_id]) {
136 Ok(text) => Ok(text),
137 Err(_) => Ok(String::new()), // page-level failure isolation
138 }
139}
140
141/// Detect whether a PDF appears to be scanned/image-only (RFC-025).
142///
143/// Returns `true` when the PDF has pages but extracted text is empty.
144/// In this case the user should be informed that OCR is needed.
145/// orbok v0.7 does not include an OCR engine; OCR is tracked in RFC-025.
146pub fn is_scanned_pdf(output: &super::types::ExtractOutput, page_count: usize) -> bool {
147 page_count > 0 && output.char_count == 0
148}
149
150/// Helper: try to get page count from a PDF without failing.
151pub fn pdf_page_count(path: &std::path::Path) -> usize {
152 lopdf::Document::load(path)
153 .map(|d| d.get_pages().len())
154 .unwrap_or(0)
155}