orbok_extract/pdf.rs
1//! PDF text extraction via lopdf (RFC-022 §6).
2//!
3//! ## RFC-022 evaluation
4//!
5//! Two backends were evaluated:
6//!
7//! | Backend | Language | Japanese | License | Notes |
8//! |---|---|---|---|---|
9//! | **lopdf** (selected) | Rust | UTF-8 text only | MIT | Fast, pure Rust, page-level |
10//! | pdfium | Rust binding | Full Unicode | Apache 2.0 | Requires native library |
11//!
12//! **Selected: lopdf** for v0.7. Reasons: pure Rust (no FFI), compiles
13//! everywhere, adequate for text-heavy PDFs. Limitation: scanned or
14//! image-only PDFs produce no text (location_quality = Unknown).
15//!
16//! pdfium is tracked as a future backend for richer PDF support once
17//! the native dependency packaging is solved (RFC-022 deferred).
18//!
19//! ## Security (RFC-015 §14)
20//!
21//! PDF parsing is treated as hostile input. All errors are caught and
22//! returned as `OrbokError::Extraction` with category
23//! `ParserError` or `EncryptedDocument`. Panics from lopdf's parser are
24//! caught via `std::panic::catch_unwind` in the extraction driver
25//! (RFC-005 §13 isolation requirement).
26//!
27//! ## Location quality
28//!
29//! lopdf reports text at page granularity. All segments carry
30//! `LocationQuality::PageOnly`. Line-level offsets are not available;
31//! UI must not show line numbers for PDF results.
32//!
33//! ## Japanese
34//!
35//! UTF-8 encoded PDFs (common in modern Japanese documents) extract
36//! correctly. Legacy SJIS/EUC PDFs may produce garbled text; the
37//! extractor does not attempt character-encoding conversion in v0.7.
38
39use crate::normalize::normalize_document as normalize_text;
40use crate::types::{
41 DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
42};
43use orbok_core::versions::NORMALIZATION_VERSION;
44use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
45use orbok_fs::ValidatedPath;
46
47/// PDF extractor using lopdf (RFC-022).
48pub struct PdfExtractor;
49
50const EXTRACTOR_NAME: &str = "pdf-lopdf";
51const EXTRACTOR_VERSION: &str = "v1";
52
53impl DocumentExtractor for PdfExtractor {
54 fn name(&self) -> &'static str {
55 EXTRACTOR_NAME
56 }
57
58 fn version(&self) -> &'static str {
59 EXTRACTOR_VERSION
60 }
61
62 fn supported_extensions(&self) -> &'static [&'static str] {
63 &["pdf"]
64 }
65
66 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
67 let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
68 let category =
69 if e.to_string().contains("password") || e.to_string().contains("encrypt") {
70 ErrorCategory::EncryptedDocument
71 } else {
72 ErrorCategory::ParserError
73 };
74 OrbokError::Extraction {
75 category,
76 message: format!("lopdf: {e}"),
77 }
78 })?;
79
80 let mut segments = Vec::new();
81 let mut total_chars = 0u64;
82 let pages: Vec<(u32, u16)> = doc.page_iter().collect();
83 let total_pages = pages.len() as u32;
84
85 for (page_idx, (obj_id, _gen_id)) in pages.iter().enumerate() {
86 let page_num = (page_idx + 1) as u32;
87 let text = extract_page_text(&doc, *obj_id, page_num)?;
88 if text.trim().is_empty() {
89 continue;
90 }
91 let normalized = normalize_text(&text);
92 if normalized.trim().is_empty() {
93 continue;
94 }
95 total_chars += normalized.len() as u64;
96 segments.push(ExtractedSegment {
97 kind: SegmentKind::Other,
98 text: normalized,
99 line_start: page_num,
100 line_end: page_num,
101 heading_path: Some(format!("Page {page_num}")),
102 location_quality: LocationQuality::PageOnly,
103 });
104 }
105
106 if segments.is_empty() {
107 tracing::debug!(
108 path = %path.canonical.display(),
109 pages = total_pages,
110 "PDF produced no text — may be scanned/image-only"
111 );
112 }
113
114 Ok(ExtractOutput {
115 extractor_name: EXTRACTOR_NAME.to_string(),
116 extractor_version: EXTRACTOR_VERSION.to_string(),
117 normalization_version: NORMALIZATION_VERSION.to_string(),
118 segments,
119 char_count: total_chars,
120 })
121 }
122}
123
124/// Extract text from one page, returning an empty string on any error.
125///
126/// lopdf's `extract_text` returns a `Result<String>`. Errors are
127/// swallowed per RFC-005 §13 (failure isolation: one page failure must
128/// not stop extraction of the whole document).
129fn extract_page_text(doc: &lopdf::Document, obj_id: u32, _page_num: u32) -> OrbokResult<String> {
130 match doc.extract_text(&[obj_id]) {
131 Ok(text) => Ok(text),
132 Err(_) => Ok(String::new()), // page-level failure isolation
133 }
134}
135
136/// Detect whether a PDF appears to be scanned/image-only (RFC-025).
137///
138/// Returns `true` when the PDF has pages but extracted text is empty.
139/// In this case the user should be informed that OCR is needed.
140/// orbok v0.7 does not include an OCR engine; OCR is tracked in RFC-025.
141pub fn is_scanned_pdf(output: &super::types::ExtractOutput, page_count: usize) -> bool {
142 page_count > 0 && output.char_count == 0
143}
144
145/// Helper: try to get page count from a PDF without failing.
146pub fn pdf_page_count(path: &std::path::Path) -> usize {
147 lopdf::Document::load(path)
148 .map(|d| d.get_pages().len())
149 .unwrap_or(0)
150}