Skip to main content

orbok_extract/
pdf.rs

1//! PDF text extraction via lopdf (RFC-022 §6; RFC-044 §16.5 hardening).
2//!
3//! ## RFC-022 evaluation
4//!
5//! | Backend | Language | Japanese | License | Notes |
6//! |---|---|---|---|---|
7//! | **lopdf** (selected) | Rust | UTF-8 text only | MIT | Fast, pure Rust, page-level |
8//! | pdfium | Rust binding | Full Unicode | Apache 2.0 | Requires native library |
9//!
10//! Selected: lopdf for v0.7. Pure Rust, no FFI, adequate for text-heavy
11//! PDFs. Limitation: scanned / image-only PDFs produce no text.
12//!
13//! ## Security (RFC-015 §14)
14//!
15//! PDF parsing is treated as hostile input. Panics from lopdf are caught
16//! in `ExtractorRegistry::extract_safely` (RFC-044 §11). All errors are
17//! returned as typed `OrbokError::Extraction`.
18//!
19//! ## Location quality
20//!
21//! lopdf reports text at page granularity. All segments carry
22//! `LocationKind::Pages` and `LocationQuality::PageOnly`.
23//! UI must not label these as "line N".
24
25use crate::normalize::normalize_document as normalize_text;
26use crate::types::{
27    DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
28    LocationKind, LocationQuality, SegmentKind, read_error_category,
29};
30use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
31use orbok_fs::ValidatedPath;
32
33const EXTRACTOR_NAME: &str = "pdf-lopdf";
34const EXTRACTOR_VERSION: &str = "v1";
35
36pub struct PdfExtractor;
37
38impl DocumentExtractor for PdfExtractor {
39    fn name(&self) -> &'static str {
40        EXTRACTOR_NAME
41    }
42
43    fn version(&self) -> &'static str {
44        EXTRACTOR_VERSION
45    }
46
47    fn supported_extensions(&self) -> &'static [&'static str] {
48        &["pdf"]
49    }
50
51    fn extract_with_context(
52        &self,
53        path: &ValidatedPath,
54        context: &ExtractContext,
55    ) -> OrbokResult<ExtractOutput> {
56        let limits = &context.limits;
57        let mut warnings = Vec::new();
58
59        // RFC-044 §9.5: check file size before loading PDF.
60        let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
61            category: read_error_category(&e),
62            message: e.to_string(),
63        })?;
64        if meta.len() > limits.max_file_bytes {
65            return Err(OrbokError::Extraction {
66                category: ErrorCategory::FileTooLarge,
67                message: format!(
68                    "PDF is {} bytes, limit is {}",
69                    meta.len(),
70                    limits.max_file_bytes
71                ),
72            });
73        }
74
75        let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
76            let category =
77                if e.to_string().contains("password") || e.to_string().contains("encrypt") {
78                    ErrorCategory::EncryptedDocument
79                } else {
80                    ErrorCategory::ParserError
81                };
82            OrbokError::Extraction {
83                category,
84                message: format!("lopdf: {e}"),
85            }
86        })?;
87
88        let pages: Vec<(u32, u16)> = doc.page_iter().collect();
89        let total_pages = pages.len();
90
91        // RFC-044 §9.5: page count limit.
92        let pages_to_process = if total_pages > limits.max_pdf_pages {
93            warnings.push(ExtractWarning::SizeLimitReached {
94                limit_name: "max_pdf_pages".into(),
95            });
96            &pages[..limits.max_pdf_pages]
97        } else {
98            &pages[..]
99        };
100
101        let mut segments = Vec::new();
102        let mut total_chars = 0u64;
103        let mut unreadable_pages = Vec::new();
104
105        for (page_idx, (obj_id, _gen_id)) in pages_to_process.iter().enumerate() {
106            let page_num = (page_idx + 1) as u32;
107
108            // RFC-044 §9.5: extracted char limit.
109            if total_chars >= limits.max_extracted_chars {
110                warnings.push(ExtractWarning::SizeLimitReached {
111                    limit_name: "max_extracted_chars".into(),
112                });
113                break;
114            }
115
116            match doc.extract_text(&[*obj_id]) {
117                Ok(text) => {
118                    if text.trim().is_empty() {
119                        continue;
120                    }
121                    let normalized = normalize_text(&text);
122                    if normalized.trim().is_empty() {
123                        continue;
124                    }
125                    let page_chars = normalized.len() as u64;
126                    total_chars += page_chars;
127                    segments.push(ExtractedSegment {
128                        kind: SegmentKind::Other,
129                        text: normalized,
130                        line_start: page_num,
131                        line_end: page_num,
132                        location_kind: LocationKind::Pages,
133                        heading_path: Some(format!("Page {page_num}")),
134                        location_quality: LocationQuality::PageOnly,
135                    });
136                }
137                Err(_) => {
138                    // Page-level failure: record and continue (RFC-005 §13).
139                    unreadable_pages.push(page_num);
140                }
141            }
142        }
143
144        // Emit warnings for unreadable pages.
145        if !unreadable_pages.is_empty() {
146            warnings.push(ExtractWarning::SomePagesUnreadable {
147                pages: unreadable_pages,
148            });
149        }
150
151        // Detect scanned/image-only PDF (RFC-025).
152        if total_pages > 0 && total_chars == 0 {
153            tracing::debug!(
154                path = %path.canonical.display(),
155                pages = total_pages,
156                "PDF produced no text — may be scanned/image-only"
157            );
158            warnings.push(ExtractWarning::PossiblyScannedPdf);
159        }
160
161        Ok(ExtractOutput {
162            extractor_name: EXTRACTOR_NAME.to_string(),
163            extractor_version: EXTRACTOR_VERSION.to_string(),
164            normalization_version: NORMALIZATION_VERSION.to_string(),
165            segments,
166            char_count: total_chars,
167            warnings,
168        })
169    }
170
171    fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
172        self.extract_with_context(path, &ExtractContext::default())
173    }
174}
175
176/// Detect whether a PDF appears to be scanned/image-only (RFC-025).
177pub fn is_scanned_pdf(output: &ExtractOutput, page_count: usize) -> bool {
178    page_count > 0 && output.char_count == 0
179}
180
181/// Helper: try to get page count from a PDF without failing.
182pub fn pdf_page_count(path: &std::path::Path) -> usize {
183    lopdf::Document::load(path)
184        .map(|d| d.get_pages().len())
185        .unwrap_or(0)
186}