1use crate::normalize::normalize_document as normalize_text;
26use crate::types::{
27 DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
28 LocationKind, LocationQuality, SegmentKind, read_error_category,
29};
30use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
31use orbok_fs::ValidatedPath;
32
33const EXTRACTOR_NAME: &str = "pdf-lopdf";
34const EXTRACTOR_VERSION: &str = "v1";
35
36pub struct PdfExtractor;
37
38impl DocumentExtractor for PdfExtractor {
39 fn name(&self) -> &'static str {
40 EXTRACTOR_NAME
41 }
42
43 fn version(&self) -> &'static str {
44 EXTRACTOR_VERSION
45 }
46
47 fn supported_extensions(&self) -> &'static [&'static str] {
48 &["pdf"]
49 }
50
51 fn extract_with_context(
52 &self,
53 path: &ValidatedPath,
54 context: &ExtractContext,
55 ) -> OrbokResult<ExtractOutput> {
56 let limits = &context.limits;
57 let mut warnings = Vec::new();
58
59 let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
61 category: read_error_category(&e),
62 message: e.to_string(),
63 })?;
64 if meta.len() > limits.max_file_bytes {
65 return Err(OrbokError::Extraction {
66 category: ErrorCategory::FileTooLarge,
67 message: format!(
68 "PDF is {} bytes, limit is {}",
69 meta.len(),
70 limits.max_file_bytes
71 ),
72 });
73 }
74
75 let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
76 let category =
77 if e.to_string().contains("password") || e.to_string().contains("encrypt") {
78 ErrorCategory::EncryptedDocument
79 } else {
80 ErrorCategory::ParserError
81 };
82 OrbokError::Extraction {
83 category,
84 message: format!("lopdf: {e}"),
85 }
86 })?;
87
88 let pages: Vec<(u32, u16)> = doc.page_iter().collect();
89 let total_pages = pages.len();
90
91 let pages_to_process = if total_pages > limits.max_pdf_pages {
93 warnings.push(ExtractWarning::SizeLimitReached {
94 limit_name: "max_pdf_pages".into(),
95 });
96 &pages[..limits.max_pdf_pages]
97 } else {
98 &pages[..]
99 };
100
101 let mut segments = Vec::new();
102 let mut total_chars = 0u64;
103 let mut unreadable_pages = Vec::new();
104
105 for (page_idx, (obj_id, _gen_id)) in pages_to_process.iter().enumerate() {
106 let page_num = (page_idx + 1) as u32;
107
108 if total_chars >= limits.max_extracted_chars {
110 warnings.push(ExtractWarning::SizeLimitReached {
111 limit_name: "max_extracted_chars".into(),
112 });
113 break;
114 }
115
116 match doc.extract_text(&[*obj_id]) {
117 Ok(text) => {
118 if text.trim().is_empty() {
119 continue;
120 }
121 let normalized = normalize_text(&text);
122 if normalized.trim().is_empty() {
123 continue;
124 }
125 let page_chars = normalized.len() as u64;
126 total_chars += page_chars;
127 segments.push(ExtractedSegment {
128 kind: SegmentKind::Other,
129 text: normalized,
130 line_start: page_num,
131 line_end: page_num,
132 location_kind: LocationKind::Pages,
133 heading_path: Some(format!("Page {page_num}")),
134 location_quality: LocationQuality::PageOnly,
135 });
136 }
137 Err(_) => {
138 unreadable_pages.push(page_num);
140 }
141 }
142 }
143
144 if !unreadable_pages.is_empty() {
146 warnings.push(ExtractWarning::SomePagesUnreadable {
147 pages: unreadable_pages,
148 });
149 }
150
151 if total_pages > 0 && total_chars == 0 {
153 tracing::debug!(
154 path = %path.canonical.display(),
155 pages = total_pages,
156 "PDF produced no text — may be scanned/image-only"
157 );
158 warnings.push(ExtractWarning::PossiblyScannedPdf);
159 }
160
161 Ok(ExtractOutput {
162 extractor_name: EXTRACTOR_NAME.to_string(),
163 extractor_version: EXTRACTOR_VERSION.to_string(),
164 normalization_version: NORMALIZATION_VERSION.to_string(),
165 segments,
166 char_count: total_chars,
167 warnings,
168 })
169 }
170
171 fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
172 self.extract_with_context(path, &ExtractContext::default())
173 }
174}
175
176pub fn is_scanned_pdf(output: &ExtractOutput, page_count: usize) -> bool {
178 page_count > 0 && output.char_count == 0
179}
180
181pub fn pdf_page_count(path: &std::path::Path) -> usize {
183 lopdf::Document::load(path)
184 .map(|d| d.get_pages().len())
185 .unwrap_or(0)
186}