use crate::normalize::normalize_document as normalize_text;
use crate::types::{
DocumentExtractor, ExtractContext, ExtractOutput, ExtractWarning, ExtractedSegment,
LocationKind, LocationQuality, SegmentKind, read_error_category,
};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult, versions::NORMALIZATION_VERSION};
use orbok_fs::ValidatedPath;
const EXTRACTOR_NAME: &str = "pdf-lopdf";
const EXTRACTOR_VERSION: &str = "v1";
pub struct PdfExtractor;
impl DocumentExtractor for PdfExtractor {
fn name(&self) -> &'static str {
EXTRACTOR_NAME
}
fn version(&self) -> &'static str {
EXTRACTOR_VERSION
}
fn supported_extensions(&self) -> &'static [&'static str] {
&["pdf"]
}
fn extract_with_context(
&self,
path: &ValidatedPath,
context: &ExtractContext,
) -> OrbokResult<ExtractOutput> {
let limits = &context.limits;
let mut warnings = Vec::new();
let meta = std::fs::metadata(&path.canonical).map_err(|e| OrbokError::Extraction {
category: read_error_category(&e),
message: e.to_string(),
})?;
if meta.len() > limits.max_file_bytes {
return Err(OrbokError::Extraction {
category: ErrorCategory::FileTooLarge,
message: format!(
"PDF is {} bytes, limit is {}",
meta.len(),
limits.max_file_bytes
),
});
}
let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
let category =
if e.to_string().contains("password") || e.to_string().contains("encrypt") {
ErrorCategory::EncryptedDocument
} else {
ErrorCategory::ParserError
};
OrbokError::Extraction {
category,
message: format!("lopdf: {e}"),
}
})?;
let pages: Vec<(u32, u16)> = doc.page_iter().collect();
let total_pages = pages.len();
let pages_to_process = if total_pages > limits.max_pdf_pages {
warnings.push(ExtractWarning::SizeLimitReached {
limit_name: "max_pdf_pages".into(),
});
&pages[..limits.max_pdf_pages]
} else {
&pages[..]
};
let mut segments = Vec::new();
let mut total_chars = 0u64;
let mut unreadable_pages = Vec::new();
for (page_idx, (obj_id, _gen_id)) in pages_to_process.iter().enumerate() {
let page_num = (page_idx + 1) as u32;
if total_chars >= limits.max_extracted_chars {
warnings.push(ExtractWarning::SizeLimitReached {
limit_name: "max_extracted_chars".into(),
});
break;
}
match doc.extract_text(&[*obj_id]) {
Ok(text) => {
if text.trim().is_empty() {
continue;
}
let normalized = normalize_text(&text);
if normalized.trim().is_empty() {
continue;
}
let page_chars = normalized.len() as u64;
total_chars += page_chars;
segments.push(ExtractedSegment {
kind: SegmentKind::Other,
text: normalized,
line_start: page_num,
line_end: page_num,
location_kind: LocationKind::Pages,
heading_path: Some(format!("Page {page_num}")),
location_quality: LocationQuality::PageOnly,
});
}
Err(_) => {
unreadable_pages.push(page_num);
}
}
}
if !unreadable_pages.is_empty() {
warnings.push(ExtractWarning::SomePagesUnreadable {
pages: unreadable_pages,
});
}
if total_pages > 0 && total_chars == 0 {
tracing::debug!(
path = %path.canonical.display(),
pages = total_pages,
"PDF produced no text — may be scanned/image-only"
);
warnings.push(ExtractWarning::PossiblyScannedPdf);
}
Ok(ExtractOutput {
extractor_name: EXTRACTOR_NAME.to_string(),
extractor_version: EXTRACTOR_VERSION.to_string(),
normalization_version: NORMALIZATION_VERSION.to_string(),
segments,
char_count: total_chars,
warnings,
})
}
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
self.extract_with_context(path, &ExtractContext::default())
}
}
pub fn is_scanned_pdf(output: &ExtractOutput, page_count: usize) -> bool {
page_count > 0 && output.char_count == 0
}
pub fn pdf_page_count(path: &std::path::Path) -> usize {
lopdf::Document::load(path)
.map(|d| d.get_pages().len())
.unwrap_or(0)
}