use crate::normalize::normalize_document as normalize_text;
use crate::types::{
DocumentExtractor, ExtractOutput, ExtractedSegment, LocationQuality, SegmentKind,
};
use orbok_core::versions::NORMALIZATION_VERSION;
use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
use orbok_fs::ValidatedPath;
pub struct PdfExtractor;
const EXTRACTOR_NAME: &str = "pdf-lopdf";
const EXTRACTOR_VERSION: &str = "v1";
impl DocumentExtractor for PdfExtractor {
fn name(&self) -> &'static str {
EXTRACTOR_NAME
}
fn version(&self) -> &'static str {
EXTRACTOR_VERSION
}
fn supported_extensions(&self) -> &'static [&'static str] {
&["pdf"]
}
fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
let doc = lopdf::Document::load(&path.canonical).map_err(|e| {
let category =
if e.to_string().contains("password") || e.to_string().contains("encrypt") {
ErrorCategory::EncryptedDocument
} else {
ErrorCategory::ParserError
};
OrbokError::Extraction {
category,
message: format!("lopdf: {e}"),
}
})?;
let mut segments = Vec::new();
let mut total_chars = 0u64;
let pages: Vec<(u32, u16)> = doc.page_iter().collect();
let total_pages = pages.len() as u32;
for (page_idx, (obj_id, _gen_id)) in pages.iter().enumerate() {
let page_num = (page_idx + 1) as u32;
let text = extract_page_text(&doc, *obj_id, page_num)?;
if text.trim().is_empty() {
continue;
}
let normalized = normalize_text(&text);
if normalized.trim().is_empty() {
continue;
}
total_chars += normalized.len() as u64;
segments.push(ExtractedSegment {
kind: SegmentKind::Other,
text: normalized,
line_start: page_num,
line_end: page_num,
heading_path: Some(format!("Page {page_num}")),
location_quality: LocationQuality::PageOnly,
});
}
if segments.is_empty() {
tracing::debug!(
path = %path.canonical.display(),
pages = total_pages,
"PDF produced no text — may be scanned/image-only"
);
}
Ok(ExtractOutput {
extractor_name: EXTRACTOR_NAME.to_string(),
extractor_version: EXTRACTOR_VERSION.to_string(),
normalization_version: NORMALIZATION_VERSION.to_string(),
segments,
char_count: total_chars,
})
}
}
fn extract_page_text(doc: &lopdf::Document, obj_id: u32, _page_num: u32) -> OrbokResult<String> {
match doc.extract_text(&[obj_id]) {
Ok(text) => Ok(text),
Err(_) => Ok(String::new()), }
}
pub fn is_scanned_pdf(output: &super::types::ExtractOutput, page_count: usize) -> bool {
page_count > 0 && output.char_count == 0
}
pub fn pdf_page_count(path: &std::path::Path) -> usize {
lopdf::Document::load(path)
.map(|d| d.get_pages().len())
.unwrap_or(0)
}