use crate::{Document, Error, Extractor, Result};
use pdfium_render::prelude::*;
use std::fmt::Write as _;
use std::path::Path;
pub struct PdfiumExtractor {
pdfium: Pdfium,
ocr_fallback: Option<Box<dyn Extractor>>,
ocr_render_scale: f32,
}
impl PdfiumExtractor {
pub fn new() -> Result<Self> {
let bindings = Pdfium::bind_to_system_library().map_err(|e| Error::MissingDependency {
name: "libpdfium".into(),
details: format!("could not load from system library path: {e}"),
})?;
Ok(Self {
pdfium: Pdfium::new(bindings),
ocr_fallback: None,
ocr_render_scale: 2.0,
})
}
pub fn with_library_path(library_dir: &str) -> Result<Self> {
let bindings =
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(library_dir))
.map_err(|e| Error::MissingDependency {
name: "libpdfium".into(),
details: format!("could not load from {library_dir}: {e}"),
})?;
Ok(Self {
pdfium: Pdfium::new(bindings),
ocr_fallback: None,
ocr_render_scale: 2.0,
})
}
#[must_use]
pub fn with_ocr_fallback(mut self, ocr: Box<dyn Extractor>) -> Self {
self.ocr_fallback = Some(ocr);
self
}
#[must_use]
pub fn with_ocr_render_scale(mut self, scale: f32) -> Self {
self.ocr_render_scale = scale;
self
}
pub fn render_pages_to_pngs(
&self,
path: &Path,
out_dir: &Path,
) -> Result<Vec<std::path::PathBuf>> {
let path_str = path.to_str().ok_or_else(|| {
Error::ParseError(format!("PDF path is not valid UTF-8: {}", path.display()))
})?;
let doc = self
.pdfium
.load_pdf_from_file(path_str, None)
.map_err(|e| Error::ParseError(format!("pdfium failed to open {path_str}: {e}")))?;
let render_config = PdfRenderConfig::new().scale_page_by_factor(self.ocr_render_scale);
let mut pngs = Vec::new();
for (idx, page) in doc.pages().iter().enumerate() {
let bitmap = page
.render_with_config(&render_config)
.map_err(|e| Error::ParseError(format!("page {idx} render failed: {e}")))?;
let image = bitmap
.as_image()
.map_err(|e| Error::ParseError(format!("page {idx} bitmap → image failed: {e}")))?;
let png_path = out_dir.join(format!("page-{:04}.png", idx + 1));
image.save(&png_path).map_err(|e| {
Error::ParseError(format!(
"failed to write rendered page {idx} to {}: {e}",
png_path.display()
))
})?;
pngs.push(png_path);
}
Ok(pngs)
}
pub fn render_pages_subset_to_pngs(
&self,
path: &Path,
indices: &[usize],
out_dir: &Path,
) -> Result<Vec<std::path::PathBuf>> {
let path_str = path.to_str().ok_or_else(|| {
Error::ParseError(format!("PDF path is not valid UTF-8: {}", path.display()))
})?;
let pdf_doc = self
.pdfium
.load_pdf_from_file(path_str, None)
.map_err(|e| Error::ParseError(format!("pdfium failed to open {path_str}: {e}")))?;
let mut wanted: Vec<usize> = indices.to_vec();
wanted.sort_unstable();
wanted.dedup();
let render_config = PdfRenderConfig::new().scale_page_by_factor(self.ocr_render_scale);
let mut pngs = Vec::with_capacity(wanted.len());
for (idx, page) in pdf_doc.pages().iter().enumerate() {
if wanted.binary_search(&idx).is_err() {
continue;
}
let bitmap = page
.render_with_config(&render_config)
.map_err(|e| Error::ParseError(format!("page {idx} render failed: {e}")))?;
let image = bitmap
.as_image()
.map_err(|e| Error::ParseError(format!("page {idx} bitmap → image failed: {e}")))?;
let png_path = out_dir.join(format!("page-{:04}.png", idx + 1));
image.save(&png_path).map_err(|e| {
Error::ParseError(format!(
"failed to write rendered page {idx} to {}: {e}",
png_path.display()
))
})?;
pngs.push(png_path);
}
Ok(pngs)
}
fn extract_from_document(doc: &PdfDocument) -> Result<Document> {
let mut markdown = String::new();
for (idx, page) in doc.pages().iter().enumerate() {
if idx > 0 {
markdown.push_str("\n\n");
}
let text = page.text().map_err(|e| {
Error::ParseError(format!("page {idx} text extraction failed: {e}"))
})?;
markdown.push_str(&text.all());
}
let (title, metadata) = Self::extract_metadata(doc);
Ok(Document {
markdown,
title,
metadata,
})
}
fn extract_metadata(
doc: &PdfDocument,
) -> (Option<String>, std::collections::HashMap<String, String>) {
let mut title: Option<String> = None;
let mut metadata = std::collections::HashMap::new();
for tag in doc.metadata().iter() {
let value = tag.value();
if value.trim().is_empty() {
continue;
}
let key = match tag.tag_type() {
PdfDocumentMetadataTagType::Title => {
title = Some(value.to_string());
"title"
}
PdfDocumentMetadataTagType::Author => "author",
PdfDocumentMetadataTagType::Subject => "subject",
PdfDocumentMetadataTagType::Keywords => "keywords",
PdfDocumentMetadataTagType::Creator => "creator",
PdfDocumentMetadataTagType::Producer => "producer",
PdfDocumentMetadataTagType::CreationDate => "created_at",
PdfDocumentMetadataTagType::ModificationDate => "modified_at",
};
metadata.insert(key.to_string(), value.to_string());
}
(title, metadata)
}
}
impl Extractor for PdfiumExtractor {
fn extensions(&self) -> &[&'static str] {
&["pdf"]
}
fn name(&self) -> &'static str {
"pdfium-render"
}
fn extract(&self, path: &Path) -> Result<Document> {
let path_str = path.to_str().ok_or_else(|| {
Error::ParseError(format!("PDF path is not valid UTF-8: {}", path.display()))
})?;
let pdf_doc = self
.pdfium
.load_pdf_from_file(path_str, None)
.map_err(|e| Error::ParseError(format!("pdfium failed to open {path_str}: {e}")))?;
let page_count = usize::try_from(pdf_doc.pages().len()).unwrap_or(0);
let mut pages: Vec<String> = Vec::with_capacity(page_count);
for (idx, page) in pdf_doc.pages().iter().enumerate() {
let text = page.text().map_err(|e| {
Error::ParseError(format!("page {idx} text extraction failed: {e}"))
})?;
pages.push(text.all());
}
let (title, mut metadata) = Self::extract_metadata(&pdf_doc);
drop(pdf_doc);
let empty_indices: Vec<usize> = pages
.iter()
.enumerate()
.filter(|(_, t)| t.trim().is_empty())
.map(|(i, _)| i)
.collect();
let any_ocred = !empty_indices.is_empty() && self.ocr_fallback.is_some();
if any_ocred {
let ocr = self.ocr_fallback.as_ref().unwrap().as_ref();
let temp = tempfile::tempdir().map_err(|e| {
Error::ParseError(format!(
"could not create tempdir for PDF→OCR fallback: {e}"
))
})?;
let pngs = self.render_pages_subset_to_pngs(path, &empty_indices, temp.path())?;
for (vec_idx, &page_idx) in empty_indices.iter().enumerate() {
let png = &pngs[vec_idx];
let page_doc = ocr.extract(png).map_err(|e| {
Error::ParseError(format!(
"OCR failed on rendered page {} ({}): {e}",
page_idx + 1,
png.display()
))
})?;
pages[page_idx] = page_doc.markdown;
}
metadata.insert(
"extractor_chain".into(),
format!("pdfium-render → {}", ocr.name()),
);
metadata.insert("pages_ocred".into(), empty_indices.len().to_string());
}
let markdown = if any_ocred {
let mut out = String::new();
for (idx, page_text) in pages.iter().enumerate() {
let trimmed = page_text.trim();
if trimmed.is_empty() {
continue;
}
if !out.is_empty() {
out.push_str("\n\n");
}
let _ = write!(out, "## Page {}\n\n{trimmed}", idx + 1);
}
out
} else {
let mut out = String::new();
for (idx, page_text) in pages.iter().enumerate() {
if idx > 0 {
out.push_str("\n\n");
}
out.push_str(page_text);
}
out
};
Ok(Document {
markdown,
title,
metadata,
})
}
fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
let doc = self
.pdfium
.load_pdf_from_byte_slice(bytes, None)
.map_err(|e| Error::ParseError(format!("pdfium failed to open byte slice: {e}")))?;
Self::extract_from_document(&doc)
}
}
#[cfg(test)]
mod tests {
use super::*;
struct FakePdf;
impl Extractor for FakePdf {
fn extensions(&self) -> &[&'static str] {
&["pdf"]
}
fn extract(&self, _: &std::path::Path) -> Result<Document> {
unreachable!("FakePdf only used for trait-surface tests")
}
fn name(&self) -> &'static str {
"pdfium-render"
}
}
#[test]
fn extensions_is_pdf_only() {
assert_eq!(FakePdf.extensions(), &["pdf"]);
}
#[test]
fn name_identifies_backend() {
assert_eq!(FakePdf.name(), "pdfium-render");
}
#[test]
#[ignore = "requires libpdfium on the system library path"]
fn extracts_text_from_a_real_pdf() {
let extractor = PdfiumExtractor::new().expect("libpdfium not available");
let doc = extractor
.extract(std::path::Path::new("tests/fixtures/hello.pdf"))
.expect("extraction failed");
assert!(
!doc.markdown.is_empty(),
"expected non-empty markdown from hello.pdf"
);
}
#[test]
#[ignore = "requires libpdfium AND a PDF with metadata at tests/fixtures/with-metadata.pdf"]
fn surfaces_pdf_metadata_and_title() {
let extractor = PdfiumExtractor::new().expect("libpdfium not available");
let doc = extractor
.extract(std::path::Path::new("tests/fixtures/with-metadata.pdf"))
.expect("extraction failed");
assert!(
doc.title.is_some(),
"expected Document.title to be populated from /Title; got {doc:?}"
);
assert_eq!(
doc.metadata.get("title").map(String::as_str),
doc.title.as_deref(),
"metadata['title'] should mirror Document.title"
);
}
#[cfg(all(
feature = "ocr-platform",
any(target_os = "macos", target_os = "windows")
))]
#[test]
#[ignore = "requires libpdfium AND a mixed-content PDF in tests/fixtures/mixed-content.pdf"]
fn mixed_content_pdf_ocrs_only_empty_pages() {
#[cfg(target_os = "macos")]
let ocr: Box<dyn Extractor> = Box::new(crate::ocr_macos::VisionOcrExtractor::new());
#[cfg(target_os = "windows")]
let ocr: Box<dyn Extractor> = Box::new(crate::ocr_windows::WindowsOcrExtractor::new());
let extractor = PdfiumExtractor::new()
.expect("libpdfium not available")
.with_ocr_fallback(ocr);
let doc = extractor
.extract(std::path::Path::new("tests/fixtures/mixed-content.pdf"))
.expect("extraction failed");
let pages_ocred: usize = doc
.metadata
.get("pages_ocred")
.and_then(|s| s.parse().ok())
.expect("pages_ocred metadata should be set when OCR fallback fires");
assert!(
pages_ocred >= 1,
"expected at least one OCR'd page in a mixed-content PDF"
);
assert!(
doc.markdown.contains("## Page "),
"expected `## Page N` heading in mixed-content output"
);
}
#[cfg(all(
feature = "ocr-platform",
any(target_os = "macos", target_os = "windows")
))]
#[test]
#[ignore = "requires libpdfium AND a scanned PDF in tests/fixtures/scanned.pdf"]
fn scanned_pdf_routes_through_ocr_fallback() {
#[cfg(target_os = "macos")]
let ocr: Box<dyn Extractor> = Box::new(crate::ocr_macos::VisionOcrExtractor::new());
#[cfg(target_os = "windows")]
let ocr: Box<dyn Extractor> = Box::new(crate::ocr_windows::WindowsOcrExtractor::new());
let extractor = PdfiumExtractor::new()
.expect("libpdfium not available")
.with_ocr_fallback(ocr);
let doc = extractor
.extract(std::path::Path::new("tests/fixtures/scanned.pdf"))
.expect("extraction failed");
assert!(
!doc.markdown.is_empty(),
"expected non-empty markdown from scanned.pdf via OCR fallback"
);
let chain = doc
.metadata
.get("extractor_chain")
.map_or("", String::as_str);
assert!(
chain == "pdfium-render → vision-macos" || chain == "pdfium-render → ocr-windows",
"expected extractor_chain to record the fallback hop, got {chain:?}"
);
}
#[test]
fn missing_libpdfium_returns_typed_error() {
let result = PdfiumExtractor::with_library_path("/nonexistent-path-that-cannot-exist");
assert!(matches!(result, Err(Error::MissingDependency { .. })));
}
}