use crate::{Document, Error, Extractor, Result};
use pdfium_render::prelude::*;
use std::path::Path;
pub struct PdfiumExtractor {
pdfium: Pdfium,
}
impl PdfiumExtractor {
pub fn new() -> Result<Self> {
let bindings = Pdfium::bind_to_system_library().map_err(|e| Error::MissingDependency {
name: "libpdfium".into(),
details: format!("could not load from system library path: {e}"),
})?;
Ok(Self {
pdfium: Pdfium::new(bindings),
})
}
pub fn with_library_path(library_dir: &str) -> Result<Self> {
let bindings =
Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(library_dir))
.map_err(|e| Error::MissingDependency {
name: "libpdfium".into(),
details: format!("could not load from {library_dir}: {e}"),
})?;
Ok(Self {
pdfium: Pdfium::new(bindings),
})
}
fn extract_from_document(doc: &PdfDocument) -> Result<Document> {
let mut markdown = String::new();
for (idx, page) in doc.pages().iter().enumerate() {
if idx > 0 {
markdown.push_str("\n\n");
}
let text = page.text().map_err(|e| {
Error::ParseError(format!("page {idx} text extraction failed: {e}"))
})?;
markdown.push_str(&text.all());
}
Ok(Document {
markdown,
title: None,
metadata: std::collections::HashMap::new(),
})
}
}
impl Extractor for PdfiumExtractor {
fn extensions(&self) -> &[&'static str] {
&["pdf"]
}
fn name(&self) -> &'static str {
"pdfium-render"
}
fn extract(&self, path: &Path) -> Result<Document> {
let path_str = path.to_str().ok_or_else(|| {
Error::ParseError(format!("PDF path is not valid UTF-8: {}", path.display()))
})?;
let doc = self
.pdfium
.load_pdf_from_file(path_str, None)
.map_err(|e| Error::ParseError(format!("pdfium failed to open {path_str}: {e}")))?;
Self::extract_from_document(&doc)
}
fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
let doc = self
.pdfium
.load_pdf_from_byte_slice(bytes, None)
.map_err(|e| Error::ParseError(format!("pdfium failed to open byte slice: {e}")))?;
Self::extract_from_document(&doc)
}
}
#[cfg(test)]
mod tests {
use super::*;
struct FakePdf;
impl Extractor for FakePdf {
fn extensions(&self) -> &[&'static str] {
&["pdf"]
}
fn extract(&self, _: &std::path::Path) -> Result<Document> {
unreachable!("FakePdf only used for trait-surface tests")
}
fn name(&self) -> &'static str {
"pdfium-render"
}
}
#[test]
fn extensions_is_pdf_only() {
assert_eq!(FakePdf.extensions(), &["pdf"]);
}
#[test]
fn name_identifies_backend() {
assert_eq!(FakePdf.name(), "pdfium-render");
}
#[test]
#[ignore = "requires libpdfium on the system library path"]
fn extracts_text_from_a_real_pdf() {
let extractor = PdfiumExtractor::new().expect("libpdfium not available");
let doc = extractor
.extract(std::path::Path::new("tests/fixtures/hello.pdf"))
.expect("extraction failed");
assert!(
!doc.markdown.is_empty(),
"expected non-empty markdown from hello.pdf"
);
}
#[test]
fn missing_libpdfium_returns_typed_error() {
let result = PdfiumExtractor::with_library_path("/nonexistent-path-that-cannot-exist");
assert!(matches!(result, Err(Error::MissingDependency { .. })));
}
}