use std::path::Path;
use std::process::Command;
use std::sync::atomic::{AtomicBool, Ordering};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum PdfExtractError {
#[error("PDF extraction failed: {0}")]
ExtractionFailed(String),
#[error("File not found or not a valid PDF: {0}")]
InvalidFile(String),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("No text extraction method available")]
NotAvailable,
}
#[derive(Debug, Clone, PartialEq)]
pub enum ExtractionMethod {
Poppler,
Lopdf,
Pdftotext,
Tesseract,
None,
}
fn is_external_available(name: &str) -> bool {
Command::new(name)
.arg("--version")
.output()
.map(|o| o.status.success())
.unwrap_or(false)
}
pub fn has_poppler() -> bool {
static POPPLER_CHECK: AtomicBool = AtomicBool::new(false);
static HAS_CHECKED: AtomicBool = AtomicBool::new(false);
if HAS_CHECKED.load(Ordering::Relaxed) {
return POPPLER_CHECK.load(Ordering::Relaxed);
}
let available = is_external_available("pdftotext");
POPPLER_CHECK.store(available, Ordering::Relaxed);
HAS_CHECKED.store(true, Ordering::Relaxed);
available
}
pub fn has_tesseract() -> bool {
static TESSERACT_CHECK: AtomicBool = AtomicBool::new(false);
static HAS_CHECKED: AtomicBool = AtomicBool::new(false);
if HAS_CHECKED.load(Ordering::Relaxed) {
return TESSERACT_CHECK.load(Ordering::Relaxed);
}
let available = is_external_available("tesseract");
TESSERACT_CHECK.store(available, Ordering::Relaxed);
HAS_CHECKED.store(true, Ordering::Relaxed);
available
}
#[allow(dead_code)]
pub fn print_tool_instructions() {
let has_poppler = is_external_available("pdftotext");
let has_tesseract = is_external_available("tesseract");
if !has_poppler || !has_tesseract {
eprintln!("\nPDF extraction tools info:");
if !has_poppler {
eprintln!(" - pdftotext: NOT FOUND");
#[cfg(windows)]
eprintln!(
" Install from: https://github.com/oschwartz10612/poppler-windows/releases/"
);
#[cfg(not(windows))]
eprintln!(" Install with: brew install poppler (macOS) or apt install poppler-utils (Linux)");
}
if !has_tesseract {
eprintln!(" - tesseract OCR: NOT FOUND");
#[cfg(windows)]
eprintln!(" Install from: https://github.com/UB-Mannheim/tesseract/wiki");
#[cfg(not(windows))]
eprintln!(" Install with: brew install tesseract (macOS) or apt install tesseract-ocr (Linux)");
}
if has_poppler && !has_tesseract {
eprintln!("\nNote: Basic PDF text extraction will work via poppler.");
eprintln!("OCR is only needed for scanned/image-based PDFs.");
} else if !has_poppler {
eprintln!("\nNote: Falling back to pure Rust lopdf for basic PDF extraction.");
eprintln!("Quality may be reduced for complex PDFs.");
}
}
}
#[derive(Debug, Clone)]
pub struct ExtractionInfo {
pub method: ExtractionMethod,
pub has_poppler: bool,
pub has_tesseract: bool,
pub has_lopdf: bool,
}
pub fn get_extraction_info() -> ExtractionInfo {
ExtractionInfo {
method: ExtractionMethod::None,
has_poppler: has_poppler(),
has_tesseract: has_tesseract(),
has_lopdf: true, }
}
fn extract_with_pdftotext(path: &Path) -> Result<String, PdfExtractError> {
let output = Command::new("pdftotext")
.arg(path)
.arg("-")
.output()
.map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
if !output.status.success() {
return Err(PdfExtractError::ExtractionFailed(
String::from_utf8_lossy(&output.stderr).to_string(),
));
}
Ok(String::from_utf8_lossy(&output.stdout).to_string())
}
fn extract_with_poppler(path: &Path) -> Result<String, PdfExtractError> {
match pdf_extract::extract_text(path) {
Ok(text) if text.trim().is_empty() => {
tracing::debug!("pdf-extract returned empty, trying pdftotext");
extract_with_pdftotext(path)
}
Ok(text) => Ok(text),
Err(e) => Err(PdfExtractError::ExtractionFailed(e.to_string())),
}
}
fn extract_with_lopdf(path: &Path) -> Result<String, PdfExtractError> {
let doc = lopdf::Document::load(path)
.map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
let pages: Vec<u32> = (1..=doc.get_pages().len() as u32).collect();
let text = doc
.extract_text(&pages)
.map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
Ok(text)
}
pub fn extract_text(path: &Path) -> Result<(String, ExtractionMethod), PdfExtractError> {
if !path.exists() {
return Err(PdfExtractError::InvalidFile(format!(
"File not found: {}",
path.display()
)));
}
if !path.is_file() {
return Err(PdfExtractError::InvalidFile(format!(
"Not a file: {}",
path.display()
)));
}
if has_poppler() {
match extract_with_poppler(path) {
Ok(text) => {
if !text.trim().is_empty() {
return Ok((text, ExtractionMethod::Poppler));
}
tracing::debug!(
"Poppler returned empty text for {}, trying fallback",
path.display()
);
}
Err(e) => {
tracing::debug!("Poppler extraction failed: {}, trying fallback", e);
}
}
match extract_with_pdftotext(path) {
Ok(text) if !text.trim().is_empty() => return Ok((text, ExtractionMethod::Pdftotext)),
_ => {}
}
}
match extract_with_lopdf(path) {
Ok(text) if !text.trim().is_empty() => return Ok((text, ExtractionMethod::Lopdf)),
Ok(_) => {
tracing::debug!("lopdf returned empty text for {}", path.display());
}
Err(e) => {
tracing::debug!("lopdf extraction failed: {}", e);
}
}
if has_tesseract() {
tracing::debug!(
"All text extraction failed, {} might be a scanned PDF. \
Consider using tesseract for OCR.",
path.display()
);
}
Err(PdfExtractError::NotAvailable)
}
pub fn extract_text_simple(path: &Path) -> Result<String, PdfExtractError> {
extract_text(path).map(|(text, _)| text)
}
#[allow(dead_code)]
pub fn extract_multiple<'a, P>(paths: P) -> Vec<Result<(String, ExtractionMethod), PdfExtractError>>
where
P: IntoIterator<Item = &'a Path>,
{
paths.into_iter().map(extract_text).collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extraction_info() {
let info = get_extraction_info();
assert!(info.has_lopdf);
println!("Poppler available: {}", info.has_poppler);
println!("Tesseract available: {}", info.has_tesseract);
}
#[test]
fn test_extract_nonexistent_file() {
let result = extract_text(Path::new("/nonexistent/file.pdf"));
assert!(result.is_err());
}
#[test]
fn test_extract_simple_nonexistent() {
let result = extract_text_simple(Path::new("/nonexistent/file.pdf"));
assert!(result.is_err());
}
}