use anyhow::{Context, Result};
use std::path::Path;
pub fn extract_pdf_text(path: &Path) -> Result<String> {
let bytes =
std::fs::read(path).with_context(|| format!("reading PDF file: {}", path.display()))?;
let text = pdf_extract::extract_text_from_mem(&bytes)
.with_context(|| format!("extracting text from PDF: {}", path.display()))?;
Ok(text)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_extract_nonexistent_pdf() {
let path = PathBuf::from("/tmp/nonexistent_test_file.pdf");
let result = extract_pdf_text(&path);
assert!(result.is_err());
}
#[test]
fn test_extract_invalid_pdf() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("fake.pdf");
std::fs::write(&path, b"this is not a PDF").unwrap();
let result = extract_pdf_text(&path);
assert!(result.is_err());
}
#[test]
fn test_extract_minimal_pdf() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("minimal.pdf");
let pdf_bytes = b"%PDF-1.0
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]
/Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 100 700 Td (Hello World) Tj ET
endstream
endobj
5 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000360 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
441
%%EOF";
std::fs::write(&path, pdf_bytes).unwrap();
let result = extract_pdf_text(&path);
if let Ok(text) = result {
assert!(
text.contains("Hello") || text.contains("World") || text.is_empty(),
"Unexpected text: {}",
text
);
}
}
}