leann-core 0.2.3

LEANN is a revolutionary vector database that democratizes personal AI. Transform your laptop into a powerful RAG system that can index and search through millions of documents while using 97% less storage than traditional solutions without accuracy loss.
Documentation
//! PDF text extraction using the `pdf-extract` crate.

use anyhow::{Context, Result};
use std::path::Path;

/// Extract all text content from a PDF file.
///
/// Concatenates text from all pages, separated by newlines.
/// Returns an error if the file cannot be opened or parsed.
pub fn extract_pdf_text(path: &Path) -> Result<String> {
    let bytes =
        std::fs::read(path).with_context(|| format!("reading PDF file: {}", path.display()))?;
    let text = pdf_extract::extract_text_from_mem(&bytes)
        .with_context(|| format!("extracting text from PDF: {}", path.display()))?;
    Ok(text)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_extract_nonexistent_pdf() {
        let path = PathBuf::from("/tmp/nonexistent_test_file.pdf");
        let result = extract_pdf_text(&path);
        assert!(result.is_err());
    }

    #[test]
    fn test_extract_invalid_pdf() {
        // Write some non-PDF bytes to a temp file
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("fake.pdf");
        std::fs::write(&path, b"this is not a PDF").unwrap();
        let result = extract_pdf_text(&path);
        assert!(result.is_err());
    }

    #[test]
    fn test_extract_minimal_pdf() {
        // Create a minimal valid PDF with text content
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("minimal.pdf");

        // Minimal PDF 1.0 with a single page containing "Hello World"
        let pdf_bytes = b"%PDF-1.0
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj

2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj

3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792]
   /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>
endobj

4 0 obj
<< /Length 44 >>
stream
BT /F1 12 Tf 100 700 Td (Hello World) Tj ET
endstream
endobj

5 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj

xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000266 00000 n
0000000360 00000 n

trailer
<< /Size 6 /Root 1 0 R >>
startxref
441
%%EOF";

        std::fs::write(&path, pdf_bytes).unwrap();

        // pdf-extract may or may not handle this minimal PDF depending on version,
        // so we just verify the function doesn't panic
        let result = extract_pdf_text(&path);
        // If it succeeds, the text should contain "Hello World"
        if let Ok(text) = result {
            assert!(
                text.contains("Hello") || text.contains("World") || text.is_empty(),
                "Unexpected text: {}",
                text
            );
        }
        // If it fails, that's also acceptable for a hand-crafted minimal PDF
    }
}