Skip to main content

research_master/utils/
pdf.rs

1//! PDF text extraction utilities with fallback support.
2//!
3//! This module provides PDF text extraction with automatic fallback:
4//! 1. First tries poppler (via pdf-extract) - best quality text extraction
5//! 2. Falls back to pure Rust lopdf if poppler is unavailable
6//!
7//! For scanned/image-based PDFs, tesseract OCR can be used if available.
8
9use std::path::Path;
10use std::process::Command;
11use std::sync::atomic::{AtomicBool, Ordering};
12use thiserror::Error;
13
14/// Errors that can occur during PDF extraction
15#[derive(Debug, Error)]
16pub enum PdfExtractError {
17    #[error("PDF extraction failed: {0}")]
18    ExtractionFailed(String),
19
20    #[error("File not found or not a valid PDF: {0}")]
21    InvalidFile(String),
22
23    #[error("IO error: {0}")]
24    Io(#[from] std::io::Error),
25
26    #[error("No text extraction method available")]
27    NotAvailable,
28}
29
30/// Method used for PDF text extraction
31#[derive(Debug, Clone, PartialEq)]
32pub enum ExtractionMethod {
33    /// Used poppler libraries (best quality)
34    Poppler,
35    /// Used pure Rust lopdf
36    Lopdf,
37    /// Used pdftotext external binary
38    Pdftotext,
39    /// Used tesseract OCR
40    Tesseract,
41    /// No method available
42    None,
43}
44
45/// Check if an external binary is available
46fn is_external_available(name: &str) -> bool {
47    Command::new(name)
48        .arg("--version")
49        .output()
50        .map(|o| o.status.success())
51        .unwrap_or(false)
52}
53
54/// Check if poppler utilities are available
55pub fn has_poppler() -> bool {
56    static POPPLER_CHECK: AtomicBool = AtomicBool::new(false);
57    static HAS_CHECKED: AtomicBool = AtomicBool::new(false);
58
59    if HAS_CHECKED.load(Ordering::Relaxed) {
60        return POPPLER_CHECK.load(Ordering::Relaxed);
61    }
62
63    let available = is_external_available("pdftotext");
64    POPPLER_CHECK.store(available, Ordering::Relaxed);
65    HAS_CHECKED.store(true, Ordering::Relaxed);
66
67    available
68}
69
70/// Check if tesseract OCR is available
71pub fn has_tesseract() -> bool {
72    static TESSERACT_CHECK: AtomicBool = AtomicBool::new(false);
73    static HAS_CHECKED: AtomicBool = AtomicBool::new(false);
74
75    if HAS_CHECKED.load(Ordering::Relaxed) {
76        return TESSERACT_CHECK.load(Ordering::Relaxed);
77    }
78
79    let available = is_external_available("tesseract");
80    TESSERACT_CHECK.store(available, Ordering::Relaxed);
81    HAS_CHECKED.store(true, Ordering::Relaxed);
82
83    available
84}
85
86/// Print installation instructions for PDF extraction tools
87#[allow(dead_code)]
88pub fn print_tool_instructions() {
89    let has_poppler = is_external_available("pdftotext");
90    let has_tesseract = is_external_available("tesseract");
91
92    if !has_poppler || !has_tesseract {
93        eprintln!("\nPDF extraction tools info:");
94
95        if !has_poppler {
96            eprintln!("  - pdftotext: NOT FOUND");
97            #[cfg(windows)]
98            eprintln!(
99                "    Install from: https://github.com/oschwartz10612/poppler-windows/releases/"
100            );
101            #[cfg(not(windows))]
102            eprintln!("    Install with: brew install poppler (macOS) or apt install poppler-utils (Linux)");
103        }
104
105        if !has_tesseract {
106            eprintln!("  - tesseract OCR: NOT FOUND");
107            #[cfg(windows)]
108            eprintln!("    Install from: https://github.com/UB-Mannheim/tesseract/wiki");
109            #[cfg(not(windows))]
110            eprintln!("    Install with: brew install tesseract (macOS) or apt install tesseract-ocr (Linux)");
111        }
112
113        if has_poppler && !has_tesseract {
114            eprintln!("\nNote: Basic PDF text extraction will work via poppler.");
115            eprintln!("OCR is only needed for scanned/image-based PDFs.");
116        } else if !has_poppler {
117            eprintln!("\nNote: Falling back to pure Rust lopdf for basic PDF extraction.");
118            eprintln!("Quality may be reduced for complex PDFs.");
119        }
120    }
121}
122
123/// Get the best available extraction method with metadata
124#[derive(Debug, Clone)]
125pub struct ExtractionInfo {
126    pub method: ExtractionMethod,
127    pub has_poppler: bool,
128    pub has_tesseract: bool,
129    pub has_lopdf: bool,
130}
131
132/// Get information about available PDF extraction methods
133pub fn get_extraction_info() -> ExtractionInfo {
134    ExtractionInfo {
135        method: ExtractionMethod::None,
136        has_poppler: has_poppler(),
137        has_tesseract: has_tesseract(),
138        has_lopdf: true, // lopdf is always available as a Rust crate
139    }
140}
141
142/// Try to extract text using pdftotext external binary
143fn extract_with_pdftotext(path: &Path) -> Result<String, PdfExtractError> {
144    let output = Command::new("pdftotext")
145        .arg(path)
146        .arg("-")
147        .output()
148        .map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
149
150    if !output.status.success() {
151        return Err(PdfExtractError::ExtractionFailed(
152            String::from_utf8_lossy(&output.stderr).to_string(),
153        ));
154    }
155
156    Ok(String::from_utf8_lossy(&output.stdout).to_string())
157}
158
159/// Try to extract text using poppler via pdf-extract crate
160fn extract_with_poppler(path: &Path) -> Result<String, PdfExtractError> {
161    match pdf_extract::extract_text(path) {
162        Ok(text) if text.trim().is_empty() => {
163            // Try pdftotext as fallback within poppler
164            tracing::debug!("pdf-extract returned empty, trying pdftotext");
165            extract_with_pdftotext(path)
166        }
167        Ok(text) => Ok(text),
168        Err(e) => Err(PdfExtractError::ExtractionFailed(e.to_string())),
169    }
170}
171
172/// Try to extract text using pure Rust lopdf
173fn extract_with_lopdf(path: &Path) -> Result<String, PdfExtractError> {
174    let doc = lopdf::Document::load(path)
175        .map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
176
177    let pages: Vec<u32> = (1..=doc.get_pages().len() as u32).collect();
178    let text = doc
179        .extract_text(&pages)
180        .map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
181
182    Ok(text)
183}
184
185/// Extract text from a PDF file using the best available method.
186///
187/// Returns the extracted text content and the method used.
188///
189/// # Arguments
190///
191/// * `path` - Path to the PDF file
192///
193/// # Returns
194///
195/// A tuple of (extracted text, extraction method used)
196pub fn extract_text(path: &Path) -> Result<(String, ExtractionMethod), PdfExtractError> {
197    // Check file exists and is a file
198    if !path.exists() {
199        return Err(PdfExtractError::InvalidFile(format!(
200            "File not found: {}",
201            path.display()
202        )));
203    }
204
205    if !path.is_file() {
206        return Err(PdfExtractError::InvalidFile(format!(
207            "Not a file: {}",
208            path.display()
209        )));
210    }
211
212    // Priority 1: Try poppler libraries first (best quality)
213    if has_poppler() {
214        match extract_with_poppler(path) {
215            Ok(text) => {
216                if !text.trim().is_empty() {
217                    return Ok((text, ExtractionMethod::Poppler));
218                }
219                // Empty text from poppler, continue to fallback
220                tracing::debug!(
221                    "Poppler returned empty text for {}, trying fallback",
222                    path.display()
223                );
224            }
225            Err(e) => {
226                tracing::debug!("Poppler extraction failed: {}, trying fallback", e);
227            }
228        }
229
230        // Try pdftotext directly as secondary poppler method
231        match extract_with_pdftotext(path) {
232            Ok(text) if !text.trim().is_empty() => return Ok((text, ExtractionMethod::Pdftotext)),
233            _ => {}
234        }
235    }
236
237    // Priority 2: Try pure Rust lopdf
238    match extract_with_lopdf(path) {
239        Ok(text) if !text.trim().is_empty() => return Ok((text, ExtractionMethod::Lopdf)),
240        Ok(_) => {
241            tracing::debug!("lopdf returned empty text for {}", path.display());
242        }
243        Err(e) => {
244            tracing::debug!("lopdf extraction failed: {}", e);
245        }
246    }
247
248    // Priority 3: If tesseract is available, try OCR
249    if has_tesseract() {
250        tracing::debug!(
251            "All text extraction failed, {} might be a scanned PDF. \
252             Consider using tesseract for OCR.",
253            path.display()
254        );
255    }
256
257    Err(PdfExtractError::NotAvailable)
258}
259
260/// Extract text from a PDF file (legacy interface, discards method info)
261pub fn extract_text_simple(path: &Path) -> Result<String, PdfExtractError> {
262    extract_text(path).map(|(text, _)| text)
263}
264
265/// Extract text from multiple PDF files and combine results.
266#[allow(dead_code)]
267pub fn extract_multiple<'a, P>(paths: P) -> Vec<Result<(String, ExtractionMethod), PdfExtractError>>
268where
269    P: IntoIterator<Item = &'a Path>,
270{
271    paths.into_iter().map(extract_text).collect()
272}
273
274#[cfg(test)]
275mod tests {
276    use super::*;
277
278    #[test]
279    fn test_extraction_info() {
280        let info = get_extraction_info();
281        // Should have at least lopdf available
282        assert!(info.has_lopdf);
283        // Poppler and tesseract depend on system installation
284        println!("Poppler available: {}", info.has_poppler);
285        println!("Tesseract available: {}", info.has_tesseract);
286    }
287
288    #[test]
289    fn test_extract_nonexistent_file() {
290        let result = extract_text(Path::new("/nonexistent/file.pdf"));
291        assert!(result.is_err());
292    }
293
294    #[test]
295    fn test_extract_simple_nonexistent() {
296        let result = extract_text_simple(Path::new("/nonexistent/file.pdf"));
297        assert!(result.is_err());
298    }
299}