research_master/utils/
pdf.rs1use std::path::Path;
10use std::process::Command;
11use std::sync::atomic::{AtomicBool, Ordering};
12use thiserror::Error;
13
14#[derive(Debug, Error)]
16pub enum PdfExtractError {
17 #[error("PDF extraction failed: {0}")]
18 ExtractionFailed(String),
19
20 #[error("File not found or not a valid PDF: {0}")]
21 InvalidFile(String),
22
23 #[error("IO error: {0}")]
24 Io(#[from] std::io::Error),
25
26 #[error("No text extraction method available")]
27 NotAvailable,
28}
29
30#[derive(Debug, Clone, PartialEq)]
32pub enum ExtractionMethod {
33 Poppler,
35 Lopdf,
37 Pdftotext,
39 Tesseract,
41 None,
43}
44
45fn is_external_available(name: &str) -> bool {
47 Command::new(name)
48 .arg("--version")
49 .output()
50 .map(|o| o.status.success())
51 .unwrap_or(false)
52}
53
54pub fn has_poppler() -> bool {
56 static POPPLER_CHECK: AtomicBool = AtomicBool::new(false);
57 static HAS_CHECKED: AtomicBool = AtomicBool::new(false);
58
59 if HAS_CHECKED.load(Ordering::Relaxed) {
60 return POPPLER_CHECK.load(Ordering::Relaxed);
61 }
62
63 let available = is_external_available("pdftotext");
64 POPPLER_CHECK.store(available, Ordering::Relaxed);
65 HAS_CHECKED.store(true, Ordering::Relaxed);
66
67 available
68}
69
70pub fn has_tesseract() -> bool {
72 static TESSERACT_CHECK: AtomicBool = AtomicBool::new(false);
73 static HAS_CHECKED: AtomicBool = AtomicBool::new(false);
74
75 if HAS_CHECKED.load(Ordering::Relaxed) {
76 return TESSERACT_CHECK.load(Ordering::Relaxed);
77 }
78
79 let available = is_external_available("tesseract");
80 TESSERACT_CHECK.store(available, Ordering::Relaxed);
81 HAS_CHECKED.store(true, Ordering::Relaxed);
82
83 available
84}
85
86#[allow(dead_code)]
88pub fn print_tool_instructions() {
89 let has_poppler = is_external_available("pdftotext");
90 let has_tesseract = is_external_available("tesseract");
91
92 if !has_poppler || !has_tesseract {
93 eprintln!("\nPDF extraction tools info:");
94
95 if !has_poppler {
96 eprintln!(" - pdftotext: NOT FOUND");
97 #[cfg(windows)]
98 eprintln!(
99 " Install from: https://github.com/oschwartz10612/poppler-windows/releases/"
100 );
101 #[cfg(not(windows))]
102 eprintln!(" Install with: brew install poppler (macOS) or apt install poppler-utils (Linux)");
103 }
104
105 if !has_tesseract {
106 eprintln!(" - tesseract OCR: NOT FOUND");
107 #[cfg(windows)]
108 eprintln!(" Install from: https://github.com/UB-Mannheim/tesseract/wiki");
109 #[cfg(not(windows))]
110 eprintln!(" Install with: brew install tesseract (macOS) or apt install tesseract-ocr (Linux)");
111 }
112
113 if has_poppler && !has_tesseract {
114 eprintln!("\nNote: Basic PDF text extraction will work via poppler.");
115 eprintln!("OCR is only needed for scanned/image-based PDFs.");
116 } else if !has_poppler {
117 eprintln!("\nNote: Falling back to pure Rust lopdf for basic PDF extraction.");
118 eprintln!("Quality may be reduced for complex PDFs.");
119 }
120 }
121}
122
123#[derive(Debug, Clone)]
125pub struct ExtractionInfo {
126 pub method: ExtractionMethod,
127 pub has_poppler: bool,
128 pub has_tesseract: bool,
129 pub has_lopdf: bool,
130}
131
132pub fn get_extraction_info() -> ExtractionInfo {
134 ExtractionInfo {
135 method: ExtractionMethod::None,
136 has_poppler: has_poppler(),
137 has_tesseract: has_tesseract(),
138 has_lopdf: true, }
140}
141
142fn extract_with_pdftotext(path: &Path) -> Result<String, PdfExtractError> {
144 let output = Command::new("pdftotext")
145 .arg(path)
146 .arg("-")
147 .output()
148 .map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
149
150 if !output.status.success() {
151 return Err(PdfExtractError::ExtractionFailed(
152 String::from_utf8_lossy(&output.stderr).to_string(),
153 ));
154 }
155
156 Ok(String::from_utf8_lossy(&output.stdout).to_string())
157}
158
159fn extract_with_poppler(path: &Path) -> Result<String, PdfExtractError> {
161 match pdf_extract::extract_text(path) {
162 Ok(text) if text.trim().is_empty() => {
163 tracing::debug!("pdf-extract returned empty, trying pdftotext");
165 extract_with_pdftotext(path)
166 }
167 Ok(text) => Ok(text),
168 Err(e) => Err(PdfExtractError::ExtractionFailed(e.to_string())),
169 }
170}
171
172fn extract_with_lopdf(path: &Path) -> Result<String, PdfExtractError> {
174 let doc = lopdf::Document::load(path)
175 .map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
176
177 let pages: Vec<u32> = (1..=doc.get_pages().len() as u32).collect();
178 let text = doc
179 .extract_text(&pages)
180 .map_err(|e| PdfExtractError::ExtractionFailed(e.to_string()))?;
181
182 Ok(text)
183}
184
185pub fn extract_text(path: &Path) -> Result<(String, ExtractionMethod), PdfExtractError> {
197 if !path.exists() {
199 return Err(PdfExtractError::InvalidFile(format!(
200 "File not found: {}",
201 path.display()
202 )));
203 }
204
205 if !path.is_file() {
206 return Err(PdfExtractError::InvalidFile(format!(
207 "Not a file: {}",
208 path.display()
209 )));
210 }
211
212 if has_poppler() {
214 match extract_with_poppler(path) {
215 Ok(text) => {
216 if !text.trim().is_empty() {
217 return Ok((text, ExtractionMethod::Poppler));
218 }
219 tracing::debug!(
221 "Poppler returned empty text for {}, trying fallback",
222 path.display()
223 );
224 }
225 Err(e) => {
226 tracing::debug!("Poppler extraction failed: {}, trying fallback", e);
227 }
228 }
229
230 match extract_with_pdftotext(path) {
232 Ok(text) if !text.trim().is_empty() => return Ok((text, ExtractionMethod::Pdftotext)),
233 _ => {}
234 }
235 }
236
237 match extract_with_lopdf(path) {
239 Ok(text) if !text.trim().is_empty() => return Ok((text, ExtractionMethod::Lopdf)),
240 Ok(_) => {
241 tracing::debug!("lopdf returned empty text for {}", path.display());
242 }
243 Err(e) => {
244 tracing::debug!("lopdf extraction failed: {}", e);
245 }
246 }
247
248 if has_tesseract() {
250 tracing::debug!(
251 "All text extraction failed, {} might be a scanned PDF. \
252 Consider using tesseract for OCR.",
253 path.display()
254 );
255 }
256
257 Err(PdfExtractError::NotAvailable)
258}
259
260pub fn extract_text_simple(path: &Path) -> Result<String, PdfExtractError> {
262 extract_text(path).map(|(text, _)| text)
263}
264
265#[allow(dead_code)]
267pub fn extract_multiple<'a, P>(paths: P) -> Vec<Result<(String, ExtractionMethod), PdfExtractError>>
268where
269 P: IntoIterator<Item = &'a Path>,
270{
271 paths.into_iter().map(extract_text).collect()
272}
273
274#[cfg(test)]
275mod tests {
276 use super::*;
277
278 #[test]
279 fn test_extraction_info() {
280 let info = get_extraction_info();
281 assert!(info.has_lopdf);
283 println!("Poppler available: {}", info.has_poppler);
285 println!("Tesseract available: {}", info.has_tesseract);
286 }
287
288 #[test]
289 fn test_extract_nonexistent_file() {
290 let result = extract_text(Path::new("/nonexistent/file.pdf"));
291 assert!(result.is_err());
292 }
293
294 #[test]
295 fn test_extract_simple_nonexistent() {
296 let result = extract_text_simple(Path::new("/nonexistent/file.pdf"));
297 assert!(result.is_err());
298 }
299}