vela_protocol/ingest.rs
1//! PDF text extraction utility used by the modern Scout agent.
2//!
3//! The pre-v0.22 file-based ingestion command (`vela ingest --pdf/--csv/...`)
4//! lived here together with this function in v0.0–v0.35. With the agent
5//! inbox (Scout, Notes Compiler, Code Analyst, Datasets) fully replacing
6//! that path in v0.32+, the legacy command was removed in v0.36 and the
7//! file collapsed to this single utility.
8//!
9//! Kept under `vela_protocol::ingest::extract_pdf_text` for backward
10//! compatibility with Scout's import; the function is otherwise unrelated
11//! to ingestion and could move to `sources.rs` in a later refactor.
12
13use std::path::Path;
14
15/// Extract plain text from a PDF.
16///
17/// Tries `pdftotext` (poppler-utils) first; falls back to a crude
18/// printable-ASCII-run extractor for environments without poppler.
19/// Returns `Err` if neither path produces non-empty text.
20pub fn extract_pdf_text(path: &Path) -> Result<String, String> {
21 // Try pdftotext (poppler-utils).
22 if let Ok(output) = std::process::Command::new("pdftotext")
23 .arg(path)
24 .arg("-")
25 .output()
26 && output.status.success()
27 {
28 let text = String::from_utf8_lossy(&output.stdout).to_string();
29 if !text.trim().is_empty() {
30 return Ok(text);
31 }
32 }
33
34 // Fallback: read raw bytes and extract printable text runs.
35 let bytes = std::fs::read(path).map_err(|e| format!("Failed to read PDF file: {e}"))?;
36
37 // Extract ASCII text runs of length >= 20 (crude but works for most PDFs).
38 let mut text = String::new();
39 let mut current_run = String::new();
40 for &b in &bytes {
41 if b.is_ascii_graphic() || b == b' ' || b == b'\n' || b == b'\t' {
42 current_run.push(b as char);
43 } else {
44 if current_run.len() >= 20 {
45 text.push_str(¤t_run);
46 text.push('\n');
47 }
48 current_run.clear();
49 }
50 }
51 if current_run.len() >= 20 {
52 text.push_str(¤t_run);
53 }
54
55 if text.trim().is_empty() {
56 return Err(
57 "Could not extract text from PDF. Install pdftotext for better results.".into(),
58 );
59 }
60 Ok(text)
61}
62
63#[cfg(test)]
64mod tests {
65 use super::*;
66 use std::path::Path;
67
68 #[test]
69 fn extract_pdf_text_handles_missing_file() {
70 let result = extract_pdf_text(Path::new("/nonexistent/file.pdf"));
71 assert!(result.is_err());
72 }
73}