Skip to main content

vela_protocol/
ingest.rs

1//! PDF text extraction utility used by the modern Scout agent.
2//!
3//! The pre-v0.22 file-based ingestion command (`vela ingest --pdf/--csv/...`)
4//! lived here together with this function in v0.0–v0.35. With the agent
5//! inbox (Scout, Notes Compiler, Code Analyst, Datasets) fully replacing
6//! that path in v0.32+, the legacy command was removed in v0.36 and the
7//! file collapsed to this single utility.
8//!
9//! Kept under `vela_protocol::ingest::extract_pdf_text` for backward
10//! compatibility with Scout's import; the function is otherwise unrelated
11//! to ingestion and could move to `sources.rs` in a later refactor.
12
13use std::path::Path;
14
15/// Extract plain text from a PDF.
16///
17/// Tries `pdftotext` (poppler-utils) first; falls back to a crude
18/// printable-ASCII-run extractor for environments without poppler.
19/// Returns `Err` if neither path produces non-empty text.
20pub fn extract_pdf_text(path: &Path) -> Result<String, String> {
21    // Try pdftotext (poppler-utils).
22    if let Ok(output) = std::process::Command::new("pdftotext")
23        .arg(path)
24        .arg("-")
25        .output()
26        && output.status.success()
27    {
28        let text = String::from_utf8_lossy(&output.stdout).to_string();
29        if !text.trim().is_empty() {
30            return Ok(text);
31        }
32    }
33
34    // Fallback: read raw bytes and extract printable text runs.
35    let bytes = std::fs::read(path).map_err(|e| format!("Failed to read PDF file: {e}"))?;
36
37    // Extract ASCII text runs of length >= 20 (crude but works for most PDFs).
38    let mut text = String::new();
39    let mut current_run = String::new();
40    for &b in &bytes {
41        if b.is_ascii_graphic() || b == b' ' || b == b'\n' || b == b'\t' {
42            current_run.push(b as char);
43        } else {
44            if current_run.len() >= 20 {
45                text.push_str(&current_run);
46                text.push('\n');
47            }
48            current_run.clear();
49        }
50    }
51    if current_run.len() >= 20 {
52        text.push_str(&current_run);
53    }
54
55    if text.trim().is_empty() {
56        return Err(
57            "Could not extract text from PDF. Install pdftotext for better results.".into(),
58        );
59    }
60    Ok(text)
61}
62
63#[cfg(test)]
64mod tests {
65    use super::*;
66    use std::path::Path;
67
68    #[test]
69    fn extract_pdf_text_handles_missing_file() {
70        let result = extract_pdf_text(Path::new("/nonexistent/file.pdf"));
71        assert!(result.is_err());
72    }
73}