lean_ctx/core/web/
pdf.rs

1//! PDF → text extraction for the research context layer.
2//!
3//! Delegates to the `pdf-extract` crate. Because PDF parsers can panic on
4//! malformed or unusual input — and `ctx_url_read` accepts arbitrary
5//! agent-supplied URLs — extraction is wrapped in [`std::panic::catch_unwind`]
6//! so a bad document yields an error instead of taking down the handler.
7
8/// Extract and normalize the text content of a PDF byte buffer.
9pub fn extract_text(bytes: &[u8]) -> Result<String, String> {
10    if !looks_like_pdf(bytes) {
11        return Err("response is not a PDF (missing %PDF header)".to_string());
12    }
13
14    let outcome = std::panic::catch_unwind(|| pdf_extract::extract_text_from_mem(bytes));
15    match outcome {
16        Ok(Ok(text)) => {
17            let normalized = normalize(&text);
18            if normalized.trim().is_empty() {
19                Err("PDF contained no extractable text (likely scanned/image-only)".to_string())
20            } else {
21                Ok(normalized)
22            }
23        }
24        Ok(Err(e)) => Err(format!("PDF text extraction failed: {e}")),
25        Err(_) => Err("PDF text extraction panicked (malformed or unsupported PDF)".to_string()),
26    }
27}
28
29/// PDFs start with `%PDF-` (optionally after a small BOM/whitespace preamble).
30pub fn looks_like_pdf(bytes: &[u8]) -> bool {
31    let head = &bytes[..bytes.len().min(1024)];
32    head.windows(5).any(|w| w == b"%PDF-")
33}
34
35/// Collapse the runs of blank lines `pdf-extract` tends to emit and trim
36/// trailing whitespace per line.
37fn normalize(text: &str) -> String {
38    let mut out = String::with_capacity(text.len());
39    let mut blank_run = 0usize;
40    for line in text.lines() {
41        let trimmed = line.trim();
42        if trimmed.is_empty() {
43            blank_run += 1;
44            if blank_run > 1 {
45                continue;
46            }
47            out.push('\n');
48        } else {
49            blank_run = 0;
50            out.push_str(trimmed);
51            out.push('\n');
52        }
53    }
54    out.trim().to_string()
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60
61    #[test]
62    fn rejects_non_pdf_bytes() {
63        assert!(extract_text(b"<html>not a pdf</html>").is_err());
64    }
65
66    #[test]
67    fn detects_pdf_header() {
68        assert!(looks_like_pdf(b"%PDF-1.7\n..."));
69        assert!(!looks_like_pdf(b"plain text"));
70    }
71
72    #[test]
73    fn normalize_collapses_blank_runs() {
74        assert_eq!(normalize("a\n\n\n\nb\n\n"), "a\n\nb");
75        assert_eq!(normalize("  x  \n   \n y "), "x\n\ny");
76    }
77}
lean_ctx/core/web/pdf.rs

lean_ctx/core/web/
pdf.rs