1pub fn extract_text(bytes: &[u8]) -> Result<String, String> {
10 if !looks_like_pdf(bytes) {
11 return Err("response is not a PDF (missing %PDF header)".to_string());
12 }
13
14 let outcome = std::panic::catch_unwind(|| pdf_extract::extract_text_from_mem(bytes));
15 match outcome {
16 Ok(Ok(text)) => {
17 let normalized = normalize(&text);
18 if normalized.trim().is_empty() {
19 Err("PDF contained no extractable text (likely scanned/image-only)".to_string())
20 } else {
21 Ok(normalized)
22 }
23 }
24 Ok(Err(e)) => Err(format!("PDF text extraction failed: {e}")),
25 Err(_) => Err("PDF text extraction panicked (malformed or unsupported PDF)".to_string()),
26 }
27}
28
29pub fn looks_like_pdf(bytes: &[u8]) -> bool {
31 let head = &bytes[..bytes.len().min(1024)];
32 head.windows(5).any(|w| w == b"%PDF-")
33}
34
35fn normalize(text: &str) -> String {
38 let mut out = String::with_capacity(text.len());
39 let mut blank_run = 0usize;
40 for line in text.lines() {
41 let trimmed = line.trim();
42 if trimmed.is_empty() {
43 blank_run += 1;
44 if blank_run > 1 {
45 continue;
46 }
47 out.push('\n');
48 } else {
49 blank_run = 0;
50 out.push_str(trimmed);
51 out.push('\n');
52 }
53 }
54 out.trim().to_string()
55}
56
57#[cfg(test)]
58mod tests {
59 use super::*;
60
61 #[test]
62 fn rejects_non_pdf_bytes() {
63 assert!(extract_text(b"<html>not a pdf</html>").is_err());
64 }
65
66 #[test]
67 fn detects_pdf_header() {
68 assert!(looks_like_pdf(b"%PDF-1.7\n..."));
69 assert!(!looks_like_pdf(b"plain text"));
70 }
71
72 #[test]
73 fn normalize_collapses_blank_runs() {
74 assert_eq!(normalize("a\n\n\n\nb\n\n"), "a\n\nb");
75 assert_eq!(normalize(" x \n \n y "), "x\n\ny");
76 }
77}