Skip to main content

kbolt_core/ingest/
pdf.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct PdfExtractor;
8
9impl Extractor for PdfExtractor {
10    fn supports(&self) -> &[&str] {
11        &["pdf"]
12    }
13
14    fn profile_key(&self) -> &'static str {
15        "pdf"
16    }
17
18    fn version(&self) -> u32 {
19        2
20    }
21
22    fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
23        let text = extract_pdf_text(bytes)?;
24        let blocks = paragraph_blocks(text.as_str());
25        if blocks.is_empty() {
26            return Err(kbolt_types::KboltError::InvalidInput(
27                "pdf text extraction produced no text; scanned or image-only PDFs need OCR"
28                    .to_string(),
29            )
30            .into());
31        }
32
33        Ok(ExtractedDocument {
34            blocks,
35            metadata: HashMap::new(),
36            title: None,
37        })
38    }
39}
40
41fn extract_pdf_text(bytes: &[u8]) -> Result<String> {
42    match std::panic::catch_unwind(|| pdf_extract::extract_text_from_mem(bytes)) {
43        Ok(Ok(text)) => Ok(text),
44        Ok(Err(err)) => Err(kbolt_types::KboltError::InvalidInput(format!(
45            "pdf text extraction failed: {err}"
46        ))
47        .into()),
48        Err(_) => Err(kbolt_types::KboltError::InvalidInput(
49            "pdf text extraction failed: parser panicked".to_string(),
50        )
51        .into()),
52    }
53}
54
55fn paragraph_blocks(text: &str) -> Vec<ExtractedBlock> {
56    let mut blocks = Vec::new();
57    let mut current = String::new();
58    let mut next_offset = 0usize;
59
60    for line in text.lines() {
61        let trimmed = line.trim_end();
62        if trimmed.trim().is_empty() {
63            push_paragraph(&mut blocks, &mut current, &mut next_offset);
64            continue;
65        }
66
67        if !current.is_empty() {
68            current.push('\n');
69        }
70        current.push_str(trimmed);
71    }
72
73    push_paragraph(&mut blocks, &mut current, &mut next_offset);
74    blocks
75}
76
77fn push_paragraph(blocks: &mut Vec<ExtractedBlock>, current: &mut String, next_offset: &mut usize) {
78    let text = current.trim().to_string();
79    current.clear();
80    if text.is_empty() {
81        return;
82    }
83
84    let offset = *next_offset;
85    let length = text.len();
86    *next_offset = next_offset.saturating_add(length).saturating_add(2);
87    blocks.push(ExtractedBlock {
88        text,
89        offset,
90        length,
91        kind: BlockKind::Paragraph,
92        heading_path: Vec::new(),
93        attrs: HashMap::new(),
94    });
95}
96
97#[cfg(test)]
98pub(crate) fn simple_pdf_fixture(text: &str) -> Vec<u8> {
99    let escaped = text
100        .replace('\\', "\\\\")
101        .replace('(', "\\(")
102        .replace(')', "\\)")
103        .replace('\n', ") Tj T* (");
104    let stream = format!("BT /F1 12 Tf 72 720 Td 14 TL ({escaped}) Tj ET");
105    let objects = vec![
106        "<< /Type /Catalog /Pages 2 0 R >>".to_string(),
107        "<< /Type /Pages /Kids [3 0 R] /Count 1 >>".to_string(),
108        "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>".to_string(),
109        "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>".to_string(),
110        format!("<< /Length {} >>\nstream\n{}\nendstream", stream.len(), stream),
111    ];
112
113    let mut pdf = b"%PDF-1.4\n".to_vec();
114    let mut offsets = Vec::new();
115    for (index, object) in objects.iter().enumerate() {
116        offsets.push(pdf.len());
117        pdf.extend_from_slice(format!("{} 0 obj\n{}\nendobj\n", index + 1, object).as_bytes());
118    }
119
120    let xref_offset = pdf.len();
121    pdf.extend_from_slice(format!("xref\n0 {}\n", offsets.len() + 1).as_bytes());
122    pdf.extend_from_slice(b"0000000000 65535 f \n");
123    for offset in offsets {
124        pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
125    }
126    pdf.extend_from_slice(
127        format!(
128            "trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n",
129            objects.len() + 1
130        )
131        .as_bytes(),
132    );
133    pdf
134}
135
136#[cfg(test)]
137mod tests {
138    use std::path::Path;
139
140    use crate::ingest::extract::Extractor;
141    use crate::ingest::pdf::{simple_pdf_fixture, PdfExtractor};
142
143    #[test]
144    fn extracts_digital_pdf_text_into_paragraphs() {
145        let extractor = PdfExtractor;
146        assert_eq!(extractor.profile_key(), "pdf");
147
148        let doc = extractor
149            .extract(
150                Path::new("papers/guide.pdf"),
151                &simple_pdf_fixture("Alpha pdf target.\nSecond line."),
152            )
153            .expect("extract pdf");
154
155        assert_eq!(doc.blocks.len(), 1);
156        assert!(doc.blocks[0].text.contains("Alpha pdf target."));
157        assert!(doc.blocks[0].text.contains("Second line."));
158    }
159
160    #[test]
161    fn rejects_invalid_pdf_bytes() {
162        let extractor = PdfExtractor;
163        let err = extractor
164            .extract(Path::new("papers/bad.pdf"), b"not a pdf")
165            .expect_err("invalid pdf should fail");
166        assert!(err.to_string().contains("pdf text extraction failed"));
167    }
168
169    #[test]
170    fn rejects_pdf_when_extraction_produces_no_text() {
171        let extractor = PdfExtractor;
172        let err = extractor
173            .extract(Path::new("papers/scan.pdf"), &simple_pdf_fixture(""))
174            .expect_err("empty extracted text should fail");
175        assert!(err.to_string().contains("produced no text"));
176    }
177}