1use std::collections::HashMap;
2use std::path::Path;
3
4use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument, Extractor};
5use crate::Result;
6
7pub struct PdfExtractor;
8
9impl Extractor for PdfExtractor {
10 fn supports(&self) -> &[&str] {
11 &["pdf"]
12 }
13
14 fn profile_key(&self) -> &'static str {
15 "pdf"
16 }
17
18 fn version(&self) -> u32 {
19 2
20 }
21
22 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
23 let text = extract_pdf_text(bytes)?;
24 let blocks = paragraph_blocks(text.as_str());
25 if blocks.is_empty() {
26 return Err(kbolt_types::KboltError::InvalidInput(
27 "pdf text extraction produced no text; scanned or image-only PDFs need OCR"
28 .to_string(),
29 )
30 .into());
31 }
32
33 Ok(ExtractedDocument {
34 blocks,
35 metadata: HashMap::new(),
36 title: None,
37 })
38 }
39}
40
41fn extract_pdf_text(bytes: &[u8]) -> Result<String> {
42 match std::panic::catch_unwind(|| pdf_extract::extract_text_from_mem(bytes)) {
43 Ok(Ok(text)) => Ok(text),
44 Ok(Err(err)) => Err(kbolt_types::KboltError::InvalidInput(format!(
45 "pdf text extraction failed: {err}"
46 ))
47 .into()),
48 Err(_) => Err(kbolt_types::KboltError::InvalidInput(
49 "pdf text extraction failed: parser panicked".to_string(),
50 )
51 .into()),
52 }
53}
54
55fn paragraph_blocks(text: &str) -> Vec<ExtractedBlock> {
56 let mut blocks = Vec::new();
57 let mut current = String::new();
58 let mut next_offset = 0usize;
59
60 for line in text.lines() {
61 let trimmed = line.trim_end();
62 if trimmed.trim().is_empty() {
63 push_paragraph(&mut blocks, &mut current, &mut next_offset);
64 continue;
65 }
66
67 if !current.is_empty() {
68 current.push('\n');
69 }
70 current.push_str(trimmed);
71 }
72
73 push_paragraph(&mut blocks, &mut current, &mut next_offset);
74 blocks
75}
76
77fn push_paragraph(blocks: &mut Vec<ExtractedBlock>, current: &mut String, next_offset: &mut usize) {
78 let text = current.trim().to_string();
79 current.clear();
80 if text.is_empty() {
81 return;
82 }
83
84 let offset = *next_offset;
85 let length = text.len();
86 *next_offset = next_offset.saturating_add(length).saturating_add(2);
87 blocks.push(ExtractedBlock {
88 text,
89 offset,
90 length,
91 kind: BlockKind::Paragraph,
92 heading_path: Vec::new(),
93 attrs: HashMap::new(),
94 });
95}
96
97#[cfg(test)]
98pub(crate) fn simple_pdf_fixture(text: &str) -> Vec<u8> {
99 let escaped = text
100 .replace('\\', "\\\\")
101 .replace('(', "\\(")
102 .replace(')', "\\)")
103 .replace('\n', ") Tj T* (");
104 let stream = format!("BT /F1 12 Tf 72 720 Td 14 TL ({escaped}) Tj ET");
105 let objects = vec![
106 "<< /Type /Catalog /Pages 2 0 R >>".to_string(),
107 "<< /Type /Pages /Kids [3 0 R] /Count 1 >>".to_string(),
108 "<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>".to_string(),
109 "<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>".to_string(),
110 format!("<< /Length {} >>\nstream\n{}\nendstream", stream.len(), stream),
111 ];
112
113 let mut pdf = b"%PDF-1.4\n".to_vec();
114 let mut offsets = Vec::new();
115 for (index, object) in objects.iter().enumerate() {
116 offsets.push(pdf.len());
117 pdf.extend_from_slice(format!("{} 0 obj\n{}\nendobj\n", index + 1, object).as_bytes());
118 }
119
120 let xref_offset = pdf.len();
121 pdf.extend_from_slice(format!("xref\n0 {}\n", offsets.len() + 1).as_bytes());
122 pdf.extend_from_slice(b"0000000000 65535 f \n");
123 for offset in offsets {
124 pdf.extend_from_slice(format!("{offset:010} 00000 n \n").as_bytes());
125 }
126 pdf.extend_from_slice(
127 format!(
128 "trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n",
129 objects.len() + 1
130 )
131 .as_bytes(),
132 );
133 pdf
134}
135
136#[cfg(test)]
137mod tests {
138 use std::path::Path;
139
140 use crate::ingest::extract::Extractor;
141 use crate::ingest::pdf::{simple_pdf_fixture, PdfExtractor};
142
143 #[test]
144 fn extracts_digital_pdf_text_into_paragraphs() {
145 let extractor = PdfExtractor;
146 assert_eq!(extractor.profile_key(), "pdf");
147
148 let doc = extractor
149 .extract(
150 Path::new("papers/guide.pdf"),
151 &simple_pdf_fixture("Alpha pdf target.\nSecond line."),
152 )
153 .expect("extract pdf");
154
155 assert_eq!(doc.blocks.len(), 1);
156 assert!(doc.blocks[0].text.contains("Alpha pdf target."));
157 assert!(doc.blocks[0].text.contains("Second line."));
158 }
159
160 #[test]
161 fn rejects_invalid_pdf_bytes() {
162 let extractor = PdfExtractor;
163 let err = extractor
164 .extract(Path::new("papers/bad.pdf"), b"not a pdf")
165 .expect_err("invalid pdf should fail");
166 assert!(err.to_string().contains("pdf text extraction failed"));
167 }
168
169 #[test]
170 fn rejects_pdf_when_extraction_produces_no_text() {
171 let extractor = PdfExtractor;
172 let err = extractor
173 .extract(Path::new("papers/scan.pdf"), &simple_pdf_fixture(""))
174 .expect_err("empty extracted text should fail");
175 assert!(err.to_string().contains("produced no text"));
176 }
177}