reasonkit/ingestion/
pdf.rs1use crate::{Document, DocumentType, Error, Metadata, Result, Source, SourceType};
6use chrono::Utc;
7use lopdf::Document as PdfDocument;
8use std::path::Path;
9use tracing::{debug, info, warn};
10
11pub struct PdfIngester {
13 extract_metadata: bool,
15}
16
17impl PdfIngester {
18 pub fn new() -> Self {
20 Self {
21 extract_metadata: true,
22 }
23 }
24
25 pub fn ingest(&self, path: &Path) -> Result<Document> {
27 info!("Ingesting PDF: {:?}", path);
28
29 let pdf_doc = PdfDocument::load(path)
30 .map_err(|e| Error::pdf(format!("Failed to load PDF: {}", e)))?;
31
32 let mut full_text = String::new();
33 let page_count = pdf_doc.get_pages().len();
34
35 debug!("PDF has {} pages", page_count);
36
37 for (page_num, _) in pdf_doc.get_pages() {
39 match self.extract_page_text(&pdf_doc, page_num) {
40 Ok(text) => {
41 if !text.is_empty() {
42 full_text.push_str(&text);
43 full_text.push('\n');
44 }
45 }
46 Err(e) => {
47 warn!("Failed to extract text from page {}: {}", page_num, e);
48 }
49 }
50 }
51
52 let cleaned_text = self.clean_text(&full_text);
54
55 let metadata = if self.extract_metadata {
57 self.extract_metadata(&pdf_doc, path)
58 } else {
59 Metadata::default()
60 };
61
62 let source_type = self.detect_source_type(path);
64 let arxiv_id = self.extract_arxiv_id(path);
65
66 let source = Source {
67 source_type,
68 url: None,
69 path: Some(path.to_string_lossy().to_string()),
70 arxiv_id,
71 github_repo: None,
72 retrieved_at: Utc::now(),
73 version: None,
74 };
75
76 let mut doc = Document::new(DocumentType::Paper, source).with_content(cleaned_text);
77
78 doc.metadata = metadata;
79
80 info!(
81 "Extracted {} chars from {} pages",
82 doc.content.char_count, page_count
83 );
84
85 Ok(doc)
86 }
87
88 fn extract_page_text(&self, doc: &PdfDocument, page_num: u32) -> Result<String> {
90 let page_id = doc
91 .page_iter()
92 .nth((page_num - 1) as usize)
93 .ok_or_else(|| Error::pdf(format!("Page {} not found", page_num)))?;
94
95 let content = doc
96 .get_page_content(page_id)
97 .map_err(|e| Error::pdf(format!("Failed to get page content: {}", e)))?;
98
99 let text = self.parse_content_stream(&content, doc);
101
102 Ok(text)
103 }
104
105 fn parse_content_stream(&self, content: &[u8], _doc: &PdfDocument) -> String {
107 let mut text = String::new();
108 let content_str = String::from_utf8_lossy(content);
109
110 let mut in_text = false;
113 let mut current_text = String::new();
114
115 for line in content_str.lines() {
116 let line = line.trim();
117
118 if line == "BT" {
120 in_text = true;
121 continue;
122 }
123 if line == "ET" {
124 if !current_text.is_empty() {
125 text.push_str(¤t_text);
126 text.push(' ');
127 current_text.clear();
128 }
129 in_text = false;
130 continue;
131 }
132
133 if in_text {
134 if let Some(text_content) = self.extract_text_from_operator(line) {
136 current_text.push_str(&text_content);
137 }
138 }
139 }
140
141 text
142 }
143
144 fn extract_text_from_operator(&self, line: &str) -> Option<String> {
146 let line = line.trim();
147
148 if line.ends_with("Tj") {
150 if let Some(start) = line.find('(') {
151 if let Some(end) = line.rfind(')') {
152 let text = &line[start + 1..end];
153 return Some(self.decode_pdf_string(text));
154 }
155 }
156 }
157
158 if line.ends_with("TJ") {
160 let mut result = String::new();
161 let mut in_string = false;
162 let mut current = String::new();
163
164 for c in line.chars() {
165 match c {
166 '(' => {
167 in_string = true;
168 current.clear();
169 }
170 ')' => {
171 if in_string {
172 result.push_str(&self.decode_pdf_string(¤t));
173 in_string = false;
174 }
175 }
176 _ if in_string => {
177 current.push(c);
178 }
179 _ => {}
180 }
181 }
182
183 if !result.is_empty() {
184 return Some(result);
185 }
186 }
187
188 None
189 }
190
191 fn decode_pdf_string(&self, s: &str) -> String {
193 let mut result = String::new();
194 let mut chars = s.chars().peekable();
195
196 while let Some(c) = chars.next() {
197 if c == '\\' {
198 match chars.next() {
199 Some('n') => result.push('\n'),
200 Some('r') => result.push('\r'),
201 Some('t') => result.push('\t'),
202 Some('\\') => result.push('\\'),
203 Some('(') => result.push('('),
204 Some(')') => result.push(')'),
205 Some(d) if d.is_ascii_digit() => {
206 let mut octal = String::from(d);
208 while octal.len() < 3 {
209 if let Some(&next) = chars.peek() {
210 if next.is_ascii_digit() {
211 octal.push(chars.next().unwrap());
212 } else {
213 break;
214 }
215 } else {
216 break;
217 }
218 }
219 if let Ok(code) = u8::from_str_radix(&octal, 8) {
220 result.push(code as char);
221 }
222 }
223 Some(other) => result.push(other),
224 None => {}
225 }
226 } else {
227 result.push(c);
228 }
229 }
230
231 result
232 }
233
234 fn clean_text(&self, text: &str) -> String {
236 let mut cleaned = String::new();
238 let mut prev_was_space = false;
239
240 for c in text.chars() {
241 if c.is_whitespace() {
242 if !prev_was_space {
243 cleaned.push(' ');
244 prev_was_space = true;
245 }
246 } else {
247 cleaned.push(c);
248 prev_was_space = false;
249 }
250 }
251
252 cleaned = cleaned.replace("\u{0000}", "");
254 cleaned = cleaned.replace("\u{FEFF}", ""); cleaned.trim().to_string()
257 }
258
259 fn extract_metadata(&self, doc: &PdfDocument, path: &Path) -> Metadata {
261 let mut metadata = Metadata::default();
262
263 let pdf_to_string = |obj: &lopdf::Object| -> Option<String> {
265 match obj {
266 lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
267 lopdf::Object::Name(bytes) => String::from_utf8(bytes.clone()).ok(),
268 _ => None,
269 }
270 };
271
272 if let Ok(info) = doc.trailer.get(b"Info") {
274 if let Ok(info_ref) = info.as_reference() {
275 if let Ok(info_dict) = doc.get_dictionary(info_ref) {
276 if let Ok(title) = info_dict.get(b"Title") {
278 metadata.title = pdf_to_string(title);
279 }
280
281 if let Ok(author) = info_dict.get(b"Author") {
283 if let Some(author_str) = pdf_to_string(author) {
284 metadata.authors.push(crate::Author {
285 name: author_str,
286 affiliation: None,
287 email: None,
288 });
289 }
290 }
291
292 if let Ok(subject) = info_dict.get(b"Subject") {
294 if let Some(abstract_text) = pdf_to_string(subject) {
295 metadata.abstract_text = Some(abstract_text);
296 }
297 }
298
299 if let Ok(keywords) = info_dict.get(b"Keywords") {
301 if let Some(keywords_str) = pdf_to_string(keywords) {
302 metadata.tags = keywords_str
303 .split(',')
304 .map(|s| s.trim().to_string())
305 .filter(|s| !s.is_empty())
306 .collect();
307 }
308 }
309 }
310 }
311 }
312
313 if metadata.title.is_none() {
315 metadata.title = path
316 .file_stem()
317 .and_then(|s| s.to_str())
318 .map(|s| s.replace('_', " "));
319 }
320
321 metadata
322 }
323
324 fn detect_source_type(&self, path: &Path) -> SourceType {
326 let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
327
328 if filename.contains("arxiv") || filename.starts_with("2") {
329 SourceType::Arxiv
330 } else {
331 SourceType::Local
332 }
333 }
334
335 fn extract_arxiv_id(&self, path: &Path) -> Option<String> {
337 let filename = path.file_stem().and_then(|s| s.to_str())?;
338
339 let re = regex::Regex::new(r"(\d{4}\.\d{4,5})").ok()?;
341
342 re.captures(filename)
343 .and_then(|caps| caps.get(1))
344 .map(|m| m.as_str().to_string())
345 }
346}
347
348impl Default for PdfIngester {
349 fn default() -> Self {
350 Self::new()
351 }
352}
353
354impl super::Ingester for PdfIngester {
355 fn ingest(&self, path: &Path) -> Result<Document> {
356 PdfIngester::ingest(self, path)
357 }
358
359 fn can_handle(&self, path: &Path) -> bool {
360 path.extension()
361 .and_then(|e| e.to_str())
362 .map(|s| s.to_lowercase() == "pdf")
363 .unwrap_or(false)
364 }
365}
366
367#[cfg(test)]
368mod tests {
369 use super::*;
370
371 #[test]
372 fn test_decode_pdf_string() {
373 let ingester = PdfIngester::new();
374
375 assert_eq!(ingester.decode_pdf_string("hello"), "hello");
376 assert_eq!(ingester.decode_pdf_string("hello\\nworld"), "hello\nworld");
377 assert_eq!(ingester.decode_pdf_string("test\\(paren\\)"), "test(paren)");
378 }
379
380 #[test]
381 fn test_extract_arxiv_id() {
382 let ingester = PdfIngester::new();
383
384 let path = Path::new("/data/papers/arxiv_2401.18059.pdf");
385 assert_eq!(
386 ingester.extract_arxiv_id(path),
387 Some("2401.18059".to_string())
388 );
389
390 let path = Path::new("/data/papers/cot_2201.11903.pdf");
391 assert_eq!(
392 ingester.extract_arxiv_id(path),
393 Some("2201.11903".to_string())
394 );
395
396 let path = Path::new("/data/papers/random_paper.pdf");
397 assert_eq!(ingester.extract_arxiv_id(path), None);
398 }
399
400 #[test]
401 fn test_clean_text() {
402 let ingester = PdfIngester::new();
403
404 let dirty = " hello world \n\n test ";
405 assert_eq!(ingester.clean_text(dirty), "hello world test");
406 }
407}