reasonkit/ingestion/
pdf.rs

1//! PDF ingestion module using lopdf
2//!
3//! Extracts text content from PDF files for indexing in the knowledge base.
4
5use crate::{Document, DocumentType, Error, Metadata, Result, Source, SourceType};
6use chrono::Utc;
7use lopdf::Document as PdfDocument;
8use std::path::Path;
9use tracing::{debug, info, warn};
10
11/// PDF document ingester using lopdf
12pub struct PdfIngester {
13    /// Whether to extract metadata from PDF
14    extract_metadata: bool,
15}
16
17impl PdfIngester {
18    /// Create a new PDF ingester
19    pub fn new() -> Self {
20        Self {
21            extract_metadata: true,
22        }
23    }
24
25    /// Ingest a PDF file and extract text content
26    pub fn ingest(&self, path: &Path) -> Result<Document> {
27        info!("Ingesting PDF: {:?}", path);
28
29        let pdf_doc = PdfDocument::load(path)
30            .map_err(|e| Error::pdf(format!("Failed to load PDF: {}", e)))?;
31
32        let mut full_text = String::new();
33        let page_count = pdf_doc.get_pages().len();
34
35        debug!("PDF has {} pages", page_count);
36
37        // Extract text from each page
38        for (page_num, _) in pdf_doc.get_pages() {
39            match self.extract_page_text(&pdf_doc, page_num) {
40                Ok(text) => {
41                    if !text.is_empty() {
42                        full_text.push_str(&text);
43                        full_text.push('\n');
44                    }
45                }
46                Err(e) => {
47                    warn!("Failed to extract text from page {}: {}", page_num, e);
48                }
49            }
50        }
51
52        // Clean up the extracted text
53        let cleaned_text = self.clean_text(&full_text);
54
55        // Extract metadata if enabled
56        let metadata = if self.extract_metadata {
57            self.extract_metadata(&pdf_doc, path)
58        } else {
59            Metadata::default()
60        };
61
62        // Determine source type based on filename
63        let source_type = self.detect_source_type(path);
64        let arxiv_id = self.extract_arxiv_id(path);
65
66        let source = Source {
67            source_type,
68            url: None,
69            path: Some(path.to_string_lossy().to_string()),
70            arxiv_id,
71            github_repo: None,
72            retrieved_at: Utc::now(),
73            version: None,
74        };
75
76        let mut doc = Document::new(DocumentType::Paper, source).with_content(cleaned_text);
77
78        doc.metadata = metadata;
79
80        info!(
81            "Extracted {} chars from {} pages",
82            doc.content.char_count, page_count
83        );
84
85        Ok(doc)
86    }
87
88    /// Extract text from a single page
89    fn extract_page_text(&self, doc: &PdfDocument, page_num: u32) -> Result<String> {
90        let page_id = doc
91            .page_iter()
92            .nth((page_num - 1) as usize)
93            .ok_or_else(|| Error::pdf(format!("Page {} not found", page_num)))?;
94
95        let content = doc
96            .get_page_content(page_id)
97            .map_err(|e| Error::pdf(format!("Failed to get page content: {}", e)))?;
98
99        // Parse content stream and extract text
100        let text = self.parse_content_stream(&content, doc);
101
102        Ok(text)
103    }
104
105    /// Parse PDF content stream to extract text
106    fn parse_content_stream(&self, content: &[u8], _doc: &PdfDocument) -> String {
107        let mut text = String::new();
108        let content_str = String::from_utf8_lossy(content);
109
110        // Simple text extraction - look for text operators
111        // This is a simplified approach; full implementation would parse the content stream properly
112        let mut in_text = false;
113        let mut current_text = String::new();
114
115        for line in content_str.lines() {
116            let line = line.trim();
117
118            // BT = Begin Text, ET = End Text
119            if line == "BT" {
120                in_text = true;
121                continue;
122            }
123            if line == "ET" {
124                if !current_text.is_empty() {
125                    text.push_str(&current_text);
126                    text.push(' ');
127                    current_text.clear();
128                }
129                in_text = false;
130                continue;
131            }
132
133            if in_text {
134                // Look for text showing operators: Tj, TJ, ', "
135                if let Some(text_content) = self.extract_text_from_operator(line) {
136                    current_text.push_str(&text_content);
137                }
138            }
139        }
140
141        text
142    }
143
144    /// Extract text from PDF text operators
145    fn extract_text_from_operator(&self, line: &str) -> Option<String> {
146        let line = line.trim();
147
148        // Tj operator: (text) Tj
149        if line.ends_with("Tj") {
150            if let Some(start) = line.find('(') {
151                if let Some(end) = line.rfind(')') {
152                    let text = &line[start + 1..end];
153                    return Some(self.decode_pdf_string(text));
154                }
155            }
156        }
157
158        // TJ operator: [(text) num (text)] TJ
159        if line.ends_with("TJ") {
160            let mut result = String::new();
161            let mut in_string = false;
162            let mut current = String::new();
163
164            for c in line.chars() {
165                match c {
166                    '(' => {
167                        in_string = true;
168                        current.clear();
169                    }
170                    ')' => {
171                        if in_string {
172                            result.push_str(&self.decode_pdf_string(&current));
173                            in_string = false;
174                        }
175                    }
176                    _ if in_string => {
177                        current.push(c);
178                    }
179                    _ => {}
180                }
181            }
182
183            if !result.is_empty() {
184                return Some(result);
185            }
186        }
187
188        None
189    }
190
191    /// Decode PDF string escapes
192    fn decode_pdf_string(&self, s: &str) -> String {
193        let mut result = String::new();
194        let mut chars = s.chars().peekable();
195
196        while let Some(c) = chars.next() {
197            if c == '\\' {
198                match chars.next() {
199                    Some('n') => result.push('\n'),
200                    Some('r') => result.push('\r'),
201                    Some('t') => result.push('\t'),
202                    Some('\\') => result.push('\\'),
203                    Some('(') => result.push('('),
204                    Some(')') => result.push(')'),
205                    Some(d) if d.is_ascii_digit() => {
206                        // Octal escape
207                        let mut octal = String::from(d);
208                        while octal.len() < 3 {
209                            if let Some(&next) = chars.peek() {
210                                if next.is_ascii_digit() {
211                                    octal.push(chars.next().unwrap());
212                                } else {
213                                    break;
214                                }
215                            } else {
216                                break;
217                            }
218                        }
219                        if let Ok(code) = u8::from_str_radix(&octal, 8) {
220                            result.push(code as char);
221                        }
222                    }
223                    Some(other) => result.push(other),
224                    None => {}
225                }
226            } else {
227                result.push(c);
228            }
229        }
230
231        result
232    }
233
234    /// Clean extracted text
235    fn clean_text(&self, text: &str) -> String {
236        // Remove excessive whitespace
237        let mut cleaned = String::new();
238        let mut prev_was_space = false;
239
240        for c in text.chars() {
241            if c.is_whitespace() {
242                if !prev_was_space {
243                    cleaned.push(' ');
244                    prev_was_space = true;
245                }
246            } else {
247                cleaned.push(c);
248                prev_was_space = false;
249            }
250        }
251
252        // Remove common PDF artifacts
253        cleaned = cleaned.replace("\u{0000}", "");
254        cleaned = cleaned.replace("\u{FEFF}", ""); // BOM
255
256        cleaned.trim().to_string()
257    }
258
259    /// Extract metadata from PDF
260    fn extract_metadata(&self, doc: &PdfDocument, path: &Path) -> Metadata {
261        let mut metadata = Metadata::default();
262
263        // Helper to convert PDF string to Rust string
264        let pdf_to_string = |obj: &lopdf::Object| -> Option<String> {
265            match obj {
266                lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
267                lopdf::Object::Name(bytes) => String::from_utf8(bytes.clone()).ok(),
268                _ => None,
269            }
270        };
271
272        // Try to get document info dictionary
273        if let Ok(info) = doc.trailer.get(b"Info") {
274            if let Ok(info_ref) = info.as_reference() {
275                if let Ok(info_dict) = doc.get_dictionary(info_ref) {
276                    // Title
277                    if let Ok(title) = info_dict.get(b"Title") {
278                        metadata.title = pdf_to_string(title);
279                    }
280
281                    // Author - convert to Author struct
282                    if let Ok(author) = info_dict.get(b"Author") {
283                        if let Some(author_str) = pdf_to_string(author) {
284                            metadata.authors.push(crate::Author {
285                                name: author_str,
286                                affiliation: None,
287                                email: None,
288                            });
289                        }
290                    }
291
292                    // Subject -> store as abstract
293                    if let Ok(subject) = info_dict.get(b"Subject") {
294                        if let Some(abstract_text) = pdf_to_string(subject) {
295                            metadata.abstract_text = Some(abstract_text);
296                        }
297                    }
298
299                    // Keywords -> store as tags
300                    if let Ok(keywords) = info_dict.get(b"Keywords") {
301                        if let Some(keywords_str) = pdf_to_string(keywords) {
302                            metadata.tags = keywords_str
303                                .split(',')
304                                .map(|s| s.trim().to_string())
305                                .filter(|s| !s.is_empty())
306                                .collect();
307                        }
308                    }
309                }
310            }
311        }
312
313        // Fall back to filename for title if not found
314        if metadata.title.is_none() {
315            metadata.title = path
316                .file_stem()
317                .and_then(|s| s.to_str())
318                .map(|s| s.replace('_', " "));
319        }
320
321        metadata
322    }
323
324    /// Detect source type from filename
325    fn detect_source_type(&self, path: &Path) -> SourceType {
326        let filename = path.file_name().and_then(|s| s.to_str()).unwrap_or("");
327
328        if filename.contains("arxiv") || filename.starts_with("2") {
329            SourceType::Arxiv
330        } else {
331            SourceType::Local
332        }
333    }
334
335    /// Extract arXiv ID from filename
336    fn extract_arxiv_id(&self, path: &Path) -> Option<String> {
337        let filename = path.file_stem().and_then(|s| s.to_str())?;
338
339        // Pattern: anything_XXXX.XXXXX or arxiv_XXXX.XXXXX
340        let re = regex::Regex::new(r"(\d{4}\.\d{4,5})").ok()?;
341
342        re.captures(filename)
343            .and_then(|caps| caps.get(1))
344            .map(|m| m.as_str().to_string())
345    }
346}
347
348impl Default for PdfIngester {
349    fn default() -> Self {
350        Self::new()
351    }
352}
353
354impl super::Ingester for PdfIngester {
355    fn ingest(&self, path: &Path) -> Result<Document> {
356        PdfIngester::ingest(self, path)
357    }
358
359    fn can_handle(&self, path: &Path) -> bool {
360        path.extension()
361            .and_then(|e| e.to_str())
362            .map(|s| s.to_lowercase() == "pdf")
363            .unwrap_or(false)
364    }
365}
366
367#[cfg(test)]
368mod tests {
369    use super::*;
370
371    #[test]
372    fn test_decode_pdf_string() {
373        let ingester = PdfIngester::new();
374
375        assert_eq!(ingester.decode_pdf_string("hello"), "hello");
376        assert_eq!(ingester.decode_pdf_string("hello\\nworld"), "hello\nworld");
377        assert_eq!(ingester.decode_pdf_string("test\\(paren\\)"), "test(paren)");
378    }
379
380    #[test]
381    fn test_extract_arxiv_id() {
382        let ingester = PdfIngester::new();
383
384        let path = Path::new("/data/papers/arxiv_2401.18059.pdf");
385        assert_eq!(
386            ingester.extract_arxiv_id(path),
387            Some("2401.18059".to_string())
388        );
389
390        let path = Path::new("/data/papers/cot_2201.11903.pdf");
391        assert_eq!(
392            ingester.extract_arxiv_id(path),
393            Some("2201.11903".to_string())
394        );
395
396        let path = Path::new("/data/papers/random_paper.pdf");
397        assert_eq!(ingester.extract_arxiv_id(path), None);
398    }
399
400    #[test]
401    fn test_clean_text() {
402        let ingester = PdfIngester::new();
403
404        let dirty = "  hello   world  \n\n  test  ";
405        assert_eq!(ingester.clean_text(dirty), "hello world test");
406    }
407}
reasonkit/ingestion/pdf.rs

reasonkit/ingestion/
pdf.rs