reasonkit/ingestion/
mod.rs

1//! Document ingestion module for ReasonKit Core
2//!
3//! Provides functionality to ingest documents from various formats:
4//! - PDF (via lopdf)
5//! - Markdown (via pulldown-cmark)
6//! - HTML (via scraper)
7//! - JSON/JSONL (via serde)
8
9pub mod pdf;
10
11use crate::{Document, DocumentType, Error, Metadata, Result, Source, SourceType};
12use chrono::Utc;
13use std::path::Path;
14
15/// Trait for document ingesters
16pub trait Ingester {
17    /// Ingest a document from a file path
18    fn ingest(&self, path: &Path) -> Result<Document>;
19
20    /// Check if this ingester can handle the given file
21    fn can_handle(&self, path: &Path) -> bool;
22}
23
24/// Main document ingester that delegates to format-specific ingesters
25pub struct DocumentIngester {
26    pdf_ingester: pdf::PdfIngester,
27}
28
29impl DocumentIngester {
30    /// Create a new document ingester
31    pub fn new() -> Self {
32        Self {
33            pdf_ingester: pdf::PdfIngester::new(),
34        }
35    }
36
37    /// Ingest a document from a file path, auto-detecting format
38    pub fn ingest(&self, path: &Path) -> Result<Document> {
39        let extension = path
40            .extension()
41            .and_then(|e| e.to_str())
42            .map(|s| s.to_lowercase());
43
44        match extension.as_deref() {
45            Some("pdf") => self.pdf_ingester.ingest(path),
46            Some("md" | "markdown") => self.ingest_markdown(path),
47            Some("html" | "htm") => self.ingest_html(path),
48            Some("json") => self.ingest_json(path),
49            Some("jsonl") => self.ingest_jsonl(path),
50            Some("txt") => self.ingest_text(path),
51            _ => Err(Error::Config(format!(
52                "Unsupported file format: {:?}",
53                path
54            ))),
55        }
56    }
57
58    /// Ingest a markdown file
59    fn ingest_markdown(&self, path: &Path) -> Result<Document> {
60        use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
61        use std::fs;
62
63        let content = fs::read_to_string(path)?;
64
65        // Parse markdown to extract text and metadata
66        let mut options = Options::empty();
67        options.insert(Options::ENABLE_TABLES);
68        options.insert(Options::ENABLE_FOOTNOTES);
69
70        let parser = Parser::new_ext(&content, options);
71        let mut text = String::new();
72        let mut title: Option<String> = None;
73        let mut in_heading = false;
74
75        for event in parser {
76            match event {
77                Event::Start(Tag::Heading {
78                    level: pulldown_cmark::HeadingLevel::H1,
79                    ..
80                }) => {
81                    in_heading = true;
82                }
83                Event::End(TagEnd::Heading(pulldown_cmark::HeadingLevel::H1)) => {
84                    in_heading = false;
85                }
86                Event::Text(t) => {
87                    if in_heading && title.is_none() {
88                        title = Some(t.to_string());
89                    }
90                    text.push_str(&t);
91                    text.push(' ');
92                }
93                Event::SoftBreak | Event::HardBreak => {
94                    text.push('\n');
95                }
96                _ => {}
97            }
98        }
99
100        let source = Source {
101            source_type: SourceType::Local,
102            url: None,
103            path: Some(path.to_string_lossy().to_string()),
104            arxiv_id: None,
105            github_repo: None,
106            retrieved_at: Utc::now(),
107            version: None,
108        };
109
110        let mut doc = Document::new(DocumentType::Documentation, source)
111            .with_content(text.trim().to_string());
112
113        doc.metadata = Metadata {
114            title,
115            ..Default::default()
116        };
117
118        Ok(doc)
119    }
120
121    /// Ingest an HTML file
122    fn ingest_html(&self, path: &Path) -> Result<Document> {
123        use scraper::{Html, Selector};
124        use std::fs;
125
126        let content = fs::read_to_string(path)?;
127        let document = Html::parse_document(&content);
128
129        // Extract title
130        let title_selector = Selector::parse("title").unwrap();
131        let title = document
132            .select(&title_selector)
133            .next()
134            .map(|e| e.text().collect::<String>());
135
136        // Extract body text
137        let body_selector = Selector::parse("body").unwrap();
138        let text = document
139            .select(&body_selector)
140            .next()
141            .map(|e| e.text().collect::<Vec<_>>().join(" "))
142            .unwrap_or_default();
143
144        let source = Source {
145            source_type: SourceType::Local,
146            url: None,
147            path: Some(path.to_string_lossy().to_string()),
148            arxiv_id: None,
149            github_repo: None,
150            retrieved_at: Utc::now(),
151            version: None,
152        };
153
154        let mut doc = Document::new(DocumentType::Documentation, source)
155            .with_content(text.trim().to_string());
156
157        doc.metadata = Metadata {
158            title,
159            ..Default::default()
160        };
161
162        Ok(doc)
163    }
164
165    /// Ingest a JSON file
166    fn ingest_json(&self, path: &Path) -> Result<Document> {
167        use std::fs;
168
169        let content = fs::read_to_string(path)?;
170
171        // Try to parse as a Document first
172        if let Ok(doc) = serde_json::from_str::<Document>(&content) {
173            return Ok(doc);
174        }
175
176        // Otherwise, treat as raw content
177        let source = Source {
178            source_type: SourceType::Local,
179            url: None,
180            path: Some(path.to_string_lossy().to_string()),
181            arxiv_id: None,
182            github_repo: None,
183            retrieved_at: Utc::now(),
184            version: None,
185        };
186
187        Ok(Document::new(DocumentType::Note, source).with_content(content))
188    }
189
190    /// Ingest a JSONL file (one document per line)
191    fn ingest_jsonl(&self, path: &Path) -> Result<Document> {
192        use std::fs;
193        use std::io::{BufRead, BufReader};
194
195        let file = fs::File::open(path)?;
196        let reader = BufReader::new(file);
197
198        let mut all_content = String::new();
199
200        for line in reader.lines() {
201            let line = line?;
202            if !line.trim().is_empty() {
203                // Try to extract content field if it exists
204                if let Ok(json) = serde_json::from_str::<serde_json::Value>(&line) {
205                    if let Some(content) = json.get("content").and_then(|c| c.as_str()) {
206                        all_content.push_str(content);
207                        all_content.push('\n');
208                    } else {
209                        all_content.push_str(&line);
210                        all_content.push('\n');
211                    }
212                }
213            }
214        }
215
216        let source = Source {
217            source_type: SourceType::Local,
218            url: None,
219            path: Some(path.to_string_lossy().to_string()),
220            arxiv_id: None,
221            github_repo: None,
222            retrieved_at: Utc::now(),
223            version: None,
224        };
225
226        Ok(Document::new(DocumentType::Documentation, source)
227            .with_content(all_content.trim().to_string()))
228    }
229
230    /// Ingest a plain text file
231    fn ingest_text(&self, path: &Path) -> Result<Document> {
232        use std::fs;
233
234        let content = fs::read_to_string(path)?;
235
236        let source = Source {
237            source_type: SourceType::Local,
238            url: None,
239            path: Some(path.to_string_lossy().to_string()),
240            arxiv_id: None,
241            github_repo: None,
242            retrieved_at: Utc::now(),
243            version: None,
244        };
245
246        Ok(Document::new(DocumentType::Note, source).with_content(content))
247    }
248}
249
250impl Default for DocumentIngester {
251    fn default() -> Self {
252        Self::new()
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use std::io::Write;
260    use tempfile::NamedTempFile;
261
262    #[test]
263    fn test_markdown_ingestion() {
264        let mut file = NamedTempFile::with_suffix(".md").unwrap();
265        writeln!(file, "# Test Title\n\nThis is test content.").unwrap();
266
267        let ingester = DocumentIngester::new();
268        let doc = ingester.ingest(file.path()).unwrap();
269
270        assert!(doc.content.raw.contains("Test Title"));
271        assert!(doc.content.raw.contains("test content"));
272    }
273
274    #[test]
275    fn test_text_ingestion() {
276        let mut file = NamedTempFile::with_suffix(".txt").unwrap();
277        writeln!(file, "Plain text content").unwrap();
278
279        let ingester = DocumentIngester::new();
280        let doc = ingester.ingest(file.path()).unwrap();
281
282        assert!(doc.content.raw.contains("Plain text"));
283    }
284}