reasonkit/ingestion/
mod.rs1pub mod pdf;
10
11use crate::{Document, DocumentType, Error, Metadata, Result, Source, SourceType};
12use chrono::Utc;
13use std::path::Path;
14
15pub trait Ingester {
17 fn ingest(&self, path: &Path) -> Result<Document>;
19
20 fn can_handle(&self, path: &Path) -> bool;
22}
23
24pub struct DocumentIngester {
26 pdf_ingester: pdf::PdfIngester,
27}
28
29impl DocumentIngester {
30 pub fn new() -> Self {
32 Self {
33 pdf_ingester: pdf::PdfIngester::new(),
34 }
35 }
36
37 pub fn ingest(&self, path: &Path) -> Result<Document> {
39 let extension = path
40 .extension()
41 .and_then(|e| e.to_str())
42 .map(|s| s.to_lowercase());
43
44 match extension.as_deref() {
45 Some("pdf") => self.pdf_ingester.ingest(path),
46 Some("md" | "markdown") => self.ingest_markdown(path),
47 Some("html" | "htm") => self.ingest_html(path),
48 Some("json") => self.ingest_json(path),
49 Some("jsonl") => self.ingest_jsonl(path),
50 Some("txt") => self.ingest_text(path),
51 _ => Err(Error::Config(format!(
52 "Unsupported file format: {:?}",
53 path
54 ))),
55 }
56 }
57
58 fn ingest_markdown(&self, path: &Path) -> Result<Document> {
60 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
61 use std::fs;
62
63 let content = fs::read_to_string(path)?;
64
65 let mut options = Options::empty();
67 options.insert(Options::ENABLE_TABLES);
68 options.insert(Options::ENABLE_FOOTNOTES);
69
70 let parser = Parser::new_ext(&content, options);
71 let mut text = String::new();
72 let mut title: Option<String> = None;
73 let mut in_heading = false;
74
75 for event in parser {
76 match event {
77 Event::Start(Tag::Heading {
78 level: pulldown_cmark::HeadingLevel::H1,
79 ..
80 }) => {
81 in_heading = true;
82 }
83 Event::End(TagEnd::Heading(pulldown_cmark::HeadingLevel::H1)) => {
84 in_heading = false;
85 }
86 Event::Text(t) => {
87 if in_heading && title.is_none() {
88 title = Some(t.to_string());
89 }
90 text.push_str(&t);
91 text.push(' ');
92 }
93 Event::SoftBreak | Event::HardBreak => {
94 text.push('\n');
95 }
96 _ => {}
97 }
98 }
99
100 let source = Source {
101 source_type: SourceType::Local,
102 url: None,
103 path: Some(path.to_string_lossy().to_string()),
104 arxiv_id: None,
105 github_repo: None,
106 retrieved_at: Utc::now(),
107 version: None,
108 };
109
110 let mut doc = Document::new(DocumentType::Documentation, source)
111 .with_content(text.trim().to_string());
112
113 doc.metadata = Metadata {
114 title,
115 ..Default::default()
116 };
117
118 Ok(doc)
119 }
120
121 fn ingest_html(&self, path: &Path) -> Result<Document> {
123 use scraper::{Html, Selector};
124 use std::fs;
125
126 let content = fs::read_to_string(path)?;
127 let document = Html::parse_document(&content);
128
129 let title_selector = Selector::parse("title").unwrap();
131 let title = document
132 .select(&title_selector)
133 .next()
134 .map(|e| e.text().collect::<String>());
135
136 let body_selector = Selector::parse("body").unwrap();
138 let text = document
139 .select(&body_selector)
140 .next()
141 .map(|e| e.text().collect::<Vec<_>>().join(" "))
142 .unwrap_or_default();
143
144 let source = Source {
145 source_type: SourceType::Local,
146 url: None,
147 path: Some(path.to_string_lossy().to_string()),
148 arxiv_id: None,
149 github_repo: None,
150 retrieved_at: Utc::now(),
151 version: None,
152 };
153
154 let mut doc = Document::new(DocumentType::Documentation, source)
155 .with_content(text.trim().to_string());
156
157 doc.metadata = Metadata {
158 title,
159 ..Default::default()
160 };
161
162 Ok(doc)
163 }
164
165 fn ingest_json(&self, path: &Path) -> Result<Document> {
167 use std::fs;
168
169 let content = fs::read_to_string(path)?;
170
171 if let Ok(doc) = serde_json::from_str::<Document>(&content) {
173 return Ok(doc);
174 }
175
176 let source = Source {
178 source_type: SourceType::Local,
179 url: None,
180 path: Some(path.to_string_lossy().to_string()),
181 arxiv_id: None,
182 github_repo: None,
183 retrieved_at: Utc::now(),
184 version: None,
185 };
186
187 Ok(Document::new(DocumentType::Note, source).with_content(content))
188 }
189
190 fn ingest_jsonl(&self, path: &Path) -> Result<Document> {
192 use std::fs;
193 use std::io::{BufRead, BufReader};
194
195 let file = fs::File::open(path)?;
196 let reader = BufReader::new(file);
197
198 let mut all_content = String::new();
199
200 for line in reader.lines() {
201 let line = line?;
202 if !line.trim().is_empty() {
203 if let Ok(json) = serde_json::from_str::<serde_json::Value>(&line) {
205 if let Some(content) = json.get("content").and_then(|c| c.as_str()) {
206 all_content.push_str(content);
207 all_content.push('\n');
208 } else {
209 all_content.push_str(&line);
210 all_content.push('\n');
211 }
212 }
213 }
214 }
215
216 let source = Source {
217 source_type: SourceType::Local,
218 url: None,
219 path: Some(path.to_string_lossy().to_string()),
220 arxiv_id: None,
221 github_repo: None,
222 retrieved_at: Utc::now(),
223 version: None,
224 };
225
226 Ok(Document::new(DocumentType::Documentation, source)
227 .with_content(all_content.trim().to_string()))
228 }
229
230 fn ingest_text(&self, path: &Path) -> Result<Document> {
232 use std::fs;
233
234 let content = fs::read_to_string(path)?;
235
236 let source = Source {
237 source_type: SourceType::Local,
238 url: None,
239 path: Some(path.to_string_lossy().to_string()),
240 arxiv_id: None,
241 github_repo: None,
242 retrieved_at: Utc::now(),
243 version: None,
244 };
245
246 Ok(Document::new(DocumentType::Note, source).with_content(content))
247 }
248}
249
250impl Default for DocumentIngester {
251 fn default() -> Self {
252 Self::new()
253 }
254}
255
256#[cfg(test)]
257mod tests {
258 use super::*;
259 use std::io::Write;
260 use tempfile::NamedTempFile;
261
262 #[test]
263 fn test_markdown_ingestion() {
264 let mut file = NamedTempFile::with_suffix(".md").unwrap();
265 writeln!(file, "# Test Title\n\nThis is test content.").unwrap();
266
267 let ingester = DocumentIngester::new();
268 let doc = ingester.ingest(file.path()).unwrap();
269
270 assert!(doc.content.raw.contains("Test Title"));
271 assert!(doc.content.raw.contains("test content"));
272 }
273
274 #[test]
275 fn test_text_ingestion() {
276 let mut file = NamedTempFile::with_suffix(".txt").unwrap();
277 writeln!(file, "Plain text content").unwrap();
278
279 let ingester = DocumentIngester::new();
280 let doc = ingester.ingest(file.path()).unwrap();
281
282 assert!(doc.content.raw.contains("Plain text"));
283 }
284}