pub mod pdf;
use crate::{Document, DocumentType, Error, Metadata, Result, Source, SourceType};
use chrono::Utc;
use rayon::prelude::*;
use std::path::{Path, PathBuf};
pub trait Ingester {
fn ingest(&self, path: &Path) -> Result<Document>;
fn can_handle(&self, path: &Path) -> bool;
}
pub struct DocumentIngester {
pdf_ingester: pdf::PdfIngester,
}
impl DocumentIngester {
pub fn new() -> Self {
Self {
pdf_ingester: pdf::PdfIngester::new(),
}
}
pub fn ingest_batch(&self, paths: &[PathBuf]) -> Vec<Result<Document>> {
paths.par_iter().map(|path| self.ingest(path)).collect()
}
pub fn ingest(&self, path: &Path) -> Result<Document> {
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_lowercase());
match extension.as_deref() {
Some("pdf") => self.pdf_ingester.ingest(path),
Some("md" | "markdown") => self.ingest_markdown(path),
Some("html" | "htm") => self.ingest_html(path),
Some("json") => self.ingest_json(path),
Some("jsonl") => self.ingest_jsonl(path),
Some("txt") => self.ingest_text(path),
_ => Err(Error::Config(format!(
"Unsupported file format: {:?}",
path
))),
}
}
fn ingest_markdown(&self, path: &Path) -> Result<Document> {
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
use std::fs;
let content = fs::read_to_string(path)?;
let mut options = Options::empty();
options.insert(Options::ENABLE_TABLES);
options.insert(Options::ENABLE_FOOTNOTES);
let parser = Parser::new_ext(&content, options);
let mut text = String::new();
let mut title: Option<String> = None;
let mut in_heading = false;
for event in parser {
match event {
Event::Start(Tag::Heading {
level: pulldown_cmark::HeadingLevel::H1,
..
}) => {
in_heading = true;
}
Event::End(TagEnd::Heading(pulldown_cmark::HeadingLevel::H1)) => {
in_heading = false;
}
Event::Text(t) => {
if in_heading && title.is_none() {
title = Some(t.to_string());
}
text.push_str(&t);
text.push(' ');
}
Event::SoftBreak | Event::HardBreak => {
text.push('\n');
}
_ => {}
}
}
let source = Source {
source_type: SourceType::Local,
url: None,
path: Some(path.to_string_lossy().to_string()),
arxiv_id: None,
github_repo: None,
retrieved_at: Utc::now(),
version: None,
};
let mut doc = Document::new(DocumentType::Documentation, source)
.with_content(text.trim().to_string());
doc.metadata = Metadata {
title,
..Default::default()
};
Ok(doc)
}
fn ingest_html(&self, path: &Path) -> Result<Document> {
use scraper::{Html, Selector};
use std::fs;
let content = fs::read_to_string(path)?;
let document = Html::parse_document(&content);
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|e| e.text().collect::<String>());
let body_selector = Selector::parse("body").unwrap();
let text = document
.select(&body_selector)
.next()
.map(|e| e.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default();
let source = Source {
source_type: SourceType::Local,
url: None,
path: Some(path.to_string_lossy().to_string()),
arxiv_id: None,
github_repo: None,
retrieved_at: Utc::now(),
version: None,
};
let mut doc = Document::new(DocumentType::Documentation, source)
.with_content(text.trim().to_string());
doc.metadata = Metadata {
title,
..Default::default()
};
Ok(doc)
}
fn ingest_json(&self, path: &Path) -> Result<Document> {
use std::fs;
let content = fs::read_to_string(path)?;
if let Ok(doc) = serde_json::from_str::<Document>(&content) {
return Ok(doc);
}
let source = Source {
source_type: SourceType::Local,
url: None,
path: Some(path.to_string_lossy().to_string()),
arxiv_id: None,
github_repo: None,
retrieved_at: Utc::now(),
version: None,
};
Ok(Document::new(DocumentType::Note, source).with_content(content))
}
fn ingest_jsonl(&self, path: &Path) -> Result<Document> {
use std::fs;
use std::io::{BufRead, BufReader};
let file = fs::File::open(path)?;
let reader = BufReader::new(file);
let lines: Vec<String> = reader.lines().collect::<std::result::Result<_, _>>()?;
let content_parts: Vec<String> = lines
.par_iter()
.filter(|line| !line.trim().is_empty())
.map(|line| {
if let Ok(json) = serde_json::from_str::<serde_json::Value>(line) {
if let Some(content) = json.get("content").and_then(|c| c.as_str()) {
return content.to_string();
}
}
line.clone()
})
.collect();
let all_content = content_parts.join("\n");
let source = Source {
source_type: SourceType::Local,
url: None,
path: Some(path.to_string_lossy().to_string()),
arxiv_id: None,
github_repo: None,
retrieved_at: Utc::now(),
version: None,
};
Ok(Document::new(DocumentType::Documentation, source)
.with_content(all_content.trim().to_string()))
}
fn ingest_text(&self, path: &Path) -> Result<Document> {
use std::fs;
let content = fs::read_to_string(path)?;
let source = Source {
source_type: SourceType::Local,
url: None,
path: Some(path.to_string_lossy().to_string()),
arxiv_id: None,
github_repo: None,
retrieved_at: Utc::now(),
version: None,
};
Ok(Document::new(DocumentType::Note, source).with_content(content))
}
}
impl Default for DocumentIngester {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_markdown_ingestion() {
let mut file = NamedTempFile::with_suffix(".md").unwrap();
writeln!(file, "# Test Title\n\nThis is test content.").unwrap();
let ingester = DocumentIngester::new();
let doc = ingester.ingest(file.path()).unwrap();
assert!(doc.content.raw.contains("Test Title"));
assert!(doc.content.raw.contains("test content"));
}
#[test]
fn test_text_ingestion() {
let mut file = NamedTempFile::with_suffix(".txt").unwrap();
writeln!(file, "Plain text content").unwrap();
let ingester = DocumentIngester::new();
let doc = ingester.ingest(file.path()).unwrap();
assert!(doc.content.raw.contains("Plain text"));
}
}