pub mod csv;
pub mod json;
pub mod markdown;
pub mod pdf;
pub mod text;
use crate::retriever_engine::Document;
use crate::types::Layer3Result;
use async_trait::async_trait;
use std::path::PathBuf;
pub use csv::CsvLoader;
pub use json::JsonLoader;
pub use markdown::MarkdownLoader;
pub use pdf::PdfLoader;
pub use text::TextLoader;
#[async_trait]
pub trait DocumentLoader: Send + Sync {
async fn load(&self, path: PathBuf) -> Layer3Result<Document>;
async fn load_and_split(&self, path: PathBuf) -> Layer3Result<Vec<Document>>;
fn supports(&self, path: &std::path::Path) -> bool;
fn extensions(&self) -> &[&str];
}
#[async_trait]
pub trait BatchLoader: DocumentLoader {
async fn load_directory(&self, dir: PathBuf, recursive: bool) -> Layer3Result<Vec<Document>>;
async fn load_batch(&self, paths: Vec<PathBuf>) -> Layer3Result<Vec<(PathBuf, Document)>>;
}
pub struct LoaderRegistry {
loaders: Vec<Box<dyn DocumentLoader>>,
}
impl LoaderRegistry {
pub fn new() -> Self {
Self {
loaders: Vec::new(),
}
}
pub fn register(&mut self, loader: Box<dyn DocumentLoader>) {
self.loaders.push(loader);
}
pub fn get_loader(&self, path: &std::path::Path) -> Option<&dyn DocumentLoader> {
self.loaders
.iter()
.find(|l| l.supports(path))
.map(|l| l.as_ref())
}
pub fn load(&self, path: PathBuf) -> Layer3Result<Document> {
let loader = self
.get_loader(&path)
.ok_or_else(|| anyhow::anyhow!("No loader for: {:?}", path))?;
futures::executor::block_on(loader.load(path))
}
}
impl Default for LoaderRegistry {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct LoadOptions {
pub encoding: Option<String>,
pub max_size: Option<u64>,
pub extract_metadata: bool,
pub parse_options: serde_json::Map<String, serde_json::Value>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_loader_registry() {
let registry = LoaderRegistry::new();
assert!(registry.loaders.is_empty());
}
}