Skip to main content

sh_layer3/document_loaders/
mod.rs

1//! # Document Loaders
2//!
3//! 文档加载器:支持多种格式文档的加载。
4
5pub mod csv;
6pub mod json;
7pub mod markdown;
8pub mod pdf;
9pub mod text;
10
11use crate::retriever_engine::Document;
12use crate::types::Layer3Result;
13use async_trait::async_trait;
14use std::path::PathBuf;
15
16// Re-export loaders
17pub use csv::CsvLoader;
18pub use json::JsonLoader;
19pub use markdown::MarkdownLoader;
20pub use pdf::PdfLoader;
21pub use text::TextLoader;
22
23/// 文档加载器 trait
24///
25/// 定义文档加载的通用接口。
26#[async_trait]
27pub trait DocumentLoader: Send + Sync {
28    /// 加载文档
29    async fn load(&self, path: PathBuf) -> Layer3Result<Document>;
30
31    /// 加载并分块
32    async fn load_and_split(&self, path: PathBuf) -> Layer3Result<Vec<Document>>;
33
34    /// 检查是否支持该文件类型
35    fn supports(&self, path: &std::path::Path) -> bool;
36
37    /// 获取支持的扩展名列表
38    fn extensions(&self) -> &[&str];
39}
40
41/// 批量加载器 trait
42#[async_trait]
43pub trait BatchLoader: DocumentLoader {
44    /// 加载目录下所有文档
45    async fn load_directory(&self, dir: PathBuf, recursive: bool) -> Layer3Result<Vec<Document>>;
46
47    /// 批量加载文件
48    async fn load_batch(&self, paths: Vec<PathBuf>) -> Layer3Result<Vec<(PathBuf, Document)>>;
49}
50
51/// 文档加载器注册表
52pub struct LoaderRegistry {
53    loaders: Vec<Box<dyn DocumentLoader>>,
54}
55
56impl LoaderRegistry {
57    pub fn new() -> Self {
58        Self {
59            loaders: Vec::new(),
60        }
61    }
62
63    pub fn register(&mut self, loader: Box<dyn DocumentLoader>) {
64        self.loaders.push(loader);
65    }
66
67    pub fn get_loader(&self, path: &std::path::Path) -> Option<&dyn DocumentLoader> {
68        self.loaders
69            .iter()
70            .find(|l| l.supports(path))
71            .map(|l| l.as_ref())
72    }
73
74    pub fn load(&self, path: PathBuf) -> Layer3Result<Document> {
75        let loader = self
76            .get_loader(&path)
77            .ok_or_else(|| anyhow::anyhow!("No loader for: {:?}", path))?;
78
79        // 需要异步调用,这里简化处理
80        futures::executor::block_on(loader.load(path))
81    }
82}
83
84impl Default for LoaderRegistry {
85    fn default() -> Self {
86        Self::new()
87    }
88}
89
90/// 加载选项
91#[derive(Debug, Clone, Default)]
92pub struct LoadOptions {
93    /// 编码
94    pub encoding: Option<String>,
95    /// 最大文件大小(字节)
96    pub max_size: Option<u64>,
97    /// 是否提取元数据
98    pub extract_metadata: bool,
99    /// 自定义解析选项
100    pub parse_options: serde_json::Map<String, serde_json::Value>,
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106
107    #[test]
108    fn test_loader_registry() {
109        let registry = LoaderRegistry::new();
110        assert!(registry.loaders.is_empty());
111    }
112}