Skip to main content

abu_rag/document/
mod.rs

1mod model;
2use std::path::Path;
3
4pub use model::*;
5mod chunk;
6pub use chunk::*;
7mod load;
8pub use load::*;
9
10pub struct DocumentProcessor<L, C> {
11    loader: L,
12    chunker: C,
13}
14
15impl<L, C> DocumentProcessor<L, C>
16where
17    L: DocumentLoad,
18    C: DocumentChunk,
19{
20    pub fn new(loader: L, chunker: C) -> Self {
21        Self { loader, chunker }
22    }
23
24    pub fn process<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Chunk>, DocumentError> {
25        let doc = self.loader.load(path)
26            .map_err(|e| DocumentError::Load(Box::new(e)))?;
27        let chunks = self.chunker.chunk(&doc)
28            .map_err(|e| DocumentError::Chunk(Box::new(e)))?;
29        Ok(chunks)
30    }
31}
32
33#[derive(Debug, thiserror::Error)]
34pub enum DocumentError {
35    #[error("load error: {0}")]
36    Load(Box<dyn std::error::Error + 'static + Send + Sync>),
37
38    #[error("chunk error: {0}")]
39    Chunk(Box<dyn std::error::Error + 'static + Send + Sync>),
40}
41
42#[cfg(test)]
43mod tests {
44    use super::*;
45    use std::fs::File;
46    use std::io::Write;
47    use std::env::temp_dir;
48
49    #[test]
50    fn test_processor_end_to_end() {
51        // 1. 在系统的临时目录创建一个测试文件
52        let temp_dir = temp_dir();
53        let file_path = temp_dir.join("rag_test_doc.txt");
54        
55        let mut file = File::create(&file_path).unwrap();
56        writeln!(file, "Hello Rust!\n\nThis is a RAG test.").unwrap();
57        
58        // 2. 初始化 Processor
59        let processor = DocumentProcessor {
60            loader: TextLoader,
61            chunker: ParagraphChunker,
62        };
63
64        // 3. 执行处理
65        let result = processor.process(&file_path);
66        assert!(result.is_ok(), "文档处理应该成功");
67        
68        let chunks = result.unwrap();
69        assert_eq!(chunks.len(), 2);
70        assert_eq!(chunks[0].text, "Hello Rust!");
71        // 注意 writeln! 会在末尾加上 \n,所以第二个段落包含 \n
72        assert_eq!(chunks[1].text, "This is a RAG test.\n");
73
74        // 4. 清理临时文件
75        std::fs::remove_file(file_path).unwrap();
76    }
77    
78    #[test]
79    fn test_processor_file_not_found() {
80        let processor = DocumentProcessor {
81            loader: TextLoader,
82            chunker: FixedChunker::without_overlap(10),
83        };
84
85        let result = processor.process(Path::new("not_exist_file_12345.txt"));
86        
87        assert!(matches!(result, Err(DocumentError::Load(_))));
88    }
89}