1mod model;
2use std::path::Path;
3
4pub use model::*;
5mod chunk;
6pub use chunk::*;
7mod load;
8pub use load::*;
9
10pub struct DocumentProcessor<L, C> {
11 loader: L,
12 chunker: C,
13}
14
15impl<L, C> DocumentProcessor<L, C>
16where
17 L: DocumentLoad,
18 C: DocumentChunk,
19{
20 pub fn new(loader: L, chunker: C) -> Self {
21 Self { loader, chunker }
22 }
23
24 pub fn process<P: AsRef<Path>>(&self, path: P) -> Result<Vec<Chunk>, DocumentError> {
25 let doc = self.loader.load(path)
26 .map_err(|e| DocumentError::Load(Box::new(e)))?;
27 let chunks = self.chunker.chunk(&doc)
28 .map_err(|e| DocumentError::Chunk(Box::new(e)))?;
29 Ok(chunks)
30 }
31}
32
33#[derive(Debug, thiserror::Error)]
34pub enum DocumentError {
35 #[error("load error: {0}")]
36 Load(Box<dyn std::error::Error + 'static + Send + Sync>),
37
38 #[error("chunk error: {0}")]
39 Chunk(Box<dyn std::error::Error + 'static + Send + Sync>),
40}
41
42#[cfg(test)]
43mod tests {
44 use super::*;
45 use std::fs::File;
46 use std::io::Write;
47 use std::env::temp_dir;
48
49 #[test]
50 fn test_processor_end_to_end() {
51 let temp_dir = temp_dir();
53 let file_path = temp_dir.join("rag_test_doc.txt");
54
55 let mut file = File::create(&file_path).unwrap();
56 writeln!(file, "Hello Rust!\n\nThis is a RAG test.").unwrap();
57
58 let processor = DocumentProcessor {
60 loader: TextLoader,
61 chunker: ParagraphChunker,
62 };
63
64 let result = processor.process(&file_path);
66 assert!(result.is_ok(), "文档处理应该成功");
67
68 let chunks = result.unwrap();
69 assert_eq!(chunks.len(), 2);
70 assert_eq!(chunks[0].text, "Hello Rust!");
71 assert_eq!(chunks[1].text, "This is a RAG test.\n");
73
74 std::fs::remove_file(file_path).unwrap();
76 }
77
78 #[test]
79 fn test_processor_file_not_found() {
80 let processor = DocumentProcessor {
81 loader: TextLoader,
82 chunker: FixedChunker::without_overlap(10),
83 };
84
85 let result = processor.process(Path::new("not_exist_file_12345.txt"));
86
87 assert!(matches!(result, Err(DocumentError::Load(_))));
88 }
89}