zeph_memory/document/loader/
text.rs1use std::collections::HashMap;
5use std::path::Path;
6use std::pin::Pin;
7
8use super::super::{
9 DEFAULT_MAX_FILE_SIZE, Document, DocumentError, DocumentLoader, DocumentMetadata,
10};
11
12pub struct TextLoader {
13 pub max_file_size: u64,
14}
15
16impl Default for TextLoader {
17 fn default() -> Self {
18 Self {
19 max_file_size: DEFAULT_MAX_FILE_SIZE,
20 }
21 }
22}
23
24impl DocumentLoader for TextLoader {
25 fn load(
26 &self,
27 path: &Path,
28 ) -> Pin<Box<dyn std::future::Future<Output = Result<Vec<Document>, DocumentError>> + Send + '_>>
29 {
30 let path = path.to_path_buf();
31 let max_size = self.max_file_size;
32 Box::pin(async move {
33 let path = std::fs::canonicalize(&path)?;
34
35 let meta = tokio::fs::metadata(&path).await?;
36 if meta.len() > max_size {
37 return Err(DocumentError::FileTooLarge(meta.len()));
38 }
39
40 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
41
42 let content_type = match ext {
43 "md" | "markdown" => "text/markdown",
44 _ => "text/plain",
45 };
46
47 let content = tokio::fs::read_to_string(&path).await?;
48
49 Ok(vec![Document {
50 content,
51 metadata: DocumentMetadata {
52 source: path.display().to_string(),
53 content_type: content_type.to_owned(),
54 extra: HashMap::new(),
55 },
56 }])
57 })
58 }
59
60 fn supported_extensions(&self) -> &[&str] {
61 &["txt", "md", "markdown"]
62 }
63}
64
65#[cfg(test)]
66mod tests {
67 use super::*;
68
69 #[tokio::test]
70 async fn load_text_file() {
71 let dir = tempfile::tempdir().unwrap();
72 let file = dir.path().join("test.txt");
73 std::fs::write(&file, "hello world").unwrap();
74
75 let docs = TextLoader::default().load(&file).await.unwrap();
76 assert_eq!(docs.len(), 1);
77 assert_eq!(docs[0].content, "hello world");
78 assert_eq!(docs[0].metadata.content_type, "text/plain");
79 }
80
81 #[tokio::test]
82 async fn load_markdown_file() {
83 let dir = tempfile::tempdir().unwrap();
84 let file = dir.path().join("readme.md");
85 std::fs::write(&file, "# Title").unwrap();
86
87 let docs = TextLoader::default().load(&file).await.unwrap();
88 assert_eq!(docs[0].metadata.content_type, "text/markdown");
89 }
90
91 #[tokio::test]
92 async fn load_nonexistent_file() {
93 let result = TextLoader::default()
94 .load(Path::new("/nonexistent/file.txt"))
95 .await;
96 assert!(result.is_err());
97 }
98
99 #[tokio::test]
100 async fn load_empty_file() {
101 let dir = tempfile::tempdir().unwrap();
102 let file = dir.path().join("empty.txt");
103 std::fs::write(&file, "").unwrap();
104
105 let docs = TextLoader::default().load(&file).await.unwrap();
106 assert_eq!(docs.len(), 1);
107 assert!(docs[0].content.is_empty());
108 }
109
110 #[tokio::test]
111 async fn load_markdown_extension_variant() {
112 let dir = tempfile::tempdir().unwrap();
113 let file = dir.path().join("doc.markdown");
114 std::fs::write(&file, "content").unwrap();
115
116 let docs = TextLoader::default().load(&file).await.unwrap();
117 assert_eq!(docs[0].metadata.content_type, "text/markdown");
118 }
119
120 #[tokio::test]
121 async fn unknown_extension_treated_as_plain_text() {
122 let dir = tempfile::tempdir().unwrap();
123 let file = dir.path().join("data.csv");
124 std::fs::write(&file, "a,b,c").unwrap();
125
126 let docs = TextLoader::default().load(&file).await.unwrap();
127 assert_eq!(docs[0].metadata.content_type, "text/plain");
128 }
129
130 #[test]
131 fn supported_extensions_list() {
132 let loader = TextLoader::default();
133 let exts = loader.supported_extensions();
134 assert!(exts.contains(&"txt"));
135 assert!(exts.contains(&"md"));
136 assert!(exts.contains(&"markdown"));
137 }
138
139 #[tokio::test]
140 async fn metadata_source_is_canonical() {
141 let dir = tempfile::tempdir().unwrap();
142 let file = dir.path().join("test.txt");
143 std::fs::write(&file, "data").unwrap();
144
145 let docs = TextLoader::default().load(&file).await.unwrap();
146 let canonical = std::fs::canonicalize(&file).unwrap();
147 assert_eq!(docs[0].metadata.source, canonical.display().to_string());
148 }
149
150 #[tokio::test]
151 async fn file_too_large_rejected() {
152 let dir = tempfile::tempdir().unwrap();
153 let file = dir.path().join("big.txt");
154 std::fs::write(&file, "x").unwrap();
155
156 let loader = TextLoader { max_file_size: 0 };
157 let result = loader.load(&file).await;
158 assert!(matches!(result, Err(DocumentError::FileTooLarge(_))));
159 }
160}