Skip to main content

zeph_memory/document/loader/
text.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4use std::collections::HashMap;
5use std::path::Path;
6use std::pin::Pin;
7
8use super::super::{
9    DEFAULT_MAX_FILE_SIZE, Document, DocumentError, DocumentLoader, DocumentMetadata,
10};
11
12pub struct TextLoader {
13    pub max_file_size: u64,
14}
15
16impl Default for TextLoader {
17    fn default() -> Self {
18        Self {
19            max_file_size: DEFAULT_MAX_FILE_SIZE,
20        }
21    }
22}
23
24impl DocumentLoader for TextLoader {
25    fn load(
26        &self,
27        path: &Path,
28    ) -> Pin<Box<dyn std::future::Future<Output = Result<Vec<Document>, DocumentError>> + Send + '_>>
29    {
30        let path = path.to_path_buf();
31        let max_size = self.max_file_size;
32        Box::pin(async move {
33            let path = std::fs::canonicalize(&path)?;
34
35            let meta = tokio::fs::metadata(&path).await?;
36            if meta.len() > max_size {
37                return Err(DocumentError::FileTooLarge(meta.len()));
38            }
39
40            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
41
42            let content_type = match ext {
43                "md" | "markdown" => "text/markdown",
44                _ => "text/plain",
45            };
46
47            let content = tokio::fs::read_to_string(&path).await?;
48
49            Ok(vec![Document {
50                content,
51                metadata: DocumentMetadata {
52                    source: path.display().to_string(),
53                    content_type: content_type.to_owned(),
54                    extra: HashMap::new(),
55                },
56            }])
57        })
58    }
59
60    fn supported_extensions(&self) -> &[&str] {
61        &["txt", "md", "markdown"]
62    }
63}
64
65#[cfg(test)]
66mod tests {
67    use super::*;
68
69    #[tokio::test]
70    async fn load_text_file() {
71        let dir = tempfile::tempdir().unwrap();
72        let file = dir.path().join("test.txt");
73        std::fs::write(&file, "hello world").unwrap();
74
75        let docs = TextLoader::default().load(&file).await.unwrap();
76        assert_eq!(docs.len(), 1);
77        assert_eq!(docs[0].content, "hello world");
78        assert_eq!(docs[0].metadata.content_type, "text/plain");
79    }
80
81    #[tokio::test]
82    async fn load_markdown_file() {
83        let dir = tempfile::tempdir().unwrap();
84        let file = dir.path().join("readme.md");
85        std::fs::write(&file, "# Title").unwrap();
86
87        let docs = TextLoader::default().load(&file).await.unwrap();
88        assert_eq!(docs[0].metadata.content_type, "text/markdown");
89    }
90
91    #[tokio::test]
92    async fn load_nonexistent_file() {
93        let result = TextLoader::default()
94            .load(Path::new("/nonexistent/file.txt"))
95            .await;
96        assert!(result.is_err());
97    }
98
99    #[tokio::test]
100    async fn load_empty_file() {
101        let dir = tempfile::tempdir().unwrap();
102        let file = dir.path().join("empty.txt");
103        std::fs::write(&file, "").unwrap();
104
105        let docs = TextLoader::default().load(&file).await.unwrap();
106        assert_eq!(docs.len(), 1);
107        assert!(docs[0].content.is_empty());
108    }
109
110    #[tokio::test]
111    async fn load_markdown_extension_variant() {
112        let dir = tempfile::tempdir().unwrap();
113        let file = dir.path().join("doc.markdown");
114        std::fs::write(&file, "content").unwrap();
115
116        let docs = TextLoader::default().load(&file).await.unwrap();
117        assert_eq!(docs[0].metadata.content_type, "text/markdown");
118    }
119
120    #[tokio::test]
121    async fn unknown_extension_treated_as_plain_text() {
122        let dir = tempfile::tempdir().unwrap();
123        let file = dir.path().join("data.csv");
124        std::fs::write(&file, "a,b,c").unwrap();
125
126        let docs = TextLoader::default().load(&file).await.unwrap();
127        assert_eq!(docs[0].metadata.content_type, "text/plain");
128    }
129
130    #[test]
131    fn supported_extensions_list() {
132        let loader = TextLoader::default();
133        let exts = loader.supported_extensions();
134        assert!(exts.contains(&"txt"));
135        assert!(exts.contains(&"md"));
136        assert!(exts.contains(&"markdown"));
137    }
138
139    #[tokio::test]
140    async fn metadata_source_is_canonical() {
141        let dir = tempfile::tempdir().unwrap();
142        let file = dir.path().join("test.txt");
143        std::fs::write(&file, "data").unwrap();
144
145        let docs = TextLoader::default().load(&file).await.unwrap();
146        let canonical = std::fs::canonicalize(&file).unwrap();
147        assert_eq!(docs[0].metadata.source, canonical.display().to_string());
148    }
149
150    #[tokio::test]
151    async fn file_too_large_rejected() {
152        let dir = tempfile::tempdir().unwrap();
153        let file = dir.path().join("big.txt");
154        std::fs::write(&file, "x").unwrap();
155
156        let loader = TextLoader { max_file_size: 0 };
157        let result = loader.load(&file).await;
158        assert!(matches!(result, Err(DocumentError::FileTooLarge(_))));
159    }
160}