agentroot_core/providers/
file.rs1use super::{ProviderConfig, SourceItem, SourceProvider};
6use crate::db::hash_content;
7use crate::error::Result;
8use crate::index::extract_title;
9use glob::Pattern;
10use std::path::Path;
11use walkdir::{DirEntry, WalkDir};
12
13const EXCLUDE_DIRS: &[&str] = &[
15 "node_modules",
16 ".git",
17 ".cache",
18 "vendor",
19 "dist",
20 "build",
21 "__pycache__",
22 ".venv",
23 "target",
24];
25
26pub struct FileProvider;
28
29impl FileProvider {
30 pub fn new() -> Self {
32 Self
33 }
34}
35
36impl Default for FileProvider {
37 fn default() -> Self {
38 Self::new()
39 }
40}
41
42#[async_trait::async_trait]
43impl SourceProvider for FileProvider {
44 fn provider_type(&self) -> &'static str {
45 "file"
46 }
47
48 async fn list_items(&self, config: &ProviderConfig) -> Result<Vec<SourceItem>> {
49 let root = Path::new(&config.base_path);
50 let pattern = Pattern::new(&config.pattern)?;
51
52 let exclude_hidden = config
53 .get_option("exclude_hidden")
54 .and_then(|v| v.parse::<bool>().ok())
55 .unwrap_or(true);
56
57 let follow_symlinks = config
58 .get_option("follow_symlinks")
59 .and_then(|v| v.parse::<bool>().ok())
60 .unwrap_or(true);
61
62 let exclude_dirs: Vec<String> = EXCLUDE_DIRS.iter().map(|s| s.to_string()).collect();
63
64 let mut items = Vec::new();
65
66 let walker = WalkDir::new(root)
67 .follow_links(follow_symlinks)
68 .into_iter()
69 .filter_entry(|e| !should_skip(e, &exclude_dirs, exclude_hidden));
70
71 for entry in walker {
72 let entry = entry?;
73 if !entry.file_type().is_file() {
74 continue;
75 }
76
77 let path = entry.path();
78 let relative = path
79 .strip_prefix(root)
80 .map(|p| p.to_string_lossy().to_string())
81 .unwrap_or_else(|_| path.to_string_lossy().to_string());
82
83 if pattern.matches(&relative) {
84 let content = std::fs::read_to_string(path)?;
85 let title = extract_title(&content, &relative);
86 let hash = hash_content(&content);
87
88 items.push(
89 SourceItem::new(relative, title, content, hash, "file".to_string())
90 .with_metadata("absolute_path".to_string(), path.display().to_string()),
91 );
92 }
93 }
94
95 Ok(items)
96 }
97
98 async fn fetch_item(&self, uri: &str) -> Result<SourceItem> {
99 let path = Path::new(uri);
100 let content = std::fs::read_to_string(path)?;
101 let title = extract_title(&content, uri);
102 let hash = hash_content(&content);
103
104 Ok(
105 SourceItem::new(uri.to_string(), title, content, hash, "file".to_string())
106 .with_metadata("absolute_path".to_string(), path.display().to_string()),
107 )
108 }
109}
110
111fn should_skip(entry: &DirEntry, exclude_dirs: &[String], exclude_hidden: bool) -> bool {
112 let name = entry.file_name().to_string_lossy();
113
114 if exclude_hidden && name.starts_with('.') {
115 return true;
116 }
117
118 if entry.file_type().is_dir() && exclude_dirs.iter().any(|d| name == *d) {
119 return true;
120 }
121
122 false
123}
124
125#[cfg(test)]
126mod tests {
127 use super::*;
128 use std::fs;
129 use tempfile::TempDir;
130
131 #[test]
132 fn test_file_provider_type() {
133 let provider = FileProvider::new();
134 assert_eq!(provider.provider_type(), "file");
135 }
136
137 #[tokio::test]
138 async fn test_file_provider_list_items() {
139 let temp = TempDir::new().unwrap();
140 let base = temp.path();
141
142 fs::write(base.join("test1.md"), "# Test 1").unwrap();
143 fs::write(base.join("test2.md"), "# Test 2").unwrap();
144 fs::write(base.join("ignore.txt"), "ignore").unwrap();
145
146 let config = ProviderConfig::new(base.to_string_lossy().to_string(), "**/*.md".to_string())
147 .with_option("exclude_hidden".to_string(), "false".to_string());
148 let provider = FileProvider::new();
149 let items = provider.list_items(&config).await.unwrap();
150
151 assert_eq!(items.len(), 2);
152 assert!(items.iter().any(|i| i.uri == "test1.md"));
153 assert!(items.iter().any(|i| i.uri == "test2.md"));
154 }
155
156 #[tokio::test]
157 async fn test_file_provider_fetch_item() {
158 let temp = TempDir::new().unwrap();
159 let base = temp.path();
160 let file = base.join("test.md");
161
162 fs::write(&file, "# Test Content").unwrap();
163
164 let provider = FileProvider::new();
165 let item = provider.fetch_item(file.to_str().unwrap()).await.unwrap();
166
167 assert_eq!(item.content, "# Test Content");
168 assert_eq!(item.title, "Test Content");
169 assert_eq!(item.source_type, "file");
170 }
171
172 #[tokio::test]
173 async fn test_file_provider_database_integration() {
174 use crate::{db::hash_content, Database};
175 use chrono::Utc;
176
177 let temp = TempDir::new().unwrap();
178 let base = temp.path();
179
180 fs::write(base.join("doc1.md"), "# Document 1\nContent for doc 1").unwrap();
181 fs::write(base.join("doc2.md"), "# Document 2\nContent for doc 2").unwrap();
182
183 let db = Database::open_in_memory().unwrap();
184 db.initialize().unwrap();
185
186 db.add_collection("test", &base.to_string_lossy(), "**/*.md", "file", None)
187 .unwrap();
188
189 let config = ProviderConfig::new(base.to_string_lossy().to_string(), "**/*.md".to_string())
190 .with_option("exclude_hidden".to_string(), "false".to_string());
191 let provider = FileProvider::new();
192 let items = provider.list_items(&config).await.unwrap();
193
194 assert_eq!(items.len(), 2, "Should find 2 .md files");
195
196 for item in &items {
197 let hash = hash_content(&item.content);
198 db.insert_content(&hash, &item.content).unwrap();
199
200 let now = Utc::now().to_rfc3339();
201 db.insert_document(
202 "test",
203 &item.uri,
204 &item.title,
205 &hash,
206 &now,
207 &now,
208 &item.source_type,
209 Some(&item.uri),
210 )
211 .unwrap();
212 }
213
214 let collections = db.list_collections().unwrap();
215 assert_eq!(collections.len(), 1);
216 assert_eq!(collections[0].name, "test");
217 assert_eq!(collections[0].provider_type, "file");
218 assert_eq!(
219 collections[0].document_count, 2,
220 "document_count should be 2"
221 );
222 assert_eq!(collections[0].provider_config, None);
223 }
224}