directory_indexer/indexing/
files.rs1use log::{debug, warn};
2use std::path::{Path, PathBuf};
3use walkdir::WalkDir;
4
5use crate::{
6 error::{IndexerError, Result},
7 utils::{chunk_text, detect_file_type, normalize_path, should_ignore_file, FileType},
8};
9
10#[derive(Debug, Clone)]
11pub struct FileInfo {
12 pub path: String,
13 pub size: u64,
14 pub modified_time: u64,
15 pub hash: String,
16 pub parent_dirs: Vec<String>,
17 pub content: Option<String>,
18 pub errors: Option<String>,
19}
20
21pub struct FileScanner {
22 max_file_size: u64,
23 ignore_patterns: Vec<String>,
24}
25
26impl Default for FileScanner {
27 fn default() -> Self {
28 Self::new()
29 }
30}
31
32impl FileScanner {
33 pub fn new() -> Self {
34 Self {
35 max_file_size: 10 * 1024 * 1024, ignore_patterns: vec![
37 ".git".to_string(),
38 "node_modules".to_string(),
39 "target".to_string(),
40 ],
41 }
42 }
43
44 pub fn with_ignore_patterns(ignore_patterns: Vec<String>) -> Self {
45 Self {
46 max_file_size: 10 * 1024 * 1024,
47 ignore_patterns,
48 }
49 }
50
51 pub fn with_max_size(max_size: u64) -> Self {
52 Self {
53 max_file_size: max_size,
54 ignore_patterns: vec![],
55 }
56 }
57
58 pub async fn scan_directory(&self, dir_path: &Path) -> Result<Vec<FileInfo>> {
59 let mut files = Vec::new();
60
61 for entry in WalkDir::new(dir_path).follow_links(false) {
62 let entry = entry.map_err(|e| {
63 IndexerError::file_processing(format!("Error walking directory: {e}"))
64 })?;
65
66 let path = entry.path();
67
68 if path.is_dir() {
70 continue;
71 }
72
73 if should_ignore_file(path, &self.ignore_patterns) {
75 debug!("Ignoring file due to patterns: {path:?}");
76 continue;
77 }
78
79 let metadata = tokio::fs::metadata(path).await?;
81 let size = metadata.len();
82
83 let modified_time = metadata
84 .modified()?
85 .duration_since(std::time::UNIX_EPOCH)
86 .map_err(|e| IndexerError::file_processing(format!("Invalid modified time: {e}")))?
87 .as_secs();
88
89 let hash = crate::utils::calculate_file_hash(path)?;
91
92 let parent_dirs = self.extract_parent_directories(path, dir_path);
94
95 let (content, errors) = if size > self.max_file_size {
97 (
98 None,
99 Some(format!(
100 "File too large: {size} bytes (max: {})",
101 self.max_file_size
102 )),
103 )
104 } else {
105 match tokio::fs::read_to_string(path).await {
106 Ok(content) => (Some(content), None),
107 Err(e) => (None, Some(format!("Failed to read file: {e}"))),
108 }
109 };
110
111 let normalized_path = match normalize_path(path) {
113 Ok(p) => p,
114 Err(e) => {
115 warn!("Failed to normalize path {path:?}: {e}");
116 path.to_string_lossy().to_string()
117 }
118 };
119
120 files.push(FileInfo {
121 path: normalized_path,
122 size,
123 modified_time,
124 hash,
125 parent_dirs,
126 content,
127 errors,
128 });
129 }
130
131 Ok(files)
132 }
133
134 fn extract_parent_directories(&self, file_path: &Path, root_dir: &Path) -> Vec<String> {
135 let mut parent_dirs = Vec::new();
136
137 if let Ok(normalized_root) = normalize_path(root_dir) {
139 parent_dirs.push(normalized_root);
140 }
141
142 if let Ok(relative_path) = file_path.strip_prefix(root_dir) {
144 let mut current = root_dir.to_path_buf();
145 for component in relative_path.parent().unwrap_or(Path::new("")).components() {
146 current = current.join(component);
147 if let Ok(normalized_current) = normalize_path(¤t) {
148 parent_dirs.push(normalized_current);
149 }
150 }
151 }
152
153 parent_dirs
154 }
155}
156
157pub struct FileProcessor {
158 max_file_size: u64,
159 ignore_patterns: Vec<String>,
160 chunk_size: usize,
161 overlap: usize,
162}
163
164#[derive(Debug, Clone)]
165pub struct ProcessedFile {
166 pub path: PathBuf,
167 pub content: String,
168 pub chunks: Vec<String>,
169 pub file_type: Option<FileType>,
170 pub size: u64,
171 pub hash: String,
172}
173
174#[derive(Debug, Clone)]
175pub struct FileMetadata {
176 pub path: PathBuf,
177 pub size: u64,
178 pub modified_time: u64,
179 pub file_type: Option<FileType>,
180}
181
182impl FileProcessor {
183 pub fn new(
184 max_file_size: u64,
185 ignore_patterns: Vec<String>,
186 chunk_size: usize,
187 overlap: usize,
188 ) -> Self {
189 Self {
190 max_file_size,
191 ignore_patterns,
192 chunk_size,
193 overlap,
194 }
195 }
196
197 pub async fn walk_directory(&self, dir_path: &Path) -> Result<Vec<FileMetadata>> {
198 let mut files = Vec::new();
199
200 for entry in WalkDir::new(dir_path).follow_links(false) {
201 let entry = entry.map_err(|e| {
202 IndexerError::file_processing(format!("Error walking directory: {e}"))
203 })?;
204
205 let path = entry.path();
206
207 if path.is_dir() {
209 continue;
210 }
211
212 if should_ignore_file(path, &self.ignore_patterns) {
214 debug!("Ignoring file due to patterns: {path:?}");
215 continue;
216 }
217
218 let metadata = tokio::fs::metadata(path).await?;
220 let size = metadata.len();
221
222 if size > self.max_file_size {
224 warn!("Skipping large file ({size} bytes): {path:?}");
225 continue;
226 }
227
228 let modified_time = metadata
229 .modified()?
230 .duration_since(std::time::UNIX_EPOCH)
231 .map_err(|e| IndexerError::file_processing(format!("Invalid modified time: {e}")))?
232 .as_secs();
233
234 let file_type = detect_file_type(path);
235
236 files.push(FileMetadata {
237 path: path.to_path_buf(),
238 size,
239 modified_time,
240 file_type,
241 });
242 }
243
244 Ok(files)
245 }
246
247 pub async fn process_file(&self, path: &Path) -> Result<ProcessedFile> {
248 debug!("Processing file: {path:?}");
249
250 let content = tokio::fs::read_to_string(path)
252 .await
253 .map_err(|e| IndexerError::file_processing(format!("Failed to read file: {e}")))?;
254
255 let chunks = chunk_text(&content, self.chunk_size, self.overlap);
257
258 let metadata = tokio::fs::metadata(path).await?;
260 let size = metadata.len();
261 let file_type = detect_file_type(path);
262
263 let hash = crate::utils::calculate_file_hash(path)?;
265
266 Ok(ProcessedFile {
267 path: path.to_path_buf(),
268 content,
269 chunks,
270 file_type,
271 size,
272 hash,
273 })
274 }
275
276 pub fn should_process_file(&self, file_type: &Option<FileType>) -> bool {
277 match file_type {
279 Some(FileType::Text)
280 | Some(FileType::Code)
281 | Some(FileType::Data)
282 | Some(FileType::Markup)
283 | Some(FileType::Config) => true,
284 None => false,
285 }
286 }
287
288 pub fn extract_parent_directories(
289 &self,
290 file_path: &Path,
291 root_dirs: &[PathBuf],
292 ) -> Vec<String> {
293 let mut parent_dirs = Vec::new();
294
295 for root in root_dirs {
296 if let Ok(relative_path) = file_path.strip_prefix(root) {
297 if let Some(parent) = relative_path.parent() {
298 if let Ok(normalized_parent) = normalize_path(root.join(parent)) {
299 parent_dirs.push(normalized_parent);
300 }
301 }
302 if let Ok(normalized_root) = normalize_path(root) {
303 parent_dirs.push(normalized_root);
304 }
305 break;
306 }
307 }
308
309 parent_dirs
310 }
311}
312
313#[cfg(test)]
314mod tests {
315 use super::*;
316 use std::fs;
317 use tempfile::TempDir;
318
319 #[tokio::test]
320 async fn test_walk_directory() {
321 let temp_dir = TempDir::new().unwrap();
322 let temp_path = temp_dir.path();
323
324 fs::write(temp_path.join("test.txt"), "test content").unwrap();
326 fs::write(temp_path.join("test.md"), "# Test").unwrap();
327
328 let processor = FileProcessor::new(1024 * 1024, vec![], 512, 50);
329 let files = processor.walk_directory(temp_path).await.unwrap();
330
331 assert_eq!(files.len(), 2);
332 }
333
334 #[tokio::test]
335 async fn test_process_file() {
336 let temp_dir = TempDir::new().unwrap();
337 let temp_path = temp_dir.path();
338 let file_path = temp_path.join("test.txt");
339
340 fs::write(
341 &file_path,
342 "This is a test file content that should be chunked.",
343 )
344 .unwrap();
345
346 let processor = FileProcessor::new(1024 * 1024, vec![], 20, 5);
347 let processed = processor.process_file(&file_path).await.unwrap();
348
349 assert!(!processed.content.is_empty());
350 assert!(!processed.chunks.is_empty());
351 assert_eq!(processed.file_type, Some(FileType::Text));
352 }
353}