Skip to main content

directory_indexer/indexing/
files.rs

1use log::{debug, warn};
2use std::path::{Path, PathBuf};
3use walkdir::WalkDir;
4
5use crate::{
6    error::{IndexerError, Result},
7    utils::{chunk_text, detect_file_type, normalize_path, should_ignore_file, FileType},
8};
9
10#[derive(Debug, Clone)]
11pub struct FileInfo {
12    pub path: String,
13    pub size: u64,
14    pub modified_time: u64,
15    pub hash: String,
16    pub parent_dirs: Vec<String>,
17    pub content: Option<String>,
18    pub errors: Option<String>,
19}
20
21pub struct FileScanner {
22    max_file_size: u64,
23    ignore_patterns: Vec<String>,
24}
25
26impl Default for FileScanner {
27    fn default() -> Self {
28        Self::new()
29    }
30}
31
32impl FileScanner {
33    pub fn new() -> Self {
34        Self {
35            max_file_size: 10 * 1024 * 1024, // 10MB default
36            ignore_patterns: vec![
37                ".git".to_string(),
38                "node_modules".to_string(),
39                "target".to_string(),
40            ],
41        }
42    }
43
44    pub fn with_ignore_patterns(ignore_patterns: Vec<String>) -> Self {
45        Self {
46            max_file_size: 10 * 1024 * 1024,
47            ignore_patterns,
48        }
49    }
50
51    pub fn with_max_size(max_size: u64) -> Self {
52        Self {
53            max_file_size: max_size,
54            ignore_patterns: vec![],
55        }
56    }
57
58    pub async fn scan_directory(&self, dir_path: &Path) -> Result<Vec<FileInfo>> {
59        let mut files = Vec::new();
60
61        for entry in WalkDir::new(dir_path).follow_links(false) {
62            let entry = entry.map_err(|e| {
63                IndexerError::file_processing(format!("Error walking directory: {e}"))
64            })?;
65
66            let path = entry.path();
67
68            // Skip directories
69            if path.is_dir() {
70                continue;
71            }
72
73            // Apply ignore patterns
74            if should_ignore_file(path, &self.ignore_patterns) {
75                debug!("Ignoring file due to patterns: {path:?}");
76                continue;
77            }
78
79            // Get file metadata
80            let metadata = tokio::fs::metadata(path).await?;
81            let size = metadata.len();
82
83            let modified_time = metadata
84                .modified()?
85                .duration_since(std::time::UNIX_EPOCH)
86                .map_err(|e| IndexerError::file_processing(format!("Invalid modified time: {e}")))?
87                .as_secs();
88
89            // Calculate hash
90            let hash = crate::utils::calculate_file_hash(path)?;
91
92            // Extract parent directories
93            let parent_dirs = self.extract_parent_directories(path, dir_path);
94
95            // Check file size and read content if appropriate
96            let (content, errors) = if size > self.max_file_size {
97                (
98                    None,
99                    Some(format!(
100                        "File too large: {size} bytes (max: {})",
101                        self.max_file_size
102                    )),
103                )
104            } else {
105                match tokio::fs::read_to_string(path).await {
106                    Ok(content) => (Some(content), None),
107                    Err(e) => (None, Some(format!("Failed to read file: {e}"))),
108                }
109            };
110
111            // Normalize path for consistent cross-platform storage
112            let normalized_path = match normalize_path(path) {
113                Ok(p) => p,
114                Err(e) => {
115                    warn!("Failed to normalize path {path:?}: {e}");
116                    path.to_string_lossy().to_string()
117                }
118            };
119
120            files.push(FileInfo {
121                path: normalized_path,
122                size,
123                modified_time,
124                hash,
125                parent_dirs,
126                content,
127                errors,
128            });
129        }
130
131        Ok(files)
132    }
133
134    fn extract_parent_directories(&self, file_path: &Path, root_dir: &Path) -> Vec<String> {
135        let mut parent_dirs = Vec::new();
136
137        // Add the root directory (normalized)
138        if let Ok(normalized_root) = normalize_path(root_dir) {
139            parent_dirs.push(normalized_root);
140        }
141
142        // Add all parent directories between root and file
143        if let Ok(relative_path) = file_path.strip_prefix(root_dir) {
144            let mut current = root_dir.to_path_buf();
145            for component in relative_path.parent().unwrap_or(Path::new("")).components() {
146                current = current.join(component);
147                if let Ok(normalized_current) = normalize_path(&current) {
148                    parent_dirs.push(normalized_current);
149                }
150            }
151        }
152
153        parent_dirs
154    }
155}
156
157pub struct FileProcessor {
158    max_file_size: u64,
159    ignore_patterns: Vec<String>,
160    chunk_size: usize,
161    overlap: usize,
162}
163
164#[derive(Debug, Clone)]
165pub struct ProcessedFile {
166    pub path: PathBuf,
167    pub content: String,
168    pub chunks: Vec<String>,
169    pub file_type: Option<FileType>,
170    pub size: u64,
171    pub hash: String,
172}
173
174#[derive(Debug, Clone)]
175pub struct FileMetadata {
176    pub path: PathBuf,
177    pub size: u64,
178    pub modified_time: u64,
179    pub file_type: Option<FileType>,
180}
181
182impl FileProcessor {
183    pub fn new(
184        max_file_size: u64,
185        ignore_patterns: Vec<String>,
186        chunk_size: usize,
187        overlap: usize,
188    ) -> Self {
189        Self {
190            max_file_size,
191            ignore_patterns,
192            chunk_size,
193            overlap,
194        }
195    }
196
197    pub async fn walk_directory(&self, dir_path: &Path) -> Result<Vec<FileMetadata>> {
198        let mut files = Vec::new();
199
200        for entry in WalkDir::new(dir_path).follow_links(false) {
201            let entry = entry.map_err(|e| {
202                IndexerError::file_processing(format!("Error walking directory: {e}"))
203            })?;
204
205            let path = entry.path();
206
207            // Skip directories
208            if path.is_dir() {
209                continue;
210            }
211
212            // Apply ignore patterns
213            if should_ignore_file(path, &self.ignore_patterns) {
214                debug!("Ignoring file due to patterns: {path:?}");
215                continue;
216            }
217
218            // Get file metadata
219            let metadata = tokio::fs::metadata(path).await?;
220            let size = metadata.len();
221
222            // Skip files that are too large
223            if size > self.max_file_size {
224                warn!("Skipping large file ({size} bytes): {path:?}");
225                continue;
226            }
227
228            let modified_time = metadata
229                .modified()?
230                .duration_since(std::time::UNIX_EPOCH)
231                .map_err(|e| IndexerError::file_processing(format!("Invalid modified time: {e}")))?
232                .as_secs();
233
234            let file_type = detect_file_type(path);
235
236            files.push(FileMetadata {
237                path: path.to_path_buf(),
238                size,
239                modified_time,
240                file_type,
241            });
242        }
243
244        Ok(files)
245    }
246
247    pub async fn process_file(&self, path: &Path) -> Result<ProcessedFile> {
248        debug!("Processing file: {path:?}");
249
250        // Read file content
251        let content = tokio::fs::read_to_string(path)
252            .await
253            .map_err(|e| IndexerError::file_processing(format!("Failed to read file: {e}")))?;
254
255        // Chunk the content
256        let chunks = chunk_text(&content, self.chunk_size, self.overlap);
257
258        // Get file metadata
259        let metadata = tokio::fs::metadata(path).await?;
260        let size = metadata.len();
261        let file_type = detect_file_type(path);
262
263        // Calculate hash
264        let hash = crate::utils::calculate_file_hash(path)?;
265
266        Ok(ProcessedFile {
267            path: path.to_path_buf(),
268            content,
269            chunks,
270            file_type,
271            size,
272            hash,
273        })
274    }
275
276    pub fn should_process_file(&self, file_type: &Option<FileType>) -> bool {
277        // Only process text-based files for now
278        match file_type {
279            Some(FileType::Text)
280            | Some(FileType::Code)
281            | Some(FileType::Data)
282            | Some(FileType::Markup)
283            | Some(FileType::Config) => true,
284            None => false,
285        }
286    }
287
288    pub fn extract_parent_directories(
289        &self,
290        file_path: &Path,
291        root_dirs: &[PathBuf],
292    ) -> Vec<String> {
293        let mut parent_dirs = Vec::new();
294
295        for root in root_dirs {
296            if let Ok(relative_path) = file_path.strip_prefix(root) {
297                if let Some(parent) = relative_path.parent() {
298                    if let Ok(normalized_parent) = normalize_path(root.join(parent)) {
299                        parent_dirs.push(normalized_parent);
300                    }
301                }
302                if let Ok(normalized_root) = normalize_path(root) {
303                    parent_dirs.push(normalized_root);
304                }
305                break;
306            }
307        }
308
309        parent_dirs
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316    use std::fs;
317    use tempfile::TempDir;
318
319    #[tokio::test]
320    async fn test_walk_directory() {
321        let temp_dir = TempDir::new().unwrap();
322        let temp_path = temp_dir.path();
323
324        // Create test files
325        fs::write(temp_path.join("test.txt"), "test content").unwrap();
326        fs::write(temp_path.join("test.md"), "# Test").unwrap();
327
328        let processor = FileProcessor::new(1024 * 1024, vec![], 512, 50);
329        let files = processor.walk_directory(temp_path).await.unwrap();
330
331        assert_eq!(files.len(), 2);
332    }
333
334    #[tokio::test]
335    async fn test_process_file() {
336        let temp_dir = TempDir::new().unwrap();
337        let temp_path = temp_dir.path();
338        let file_path = temp_path.join("test.txt");
339
340        fs::write(
341            &file_path,
342            "This is a test file content that should be chunked.",
343        )
344        .unwrap();
345
346        let processor = FileProcessor::new(1024 * 1024, vec![], 20, 5);
347        let processed = processor.process_file(&file_path).await.unwrap();
348
349        assert!(!processed.content.is_empty());
350        assert!(!processed.chunks.is_empty());
351        assert_eq!(processed.file_type, Some(FileType::Text));
352    }
353}