aurora_semantic/ignore/
mod.rs

1//! Ignore rules module for filtering files during indexing.
2//!
3//! This module provides functionality to filter files based on:
4//! - .gitignore patterns
5//! - .ignore files
6//! - Custom patterns
7//! - File extensions
8//! - Directory names
9//! - File size limits
10
11use std::path::{Path, PathBuf};
12
13use ignore::gitignore::{Gitignore, GitignoreBuilder};
14use ignore::WalkBuilder;
15
16use crate::config::IgnoreConfig;
17use crate::error::Result;
18
19/// File filter for determining which files to index.
20pub struct FileFilter {
21    config: IgnoreConfig,
22    gitignore: Option<Gitignore>,
23    custom_patterns: Vec<glob::Pattern>,
24    /// Root path of the workspace (for computing relative paths).
25    root_path: Option<PathBuf>,
26    /// Normalized excluded file paths (relative to workspace root).
27    excluded_files: Vec<PathBuf>,
28    /// Normalized excluded directory paths (relative to workspace root).
29    excluded_directory_paths: Vec<PathBuf>,
30}
31
32impl FileFilter {
33    /// Create a new file filter with the given configuration.
34    pub fn new(config: IgnoreConfig) -> Self {
35        // Normalize excluded paths for cross-platform matching
36        let excluded_files: Vec<PathBuf> = config.excluded_files.iter()
37            .map(|p| Self::normalize_path(p))
38            .collect();
39        let excluded_directory_paths: Vec<PathBuf> = config.excluded_directories.iter()
40            .map(|p| Self::normalize_path(p))
41            .collect();
42        
43        Self {
44            config,
45            gitignore: None,
46            custom_patterns: Vec::new(),
47            root_path: None,
48            excluded_files,
49            excluded_directory_paths,
50        }
51    }
52
53    /// Create a file filter for a specific workspace.
54    pub fn for_workspace(root: &Path, config: IgnoreConfig) -> Result<Self> {
55        let mut filter = Self::new(config);
56        filter.root_path = Some(root.to_path_buf());
57        filter.load_gitignore(root)?;
58        filter.compile_patterns()?;
59        Ok(filter)
60    }
61
62    /// Normalize a path for cross-platform comparison.
63    /// Converts backslashes to forward slashes and removes trailing slashes.
64    fn normalize_path(path: &Path) -> PathBuf {
65        let path_str = path.to_string_lossy().replace('\\', "/");
66        let trimmed = path_str.trim_end_matches('/');
67        PathBuf::from(trimmed)
68    }
69
70    /// Get the relative path from the workspace root, normalized for comparison.
71    fn get_relative_path(&self, path: &Path) -> PathBuf {
72        let relative = if let Some(ref root) = self.root_path {
73            path.strip_prefix(root).unwrap_or(path)
74        } else {
75            path
76        };
77        Self::normalize_path(relative)
78    }
79
80    /// Load .gitignore file from workspace root.
81    fn load_gitignore(&mut self, root: &Path) -> Result<()> {
82        if !self.config.use_gitignore {
83            return Ok(());
84        }
85
86        let gitignore_path = root.join(".gitignore");
87        if gitignore_path.exists() {
88            let mut builder = GitignoreBuilder::new(root);
89            builder.add(&gitignore_path);
90
91            // Also check for global gitignore
92            if let Some(home) = dirs::home_dir() {
93                let global_gitignore = home.join(".gitignore_global");
94                if global_gitignore.exists() {
95                    builder.add(&global_gitignore);
96                }
97            }
98
99            self.gitignore = builder.build().ok();
100        }
101
102        Ok(())
103    }
104
105    /// Compile custom glob patterns.
106    fn compile_patterns(&mut self) -> Result<()> {
107        for pattern in &self.config.patterns {
108            if let Ok(compiled) = glob::Pattern::new(pattern) {
109                self.custom_patterns.push(compiled);
110            }
111        }
112        Ok(())
113    }
114
115    /// Check if a file should be indexed.
116    pub fn should_index(&self, path: &Path, file_size: u64) -> bool {
117        // Check file size limit
118        if file_size > self.config.max_file_size {
119            return false;
120        }
121
122        // Get filename for pattern matching
123        let filename = path.file_name()
124            .and_then(|n| n.to_str())
125            .map(|s| s.to_lowercase())
126            .unwrap_or_default();
127
128        // Check for specific ignored filenames (lock files, etc.)
129        // These are critical files that should NEVER be indexed
130        const IGNORED_FILENAMES: &[&str] = &[
131            "pnpm-lock.yaml", "package-lock.json", "yarn.lock", "bun.lockb",
132            "cargo.lock", "gemfile.lock", "composer.lock", "poetry.lock",
133            "pipfile.lock", "pubspec.lock", "packages.lock.json",
134            "shrinkwrap.yaml", "npm-shrinkwrap.json",
135            ".ds_store", "thumbs.db", "desktop.ini",
136            ".gitignore", ".gitattributes", ".gitmodules",
137            ".npmrc", ".yarnrc", ".nvmrc", ".node-version",
138            ".env", ".env.local", ".env.development", ".env.production",
139        ];
140        
141        if IGNORED_FILENAMES.iter().any(|&f| filename == f) {
142            return false;
143        }
144
145        // Check extension
146        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
147            let ext_lower = ext.to_lowercase();
148            if self.config.ignored_extensions.iter().any(|e| e == &ext_lower) {
149                return false;
150            }
151        }
152
153        // Check if in ignored directory
154        for component in path.components() {
155            if let Some(name) = component.as_os_str().to_str() {
156                if self.config.ignored_directories.iter().any(|d| d == name) {
157                    return false;
158                }
159            }
160        }
161
162        // Check gitignore
163        if let Some(ref gitignore) = self.gitignore {
164            if gitignore.matched(path, path.is_dir()).is_ignore() {
165                return false;
166            }
167        }
168
169        // Check custom patterns
170        let path_str = path.to_string_lossy();
171        for pattern in &self.custom_patterns {
172            if pattern.matches(&path_str) {
173                return false;
174            }
175        }
176
177        // Check explicitly excluded file paths
178        let relative_path = self.get_relative_path(path);
179        if self.excluded_files.iter().any(|excluded| {
180            *excluded == relative_path
181        }) {
182            return false;
183        }
184
185        // Check if file is inside an explicitly excluded directory
186        for excluded_dir in &self.excluded_directory_paths {
187            if relative_path.starts_with(excluded_dir) {
188                return false;
189            }
190        }
191
192        true
193    }
194
195    /// Check if a directory should be traversed.
196    pub fn should_traverse(&self, path: &Path) -> bool {
197        // Check if directory is in ignored list
198        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
199            if self.config.ignored_directories.iter().any(|d| d == name) {
200                return false;
201            }
202        }
203
204        // Check gitignore
205        if let Some(ref gitignore) = self.gitignore {
206            if gitignore.matched(path, true).is_ignore() {
207                return false;
208            }
209        }
210
211        // Check explicitly excluded directory paths
212        let relative_path = self.get_relative_path(path);
213        for excluded_dir in &self.excluded_directory_paths {
214            if relative_path == *excluded_dir || relative_path.starts_with(excluded_dir) {
215                return false;
216            }
217        }
218
219        true
220    }
221}
222
223/// Walk a directory and yield files that should be indexed.
224pub struct FileWalker {
225    filter: FileFilter,
226    root: PathBuf,
227}
228
229impl FileWalker {
230    /// Create a new file walker.
231    pub fn new(root: PathBuf, config: IgnoreConfig) -> Result<Self> {
232        let filter = FileFilter::for_workspace(&root, config)?;
233        Ok(Self { filter, root })
234    }
235
236    /// Walk the directory and collect files to index.
237    pub fn walk(&self) -> Result<Vec<PathBuf>> {
238        let mut files = Vec::new();
239
240        let walker = WalkBuilder::new(&self.root)
241            .hidden(false) // Show hidden files
242            .git_ignore(self.filter.config.use_gitignore)
243            .git_global(self.filter.config.use_gitignore)
244            .git_exclude(self.filter.config.use_gitignore)
245            .ignore(self.filter.config.use_ignore_files)
246            .build();
247
248        for entry in walker.filter_map(|e| e.ok()) {
249            let path = entry.path();
250
251            // Skip directories
252            if path.is_dir() {
253                continue;
254            }
255
256            // Get file size
257            let metadata = match std::fs::metadata(path) {
258                Ok(m) => m,
259                Err(_) => continue,
260            };
261
262            // Apply filter
263            if self.filter.should_index(path, metadata.len()) {
264                files.push(path.to_path_buf());
265            }
266        }
267
268        Ok(files)
269    }
270
271    /// Walk the directory with a callback for each file.
272    pub fn walk_with_callback<F>(&self, mut callback: F) -> Result<()>
273    where
274        F: FnMut(&Path, u64) -> bool, // Return false to stop
275    {
276        let walker = WalkBuilder::new(&self.root)
277            .hidden(false)
278            .git_ignore(self.filter.config.use_gitignore)
279            .git_global(self.filter.config.use_gitignore)
280            .git_exclude(self.filter.config.use_gitignore)
281            .ignore(self.filter.config.use_ignore_files)
282            .build();
283
284        for entry in walker.filter_map(|e| e.ok()) {
285            let path = entry.path();
286
287            if path.is_dir() {
288                continue;
289            }
290
291            let metadata = match std::fs::metadata(path) {
292                Ok(m) => m,
293                Err(_) => continue,
294            };
295
296            if self.filter.should_index(path, metadata.len()) {
297                if !callback(path, metadata.len()) {
298                    break;
299                }
300            }
301        }
302
303        Ok(())
304    }
305}
306
307/// Scan a workspace and count files without loading them.
308#[allow(dead_code)]
309pub fn scan_workspace(root: &Path, config: &IgnoreConfig) -> Result<WorkspaceScan> {
310    let filter = FileFilter::for_workspace(root, config.clone())?;
311
312    let walker = WalkBuilder::new(root)
313        .hidden(false)
314        .git_ignore(config.use_gitignore)
315        .git_global(config.use_gitignore)
316        .git_exclude(config.use_gitignore)
317        .ignore(config.use_ignore_files)
318        .build();
319
320    let mut scan = WorkspaceScan {
321        file_count: 0,
322        total_bytes: 0,
323        by_extension: std::collections::HashMap::new(),
324    };
325
326    for entry in walker.filter_map(|e| e.ok()) {
327        let path = entry.path();
328
329        if path.is_dir() {
330            continue;
331        }
332
333        let metadata = match std::fs::metadata(path) {
334            Ok(m) => m,
335            Err(_) => continue,
336        };
337
338        if filter.should_index(path, metadata.len()) {
339            scan.file_count += 1;
340            scan.total_bytes += metadata.len();
341
342            if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
343                *scan.by_extension.entry(ext.to_lowercase()).or_insert(0) += 1;
344            }
345        }
346    }
347
348    Ok(scan)
349}
350
351/// Result of scanning a workspace.
352#[derive(Debug)]
353#[allow(dead_code)]
354pub struct WorkspaceScan {
355    /// Number of files to index.
356    pub file_count: usize,
357    /// Total size of files in bytes.
358    pub total_bytes: u64,
359    /// File count by extension.
360    pub by_extension: std::collections::HashMap<String, usize>,
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366
367    #[test]
368    fn test_file_filter_extension() {
369        let config = IgnoreConfig::default();
370        let filter = FileFilter::new(config);
371
372        // Should filter out binary files
373        assert!(!filter.should_index(Path::new("test.exe"), 100));
374        assert!(!filter.should_index(Path::new("test.dll"), 100));
375
376        // Should allow source files
377        assert!(filter.should_index(Path::new("test.rs"), 100));
378        assert!(filter.should_index(Path::new("test.py"), 100));
379    }
380
381    #[test]
382    fn test_file_filter_size() {
383        let config = IgnoreConfig {
384            max_file_size: 1000,
385            ..Default::default()
386        };
387        let filter = FileFilter::new(config);
388
389        assert!(filter.should_index(Path::new("small.rs"), 500));
390        assert!(!filter.should_index(Path::new("large.rs"), 2000));
391    }
392
393    #[test]
394    fn test_file_filter_directory() {
395        let config = IgnoreConfig::default();
396        let filter = FileFilter::new(config);
397
398        assert!(!filter.should_index(Path::new("node_modules/package/index.js"), 100));
399        assert!(!filter.should_index(Path::new("target/debug/main.rs"), 100));
400        assert!(filter.should_index(Path::new("src/main.rs"), 100));
401    }
402
403    #[test]
404    fn test_file_filter_excluded_paths() {
405        let config = IgnoreConfig {
406            excluded_files: vec![PathBuf::from("src/generated.rs")],
407            excluded_directories: vec![PathBuf::from("vendor/libs")],
408            ..Default::default()
409        };
410        let filter = FileFilter::for_workspace(Path::new("/project"), config).unwrap();
411        
412        // Should exclude explicitly specified file
413        assert!(!filter.should_index(Path::new("/project/src/generated.rs"), 100));
414        
415        // Should allow other files in same directory
416        assert!(filter.should_index(Path::new("/project/src/main.rs"), 100));
417        
418        // Should exclude files inside excluded directory
419        assert!(!filter.should_index(Path::new("/project/vendor/libs/helper.rs"), 100));
420        
421        // Should not traverse excluded directory
422        assert!(!filter.should_traverse(Path::new("/project/vendor/libs")));
423        
424        // Should allow traversing non-excluded directories
425        assert!(filter.should_traverse(Path::new("/project/src")));
426    }
427
428    #[test]
429    fn test_file_filter_aurora_excluded_by_default() {
430        let config = IgnoreConfig::default();
431        let filter = FileFilter::for_workspace(Path::new("/project"), config).unwrap();
432        
433        // Aurora index directory should be excluded by default
434        assert!(!filter.should_traverse(Path::new("/project/.aurora")));
435        assert!(!filter.should_index(Path::new("/project/.aurora/index.bin"), 100));
436    }
437}