aurora_semantic/ignore/
mod.rs

1//! Ignore rules module for filtering files during indexing.
2//!
3//! This module provides functionality to filter files based on:
4//! - .gitignore patterns
5//! - .ignore files
6//! - Custom patterns
7//! - File extensions
8//! - Directory names
9//! - File size limits
10
11use std::path::{Path, PathBuf};
12
13use ignore::gitignore::{Gitignore, GitignoreBuilder};
14use ignore::WalkBuilder;
15
16use crate::config::IgnoreConfig;
17use crate::error::Result;
18
19/// File filter for determining which files to index.
20pub struct FileFilter {
21    config: IgnoreConfig,
22    gitignore: Option<Gitignore>,
23    custom_patterns: Vec<glob::Pattern>,
24}
25
26impl FileFilter {
27    /// Create a new file filter with the given configuration.
28    pub fn new(config: IgnoreConfig) -> Self {
29        Self {
30            config,
31            gitignore: None,
32            custom_patterns: Vec::new(),
33        }
34    }
35
36    /// Create a file filter for a specific workspace.
37    pub fn for_workspace(root: &Path, config: IgnoreConfig) -> Result<Self> {
38        let mut filter = Self::new(config);
39        filter.load_gitignore(root)?;
40        filter.compile_patterns()?;
41        Ok(filter)
42    }
43
44    /// Load .gitignore file from workspace root.
45    fn load_gitignore(&mut self, root: &Path) -> Result<()> {
46        if !self.config.use_gitignore {
47            return Ok(());
48        }
49
50        let gitignore_path = root.join(".gitignore");
51        if gitignore_path.exists() {
52            let mut builder = GitignoreBuilder::new(root);
53            builder.add(&gitignore_path);
54
55            // Also check for global gitignore
56            if let Some(home) = dirs::home_dir() {
57                let global_gitignore = home.join(".gitignore_global");
58                if global_gitignore.exists() {
59                    builder.add(&global_gitignore);
60                }
61            }
62
63            self.gitignore = builder.build().ok();
64        }
65
66        Ok(())
67    }
68
69    /// Compile custom glob patterns.
70    fn compile_patterns(&mut self) -> Result<()> {
71        for pattern in &self.config.patterns {
72            if let Ok(compiled) = glob::Pattern::new(pattern) {
73                self.custom_patterns.push(compiled);
74            }
75        }
76        Ok(())
77    }
78
79    /// Check if a file should be indexed.
80    pub fn should_index(&self, path: &Path, file_size: u64) -> bool {
81        // Check file size limit
82        if file_size > self.config.max_file_size {
83            return false;
84        }
85
86        // Check extension
87        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
88            let ext_lower = ext.to_lowercase();
89            if self.config.ignored_extensions.iter().any(|e| e == &ext_lower) {
90                return false;
91            }
92        }
93
94        // Check if in ignored directory
95        for component in path.components() {
96            if let Some(name) = component.as_os_str().to_str() {
97                if self.config.ignored_directories.iter().any(|d| d == name) {
98                    return false;
99                }
100            }
101        }
102
103        // Check gitignore
104        if let Some(ref gitignore) = self.gitignore {
105            if gitignore.matched(path, path.is_dir()).is_ignore() {
106                return false;
107            }
108        }
109
110        // Check custom patterns
111        let path_str = path.to_string_lossy();
112        for pattern in &self.custom_patterns {
113            if pattern.matches(&path_str) {
114                return false;
115            }
116        }
117
118        true
119    }
120
121    /// Check if a directory should be traversed.
122    pub fn should_traverse(&self, path: &Path) -> bool {
123        // Check if directory is in ignored list
124        if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
125            if self.config.ignored_directories.iter().any(|d| d == name) {
126                return false;
127            }
128        }
129
130        // Check gitignore
131        if let Some(ref gitignore) = self.gitignore {
132            if gitignore.matched(path, true).is_ignore() {
133                return false;
134            }
135        }
136
137        true
138    }
139}
140
141/// Walk a directory and yield files that should be indexed.
142pub struct FileWalker {
143    filter: FileFilter,
144    root: PathBuf,
145}
146
147impl FileWalker {
148    /// Create a new file walker.
149    pub fn new(root: PathBuf, config: IgnoreConfig) -> Result<Self> {
150        let filter = FileFilter::for_workspace(&root, config)?;
151        Ok(Self { filter, root })
152    }
153
154    /// Walk the directory and collect files to index.
155    pub fn walk(&self) -> Result<Vec<PathBuf>> {
156        let mut files = Vec::new();
157
158        let walker = WalkBuilder::new(&self.root)
159            .hidden(false) // Show hidden files
160            .git_ignore(self.filter.config.use_gitignore)
161            .git_global(self.filter.config.use_gitignore)
162            .git_exclude(self.filter.config.use_gitignore)
163            .ignore(self.filter.config.use_ignore_files)
164            .build();
165
166        for entry in walker.filter_map(|e| e.ok()) {
167            let path = entry.path();
168
169            // Skip directories
170            if path.is_dir() {
171                continue;
172            }
173
174            // Get file size
175            let metadata = match std::fs::metadata(path) {
176                Ok(m) => m,
177                Err(_) => continue,
178            };
179
180            // Apply filter
181            if self.filter.should_index(path, metadata.len()) {
182                files.push(path.to_path_buf());
183            }
184        }
185
186        Ok(files)
187    }
188
189    /// Walk the directory with a callback for each file.
190    pub fn walk_with_callback<F>(&self, mut callback: F) -> Result<()>
191    where
192        F: FnMut(&Path, u64) -> bool, // Return false to stop
193    {
194        let walker = WalkBuilder::new(&self.root)
195            .hidden(false)
196            .git_ignore(self.filter.config.use_gitignore)
197            .git_global(self.filter.config.use_gitignore)
198            .git_exclude(self.filter.config.use_gitignore)
199            .ignore(self.filter.config.use_ignore_files)
200            .build();
201
202        for entry in walker.filter_map(|e| e.ok()) {
203            let path = entry.path();
204
205            if path.is_dir() {
206                continue;
207            }
208
209            let metadata = match std::fs::metadata(path) {
210                Ok(m) => m,
211                Err(_) => continue,
212            };
213
214            if self.filter.should_index(path, metadata.len()) {
215                if !callback(path, metadata.len()) {
216                    break;
217                }
218            }
219        }
220
221        Ok(())
222    }
223}
224
225/// Scan a workspace and count files without loading them.
226#[allow(dead_code)]
227pub fn scan_workspace(root: &Path, config: &IgnoreConfig) -> Result<WorkspaceScan> {
228    let filter = FileFilter::for_workspace(root, config.clone())?;
229
230    let walker = WalkBuilder::new(root)
231        .hidden(false)
232        .git_ignore(config.use_gitignore)
233        .git_global(config.use_gitignore)
234        .git_exclude(config.use_gitignore)
235        .ignore(config.use_ignore_files)
236        .build();
237
238    let mut scan = WorkspaceScan {
239        file_count: 0,
240        total_bytes: 0,
241        by_extension: std::collections::HashMap::new(),
242    };
243
244    for entry in walker.filter_map(|e| e.ok()) {
245        let path = entry.path();
246
247        if path.is_dir() {
248            continue;
249        }
250
251        let metadata = match std::fs::metadata(path) {
252            Ok(m) => m,
253            Err(_) => continue,
254        };
255
256        if filter.should_index(path, metadata.len()) {
257            scan.file_count += 1;
258            scan.total_bytes += metadata.len();
259
260            if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
261                *scan.by_extension.entry(ext.to_lowercase()).or_insert(0) += 1;
262            }
263        }
264    }
265
266    Ok(scan)
267}
268
269/// Result of scanning a workspace.
270#[derive(Debug)]
271#[allow(dead_code)]
272pub struct WorkspaceScan {
273    /// Number of files to index.
274    pub file_count: usize,
275    /// Total size of files in bytes.
276    pub total_bytes: u64,
277    /// File count by extension.
278    pub by_extension: std::collections::HashMap<String, usize>,
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_file_filter_extension() {
287        let config = IgnoreConfig::default();
288        let filter = FileFilter::new(config);
289
290        // Should filter out binary files
291        assert!(!filter.should_index(Path::new("test.exe"), 100));
292        assert!(!filter.should_index(Path::new("test.dll"), 100));
293
294        // Should allow source files
295        assert!(filter.should_index(Path::new("test.rs"), 100));
296        assert!(filter.should_index(Path::new("test.py"), 100));
297    }
298
299    #[test]
300    fn test_file_filter_size() {
301        let config = IgnoreConfig {
302            max_file_size: 1000,
303            ..Default::default()
304        };
305        let filter = FileFilter::new(config);
306
307        assert!(filter.should_index(Path::new("small.rs"), 500));
308        assert!(!filter.should_index(Path::new("large.rs"), 2000));
309    }
310
311    #[test]
312    fn test_file_filter_directory() {
313        let config = IgnoreConfig::default();
314        let filter = FileFilter::new(config);
315
316        assert!(!filter.should_index(Path::new("node_modules/package/index.js"), 100));
317        assert!(!filter.should_index(Path::new("target/debug/main.rs"), 100));
318        assert!(filter.should_index(Path::new("src/main.rs"), 100));
319    }
320}