Skip to main content

kaish_glob/
ignore.rs

1//! Gitignore-style pattern matching.
2//!
3//! Implements gitignore semantics for filtering files during walks.
4
5use std::path::Path;
6
7use crate::WalkerFs;
8use crate::glob::glob_match;
9
10/// Maximum size for .gitignore files (1 MiB). Files larger than this are
11/// rejected to prevent accidental memory exhaustion from corrupt/adversarial
12/// gitignore files.
13const MAX_GITIGNORE_SIZE: usize = 1_048_576;
14
15/// A compiled ignore rule from a gitignore file.
16#[derive(Debug, Clone)]
17struct IgnoreRule {
18    /// The pattern to match.
19    pattern: String,
20    /// True if this rule negates (starts with !).
21    negated: bool,
22    /// True if this rule only matches directories (ends with /).
23    dir_only: bool,
24    /// True if this pattern is anchored (contains / not at end).
25    anchored: bool,
26}
27
28impl IgnoreRule {
29    fn parse(line: &str) -> Option<Self> {
30        let line = line.trim();
31
32        // Skip empty lines and comments
33        if line.is_empty() || line.starts_with('#') {
34            return None;
35        }
36
37        let mut pattern = line.to_string();
38        let mut negated = false;
39        let mut dir_only = false;
40
41        // Check for negation
42        if let Some(stripped) = pattern.strip_prefix('!') {
43            negated = true;
44            pattern = stripped.to_string();
45        }
46
47        // Check for directory-only pattern
48        if let Some(stripped) = pattern.strip_suffix('/') {
49            dir_only = true;
50            pattern = stripped.to_string();
51        }
52
53        // Check if anchored (contains / that's not at the end)
54        let anchored = pattern.contains('/');
55
56        // Remove leading /
57        if let Some(stripped) = pattern.strip_prefix('/') {
58            pattern = stripped.to_string();
59        }
60
61        Some(IgnoreRule {
62            pattern,
63            negated,
64            dir_only,
65            anchored,
66        })
67    }
68
69    fn matches(&self, path: &Path, is_dir: bool) -> bool {
70        // Dir-only rules only match directories
71        if self.dir_only && !is_dir {
72            return false;
73        }
74
75        let path_str = path.to_string_lossy();
76
77        if self.anchored {
78            // Anchored patterns match from the root
79            self.glob_match_path(&path_str)
80        } else {
81            // Non-anchored patterns can match anywhere
82            // Try matching the full path
83            if self.glob_match_path(&path_str) {
84                return true;
85            }
86
87            // Try matching just the filename
88            if let Some(name) = path.file_name() {
89                let name_str = name.to_string_lossy();
90                if glob_match(&self.pattern, &name_str) {
91                    return true;
92                }
93            }
94
95            false
96        }
97    }
98
99    fn glob_match_path(&self, path: &str) -> bool {
100        // Handle ** in patterns by converting to a match that works
101        if self.pattern.contains("**") {
102            self.match_with_globstar(path)
103        } else {
104            glob_match(&self.pattern, path)
105        }
106    }
107
108    fn match_with_globstar(&self, path: &str) -> bool {
109        // Split pattern by **
110        let parts: Vec<&str> = self.pattern.split("**").collect();
111
112        if parts.len() == 2 {
113            let prefix = parts[0].trim_end_matches('/');
114            let suffix = parts[1].trim_start_matches('/');
115
116            // Check prefix
117            let remaining = if prefix.is_empty() {
118                path
119            } else if let Some(rest) = path.strip_prefix(prefix) {
120                rest.trim_start_matches('/')
121            } else {
122                return false;
123            };
124
125            // Check suffix
126            if suffix.is_empty() {
127                return true;
128            }
129
130            // Try matching suffix against any tail of the path
131            for (i, _) in remaining.char_indices() {
132                let tail = &remaining[i..];
133                if glob_match(suffix, tail) {
134                    return true;
135                }
136            }
137
138            // Also try matching just the suffix
139            glob_match(suffix, remaining)
140        } else {
141            // Complex pattern with multiple **. Replacing ** with * is safe here
142            // because glob_match's * already crosses / boundaries (unlike POSIX
143            // path-aware globbing). This flattens O(n^k) multi-globstar into a
144            // single wildcard pass while preserving match semantics.
145            glob_match(&self.pattern.replace("**", "*"), path)
146        }
147    }
148}
149
150/// Filter for gitignore-style patterns.
151#[derive(Debug, Clone, Default)]
152pub struct IgnoreFilter {
153    rules: Vec<IgnoreRule>,
154}
155
156impl IgnoreFilter {
157    /// Create an empty ignore filter.
158    pub fn new() -> Self {
159        Self::default()
160    }
161
162    /// Create a filter with default ignores for common build artifacts.
163    pub fn with_defaults() -> Self {
164        let mut filter = Self::new();
165
166        // Always ignore .git
167        filter.add_rule(".git");
168
169        // Common build/dependency directories
170        filter.add_rule("node_modules");
171        filter.add_rule("target"); // Rust
172        filter.add_rule("__pycache__");
173        filter.add_rule(".venv");
174        filter.add_rule("venv");
175        filter.add_rule("dist");
176        filter.add_rule("build");
177        filter.add_rule(".next"); // Next.js
178
179        filter
180    }
181
182    /// Load patterns from a gitignore file via `WalkerFs`.
183    ///
184    /// Rejects files larger than 1 MiB to prevent memory exhaustion.
185    pub async fn from_gitignore(
186        path: &Path,
187        fs: &impl WalkerFs,
188    ) -> Result<Self, crate::WalkerError> {
189        let content = fs.read_file(path).await?;
190        if content.len() > MAX_GITIGNORE_SIZE {
191            return Err(crate::WalkerError::Io(format!(
192                "{}: gitignore too large ({} bytes, max {})",
193                path.display(),
194                content.len(),
195                MAX_GITIGNORE_SIZE,
196            )));
197        }
198        let text = String::from_utf8_lossy(&content);
199
200        let mut filter = Self::new();
201        for line in text.lines() {
202            if let Some(rule) = IgnoreRule::parse(line) {
203                filter.rules.push(rule);
204            }
205        }
206
207        Ok(filter)
208    }
209
210    /// Add a rule from a pattern string.
211    pub fn add_rule(&mut self, pattern: &str) {
212        if let Some(rule) = IgnoreRule::parse(pattern) {
213            self.rules.push(rule);
214        }
215    }
216
217    /// Check if a path should be ignored.
218    ///
219    /// Returns true if the path matches any non-negated rule
220    /// and doesn't match a later negated rule.
221    pub fn is_ignored(&self, path: &Path, is_dir: bool) -> bool {
222        let mut ignored = false;
223
224        for rule in &self.rules {
225            if rule.matches(path, is_dir) {
226                ignored = !rule.negated;
227            }
228        }
229
230        ignored
231    }
232
233    /// Check if a path component (single name) should be ignored.
234    ///
235    /// This is for quick filtering during directory traversal.
236    pub fn is_name_ignored(&self, name: &str, is_dir: bool) -> bool {
237        self.is_ignored(Path::new(name), is_dir)
238    }
239
240    /// Merge another filter's rules into this one.
241    ///
242    /// The other filter's rules are added after (and thus take precedence over)
243    /// this filter's rules.
244    pub fn merge(&mut self, other: &IgnoreFilter) {
245        self.rules.extend(other.rules.iter().cloned());
246    }
247
248    /// Create a new filter by merging this filter with another.
249    ///
250    /// Returns a new filter with the combined rules (other's rules take precedence).
251    pub fn merged_with(&self, other: &IgnoreFilter) -> IgnoreFilter {
252        let mut merged = self.clone();
253        merged.merge(other);
254        merged
255    }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    #[test]
263    fn test_simple_patterns() {
264        let mut filter = IgnoreFilter::new();
265        filter.add_rule("*.log");
266        filter.add_rule("temp/");
267
268        assert!(filter.is_ignored(Path::new("app.log"), false));
269        assert!(filter.is_ignored(Path::new("debug.log"), false));
270        assert!(!filter.is_ignored(Path::new("app.txt"), false));
271
272        assert!(filter.is_ignored(Path::new("temp"), true));
273        assert!(!filter.is_ignored(Path::new("temp"), false)); // dir-only
274    }
275
276    #[test]
277    fn test_negation() {
278        let mut filter = IgnoreFilter::new();
279        filter.add_rule("*.log");
280        filter.add_rule("!important.log");
281
282        assert!(filter.is_ignored(Path::new("debug.log"), false));
283        assert!(!filter.is_ignored(Path::new("important.log"), false));
284    }
285
286    #[test]
287    fn test_anchored_patterns() {
288        let mut filter = IgnoreFilter::new();
289        filter.add_rule("/root.txt");
290        filter.add_rule("anywhere.txt");
291
292        assert!(filter.is_ignored(Path::new("root.txt"), false));
293        assert!(!filter.is_ignored(Path::new("sub/root.txt"), false));
294
295        assert!(filter.is_ignored(Path::new("anywhere.txt"), false));
296        assert!(filter.is_ignored(Path::new("sub/anywhere.txt"), false));
297    }
298
299    #[test]
300    fn test_directory_patterns() {
301        let mut filter = IgnoreFilter::new();
302        filter.add_rule("build/");
303
304        assert!(filter.is_ignored(Path::new("build"), true));
305        assert!(!filter.is_ignored(Path::new("build"), false)); // file named "build"
306    }
307
308    #[test]
309    fn test_globstar() {
310        let mut filter = IgnoreFilter::new();
311        filter.add_rule("**/*.log");
312
313        assert!(filter.is_ignored(Path::new("app.log"), false));
314        assert!(filter.is_ignored(Path::new("logs/app.log"), false));
315        assert!(filter.is_ignored(Path::new("var/logs/app.log"), false));
316    }
317
318    #[test]
319    fn test_defaults() {
320        let filter = IgnoreFilter::with_defaults();
321
322        assert!(filter.is_ignored(Path::new(".git"), true));
323        assert!(filter.is_ignored(Path::new("node_modules"), true));
324        assert!(filter.is_ignored(Path::new("target"), true));
325        assert!(filter.is_ignored(Path::new("__pycache__"), true));
326    }
327
328    #[test]
329    fn test_comments_and_empty() {
330        let mut filter = IgnoreFilter::new();
331        filter.add_rule("# comment");
332        filter.add_rule("");
333        filter.add_rule("  ");
334        filter.add_rule("valid.txt");
335
336        assert_eq!(filter.rules.len(), 1);
337        assert!(filter.is_ignored(Path::new("valid.txt"), false));
338    }
339
340    #[test]
341    fn test_path_patterns() {
342        let mut filter = IgnoreFilter::new();
343        filter.add_rule("logs/*.log");
344
345        assert!(filter.is_ignored(Path::new("logs/app.log"), false));
346        assert!(!filter.is_ignored(Path::new("other/app.log"), false));
347        assert!(!filter.is_ignored(Path::new("app.log"), false));
348    }
349
350    mod async_tests {
351        use super::*;
352        use crate::{WalkerDirEntry, WalkerError, WalkerFs};
353        use std::collections::HashMap;
354        use std::path::PathBuf;
355
356        struct MemEntry;
357        impl WalkerDirEntry for MemEntry {
358            fn name(&self) -> &str { "" }
359            fn is_dir(&self) -> bool { false }
360            fn is_file(&self) -> bool { true }
361            fn is_symlink(&self) -> bool { false }
362        }
363
364        /// Minimal FS for testing gitignore loading.
365        struct SingleFileFs(HashMap<PathBuf, Vec<u8>>);
366
367        #[async_trait::async_trait]
368        impl WalkerFs for SingleFileFs {
369            type DirEntry = MemEntry;
370            async fn list_dir(&self, _: &Path) -> Result<Vec<MemEntry>, WalkerError> {
371                Ok(vec![])
372            }
373            async fn read_file(&self, path: &Path) -> Result<Vec<u8>, WalkerError> {
374                self.0.get(path)
375                    .cloned()
376                    .ok_or_else(|| WalkerError::NotFound(path.display().to_string()))
377            }
378            async fn is_dir(&self, _: &Path) -> bool { false }
379            async fn exists(&self, path: &Path) -> bool { self.0.contains_key(path) }
380        }
381
382        #[tokio::test]
383        async fn test_oversized_gitignore_rejected() {
384            let oversized = vec![b'#'; super::MAX_GITIGNORE_SIZE + 1];
385            let mut files = HashMap::new();
386            files.insert(PathBuf::from("/.gitignore"), oversized);
387            let fs = SingleFileFs(files);
388
389            let result = IgnoreFilter::from_gitignore(Path::new("/.gitignore"), &fs).await;
390            assert!(result.is_err());
391            let err = result.unwrap_err().to_string();
392            assert!(err.contains("too large"), "expected 'too large' in: {err}");
393        }
394
395        #[tokio::test]
396        async fn test_normal_gitignore_accepted() {
397            let content = b"*.log\n# comment\ntarget/\n".to_vec();
398            let mut files = HashMap::new();
399            files.insert(PathBuf::from("/.gitignore"), content);
400            let fs = SingleFileFs(files);
401
402            let filter = IgnoreFilter::from_gitignore(Path::new("/.gitignore"), &fs)
403                .await
404                .unwrap();
405            assert!(filter.is_ignored(Path::new("app.log"), false));
406            assert!(filter.is_ignored(Path::new("target"), true));
407        }
408    }
409}