Skip to main content

mkit_core/
ignore.rs

1//! `.mkitignore` / `.gitignore` glob patterns.
2//!
3//! Matching is **path-relative** (git-faithful), not basename-only: callers
4//! pass a path relative to the repository root and the matcher honors the
5//! gitignore anchoring rules below.
6//!
7//! Grammar (the implemented v1 subset of `gitignore`):
8//! - One pattern per line; blank lines and `#`-prefixed lines are skipped
9//!   (`\#` escapes a leading `#` to a literal).
10//! - A leading `!` negates (last match wins, gitignore semantics); `\!`
11//!   escapes a leading `!` to a literal.
12//! - A trailing `/` makes the pattern match directories only.
13//! - **Anchoring:** a pattern containing a `/` anywhere other than a trailing
14//!   one is *anchored* to the repo root (a leading `/` is one way; `foo/bar`
15//!   is another). A pattern with no such `/` matches at **any depth** (as if
16//!   prefixed with `**/`), so `*.log` matches `a/b/c.log`.
17//! - `*` matches any run of non-`/`; `?` matches a single non-`/`;
18//!   `[abc]` / `[a-z]` / `[!abc]` are character classes (single non-`/`).
19//!   `\` escapes the next character to a literal.
20//! - `**` as a whole path segment crosses `/`: leading `**/` and middle
21//!   `/**/` match zero or more directories; a trailing `/**` matches one or
22//!   more (everything *inside* a directory). A `**` not isolated by slashes
23//!   behaves like a single `*`.
24//! - Trailing unescaped spaces are trimmed.
25//!
26//! `.mkit` and `.git` are *always* ignored (matched on the path's basename,
27//! ASCII case-insensitively, so `.MKIT` can't bypass on case-insensitive
28//! filesystems — the Git CVE-2021-21300 family).
29//!
30//! Both `.gitignore` and `.mkitignore` are read from the repository root;
31//! `.gitignore` patterns are applied first and `.mkitignore` last, so a
32//! repo's own `.mkitignore` takes precedence under last-match-wins.
33//!
34//! **Deferred (documented non-goals for this pass):** nested per-directory
35//! ignore files (only the root is read), global excludes
36//! (`core.excludesFile`), and escaped trailing spaces.
37
38use std::fs;
39use std::io;
40use std::path::Path;
41
42/// Hard cap on a single ignore file (1 MiB).
43pub const MAX_IGNORE_FILE_BYTES: u64 = 1024 * 1024;
44
45/// Errors returned by [`load`].
46#[derive(Debug, thiserror::Error)]
47pub enum IgnoreError {
48    /// An ignore file exceeded [`MAX_IGNORE_FILE_BYTES`].
49    #[error("ignore file too large (>{MAX_IGNORE_FILE_BYTES} bytes)")]
50    FileTooLarge,
51    /// Underlying I/O failure.
52    #[error(transparent)]
53    Io(#[from] io::Error),
54}
55
56/// One path segment of a parsed pattern.
57#[derive(Debug, Clone, PartialEq, Eq)]
58enum Segment {
59    /// A `**` segment — matches across `/` boundaries.
60    DoubleStar,
61    /// A single-level glob segment (`*`/`?`/`[...]`/literals, no `/`).
62    Glob(String),
63}
64
65/// A single ignore pattern with its modifiers.
66#[derive(Debug, Clone, PartialEq, Eq)]
67pub struct Pattern {
68    /// Cleaned glob body, with `!`, the leading `/`, and the trailing `/`
69    /// already stripped (e.g. `*.log`, `build`, `src/gen`).
70    pub pattern: String,
71    /// `true` if the pattern was prefixed with `!` (un-ignore).
72    pub negated: bool,
73    /// `true` if the pattern ended with `/` (directory-only).
74    pub dir_only: bool,
75    /// `true` if the pattern is anchored to the repo root (contained a
76    /// non-trailing `/`); otherwise it matches at any depth.
77    pub anchored: bool,
78    /// Effective match segments. Non-anchored patterns carry a leading
79    /// [`Segment::DoubleStar`] so they match at any depth.
80    segments: Vec<Segment>,
81}
82
83impl Pattern {
84    /// Match this pattern against a slice of path segments.
85    fn matches(&self, path: &[&str]) -> bool {
86        match_segments(&self.segments, path)
87    }
88}
89
90/// Parsed ignore-list (the merged `.gitignore` + `.mkitignore` patterns).
91#[derive(Debug, Default, Clone)]
92pub struct IgnoreList {
93    patterns: Vec<Pattern>,
94}
95
96impl IgnoreList {
97    /// Construct an empty list (matches nothing user-defined; the
98    /// hard-coded `.mkit` / `.git` ignores still apply).
99    #[must_use]
100    pub const fn new() -> Self {
101        Self {
102            patterns: Vec::new(),
103        }
104    }
105
106    /// Borrow the parsed patterns.
107    #[must_use]
108    pub fn patterns(&self) -> &[Pattern] {
109        &self.patterns
110    }
111
112    /// Returns `true` if `rel_path` (relative to the repo root, `/`-separated)
113    /// should be ignored. `is_dir` controls whether directory-only patterns
114    /// apply.
115    ///
116    /// Callers walk top-down and skip ignored directories without descending,
117    /// which is what gives directory-only patterns (`build/`) their
118    /// "everything inside is ignored too" behavior — the contents are simply
119    /// never visited.
120    #[must_use]
121    pub fn is_ignored(&self, rel_path: &str, is_dir: bool) -> bool {
122        let trimmed = rel_path.trim_matches('/');
123        // Hard-coded ignores key off the basename at any depth.
124        let base = trimmed.rsplit('/').next().unwrap_or(trimmed);
125        if base.eq_ignore_ascii_case(".mkit") || base.eq_ignore_ascii_case(".git") {
126            return true;
127        }
128        let path: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
129        if path.is_empty() {
130            return false;
131        }
132        // Walk patterns in order; last match wins.
133        let mut ignored = false;
134        for p in &self.patterns {
135            if p.dir_only && !is_dir {
136                continue;
137            }
138            if p.matches(&path) {
139                ignored = !p.negated;
140            }
141        }
142        ignored
143    }
144
145    /// Like [`is_ignored`](Self::is_ignored), but also returns `true` when any
146    /// **ancestor directory** of `rel_path` is ignored — git treats
147    /// everything under an excluded directory as excluded.
148    ///
149    /// Use this for one-shot path tests (an explicit `add <path>`, the restore
150    /// safety gate) where there is no top-down walk to carry the
151    /// ancestor-ignored bit. Walkers that descend top-down should instead
152    /// thread their own ancestor flag (cheaper) and skip ignored directories.
153    #[must_use]
154    pub fn is_ignored_with_ancestors(&self, rel_path: &str, is_dir: bool) -> bool {
155        let trimmed = rel_path.trim_matches('/');
156        if trimmed.is_empty() {
157            return false;
158        }
159        let segs: Vec<&str> = trimmed.split('/').filter(|s| !s.is_empty()).collect();
160        // Each strict ancestor is a directory; if one is ignored, so is this.
161        for i in 1..segs.len() {
162            if self.is_ignored(&segs[..i].join("/"), true) {
163                return true;
164            }
165        }
166        self.is_ignored(trimmed, is_dir)
167    }
168}
169
170/// Load and merge `.gitignore` then `.mkitignore` from `dir`. Returns an
171/// empty list if neither file is present.
172///
173/// `.gitignore` patterns come first so a repo's own `.mkitignore` wins under
174/// last-match-wins.
175///
176/// # Errors
177/// - [`IgnoreError::FileTooLarge`] if either file exceeds 1 MiB.
178/// - [`IgnoreError::Io`] for other filesystem failures.
179pub fn load(dir: &Path) -> Result<IgnoreList, IgnoreError> {
180    let mut patterns = Vec::new();
181    for name in [".gitignore", ".mkitignore"] {
182        if let Some(list) = load_one(&dir.join(name))? {
183            patterns.extend(list.patterns);
184        }
185    }
186    Ok(IgnoreList { patterns })
187}
188
189/// Read and parse a single ignore file, or `None` if it is absent.
190fn load_one(path: &Path) -> Result<Option<IgnoreList>, IgnoreError> {
191    let meta = match fs::metadata(path) {
192        Ok(m) => m,
193        Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(None),
194        Err(e) => return Err(IgnoreError::Io(e)),
195    };
196    if meta.len() > MAX_IGNORE_FILE_BYTES {
197        return Err(IgnoreError::FileTooLarge);
198    }
199    Ok(Some(parse(&fs::read_to_string(path)?)))
200}
201
202/// Parse ignore-file content into a list of patterns. Never fails:
203/// malformed-looking lines are silently skipped.
204#[must_use]
205pub fn parse(content: &str) -> IgnoreList {
206    let mut patterns = Vec::new();
207    for raw in content.split('\n') {
208        // Strip a single trailing `\r` for Windows-style line endings.
209        let line = raw.strip_suffix('\r').unwrap_or(raw);
210        if let Some(p) = parse_line(line) {
211            patterns.push(p);
212        }
213    }
214    IgnoreList { patterns }
215}
216
217/// Parse one cleaned line into a [`Pattern`], or `None` for blanks/comments.
218fn parse_line(line: &str) -> Option<Pattern> {
219    if line.is_empty() || line.starts_with('#') {
220        // Blank, or a leading unescaped `#` comment.
221        return None;
222    }
223    // Trim trailing unescaped spaces (escaped trailing spaces are deferred).
224    let line = line.trim_end_matches(' ');
225    if line.is_empty() {
226        return None;
227    }
228    // Negation (`!`) and leading-escape (`\!`, `\#`) handling.
229    let (negated, body) = if let Some(rest) = line.strip_prefix('!') {
230        (true, rest.to_string())
231    } else if let Some(rest) = line.strip_prefix("\\!") {
232        (false, format!("!{rest}"))
233    } else if let Some(rest) = line.strip_prefix("\\#") {
234        (false, format!("#{rest}"))
235    } else {
236        (false, line.to_string())
237    };
238    finish_pattern(&body, negated)
239}
240
241/// Build a [`Pattern`] from a (possibly negated) body after `#`/`!` handling.
242fn finish_pattern(body: &str, negated: bool) -> Option<Pattern> {
243    // Trailing `/` ⇒ directory-only.
244    let (dir_only, body) = match body.strip_suffix('/') {
245        Some(rest) => (true, rest),
246        None => (false, body),
247    };
248    if body.is_empty() {
249        return None;
250    }
251
252    // Anchored if there is a `/` anywhere (after the trailing one was
253    // stripped). A leading `/` is stripped; it only signals anchoring.
254    let anchored = body.contains('/');
255    let core = body.strip_prefix('/').unwrap_or(body);
256    if core.is_empty() {
257        return None;
258    }
259
260    let mut segments: Vec<Segment> = core
261        .split('/')
262        .filter(|s| !s.is_empty())
263        .map(|s| {
264            if s == "**" {
265                Segment::DoubleStar
266            } else {
267                Segment::Glob(s.to_string())
268            }
269        })
270        .collect();
271    if segments.is_empty() {
272        return None;
273    }
274    // A non-anchored pattern matches at any depth (as if prefixed `**/`).
275    if !anchored {
276        segments.insert(0, Segment::DoubleStar);
277    }
278
279    Some(Pattern {
280        pattern: core.to_string(),
281        negated,
282        dir_only,
283        anchored,
284        segments,
285    })
286}
287
288/// Match a segment list against a path-segment slice (entire-path match).
289fn match_segments(pat: &[Segment], path: &[&str]) -> bool {
290    match pat.split_first() {
291        None => path.is_empty(),
292        Some((Segment::DoubleStar, rest)) => {
293            if rest.is_empty() {
294                // A trailing `**` matches one or more remaining segments
295                // (git's "everything inside" — not the directory itself).
296                return !path.is_empty();
297            }
298            // Match zero or more leading segments, then the rest.
299            (0..=path.len()).any(|k| match_segments(rest, &path[k..]))
300        }
301        Some((Segment::Glob(g), rest)) => match path.split_first() {
302            Some((first, tail)) => {
303                segment_match(g.as_bytes(), first.as_bytes()) && match_segments(rest, tail)
304            }
305            None => false,
306        },
307    }
308}
309
310/// Match a single path segment against a single-level glob (`*`/`?`/`[...]`,
311/// `\` escapes, literals). Never crosses `/` (segments contain none).
312fn segment_match(p: &[u8], s: &[u8]) -> bool {
313    match p.split_first() {
314        None => s.is_empty(),
315        Some((&b'*', mut rest)) => {
316            while rest.first() == Some(&b'*') {
317                rest = &rest[1..];
318            }
319            (0..=s.len()).any(|i| segment_match(rest, &s[i..]))
320        }
321        Some((&b'?', rest)) => !s.is_empty() && segment_match(rest, &s[1..]),
322        Some((&b'[', _)) => match match_class(p, s.first().copied()) {
323            Some((matched, plen)) => matched && segment_match(&p[plen..], &s[1..]),
324            // Malformed class (no closing `]`) ⇒ literal `[`.
325            None => s.first() == Some(&b'[') && segment_match(&p[1..], &s[1..]),
326        },
327        Some((&b'\\', rest)) => match rest.split_first() {
328            Some((&c, rest2)) => s.first() == Some(&c) && segment_match(rest2, &s[1..]),
329            None => s.first() == Some(&b'\\') && s.len() == 1,
330        },
331        Some((&c, rest)) => s.first() == Some(&c) && segment_match(rest, &s[1..]),
332    }
333}
334
335/// Evaluate a `[...]` character class at the front of `p` against `ch`.
336/// Returns `(matched, bytes_consumed_including_brackets)`, or `None` if the
337/// class has no closing `]`.
338fn match_class(p: &[u8], ch: Option<u8>) -> Option<(bool, usize)> {
339    debug_assert_eq!(p.first(), Some(&b'['));
340    let mut i = 1;
341    let negate = matches!(p.get(i), Some(&b'!' | &b'^'));
342    if negate {
343        i += 1;
344    }
345    let start = i;
346    let mut matched = false;
347    while i < p.len() {
348        // A `]` after the first class char closes it; as the first char it is
349        // a literal member.
350        if p[i] == b']' && i > start {
351            // Negation inverts only on a real character (an empty `ch`, i.e.
352            // end-of-segment, never matches).
353            let result = ch.is_some() && (matched ^ negate);
354            return Some((result, i + 1));
355        }
356        // Range `a-z` (but `-` as the last char before `]` is literal).
357        if i + 2 < p.len() && p[i + 1] == b'-' && p[i + 2] != b']' {
358            if let Some(c) = ch
359                && p[i] <= c
360                && c <= p[i + 2]
361            {
362                matched = true;
363            }
364            i += 3;
365        } else {
366            if ch == Some(p[i]) {
367                matched = true;
368            }
369            i += 1;
370        }
371    }
372    None
373}
374
375/// Match a basename `name` against a single-level glob `pattern`. Supports
376/// `*` (any run of non-`/`), `?` (one non-`/`), `[...]` classes, and `\`
377/// escapes. Retained as a small public helper; the [`IgnoreList`] matcher
378/// uses the richer path-aware engine above.
379#[must_use]
380pub fn glob_match(pattern: &str, name: &str) -> bool {
381    // Reject any name containing `/`, mirroring the single-level contract.
382    if name.contains('/') {
383        return false;
384    }
385    segment_match(pattern.as_bytes(), name.as_bytes())
386}
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391    use tempfile::TempDir;
392
393    #[test]
394    fn empty_patterns_match_nothing_user_defined() {
395        let il = parse("");
396        assert!(!il.is_ignored("anything.txt", false));
397        assert!(!il.is_ignored("somedir", true));
398    }
399
400    #[test]
401    fn exact_filename_match() {
402        let il = parse("secret.key");
403        assert!(il.is_ignored("secret.key", false));
404        assert!(!il.is_ignored("other.key", false));
405    }
406
407    #[test]
408    fn glob_star_pattern() {
409        let il = parse("*.log");
410        assert!(il.is_ignored("debug.log", false));
411        assert!(!il.is_ignored("debug.txt", false));
412    }
413
414    #[test]
415    fn directory_pattern_trailing_slash() {
416        let il = parse("build/");
417        assert!(il.is_ignored("build", true));
418        assert!(!il.is_ignored("build", false));
419    }
420
421    #[test]
422    fn negation_pattern() {
423        let il = parse("*.log\n!important.log");
424        assert!(il.is_ignored("debug.log", false));
425        assert!(!il.is_ignored("important.log", false));
426    }
427
428    #[test]
429    fn comment_lines_ignored() {
430        let il = parse("# this is a comment\n*.tmp");
431        assert_eq!(il.patterns().len(), 1);
432    }
433
434    #[test]
435    fn blank_lines_ignored() {
436        let il = parse("\n\n*.tmp\n\n");
437        assert_eq!(il.patterns().len(), 1);
438    }
439
440    #[test]
441    fn glob_question_mark() {
442        let il = parse("file?.txt");
443        assert!(il.is_ignored("file1.txt", false));
444        assert!(!il.is_ignored("file12.txt", false));
445    }
446
447    #[test]
448    fn default_ignores() {
449        let il = parse("");
450        assert!(il.is_ignored(".mkit", true));
451        assert!(il.is_ignored(".git", true));
452        assert!(il.is_ignored(".mkit", false));
453        assert!(il.is_ignored(".git", false));
454        // ...at any depth.
455        assert!(il.is_ignored("sub/.git", true));
456    }
457
458    // --- path-relative / anchoring -------------------------------------
459
460    #[test]
461    fn non_anchored_matches_at_any_depth() {
462        let il = parse("*.log");
463        assert!(il.is_ignored("a/b/c.log", false));
464        assert!(il.is_ignored("c.log", false));
465    }
466
467    #[test]
468    fn anchored_leading_slash_matches_root_only() {
469        let il = parse("/foo.txt");
470        assert!(il.is_ignored("foo.txt", false));
471        assert!(!il.is_ignored("sub/foo.txt", false));
472    }
473
474    #[test]
475    fn anchored_multi_segment() {
476        let il = parse("src/gen");
477        assert!(il.is_ignored("src/gen", true));
478        assert!(!il.is_ignored("other/src/gen", true));
479        assert!(!il.is_ignored("gen", true));
480    }
481
482    #[test]
483    fn dir_only_matches_dir_at_any_depth() {
484        let il = parse("build/");
485        assert!(il.is_ignored("a/b/build", true));
486        assert!(!il.is_ignored("a/b/build", false));
487    }
488
489    // --- `**` ----------------------------------------------------------
490
491    #[test]
492    fn leading_double_star() {
493        let il = parse("**/foo");
494        assert!(il.is_ignored("foo", false));
495        assert!(il.is_ignored("a/foo", false));
496        assert!(il.is_ignored("a/b/foo", false));
497        assert!(!il.is_ignored("a/foobar", false));
498    }
499
500    #[test]
501    fn middle_double_star() {
502        let il = parse("a/**/b");
503        assert!(il.is_ignored("a/b", false));
504        assert!(il.is_ignored("a/x/b", false));
505        assert!(il.is_ignored("a/x/y/b", false));
506        assert!(!il.is_ignored("a/b/c", false));
507    }
508
509    #[test]
510    fn trailing_double_star_matches_inside_not_self() {
511        let il = parse("abc/**");
512        assert!(il.is_ignored("abc/x", false));
513        assert!(il.is_ignored("abc/x/y", false));
514        // `abc` itself is not matched by a trailing `/**`.
515        assert!(!il.is_ignored("abc", true));
516    }
517
518    // --- char classes / escapes ----------------------------------------
519
520    #[test]
521    fn char_class_range_and_negation() {
522        let il = parse("file[0-9].txt");
523        assert!(il.is_ignored("file3.txt", false));
524        assert!(!il.is_ignored("filex.txt", false));
525        let neg = parse("file[!0-9].txt");
526        assert!(neg.is_ignored("filex.txt", false));
527        assert!(!neg.is_ignored("file3.txt", false));
528    }
529
530    #[test]
531    fn escaped_hash_and_bang_are_literal() {
532        let il = parse("\\#notacomment\n\\!notnegated");
533        assert_eq!(il.patterns().len(), 2);
534        assert!(il.is_ignored("#notacomment", false));
535        assert!(il.is_ignored("!notnegated", false));
536    }
537
538    #[test]
539    fn trailing_spaces_trimmed() {
540        let il = parse("foo.txt   ");
541        assert!(il.is_ignored("foo.txt", false));
542    }
543
544    // --- negation across anchored / nested -----------------------------
545
546    #[test]
547    fn negation_reincludes_specific_file() {
548        let il = parse("*.log\n!keep/important.log");
549        assert!(il.is_ignored("a/debug.log", false));
550        assert!(!il.is_ignored("keep/important.log", false));
551    }
552
553    // --- dual-file load / precedence -----------------------------------
554
555    #[test]
556    fn comment_lines_count() {
557        let il = parse("# this is a comment\n*.tmp");
558        assert_eq!(il.patterns().len(), 1);
559        assert_eq!(il.patterns()[0].pattern, "*.tmp");
560    }
561
562    #[test]
563    fn windows_line_endings_stripped() {
564        let il = parse("*.log\r\n*.tmp\r\n");
565        assert_eq!(il.patterns().len(), 2);
566        assert_eq!(il.patterns()[0].pattern, "*.log");
567        assert_eq!(il.patterns()[1].pattern, "*.tmp");
568    }
569
570    #[test]
571    fn load_missing_file_returns_empty() {
572        let dir = TempDir::new().unwrap();
573        let il = load(dir.path()).unwrap();
574        assert!(il.patterns().is_empty());
575    }
576
577    #[test]
578    fn load_with_mkitignore() {
579        let dir = TempDir::new().unwrap();
580        std::fs::write(dir.path().join(".mkitignore"), "*.log\nbuild/\n").unwrap();
581        let il = load(dir.path()).unwrap();
582        assert_eq!(il.patterns().len(), 2);
583        assert!(il.is_ignored("test.log", false));
584        assert!(il.is_ignored("build", true));
585    }
586
587    #[test]
588    fn load_reads_gitignore_too() {
589        let dir = TempDir::new().unwrap();
590        std::fs::write(dir.path().join(".gitignore"), "*.log\n").unwrap();
591        let il = load(dir.path()).unwrap();
592        assert!(il.is_ignored("debug.log", false));
593    }
594
595    #[test]
596    fn mkitignore_overrides_gitignore_last_match_wins() {
597        let dir = TempDir::new().unwrap();
598        std::fs::write(dir.path().join(".gitignore"), "*.log\n").unwrap();
599        // .mkitignore is applied last, so its re-include wins.
600        std::fs::write(dir.path().join(".mkitignore"), "!keep.log\n").unwrap();
601        let il = load(dir.path()).unwrap();
602        assert!(il.is_ignored("other.log", false));
603        assert!(!il.is_ignored("keep.log", false));
604    }
605
606    #[test]
607    fn load_rejects_oversize_file() {
608        let dir = TempDir::new().unwrap();
609        let oversized = vec![b'#'; usize::try_from(MAX_IGNORE_FILE_BYTES + 1).unwrap()];
610        std::fs::write(dir.path().join(".mkitignore"), oversized).unwrap();
611        let err = load(dir.path()).unwrap_err();
612        assert!(matches!(err, IgnoreError::FileTooLarge));
613    }
614
615    // --- glob_match public helper --------------------------------------
616
617    #[test]
618    fn is_ignored_with_ancestors_catches_files_under_ignored_dir() {
619        let il = parse("node_modules/\n");
620        // The directory itself and any descendant (file or dir).
621        assert!(il.is_ignored_with_ancestors("node_modules", true));
622        assert!(il.is_ignored_with_ancestors("node_modules/pkg/index.js", false));
623        // Unrelated paths are unaffected.
624        assert!(!il.is_ignored_with_ancestors("src/main.rs", false));
625        // The plain matcher does NOT catch the descendant (no walk context).
626        assert!(!il.is_ignored("node_modules/pkg/index.js", false));
627    }
628
629    #[test]
630    fn glob_match_exact() {
631        assert!(glob_match("hello", "hello"));
632        assert!(!glob_match("hello", "world"));
633    }
634
635    #[test]
636    fn glob_match_star() {
637        assert!(glob_match("*.rs", "main.rs"));
638        assert!(!glob_match("*.rs", "main.txt"));
639        assert!(glob_match("test*", "testing"));
640        assert!(glob_match("*", "anything"));
641        // Single-level: `*` never crosses `/`.
642        assert!(!glob_match("*", "a/b"));
643    }
644}