ignored 0.0.6

A Rust implementation of the .gitignore file format for quickly checking whether a path is ignored by git - without invoking the git cli.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
use std::{
    fmt::{self, Debug, Display},
    path::{MAIN_SEPARATOR, Path},
};

use crate::{
    evaluator::{self, Result},
    lexer::{self, Token, TokenStream},
};

/// An individual glob pattern read from a `.gitignore` file.
///
/// The glob pattern is represented internally as a regular expression, which is used to perform the actual
/// matching against file paths.
///
/// The constructed pattern _does not_ handle negating the match itself (i.e. Glob patterns with leading
/// "!" character).
///
/// Instead, the `is_negated` field is used to indicate whether the pattern is
/// negated or not.
///
/// ## Empty Globs
///
/// Glob patterns can be empty, which is understood to mean "does not match anything".
///
/// This can occur in two cases:
/// 1. When the pattern is an empty string.
/// 2. When the pattern is a comment (i.e. starts with a "#" character).
#[derive(Debug)]
pub struct Glob {
    regex: Option<regex::bytes::Regex>,
    pattern: Option<String>,
    is_negated: bool,
}

impl Display for Glob {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match &self.pattern {
            Some(p) => write!(f, "{p}"),
            None => write!(f, "None"),
        }
    }
}

impl Glob {
    /// Create a new glob pattern from a regular expression and the original pattern string.
    ///
    /// The `is_negated` field indicates whether the pattern is negated or not (i.e. whether
    /// it starts with a "!" character). And will cause the `ignored` method to return the
    /// opposite of the match result (i.e. if the pattern is negated and it matches, then
    /// `ignored` will return `false`).
    pub(crate) fn new(regex: regex::bytes::Regex, pattern: &str, is_negated: bool) -> Self {
        Self {
            regex: Some(regex),
            pattern: Some(pattern.into()),
            is_negated,
        }
    }

    /// Create an empty glob pattern, which is understood to mean "does not match anything".
    pub(crate) const fn empty() -> Self {
        Self {
            regex: None,
            pattern: None,
            is_negated: false,
        }
    }

    /// Check if the glob pattern is empty, which is understood to mean "does not match anything".
    ///
    /// This can occur in two cases:
    /// 1. When the pattern is an empty string.
    /// 2. When the pattern is a comment (i.e. starts with a "#" character).
    #[must_use]
    pub const fn is_empty(&self) -> bool {
        self.regex.is_none()
    }

    /// Check if the glob pattern matches the given path.
    ///
    /// Returns `None` if the pattern is empty ([`Glob::is_empty`]), or if the pattern
    /// does not match the path given.
    ///
    /// Otherwise, returns `Some(true)` if the pattern matches and the path **is** ignored, or `Some(false)`
    /// if the pattern matches the path, but is negated (i.e. **is not** ignored).
    #[must_use]
    pub fn is_ignored(&self, path: impl AsRef<Path>) -> Option<bool> {
        let regex = self.regex.as_ref()?;

        let matched = regex.is_match(
            path.as_ref()
                .as_os_str()
                .to_str()
                .unwrap_or_default()
                .as_bytes(),
        );

        if !matched {
            log::trace!(
                "{} did not match {:?} (via regular expression: {regex})",
                path.as_ref().display(),
                &self.pattern.as_ref(),
            );

            return None;
        }

        log::debug!(
            "{} matched {:?} (via regular expression: {regex}). Is ignored: {}",
            path.as_ref().display(),
            self.pattern.as_ref(),
            !self.is_negated
        );

        Some(!self.is_negated)
    }
}

/// Convert a stream of tokens into a `Glob` that can be used to match file paths against
/// the original pattern.
///
/// Under the hood this relies on regular expressions, so the token stream is converted into an intermediate
/// representation which should always be a valid regular expression. This is then compiled into a
/// `regex::Regex` which is used to perform the actual matching against file paths.
impl TryFrom<TokenStream> for Glob {
    type Error = evaluator::Error;

    fn try_from(value: TokenStream) -> Result<Self> {
        if value.is_empty() {
            // No tokens in the stream, which means there's nothing to match
            return Ok(Self::empty());
        }

        if value.len() == 1 && matches!(value.first(), Some(Token::Comment(_))) {
            // A line starting with # serves as a comment.
            return Ok(Self::empty());
        }

        let mut regex = String::new();

        let mut tokens = value.iter().peekable();

        let is_negated = tokens.next_if(|token| *token == &Token::Negation).is_some();

        let mut is_relative_to_root = false;
        let mut is_directory_only = false;

        while let Some(token) = tokens.next() {
            if *token == Token::DirectorySeparator && tokens.peek().is_some() {
                // If there is a separator at the beginning or middle (or both) of the pattern, then
                // the pattern is relative to the directory level of the particular `.gitignore` file itself.
                is_relative_to_root = true;

                if regex.is_empty() {
                    // If there's a separator at the beginning we should not include it as a literal
                    // in the regex, as the input file path will always have any leading
                    // separators stripped off before matching.
                    continue;
                }
            } else if token == &Token::DirectorySeparator && tokens.peek().is_none() {
                // If there is a separator at the end of the pattern then the
                // pattern will only match directories, otherwise the pattern can
                // match both files and directories.
                is_directory_only = true;
            }

            match token {
                Token::ExplicitLiteral(_) => regex.push_str(regex::escape(token.as_str()).as_str()),
                Token::ImplicitLiteral(_) => {
                    let literal = if tokens.peek().is_none() {
                        token
                            .as_str()
                            // Trailing spaces are ignored unless they are quoted with backslash ("\"), in
                            // which case they'd be an explicit literal.
                            .trim_end()
                    } else {
                        token.as_str()
                    };

                    // Implicit literal might still contain regular expression meta characters
                    // (namely `.`). These need to be escaped so the compiled regular expression
                    // text behaves like a literal.
                    regex.push_str(regex::escape(literal).as_str());
                }
                Token::Range(_) => {
                    // The range notation, e.g. [a-zA-Z], can be used to match one of the characters in a
                    // range.
                    regex.push('[');
                    regex.push_str(token.as_str());
                    regex.push(']');
                }
                Token::Comment(_) | Token::Negation => {
                    // Negations and comments should only be present at the start of the pattern,
                    // so if we encounter them here then the pattern is invalid (likely because of
                    // invalid parsing behavior in the crate).
                    return Err(evaluator::Error::InvalidPattern {
                        pattern: value.into(),
                        source: None,
                    });
                }
                Token::Asterisk => {
                    // An asterisk "*" matches anything except a slash
                    regex.push_str(r"[^\\/]+");
                }
                Token::DoubleAsterisk => {
                    regex.push_str(r".*");

                    if regex.is_empty()
                        && tokens
                            .next_if(|next| *next == &Token::DirectorySeparator)
                            .is_some()
                    {
                        is_relative_to_root = true;

                        // A leading "**" followed by a slash means match in all directories. For example, "**/foo" matches file or directory "foo"
                        // anywhere, the same as pattern "foo". "**/foo/bar" matches file or directory "bar" anywhere that is directly under directory "foo".
                        regex.push(MAIN_SEPARATOR);
                    }
                }
                Token::DirectorySeparator
                    if tokens
                        .next_if(|next| *next == &Token::DoubleAsterisk)
                        .is_some() =>
                {
                    // A trailing "/**" matches everything inside. For example, "abc/**" matches all files inside directory "abc", relative to
                    // the location of the `.gitignore` file, with infinite depth.
                    if tokens.peek().is_none() {
                        regex.push_str(r"[\\/].*");
                        break;
                    }

                    // A slash followed by two consecutive asterisks then a slash matches zero or more directories. For example, "a/**/b" matches
                    // "a/b", "a/x/b", "a/x/y/b" and so on.
                    if tokens
                        .next_if(|next| *next == &Token::DirectorySeparator)
                        .is_some()
                    {
                        regex.push_str(r"[\\/]([^\\/]+[\\/])*");
                    }
                }
                Token::DirectorySeparator => {
                    // Directory separators differ between operating systems, so we push a regular
                    // expression match here for either separator (matching gits behaviour), as
                    // opposed to pushing the raw string (which is the operating system specific
                    // separator).
                    regex.push_str(r"[\\/]");
                }
                Token::QuestionMark => {
                    // The character "?" matches any one character except "/"
                    regex.push_str(r"[^\\/]");
                }
            }
        }

        if is_relative_to_root {
            // If there is a separator at the beginning or middle (or both) of the pattern, then
            // the pattern is relative to the directory level of the particular `.gitignore` file itself.
            regex.insert(0, '^');
        } else {
            // When there is no separators, the pattern may match at any level (i.e.
            // directory or filename) below the `.gitignore` level.
            regex.insert_str(0, r"(?:^|[\\/])");
        }

        if !is_directory_only {
            // If there is a separator at the end of the pattern then the pattern will
            // only match directories, otherwise the pattern can match both files and
            // directories.
            //
            // For example, a pattern `doc/frotz/` matches `doc/frotz` directory, but not
            // `a/doc/frotz` directory; however `frotz/` matches `frotz` and `a/frotz` that
            // is a directory (all paths are relative from the `.gitignore` file).
            regex.push_str(r"(?:$|[\\/])");
        }

        let regex = regex::bytes::RegexBuilder::new(regex.as_str())
            // Purposely attempt to avoid `.gitignore` glob patterns from translating
            // into very resource intensive regular expressions.
            //
            // Largely this is an arbitrary number, so can be increased. But it should
            // remain small enough to prevent any ReDoS attacks.
            .size_limit(20_000)
            //  The git man page does not call this out directly, however it can be seen that
            //  git matches on _bytes_ not characters (where non-UTF-8 chars, like "é" are a single
            //  _character_ but multiple _bytes_).
            //
            //  Getting this behaviour correct is particularly important for the `?` glob pattern.
            //
            //  This is because `?` is supposed to "match any one character except "/"" (where
            //  "character" actually means byte), and whether its bytes or characters fundamentally changes
            //  the matching bejaviour of `?` on non-UTF-8 strings.
            //
            //  Example:
            //
            //  When matching on characters, "file?.txt" matches "fileé.txt" because the two byte "é" is
            //  treated as a single character.
            //
            //  However, because of its multi-byte nature, when matching on bytes "file?.txt" DOES NOT
            //  match "fileé.txt" because "é" takes two bytes to produce the one character.
            //
            //  In order to match "fileé.txt" when Unicode is turned off the pattern would need to
            //  be "file??.txt", so that it can match bytes.
            .unicode(false)
            .build()
            .map_err(|e| evaluator::Error::InvalidRegex {
                pattern: value.clone().into(),
                regex: regex.as_str().into(),
                source: e,
            })?;

        log::trace!(
            "Converted pattern: {:?} into regex: {:?} (with negation: {})",
            String::from(value.clone()),
            regex,
            is_negated
        );

        Ok(Self::new(
            regex,
            String::from(value.clone()).as_str(),
            is_negated,
        ))
    }
}

/// Convert a stream of tokens back into the original pattern that produced those tokens.
///
/// This is designed to produce a byte-for-byte identical string to the original pattern that produced the
/// token stream. In other words, any given token stream converted back into a string should be exactly the
/// same as the original pattern that produced that token stream.
impl From<TokenStream> for String {
    fn from(value: TokenStream) -> Self {
        let mut pattern = Self::new();

        for token in value.iter() {
            match token {
                Token::ExplicitLiteral(_) => {
                    pattern.push('\\');
                    pattern.push_str(token.as_str());
                }
                Token::Range(_) => {
                    pattern.push('[');
                    pattern.push_str(token.as_str());
                    pattern.push(']');
                }
                Token::Comment(_) => {
                    pattern.push('#');
                    pattern.push_str(token.as_str());
                }
                _ => pattern.push_str(token.as_str()),
            }
        }

        pattern
    }
}

impl TryFrom<&str> for Glob {
    type Error = evaluator::Error;

    fn try_from(value: &str) -> evaluator::Result<Self> {
        if value.is_empty() {
            return Ok(Self::empty());
        }

        let tokens = lexer::analyse(value).map_err(|e| evaluator::Error::InvalidPattern {
            pattern: value.into(),
            source: Some(e),
        })?;

        Self::try_from(tokens)
    }
}

#[cfg(test)]
mod tests {
    use insta::assert_snapshot;
    use proptest::prelude::*;
    use rstest::{Context, rstest};

    use crate::utils;

    #[rstest]
    #[case(r"")]
    #[case(r"# This is a comment")]
    pub fn test_empty_globs(#[case] pattern: &str) {
        let output = super::Glob::try_from(pattern)
            .expect("Should never fail to build glob from empty or comment pattern");

        assert!(output.is_empty());
    }

    #[rstest]
    #[case(r"foo\", false)]
    #[case(r"foo\ ", true)]
    pub fn test_valid_vs_invalid_patterns(#[case] pattern: &str, #[case] expect_valid: bool) {
        let output = super::Glob::try_from(pattern);

        assert_eq!(output.is_ok(), expect_valid);
    }

    #[rstest]
    #[case(r"build/")]
    #[case(r"tmp/")]
    #[case(r"vendor/")]
    #[case(r"!vendor/keep.me")]
    #[case(r"*.tmp")]
    #[case(r"*.log")]
    #[case(r"**/globfoo.txt")]
    #[case(r"globdir/**")]
    #[case(r"a/**/globbar.txt")]
    #[case(r"/anchored.txt")]
    #[case(r"dironly/")]
    #[case(r"literal/file\*.txt")]
    #[case(r"literal/file\?.txt")]
    #[case(r"literal/file\[abc\].txt")]
    #[case(r"precedence.log")]
    #[case(r"!important.log")]
    #[case(r"pruned/")]
    #[case(r"!pruned/deep/keep.txt")]
    #[case(r"double_negation/important.tmp")]
    #[case(r"foo")]
    #[case(r"file?.txt")]
    #[case(r"file[abc].log")]
    #[case(r"file[0-9].txt")]
    pub fn test_glob_regexes_match_snapshot(#[context] ctx: Context, #[case] pattern: &str) {
        let output =
            super::Glob::try_from(pattern).expect("Should never fail to build glob pattern");

        assert_snapshot!(
            format!(
                "{}_{}",
                ctx.name,
                ctx.case.expect("to provide description for test case")
            ),
            output.regex.map(|r| r.to_string()).unwrap_or_default()
        );
    }

    proptest! {
        #[test]
        fn test_building_never_panics(
            pattern in utils::get_gitignore_pattern_fuzzing_strategy()
        ) {
            let output = super::Glob::try_from(pattern.as_str());

            prop_assert!(output.is_ok(), "Failed to build glob from pattern: {:?}", pattern);
        }
    }
}