Skip to main content

uv_globfilter/
portable_glob.rs

1//! Cross-language glob syntax from
2//! [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/).
3
4use globset::{Glob, GlobBuilder};
5use owo_colors::OwoColorize;
6use thiserror::Error;
7
8#[derive(Debug, Error)]
9pub enum PortableGlobError {
10    /// Shows the failing glob in the error message.
11    #[error(transparent)]
12    GlobError(#[from] globset::Error),
13    #[error(
14        "The parent directory operator (`..`) at position {pos} is not allowed in glob: `{glob}`"
15    )]
16    ParentDirectory { glob: String, pos: usize },
17    #[error("Invalid character `{invalid}` at position {pos} in glob: `{glob}`")]
18    InvalidCharacter {
19        glob: String,
20        pos: usize,
21        invalid: char,
22    },
23    #[error(
24        "Invalid character `{invalid}` at position {pos} in glob: `{glob}`. {}{} Characters can be escaped with a backslash",
25        "hint".bold().cyan(),
26        ":".bold()
27    )]
28    InvalidCharacterUv {
29        glob: String,
30        pos: usize,
31        invalid: char,
32    },
33    #[error(
34        "Only forward slashes are allowed as path separator, invalid character at position {pos} in glob: `{glob}`"
35    )]
36    InvalidBackslash { glob: String, pos: usize },
37    #[error(
38        "Path separators can't be escaped, invalid character at position {pos} in glob: `{glob}`"
39    )]
40    InvalidEscapee { glob: String, pos: usize },
41    #[error("Invalid character `{invalid}` in range at position {pos} in glob: `{glob}`")]
42    InvalidCharacterRange {
43        glob: String,
44        pos: usize,
45        invalid: char,
46    },
47    #[error("Too many at stars at position {pos} in glob: `{glob}`")]
48    TooManyStars { glob: String, pos: usize },
49    #[error("Trailing backslash at position {pos} in glob: `{glob}`")]
50    TrailingEscape { glob: String, pos: usize },
51}
52
53/// Cross-language glob syntax from
54/// [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/).
55///
56/// The variant determines whether the parser strictly adheres to PEP 639 rules or allows extensions
57/// such as backslash escapes.
58#[derive(Debug, PartialEq, Eq, Clone, Copy)]
59pub enum PortableGlobParser {
60    /// Follow the PEP 639 rules strictly.
61    Pep639,
62    /// In addition to the PEP 639 syntax, allow escaping characters with backslashes.
63    ///
64    /// For cross-platform compatibility, escaping path separators is not allowed, i.e., forward
65    /// slashes and backslashes can't be escaped.
66    Uv,
67}
68
69impl PortableGlobParser {
70    fn backslash_escape(self) -> bool {
71        match self {
72            Self::Pep639 => false,
73            Self::Uv => true,
74        }
75    }
76
77    /// Parse cross-language glob syntax based on [PEP 639](https://packaging.python.org/en/latest/specifications/glob-patterns/):
78    ///
79    /// - Alphanumeric characters, underscores (`_`), hyphens (`-`) and dots (`.`) are matched verbatim.
80    /// - The special glob characters are:
81    ///   - `*`: Matches any number of characters except path separators
82    ///   - `?`: Matches a single character except the path separator
83    ///   - `**`: Matches any number of characters including path separators
84    ///   - `[]`, containing only the verbatim matched characters: Matches a single of the characters contained. Within
85    ///     `[...]`, the hyphen indicates a locale-agnostic range (e.g. `a-z`, order based on Unicode code points). Hyphens at
86    ///     the start or end are matched literally.
87    ///   - `\`: Disallowed in PEP 639 mode. In uv mode, it escapes the following character to be matched verbatim.
88    /// - The path separator is the forward slash character (`/`). Patterns are relative to the given directory, a leading slash
89    ///   character for absolute paths is not supported.
90    /// - Parent directory indicators (`..`) are not allowed.
91    ///
92    /// These rules mean that matching the backslash (`\`) is forbidden, which avoid collisions with the windows path separator.
93    pub fn parse(&self, glob: &str) -> Result<Glob, PortableGlobError> {
94        self.check(glob)?;
95        Ok(GlobBuilder::new(glob)
96            .literal_separator(true)
97            // No need to support Windows-style paths, so the backslash can be used a escape.
98            .backslash_escape(self.backslash_escape())
99            .build()?)
100    }
101
102    /// See [`parse_portable_glob`].
103    pub fn check(&self, glob: &str) -> Result<(), PortableGlobError> {
104        let mut chars = glob.chars().enumerate().peekable();
105        // A `..` is on a parent directory indicator at the start of the string or after a directory
106        // separator.
107        let mut start_or_slash = true;
108        // The number of consecutive stars before the current character.
109        while let Some((pos, c)) = chars.next() {
110            // `***` or `**literals` can be correctly represented with less stars. They are banned by
111            // `glob`, they are allowed by `globset` and PEP 639 is ambiguous, so we're filtering them
112            // out.
113            if c == '*' {
114                let mut star_run = 1;
115                while let Some((_, c)) = chars.peek() {
116                    if *c == '*' {
117                        star_run += 1;
118                        chars.next();
119                    } else {
120                        break;
121                    }
122                }
123                if star_run >= 3 {
124                    return Err(PortableGlobError::TooManyStars {
125                        glob: glob.to_string(),
126                        // We don't update pos for the stars.
127                        pos,
128                    });
129                } else if star_run == 2 {
130                    if chars.peek().is_some_and(|(_, c)| *c != '/') {
131                        return Err(PortableGlobError::TooManyStars {
132                            glob: glob.to_string(),
133                            // We don't update pos for the stars.
134                            pos,
135                        });
136                    }
137                }
138                start_or_slash = false;
139            } else if c.is_alphanumeric() || matches!(c, '_' | '-' | '?') {
140                start_or_slash = false;
141            } else if c == '.' {
142                if start_or_slash && matches!(chars.peek(), Some((_, '.'))) {
143                    return Err(PortableGlobError::ParentDirectory {
144                        pos,
145                        glob: glob.to_string(),
146                    });
147                }
148                start_or_slash = false;
149            } else if c == '/' {
150                start_or_slash = true;
151            } else if c == '[' {
152                for (pos, c) in chars.by_ref() {
153                    if c.is_alphanumeric() || matches!(c, '_' | '-' | '.') {
154                        // Allowed.
155                    } else if c == ']' {
156                        break;
157                    } else {
158                        return Err(PortableGlobError::InvalidCharacterRange {
159                            glob: glob.to_string(),
160                            pos,
161                            invalid: c,
162                        });
163                    }
164                }
165                start_or_slash = false;
166            } else if c == '\\' {
167                match self {
168                    Self::Pep639 => {
169                        return Err(PortableGlobError::InvalidBackslash {
170                            glob: glob.to_string(),
171                            pos,
172                        });
173                    }
174                    Self::Uv => {
175                        match chars.next() {
176                            Some((pos, '/' | '\\')) => {
177                                // For cross-platform compatibility, we don't allow forward slashes or
178                                // backslashes to be escaped.
179                                return Err(PortableGlobError::InvalidEscapee {
180                                    glob: glob.to_string(),
181                                    pos,
182                                });
183                            }
184                            Some(_) => {
185                                // Escaped character
186                            }
187                            None => {
188                                return Err(PortableGlobError::TrailingEscape {
189                                    glob: glob.to_string(),
190                                    pos,
191                                });
192                            }
193                        }
194                    }
195                }
196            } else {
197                let err = match self {
198                    Self::Pep639 => PortableGlobError::InvalidCharacter {
199                        glob: glob.to_string(),
200                        pos,
201                        invalid: c,
202                    },
203                    Self::Uv => PortableGlobError::InvalidCharacterUv {
204                        glob: glob.to_string(),
205                        pos,
206                        invalid: c,
207                    },
208                };
209                return Err(err);
210            }
211        }
212        Ok(())
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use insta::assert_snapshot;
220
221    #[test]
222    fn test_error() {
223        let parse_err = |glob| {
224            let error = PortableGlobParser::Pep639.parse(glob).unwrap_err();
225            anstream::adapter::strip_str(&error.to_string()).to_string()
226        };
227        assert_snapshot!(
228            parse_err(".."),
229            @"The parent directory operator (`..`) at position 0 is not allowed in glob: `..`"
230        );
231        assert_snapshot!(
232            parse_err("licenses/.."),
233            @"The parent directory operator (`..`) at position 9 is not allowed in glob: `licenses/..`"
234        );
235        assert_snapshot!(
236            parse_err("licenses/LICEN!E.txt"),
237            @"Invalid character `!` at position 14 in glob: `licenses/LICEN!E.txt`"
238        );
239        assert_snapshot!(
240            parse_err("licenses/LICEN[!C]E.txt"),
241            @"Invalid character `!` in range at position 15 in glob: `licenses/LICEN[!C]E.txt`"
242        );
243        assert_snapshot!(
244            parse_err("licenses/LICEN[C?]E.txt"),
245            @"Invalid character `?` in range at position 16 in glob: `licenses/LICEN[C?]E.txt`"
246        );
247        assert_snapshot!(
248            parse_err("******"),
249            @"Too many at stars at position 0 in glob: `******`"
250        );
251        assert_snapshot!(
252            parse_err("licenses/**license"),
253            @"Too many at stars at position 9 in glob: `licenses/**license`"
254        );
255        assert_snapshot!(
256            parse_err("licenses/***/licenses.csv"),
257            @"Too many at stars at position 9 in glob: `licenses/***/licenses.csv`"
258        );
259        assert_snapshot!(
260            parse_err(r"licenses\eula.txt"),
261            @r"Only forward slashes are allowed as path separator, invalid character at position 8 in glob: `licenses\eula.txt`"
262        );
263        assert_snapshot!(
264            parse_err(r"**/@test"),
265            @"Invalid character `@` at position 3 in glob: `**/@test`"
266        );
267        // Escapes are not allowed in strict PEP 639 mode
268        assert_snapshot!(
269            parse_err(r"public domain/Gulliver\\’s Travels.txt"),
270            @r"Invalid character ` ` at position 6 in glob: `public domain/Gulliver\\’s Travels.txt`"
271        );
272        let parse_err_uv = |glob| {
273            let error = PortableGlobParser::Uv.parse(glob).unwrap_err();
274            anstream::adapter::strip_str(&error.to_string()).to_string()
275        };
276        assert_snapshot!(
277            parse_err_uv(r"**/@test"),
278            @"Invalid character `@` at position 3 in glob: `**/@test`. hint: Characters can be escaped with a backslash"
279        );
280        // Escaping slashes is not allowed.
281        assert_snapshot!(
282            parse_err_uv(r"licenses\\MIT.txt"),
283            @r"Path separators can't be escaped, invalid character at position 9 in glob: `licenses\\MIT.txt`"
284        );
285        assert_snapshot!(
286            parse_err_uv(r"licenses\/MIT.txt"),
287            @r"Path separators can't be escaped, invalid character at position 9 in glob: `licenses\/MIT.txt`"
288        );
289    }
290
291    #[test]
292    fn test_valid() {
293        let cases = [
294            r"licenses/*.txt",
295            r"licenses/**/*.txt",
296            r"LICEN[CS]E.txt",
297            r"LICEN?E.txt",
298            r"[a-z].txt",
299            r"[a-z._-].txt",
300            r"*/**",
301            r"LICENSE..txt",
302            r"LICENSE_file-1.txt",
303            // (google translate)
304            r"licenses/라이센스*.txt",
305            r"licenses/ライセンス*.txt",
306            r"licenses/执照*.txt",
307            r"src/**",
308        ];
309        let cases_uv = [
310            r"public-domain/Gulliver\’s\ Travels.txt",
311            // https://github.com/astral-sh/uv/issues/13280
312            r"**/\@test",
313        ];
314        for case in cases {
315            PortableGlobParser::Pep639.parse(case).unwrap();
316        }
317        for case in cases.iter().chain(cases_uv.iter()) {
318            PortableGlobParser::Uv.parse(case).unwrap();
319        }
320    }
321}