Skip to main content

shannon_brush_parser/
pattern.rs

1//! Implements parsing for shell glob and extglob patterns.
2
3use crate::error;
4
5/// Represents the kind of an extended glob.
6pub enum ExtendedGlobKind {
7    /// The `+` extended glob; matches one or more occurrences of the inner pattern.
8    Plus,
9    /// The `@` extended glob; allows matching an alternation of inner patterns.
10    At,
11    /// The `!` extended glob; matches the negation of the inner pattern.
12    Exclamation,
13    /// The `?` extended glob; matches zero or one occurrence of the inner pattern.
14    Question,
15    /// The `*` extended glob; matches zero or more occurrences of the inner pattern.
16    Star,
17}
18
19/// Converts a shell pattern to a regular expression string.
20///
21/// # Arguments
22///
23/// * `pattern` - The shell pattern to convert.
24/// * `enable_extended_globbing` - Whether to enable extended globbing (extglob).
25pub fn pattern_to_regex_str(
26    pattern: &str,
27    enable_extended_globbing: bool,
28) -> Result<String, error::WordParseError> {
29    let regex_str = pattern_to_regex_translator::pattern(pattern, enable_extended_globbing)
30        .map_err(|e| error::WordParseError::Pattern(e.into()))?;
31    Ok(regex_str)
32}
33
34peg::parser! {
35    grammar pattern_to_regex_translator(enable_extended_globbing: bool) for str {
36        pub(crate) rule pattern() -> String =
37            pieces:(pattern_piece()*) {
38                pieces.join("")
39            }
40
41        rule pattern_piece() -> String =
42            escape_sequence() /
43            bracket_expression() /
44            extglob_enabled() s:extended_glob_pattern() { s } /
45            wildcard() /
46            [c if regex_char_needs_escaping(c)] {
47                let mut s = '\\'.to_string();
48                s.push(c);
49                s
50            } /
51            [c] { c.to_string() }
52
53        rule escape_sequence() -> String =
54            sequence:$(['\\'] [c if regex_char_needs_escaping(c)]) { sequence.to_owned() } /
55            ['\\'] [c] { c.to_string() }
56
57        rule bracket_expression() -> String =
58            "[" invert:(invert_char()?) members:bracket_member()+ "]" {
59                let mut members = members.into_iter().flatten().collect::<Vec<_>>();
60
61                // If we completed the parse but ended up with no valid members
62                // of the bracket expression, then return a regex that matches nothing.
63                // (Or in the inverted case, matches everything.)
64                if members.is_empty() {
65                    if invert.is_some() {
66                        String::from(".")
67                    } else {
68                        String::from("(?!)")
69                    }
70                } else {
71                    if invert.is_some() {
72                        members.insert(0, String::from("^"));
73                    }
74
75                    std::format!("[{}]", members.join(""))
76                }
77            }
78
79        rule invert_char() -> bool =
80            ['!' | '^'] { true }
81
82        rule bracket_member() -> Option<String> =
83            e:char_class_expression() { Some(e) } /
84            r:char_range() { r } /
85            m:single_char_bracket_member() {
86                let (char_str, _) = m;
87                Some(char_str)
88            }
89
90        rule char_class_expression() -> String =
91            e:$("[:" char_class() ":]") { e.to_owned() }
92
93        rule char_class() =
94            "alnum" / "alpha" / "blank" / "cntrl" / "digit" / "graph" / "lower" / "print" / "punct" / "space" / "upper"/ "xdigit"
95
96        rule char_range() -> Option<String> =
97            from:single_char_bracket_member() "-" to:single_char_bracket_member() {
98                let (from_str, from_c) = from;
99                let (to_str, to_c) = to;
100
101                // Evaluate if the range is valid.
102                if from_c <= to_c {
103                    Some(std::format!("{from_str}-{to_str}"))
104                } else {
105                    None
106                }
107            }
108
109        rule single_char_bracket_member() -> (String, char) =
110            // Preserve escaped characters as-is.
111            ['\\'] [c] { (std::format!("\\{c}"), c) } /
112            // Escape opening bracket.
113            ['['] { (String::from(r"\["), '[') } /
114            // Any other character except closing bracket gets added as-is.
115            [c if c != ']'] { (c.to_string(), c) }
116
117        rule wildcard() -> String =
118            "?" { String::from(".") } /
119            "*" { String::from(".*") }
120
121        rule extglob_enabled() -> () =
122            &[_] {? if enable_extended_globbing { Ok(()) } else { Err("extglob disabled") } }
123
124        pub(crate) rule extended_glob_pattern() -> String =
125            kind:extended_glob_prefix() "(" branches:extended_glob_body() ")" {
126                let mut s = String::new();
127
128                // fancy_regex uses ?! to indicate a negative lookahead.
129                if matches!(kind, ExtendedGlobKind::Exclamation) {
130                    if !branches.is_empty() {
131                        s.push_str("(?:(?!");
132                        s.push_str(&branches.join("|"));
133                        s.push_str(").*|(?>");
134                        s.push_str(&branches.join("|"));
135                        s.push_str(").+?|)");
136                    } else {
137                        s.push_str("(?:.+)");
138                    }
139                } else {
140                    s.push('(');
141                    s.push_str(&branches.join("|"));
142                    s.push(')');
143
144                    match kind {
145                        ExtendedGlobKind::Plus => s.push('+'),
146                        ExtendedGlobKind::Question => s.push('?'),
147                        ExtendedGlobKind::Star => s.push('*'),
148                        ExtendedGlobKind::At | ExtendedGlobKind::Exclamation => (),
149                    }
150                }
151
152                s
153            }
154
155        rule extended_glob_prefix() -> ExtendedGlobKind =
156            "+" { ExtendedGlobKind::Plus } /
157            "@" { ExtendedGlobKind::At } /
158            "!" { ExtendedGlobKind::Exclamation } /
159            "?" { ExtendedGlobKind::Question } /
160            "*" { ExtendedGlobKind::Star }
161
162        pub(crate) rule extended_glob_body() -> Vec<String> =
163            // Cover case with *no* branches.
164            &[')'] { vec![] } /
165            // Otherwise, look for branches separated by '|'.
166            extended_glob_branch() ** "|"
167
168        rule extended_glob_branch() -> String =
169            // Cover case of empty branch.
170            &['|' | ')'] { String::new() } /
171            pieces:(!['|' | ')'] piece:pattern_piece() { piece })+ {
172                pieces.join("")
173            }
174
175        // A glob metacharacter construct: wildcard, bracket expression, or extglob.
176        rule glob_piece() =
177            bracket_expression() /
178            extglob_enabled() extended_glob_pattern() /
179            wildcard()
180
181        // A non-glob piece: an escape sequence or any character not starting a glob.
182        rule non_glob_piece() =
183            escape_sequence() /
184            !glob_piece() [_]
185
186        // Succeeds (returning true) if the pattern contains at least one glob
187        // metacharacter. The same bracket_expression, wildcard, and
188        // extended_glob_pattern rules used for regex conversion are reused here
189        // via negative lookaheads, keeping a single source of truth.
190        pub(crate) rule has_glob_metacharacters() -> bool =
191            non_glob_piece()* glob_piece() [_]* { true }
192    }
193}
194
195/// Returns whether a pattern string contains any glob metacharacters.
196///
197/// Uses the same PEG grammar rules that `pattern_to_regex_str` uses, keeping
198/// a single source of truth for what constitutes a glob metacharacter.
199///
200/// # Arguments
201///
202/// * `pattern` - The shell pattern to check.
203/// * `enable_extended_globbing` - Whether to enable extended globbing (extglob).
204pub fn pattern_has_glob_metacharacters(pattern: &str, enable_extended_globbing: bool) -> bool {
205    pattern_to_regex_translator::has_glob_metacharacters(pattern, enable_extended_globbing)
206        .unwrap_or(false)
207}
208
209/// Returns whether or not a given character needs to be escaped in a regular expression.
210///
211/// # Arguments
212///
213/// * `c` - The character to check.
214pub const fn regex_char_needs_escaping(c: char) -> bool {
215    matches!(
216        c,
217        '[' | ']' | '(' | ')' | '{' | '}' | '*' | '?' | '.' | '+' | '^' | '$' | '|' | '\\' | '-'
218    )
219}
220
221#[cfg(test)]
222#[expect(clippy::panic_in_result_fn)]
223mod tests {
224    use super::*;
225    use anyhow::Result;
226
227    #[test]
228    fn test_bracket_exprs() -> Result<()> {
229        assert_eq!(pattern_to_regex_str("[a-z]", true)?, "[a-z]");
230        assert_eq!(pattern_to_regex_str("[z-a]", true)?, "(?!)");
231        assert_eq!(pattern_to_regex_str("[+-/]", true)?, "[+-/]");
232        assert_eq!(pattern_to_regex_str(r"[\*-/]", true)?, r"[\*-/]");
233        assert_eq!(pattern_to_regex_str("[abc]", true)?, "[abc]");
234        assert_eq!(pattern_to_regex_str(r"[\(]", true)?, r"[\(]");
235        assert_eq!(pattern_to_regex_str(r"[(]", true)?, "[(]");
236        assert_eq!(pattern_to_regex_str("[[:digit:]]", true)?, "[[:digit:]]");
237        assert_eq!(pattern_to_regex_str(r"[-(),!]*", true)?, r"[-(),!].*");
238        assert_eq!(pattern_to_regex_str(r"[-\(\),\!]*", true)?, r"[-\(\),\!].*");
239        assert_eq!(pattern_to_regex_str(r"[a\-b]", true)?, r"[a\-b]");
240        assert_eq!(pattern_to_regex_str(r"[a\-\*]", true)?, r"[a\-\*]");
241        Ok(())
242    }
243
244    #[test]
245    fn test_extended_glob() -> Result<()> {
246        assert_eq!(
247            pattern_to_regex_translator::extended_glob_pattern("@(a|b)", true)?,
248            "(a|b)"
249        );
250
251        assert_eq!(
252            pattern_to_regex_translator::extended_glob_pattern("@(|a)", true)?,
253            "(|a)"
254        );
255
256        assert_eq!(
257            pattern_to_regex_translator::extended_glob_pattern("@(|)", true)?,
258            "(|)"
259        );
260
261        assert_eq!(
262            pattern_to_regex_translator::extended_glob_body("ab|ac", true)?,
263            vec!["ab", "ac"],
264        );
265
266        assert_eq!(
267            pattern_to_regex_translator::extended_glob_pattern("*(ab|ac)", true)?,
268            "(ab|ac)*"
269        );
270
271        assert_eq!(
272            pattern_to_regex_translator::extended_glob_body("", true)?,
273            Vec::<String>::new(),
274        );
275
276        Ok(())
277    }
278
279    #[test]
280    fn test_has_glob_metacharacters() {
281        // Basic metacharacters.
282        assert!(pattern_has_glob_metacharacters("*", false));
283        assert!(pattern_has_glob_metacharacters("?", false));
284        assert!(pattern_has_glob_metacharacters("a*b", false));
285        assert!(pattern_has_glob_metacharacters("a?b", false));
286
287        // Valid bracket expressions.
288        assert!(pattern_has_glob_metacharacters("[abc]", false));
289        assert!(pattern_has_glob_metacharacters("[a-z]", false));
290        assert!(pattern_has_glob_metacharacters("[!a]", false));
291
292        // Lone `]` is NOT a glob metacharacter.
293        assert!(!pattern_has_glob_metacharacters("]", false));
294        assert!(!pattern_has_glob_metacharacters("foo]", false));
295        assert!(!pattern_has_glob_metacharacters("a]b", false));
296
297        // Lone `[` without matching `]` is NOT a glob metacharacter.
298        assert!(!pattern_has_glob_metacharacters("[", false));
299        assert!(!pattern_has_glob_metacharacters("[abc", false));
300        assert!(!pattern_has_glob_metacharacters("a[b", false));
301
302        // Plain text — no glob chars.
303        assert!(!pattern_has_glob_metacharacters("hello", false));
304        assert!(!pattern_has_glob_metacharacters("", false));
305
306        // Backslash-escaped metacharacters are not globs.
307        assert!(!pattern_has_glob_metacharacters(r"\*", false));
308        assert!(!pattern_has_glob_metacharacters(r"\?", false));
309        assert!(!pattern_has_glob_metacharacters(r"\[abc]", false));
310
311        // Extglob patterns — not detected without extended globbing.
312        assert!(!pattern_has_glob_metacharacters("@(a)", false));
313        assert!(!pattern_has_glob_metacharacters("!(a)", false));
314        assert!(!pattern_has_glob_metacharacters("+(a)", false));
315
316        // Extglob patterns — detected with extended globbing.
317        assert!(pattern_has_glob_metacharacters("@(a)", true));
318        assert!(pattern_has_glob_metacharacters("!(a)", true));
319        assert!(pattern_has_glob_metacharacters("+(a)", true));
320
321        // *( and ?( are already caught by * and ? checks.
322        assert!(pattern_has_glob_metacharacters("*(a)", false));
323        assert!(pattern_has_glob_metacharacters("?(a)", false));
324    }
325}