fnmatch_regex2/
glob.rs

1// SPDX-FileCopyrightText: Peter Pentchev <roam@ringlet.net>
2// SPDX-License-Identifier: BSD-2-Clause
3//! Shell glob-like filename matching.
4//!
5//! The glob-style pattern features currently supported are:
6//! - any character except `?`, `*`, `[`, `\`, or `{` is matched literally
7//! - `?` matches any single character except a slash (`/`)
8//! - `*` matches any sequence of zero or more characters that does not
9//!   contain a slash (`/`)
10//! - a backslash allows the next character to be matched literally, except
11//!   for the `\a`, `\b`, `\e`, `\n`, `\r`, and `\v` sequences
12//! - a `[...]` character class supports ranges, negation if the very first
13//!   character is `!`, backslash-escaping, and also matching
14//!   a `]` character if it is the very first character possibly after
15//!   the `!` one (e.g. `[]]` would only match a single `]` character)
16//! - an `{a,bbb,cc}` alternation supports backslash-escaping, but not
17//!   nested alternations or character classes yet
18//!
19//! Note that the `*` and `?` wildcard patterns, as well as the character
20//! classes, will never match a slash.
21//!
22//! Examples:
23//! - `abc.txt` would only match `abc.txt`
24//! - `foo/test?.txt` would match e.g. `foo/test1.txt` or `foo/test".txt`,
25//!   but not `foo/test/.txt`
26//! - `/etc/c[--9].conf` would match e.g. `/etc/c-.conf`, `/etc/c..conf`,
27//!    or `/etc/7.conf`, but not `/etc/c/.conf`
28//! - `linux-[0-9]*-{generic,aws}` would match `linux-5.2.27b1-generic`
29//!   and `linux-4.0.12-aws`, but not `linux-unsigned-5.2.27b1-generic`
30//!
31//! Note that the [`glob_to_regex`] function returns a regular expression
32//! that will only verify whether a specified text string matches
33//! the pattern; it does not in any way attempt to look up any paths on
34//! the filesystem.
35//!
36//! ```rust
37//! # use std::error::Error;
38//!
39//! # fn main() -> Result<(), Box<dyn Error>> {
40//! let re_name = fnmatch_regex2::glob_to_regex("linux-[0-9]*-{generic,aws}")?;
41//! for name in &[
42//!     "linux-5.2.27b1-generic",
43//!     "linux-4.0.12-aws",
44//!     "linux-unsigned-5.2.27b1-generic"
45//! ] {
46//!     let okay = re_name.is_match(name);
47//!     println!(
48//!         "{}: {}",
49//!         name,
50//!         match okay { true => "yes", false => "no" },
51//!     );
52//!     assert!(okay == !name.contains("unsigned"));
53//! }
54//! # Ok(())
55//! # }
56//! ```
57
58use std::mem;
59use std::vec::IntoIter as VecIntoIter;
60
61use anyhow::anyhow;
62use itertools::{Either, Itertools as _};
63use regex::{Regex, RegexBuilder};
64
65use crate::error::Error as FError;
66
67/// Something that may appear in a character class.
68#[derive(Debug)]
69enum ClassItem {
70    /// A character may appear in a character class.
71    Char(char),
72    /// A range of characters may appear in a character class.
73    Range(char, char),
74}
75
76/// An accumulator for building the representation of a character class.
77#[derive(Debug)]
78struct ClassAccumulator {
79    /// Is the class negated (i.e. was `^` the first character).
80    negated: bool,
81    /// The characters or ranges in the class, in order of appearance.
82    items: Vec<ClassItem>,
83}
84
85/// The current state of the glob pattern parser.
86#[derive(Debug)]
87enum State {
88    /// The very start of the pattern.
89    Start,
90    /// The end of the pattern, nothing more to do.
91    End,
92    /// The next item can be a literal character.
93    Literal,
94    /// The next item will signify a character escape, e.g. `\t`, `\n`, etc.
95    Escape,
96    /// The next item will be the first character of a class, possibly `^`.
97    ClassStart,
98    /// The next item will either be a character or a range, both within a class.
99    Class(ClassAccumulator),
100    /// A character class range was completed; check whether the next character is
101    /// a dash.
102    ClassRange(ClassAccumulator, char),
103    /// There was a dash following a character range; let's hope this is the end of
104    /// the class definition.
105    ClassRangeDash(ClassAccumulator),
106    /// The next item will signify a character escape within a character class.
107    ClassEscape(ClassAccumulator),
108    /// We are building a collection of alternatives.
109    Alternate(String, Vec<String>),
110    /// The next item will signify a character escape within a collection of alternatives.
111    AlternateEscape(String, Vec<String>),
112}
113
114// We need this so we can use mem::take() later.
115impl Default for State {
116    fn default() -> Self {
117        Self::Start
118    }
119}
120
121/// Escape a character in a character class if necessary.
122/// This only escapes the backslash itself and the closing bracket.
123fn escape_in_class(chr: char) -> String {
124    if chr == ']' || chr == '\\' {
125        format!("\\{chr}")
126    } else {
127        chr.to_string()
128    }
129}
130
131/// Escape a character outside of a character class if necessary.
132fn escape(chr: char) -> String {
133    if "[{(|^$.*?+\\".contains(chr) {
134        format!("\\{chr}")
135    } else {
136        chr.to_string()
137    }
138}
139
140/// Interpret an escaped character: return the one that was meant.
141const fn map_letter_escape(chr: char) -> char {
142    match chr {
143        'a' => '\x07',
144        'b' => '\x08',
145        'e' => '\x1b',
146        'f' => '\x0c',
147        'n' => '\x0a',
148        'r' => '\x0d',
149        't' => '\x09',
150        'v' => '\x0b',
151        other => other,
152    }
153}
154
155/// Unescape a character and escape it if needed.
156fn escape_special(chr: char) -> String {
157    escape(map_letter_escape(chr))
158}
159
160/// Remove a slash from characters and classes.
161struct ExcIter<I>
162where
163    I: Iterator<Item = ClassItem>,
164{
165    /// The items to remove slashes from.
166    it: I,
167}
168
169impl<I> Iterator for ExcIter<I>
170where
171    I: Iterator<Item = ClassItem>,
172{
173    type Item = VecIntoIter<ClassItem>;
174
175    fn next(&mut self) -> Option<Self::Item> {
176        self.it.next().map(|cls| {
177            match cls {
178                ClassItem::Char('/') => vec![],
179                ClassItem::Char(_) => vec![cls],
180                ClassItem::Range('.', '/') => vec![ClassItem::Char('.')],
181                ClassItem::Range(start, '/') => vec![ClassItem::Range(start, '.')],
182                ClassItem::Range('/', '0') => vec![ClassItem::Char('0')],
183                ClassItem::Range('/', end) => vec![ClassItem::Range('0', end)],
184                ClassItem::Range(start, end) if start > '/' || end < '/' => vec![cls],
185                ClassItem::Range(start, end) => vec![
186                    if start == '.' {
187                        ClassItem::Char('.')
188                    } else {
189                        ClassItem::Range(start, '.')
190                    },
191                    if end == '0' {
192                        ClassItem::Char('0')
193                    } else {
194                        ClassItem::Range('0', end)
195                    },
196                ],
197            }
198            .into_iter()
199        })
200    }
201}
202
203/// Character classes should never match a slash when used in filenames.
204/// Thus, make sure that a negated character class will include the slash
205/// character and that a non-negated one will not include it.
206fn handle_slash(mut acc: ClassAccumulator) -> ClassAccumulator {
207    if acc.negated {
208        let slash_found = acc.items.iter().any(|item| match *item {
209            ClassItem::Char('/') => true,
210            ClassItem::Char(_) => false,
211            ClassItem::Range(start, end) => start <= '/' && end >= '/',
212        });
213        if !slash_found {
214            acc.items.push(ClassItem::Char('/'));
215        }
216        acc
217    } else {
218        ClassAccumulator {
219            items: ExcIter {
220                it: acc.items.into_iter(),
221            }
222            .flatten()
223            .collect(),
224            ..acc
225        }
226    }
227}
228
229/// Convert a glob character class to a regular expression one.
230/// Make sure none of the classes will allow a slash to be matched in
231/// a filename, make sure the dash is at the end of the regular expression
232/// class pattern (e.g. `[A-Za-z0-9-]`), sort the characters and the classes.
233fn close_class(glob_acc: ClassAccumulator) -> String {
234    let acc = handle_slash(glob_acc);
235    let (chars_vec, classes_vec): (Vec<_>, Vec<_>) =
236        acc.items.into_iter().partition_map(|item| match item {
237            ClassItem::Char(chr) => Either::Left(chr),
238            ClassItem::Range(start, end) => Either::Right((start, end)),
239        });
240
241    let (chars, final_dash) = {
242        let mut has_dash = false;
243        let res = chars_vec
244            .into_iter()
245            .filter(|chr| {
246                if *chr == '-' {
247                    has_dash = true;
248                    false
249                } else {
250                    true
251                }
252            })
253            .sorted_unstable()
254            .dedup()
255            .map(escape_in_class);
256        (res, if has_dash { "-" } else { "" })
257    };
258
259    let classes = classes_vec
260        .into_iter()
261        .sorted_unstable()
262        .dedup()
263        .map(|cls| {
264            format!(
265                "{first}-{last}",
266                first = escape_in_class(cls.0),
267                last = escape_in_class(cls.1)
268            )
269        });
270
271    format!(
272        "[{neg}{ranges}{final_dash}]",
273        neg = if acc.negated { "^" } else { "" },
274        ranges = chars.chain(classes).collect::<String>(),
275    )
276}
277
278/// Convert a glob alternatives list to a regular expression pattern.
279fn close_alternate(gathered: Vec<String>) -> String {
280    let items = gathered
281        .into_iter()
282        .map(|item| item.chars().map(escape).collect::<String>())
283        .sorted_unstable()
284        .dedup()
285        .join("|");
286
287    format!("({items})")
288}
289
290/// Iterate over a glob pattern's characters, build up a regular expression.
291struct GlobIterator<I: Iterator<Item = char>> {
292    /// The iterator over the glob pattern's characters.
293    pattern: I,
294    /// The current state of the glob pattern parser.
295    state: State,
296}
297
298/// Either a piece of the regular expression or an error.
299type StringResult = Result<Option<String>, FError>;
300
301impl<I> GlobIterator<I>
302where
303    I: Iterator<Item = char>,
304{
305    /// Output a "^" at the very start of the pattern.
306    fn handle_start(&mut self) -> String {
307        self.state = State::Literal;
308        "^".to_owned()
309    }
310
311    /// Handle the next character when expecting a literal one.
312    fn handle_literal(&mut self) -> Option<String> {
313        match self.pattern.next() {
314            None => {
315                self.state = State::End;
316                Some("$".to_owned())
317            }
318            Some(chr) => {
319                let (new_state, res) = match chr {
320                    '\\' => (State::Escape, None),
321                    '[' => (State::ClassStart, None),
322                    '{' => (State::Alternate(String::new(), Vec::new()), None),
323                    '?' => (State::Literal, Some("[^/]".to_owned())),
324                    '*' => (State::Literal, Some(".*".to_owned())),
325                    _ => (State::Literal, Some(regex::escape(&chr.to_string()))),
326                };
327                self.state = new_state;
328                res
329            }
330        }
331    }
332
333    /// Handle an escaped character.
334    fn handle_escape(&mut self) -> StringResult {
335        match self.pattern.next() {
336            Some(chr) => {
337                self.state = State::Literal;
338                Ok(Some(escape_special(chr)))
339            }
340            None => Err(FError::BareEscape),
341        }
342    }
343
344    /// Handle the first character in a character class specification.
345    fn handle_class_start(&mut self) -> StringResult {
346        match self.pattern.next() {
347            Some(chr) => {
348                self.state = match chr {
349                    '!' => State::Class(ClassAccumulator {
350                        negated: true,
351                        items: Vec::new(),
352                    }),
353                    '-' => State::Class(ClassAccumulator {
354                        negated: false,
355                        items: vec![ClassItem::Char('-')],
356                    }),
357                    ']' => State::Class(ClassAccumulator {
358                        negated: false,
359                        items: vec![ClassItem::Char(']')],
360                    }),
361                    '\\' => State::ClassEscape(ClassAccumulator {
362                        negated: false,
363                        items: Vec::new(),
364                    }),
365                    other => State::Class(ClassAccumulator {
366                        negated: false,
367                        items: vec![ClassItem::Char(other)],
368                    }),
369                };
370                Ok(None)
371            }
372            None => Err(FError::UnclosedClass),
373        }
374    }
375
376    /// Handle a character in a character class specification.
377    fn handle_class(&mut self, mut acc: ClassAccumulator) -> StringResult {
378        match self.pattern.next() {
379            Some(chr) => Ok(match chr {
380                ']' => {
381                    if acc.items.is_empty() {
382                        acc.items.push(ClassItem::Char(']'));
383                        self.state = State::Class(acc);
384                        None
385                    } else {
386                        self.state = State::Literal;
387                        Some(close_class(acc))
388                    }
389                }
390                '-' => match acc.items.pop() {
391                    None => {
392                        acc.items.push(ClassItem::Char('-'));
393                        self.state = State::Class(acc);
394                        None
395                    }
396                    Some(ClassItem::Range(start, end)) => {
397                        acc.items.push(ClassItem::Range(start, end));
398                        self.state = State::ClassRangeDash(acc);
399                        None
400                    }
401                    Some(ClassItem::Char(start)) => {
402                        self.state = State::ClassRange(acc, start);
403                        None
404                    }
405                },
406                '\\' => {
407                    self.state = State::ClassEscape(acc);
408                    None
409                }
410                other => {
411                    acc.items.push(ClassItem::Char(other));
412                    self.state = State::Class(acc);
413                    None
414                }
415            }),
416            None => Err(FError::UnclosedClass),
417        }
418    }
419
420    /// Escape a character in a class specification.
421    fn handle_class_escape(&mut self, mut acc: ClassAccumulator) -> StringResult {
422        match self.pattern.next() {
423            Some(chr) => {
424                acc.items.push(ClassItem::Char(map_letter_escape(chr)));
425                self.state = State::Class(acc);
426                Ok(None)
427            }
428            None => Err(FError::UnclosedClass),
429        }
430    }
431
432    /// Handle a character within a class range.
433    fn handle_class_range(&mut self, mut acc: ClassAccumulator, start: char) -> StringResult {
434        match self.pattern.next() {
435            Some(chr) => match chr {
436                '\\' => Err(FError::NotImplemented(format!(
437                    "FIXME: handle class range end escape with {acc:?} start {start:?}"
438                ))),
439                ']' => {
440                    acc.items.push(ClassItem::Char(start));
441                    acc.items.push(ClassItem::Char('-'));
442                    self.state = State::Literal;
443                    Ok(Some(close_class(acc)))
444                }
445                end if start > end => Err(FError::ReversedRange(start, end)),
446                end if start == end => {
447                    acc.items.push(ClassItem::Char(start));
448                    self.state = State::Class(acc);
449                    Ok(None)
450                }
451                end => {
452                    acc.items.push(ClassItem::Range(start, end));
453                    self.state = State::Class(acc);
454                    Ok(None)
455                }
456            },
457            None => Err(FError::UnclosedClass),
458        }
459    }
460
461    /// Handle a dash immediately following a range within a character class.
462    fn handle_class_range_dash(&mut self, mut acc: ClassAccumulator) -> StringResult {
463        match self.pattern.next() {
464            Some(chr) => {
465                if chr == ']' {
466                    acc.items.push(ClassItem::Char('-'));
467                    self.state = State::Literal;
468                    Ok(Some(close_class(acc)))
469                } else if let Some(ClassItem::Range(start, end)) = acc.items.pop() {
470                    Err(FError::RangeAfterRange(start, end))
471                } else {
472                    Err(FError::InternalError(anyhow!(
473                        "handle_class_range_dash: chr {chr:?} acc {acc:?}"
474                    )))
475                }
476            }
477            None => Err(FError::UnclosedClass),
478        }
479    }
480
481    /// Start a set of alternatives.
482    fn handle_alternate(&mut self, mut current: String, mut gathered: Vec<String>) -> StringResult {
483        match self.pattern.next() {
484            Some(chr) => match chr {
485                ',' => {
486                    gathered.push(current);
487                    self.state = State::Alternate(String::new(), gathered);
488                    Ok(None)
489                }
490                '}' => {
491                    self.state = State::Literal;
492                    if current.is_empty() && gathered.is_empty() {
493                        Ok(Some(r"\{\}".to_owned()))
494                    } else {
495                        gathered.push(current);
496                        Ok(Some(close_alternate(gathered)))
497                    }
498                }
499                '\\' => {
500                    self.state = State::AlternateEscape(current, gathered);
501                    Ok(None)
502                }
503                '[' => Err(FError::NotImplemented(
504                    "FIXME: alternate character class".to_owned(),
505                )),
506                other => {
507                    current.push(other);
508                    self.state = State::Alternate(current, gathered);
509                    Ok(None)
510                }
511            },
512            None => Err(FError::UnclosedAlternation),
513        }
514    }
515
516    /// Escape a character within a list of alternatives.
517    fn handle_alternate_escape(
518        &mut self,
519        mut current: String,
520        gathered: Vec<String>,
521    ) -> StringResult {
522        match self.pattern.next() {
523            Some(chr) => {
524                current.push(map_letter_escape(chr));
525                self.state = State::Alternate(current, gathered);
526                Ok(None)
527            }
528            None => Err(FError::UnclosedAlternation),
529        }
530    }
531}
532
533impl<I> Iterator for GlobIterator<I>
534where
535    I: Iterator<Item = char>,
536{
537    type Item = StringResult;
538
539    fn next(&mut self) -> Option<Self::Item> {
540        match mem::take(&mut self.state) {
541            State::Start => Some(Ok(Some(self.handle_start()))),
542            State::End => None,
543            State::Literal => Some(Ok(self.handle_literal())),
544            State::Escape => Some(self.handle_escape()),
545            State::ClassStart => Some(self.handle_class_start()),
546            State::Class(acc) => Some(self.handle_class(acc)),
547            State::ClassEscape(acc) => Some(self.handle_class_escape(acc)),
548            State::ClassRange(acc, start) => Some(self.handle_class_range(acc, start)),
549            State::ClassRangeDash(acc) => Some(self.handle_class_range_dash(acc)),
550            State::Alternate(current, gathered) => Some(self.handle_alternate(current, gathered)),
551            State::AlternateEscape(current, gathered) => {
552                Some(self.handle_alternate_escape(current, gathered))
553            }
554        }
555    }
556}
557
558/// Parse a shell glob-like pattern into a regular expression.
559///
560/// See the module-level documentation for a description of the pattern
561/// features supported.
562///
563/// # Errors
564/// Most of the [`crate::error::Error`] values, mostly syntax errors in
565/// the specified glob pattern.
566#[allow(clippy::missing_inline_in_public_items)]
567pub fn glob_to_regex(pattern: &str) -> Result<Regex, FError> {
568    let parser = GlobIterator {
569        pattern: pattern.chars(),
570        state: State::Start,
571    };
572    let re_pattern = parser.flatten_ok().collect::<Result<Vec<_>, _>>()?.join("");
573    Regex::new(&re_pattern).map_err(|err| FError::InvalidRegex(re_pattern, err))
574}
575
576/// Parse a shell glob-like pattern into a regular expression.
577///
578/// See the module-level documentation for a description of the pattern
579/// features supported.
580///
581/// # Errors
582/// Most of the [`crate::error::Error`] values, mostly syntax errors in
583/// the specified glob pattern.
584#[allow(clippy::missing_inline_in_public_items)]
585pub fn glob_to_regex_ext(pattern: &str, ignore_case: bool) -> Result<Regex, FError> {
586    let parser = GlobIterator {
587        pattern: pattern.chars(),
588        state: State::Start,
589    };
590    let re_pattern = parser.flatten_ok().collect::<Result<Vec<_>, _>>()?.join("");
591    RegexBuilder::new(&re_pattern)
592        .case_insensitive(ignore_case)
593        .build()
594        .map_err(|err| FError::InvalidRegex(re_pattern, err))
595}