fnmatch_regex2/
glob.rs

1//! Shell glob-like filename matching.
2//!
3//! The glob-style pattern features currently supported are:
4//! - any character except `?`, `*`, `[`, `\`, or `{` is matched literally
5//! - `?` matches any single character except a slash (`/`)
6//! - `*` matches any sequence of zero or more characters that does not
7//!   contain a slash (`/`)
8//! - a backslash allows the next character to be matched literally, except
9//!   for the `\a`, `\b`, `\e`, `\n`, `\r`, and `\v` sequences
10//! - a `[...]` character class supports ranges, negation if the very first
11//!   character is `!`, backslash-escaping, and also matching
12//!   a `]` character if it is the very first character possibly after
13//!   the `!` one (e.g. `[]]` would only match a single `]` character)
14//! - an `{a,bbb,cc}` alternation supports backslash-escaping, but not
15//!   nested alternations or character classes yet
16//!
17//! Note that the `*` and `?` wildcard patterns, as well as the character
18//! classes, will never match a slash.
19//!
20//! Examples:
21//! - `abc.txt` would only match `abc.txt`
22//! - `foo/test?.txt` would match e.g. `foo/test1.txt` or `foo/test".txt`,
23//!   but not `foo/test/.txt`
24//! - `/etc/c[--9].conf` would match e.g. `/etc/c-.conf`, `/etc/c..conf`,
25//!    or `/etc/7.conf`, but not `/etc/c/.conf`
26//! - `linux-[0-9]*-{generic,aws}` would match `linux-5.2.27b1-generic`
27//!   and `linux-4.0.12-aws`, but not `linux-unsigned-5.2.27b1-generic`
28//!
29//! Note that the [`glob_to_regex`] function returns a regular expression
30//! that will only verify whether a specified text string matches
31//! the pattern; it does not in any way attempt to look up any paths on
32//! the filesystem.
33//!
34//! ```rust
35//! # use std::error::Error;
36//!
37//! # fn main() -> Result<(), Box<dyn Error>> {
38//! let re_name = fnmatch_regex2::glob_to_regex("linux-[0-9]*-{generic,aws}")?;
39//! for name in &[
40//!     "linux-5.2.27b1-generic",
41//!     "linux-4.0.12-aws",
42//!     "linux-unsigned-5.2.27b1-generic"
43//! ] {
44//!     let okay = re_name.is_match(name);
45//!     println!(
46//!         "{}: {}",
47//!         name,
48//!         match okay { true => "yes", false => "no" },
49//!     );
50//!     assert!(okay == !name.contains("unsigned"));
51//! }
52//! # Ok(())
53//! # }
54//! ```
55
56/*
57 * Copyright (c) 2021, 2022  Peter Pentchev <roam@ringlet.net>
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 * 1. Redistributions of source code must retain the above copyright
64 *    notice, this list of conditions and the following disclaimer.
65 * 2. Redistributions in binary form must reproduce the above copyright
66 *    notice, this list of conditions and the following disclaimer in the
67 *    documentation and/or other materials provided with the distribution.
68 *
69 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
70 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
71 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
72 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
73 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
74 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
75 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
76 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
77 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
78 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
79 * SUCH DAMAGE.
80 */
81
82use std::mem;
83use std::vec::IntoIter as VecIntoIter;
84
85use itertools::{Either, Itertools};
86use regex::{Regex, RegexBuilder};
87
88use crate::error::Error as FError;
89
90/// Something that may appear in a character class.
91#[derive(Debug)]
92enum ClassItem {
93    /// A character may appear in a character class.
94    Char(char),
95    /// A range of characters may appear in a character class.
96    Range(char, char),
97}
98
99/// An accumulator for building the representation of a character class.
100#[derive(Debug)]
101struct ClassAccumulator {
102    /// Is the class negated (i.e. was `^` the first character).
103    negated: bool,
104    /// The characters or ranges in the class, in order of appearance.
105    items: Vec<ClassItem>,
106}
107
108/// The current state of the glob pattern parser.
109#[derive(Debug)]
110enum State {
111    /// The very start of the pattern.
112    Start,
113    /// The end of the pattern, nothing more to do.
114    End,
115    /// The next item can be a literal character.
116    Literal,
117    /// The next item will signify a character escape, e.g. `\t`, `\n`, etc.
118    Escape,
119    /// The next item will be the first character of a class, possibly `^`.
120    ClassStart,
121    /// The next item will either be a character or a range, both within a class.
122    Class(ClassAccumulator),
123    /// A character class range was completed; check whether the next character is
124    /// a dash.
125    ClassRange(ClassAccumulator, char),
126    /// There was a dash following a character range; let's hope this is the end of
127    /// the class definition.
128    ClassRangeDash(ClassAccumulator),
129    /// The next item will signify a character escape within a character class.
130    ClassEscape(ClassAccumulator),
131    /// We are building a collection of alternatives.
132    Alternate(String, Vec<String>),
133    /// The next item will signify a character escape within a collection of alternatives.
134    AlternateEscape(String, Vec<String>),
135}
136
137// We need this so we can use mem::take() later.
138impl Default for State {
139    fn default() -> Self {
140        Self::Start
141    }
142}
143
144/// Escape a character in a character class if necessary.
145/// This only escapes the backslash itself and the closing bracket.
146fn escape_in_class(chr: char) -> String {
147    if chr == ']' || chr == '\\' {
148        format!("\\{}", chr)
149    } else {
150        chr.to_string()
151    }
152}
153
154/// Escape a character outside of a character class if necessary.
155fn escape(chr: char) -> String {
156    if "[{(|^$.*?+\\".contains(chr) {
157        format!("\\{}", chr)
158    } else {
159        chr.to_string()
160    }
161}
162
163/// Interpret an escaped character: return the one that was meant.
164const fn map_letter_escape(chr: char) -> char {
165    match chr {
166        'a' => '\x07',
167        'b' => '\x08',
168        'e' => '\x1b',
169        'f' => '\x0c',
170        'n' => '\x0a',
171        'r' => '\x0d',
172        't' => '\x09',
173        'v' => '\x0b',
174        other => other,
175    }
176}
177
178/// Unescape a character and escape it if needed.
179fn escape_special(chr: char) -> String {
180    escape(map_letter_escape(chr))
181}
182
183/// Remove a slash from characters and classes.
184struct ExcIter<I>
185where
186    I: Iterator<Item = ClassItem>,
187{
188    /// The items to remove slashes from.
189    it: I,
190}
191
192impl<I> Iterator for ExcIter<I>
193where
194    I: Iterator<Item = ClassItem>,
195{
196    type Item = VecIntoIter<ClassItem>;
197
198    fn next(&mut self) -> Option<Self::Item> {
199        self.it.next().map(|cls| {
200            match cls {
201                ClassItem::Char('/') => vec![],
202                ClassItem::Char(_) => vec![cls],
203                ClassItem::Range('.', '/') => vec![ClassItem::Char('.')],
204                ClassItem::Range(start, '/') => vec![ClassItem::Range(start, '.')],
205                ClassItem::Range('/', '0') => vec![ClassItem::Char('0')],
206                ClassItem::Range('/', end) => vec![ClassItem::Range('0', end)],
207                ClassItem::Range(start, end) if start > '/' || end < '/' => vec![cls],
208                ClassItem::Range(start, end) => vec![
209                    if start == '.' {
210                        ClassItem::Char('.')
211                    } else {
212                        ClassItem::Range(start, '.')
213                    },
214                    if end == '0' {
215                        ClassItem::Char('0')
216                    } else {
217                        ClassItem::Range('0', end)
218                    },
219                ],
220            }
221            .into_iter()
222        })
223    }
224}
225
226/// Exclude the slash character from classes that would include it.
227fn handle_slash_exclude(acc: ClassAccumulator) -> ClassAccumulator {
228    assert!(!acc.negated);
229    ClassAccumulator {
230        items: ExcIter {
231            it: acc.items.into_iter(),
232        }
233        .flatten()
234        .collect(),
235        ..acc
236    }
237}
238
239/// Make sure a character class will match a slash.
240fn handle_slash_include(mut acc: ClassAccumulator) -> ClassAccumulator {
241    assert!(acc.negated);
242    let slash_found = acc.items.iter().any(|item| match *item {
243        ClassItem::Char('/') => true,
244        ClassItem::Char(_) => false,
245        ClassItem::Range(start, end) => start <= '/' && end >= '/',
246    });
247    if !slash_found {
248        acc.items.push(ClassItem::Char('/'));
249    }
250    acc
251}
252
253/// Character classes should never match a slash when used in filenames.
254/// Thus, make sure that a negated character class will include the slash
255/// character and that a non-negated one will not include it.
256fn handle_slash(acc: ClassAccumulator) -> ClassAccumulator {
257    if acc.negated {
258        handle_slash_include(acc)
259    } else {
260        handle_slash_exclude(acc)
261    }
262}
263
264/// Convert a glob character class to a regular expression one.
265/// Make sure none of the classes will allow a slash to be matched in
266/// a filename, make sure the dash is at the end of the regular expression
267/// class pattern (e.g. `[A-Za-z0-9-]`), sort the characters and the classes.
268fn close_class(glob_acc: ClassAccumulator) -> String {
269    let acc = handle_slash(glob_acc);
270    let (chars_vec, classes_vec): (Vec<_>, Vec<_>) =
271        acc.items.into_iter().partition_map(|item| match item {
272            ClassItem::Char(chr) => Either::Left(chr),
273            ClassItem::Range(start, end) => Either::Right((start, end)),
274        });
275
276    let (chars, final_dash) = {
277        let mut has_dash = false;
278        let res = chars_vec
279            .into_iter()
280            .filter(|chr| {
281                if *chr == '-' {
282                    has_dash = true;
283                    false
284                } else {
285                    true
286                }
287            })
288            .sorted_unstable()
289            .dedup()
290            .map(escape_in_class);
291        (res, if has_dash { "-" } else { "" })
292    };
293
294    let classes = classes_vec
295        .into_iter()
296        .sorted_unstable()
297        .dedup()
298        .map(|cls| format!("{}-{}", escape_in_class(cls.0), escape_in_class(cls.1)));
299
300    format!(
301        "[{}{}{}]",
302        if acc.negated { "^" } else { "" },
303        chars.chain(classes).collect::<String>(),
304        final_dash,
305    )
306}
307
308/// Convert a glob alternatives list to a regular expression pattern.
309fn close_alternate(gathered: Vec<String>) -> String {
310    let items = gathered
311        .into_iter()
312        .map(|item| item.chars().map(escape).collect::<String>())
313        .sorted_unstable()
314        .dedup()
315        .join("|");
316
317    format!("({})", items)
318}
319
320/// Iterate over a glob pattern's characters, build up a regular expression.
321struct GlobIterator<I: Iterator<Item = char>> {
322    /// The iterator over the glob pattern's characters.
323    pattern: I,
324    /// The current state of the glob pattern parser.
325    state: State,
326}
327
328/// Either a piece of the regular expression or an error.
329type StringResult = Result<Option<String>, FError>;
330
331impl<I> GlobIterator<I>
332where
333    I: Iterator<Item = char>,
334{
335    /// Output a "^" at the very start of the pattern.
336    fn handle_start(&mut self) -> String {
337        self.state = State::Literal;
338        "^".to_owned()
339    }
340
341    /// Handle the next character when expecting a literal one.
342    fn handle_literal(&mut self) -> Option<String> {
343        match self.pattern.next() {
344            None => {
345                self.state = State::End;
346                Some("$".to_owned())
347            }
348            Some(chr) => {
349                let (new_state, res) = match chr {
350                    '\\' => (State::Escape, None),
351                    '[' => (State::ClassStart, None),
352                    '{' => (State::Alternate(String::new(), Vec::new()), None),
353                    '?' => (State::Literal, Some("[^/]".to_owned())),
354                    '*' => (State::Literal, Some(".*".to_owned())),
355                    ']' | '}' | '.' => (State::Literal, Some(format!("\\{}", chr))),
356                    _ => (State::Literal, Some(format!("{}", chr))),
357                };
358                self.state = new_state;
359                res
360            }
361        }
362    }
363
364    /// Handle an escaped character.
365    fn handle_escape(&mut self) -> StringResult {
366        match self.pattern.next() {
367            Some(chr) => {
368                self.state = State::Literal;
369                Ok(Some(escape_special(chr)))
370            }
371            None => Err(FError::BareEscape),
372        }
373    }
374
375    /// Handle the first character in a character class specification.
376    fn handle_class_start(&mut self) -> StringResult {
377        match self.pattern.next() {
378            Some(chr) => {
379                self.state = match chr {
380                    '!' => State::Class(ClassAccumulator {
381                        negated: true,
382                        items: Vec::new(),
383                    }),
384                    '-' => State::Class(ClassAccumulator {
385                        negated: false,
386                        items: vec![ClassItem::Char('-')],
387                    }),
388                    ']' => State::Class(ClassAccumulator {
389                        negated: false,
390                        items: vec![ClassItem::Char(']')],
391                    }),
392                    '\\' => State::ClassEscape(ClassAccumulator {
393                        negated: false,
394                        items: Vec::new(),
395                    }),
396                    other => State::Class(ClassAccumulator {
397                        negated: false,
398                        items: vec![ClassItem::Char(other)],
399                    }),
400                };
401                Ok(None)
402            }
403            None => Err(FError::UnclosedClass),
404        }
405    }
406
407    /// Handle a character in a character class specification.
408    fn handle_class(&mut self, mut acc: ClassAccumulator) -> StringResult {
409        match self.pattern.next() {
410            Some(chr) => Ok(match chr {
411                ']' => {
412                    if acc.items.is_empty() {
413                        acc.items.push(ClassItem::Char(']'));
414                        self.state = State::Class(acc);
415                        None
416                    } else {
417                        self.state = State::Literal;
418                        Some(close_class(acc))
419                    }
420                }
421                '-' => match acc.items.pop() {
422                    None => {
423                        acc.items.push(ClassItem::Char('-'));
424                        self.state = State::Class(acc);
425                        None
426                    }
427                    Some(ClassItem::Range(start, end)) => {
428                        acc.items.push(ClassItem::Range(start, end));
429                        self.state = State::ClassRangeDash(acc);
430                        None
431                    }
432                    Some(ClassItem::Char(start)) => {
433                        self.state = State::ClassRange(acc, start);
434                        None
435                    }
436                },
437                '\\' => {
438                    self.state = State::ClassEscape(acc);
439                    None
440                }
441                other => {
442                    acc.items.push(ClassItem::Char(other));
443                    self.state = State::Class(acc);
444                    None
445                }
446            }),
447            None => Err(FError::UnclosedClass),
448        }
449    }
450
451    /// Escape a character in a class specification.
452    fn handle_class_escape(&mut self, mut acc: ClassAccumulator) -> StringResult {
453        match self.pattern.next() {
454            Some(chr) => {
455                acc.items.push(ClassItem::Char(map_letter_escape(chr)));
456                self.state = State::Class(acc);
457                Ok(None)
458            }
459            None => Err(FError::UnclosedClass),
460        }
461    }
462
463    /// Handle a character within a class range.
464    fn handle_class_range(&mut self, mut acc: ClassAccumulator, start: char) -> StringResult {
465        match self.pattern.next() {
466            Some(chr) => match chr {
467                '\\' => Err(FError::NotImplemented(format!(
468                    "FIXME: handle class range end escape with {:?} start {:?}",
469                    acc, start
470                ))),
471                ']' => {
472                    acc.items.push(ClassItem::Char(start));
473                    acc.items.push(ClassItem::Char('-'));
474                    self.state = State::Literal;
475                    Ok(Some(close_class(acc)))
476                }
477                end if start > end => Err(FError::ReversedRange(start, end)),
478                end if start == end => {
479                    acc.items.push(ClassItem::Char(start));
480                    self.state = State::Class(acc);
481                    Ok(None)
482                }
483                end => {
484                    acc.items.push(ClassItem::Range(start, end));
485                    self.state = State::Class(acc);
486                    Ok(None)
487                }
488            },
489            None => Err(FError::UnclosedClass),
490        }
491    }
492
493    /// Handle a dash immediately following a range within a character class.
494    #[allow(clippy::panic_in_result_fn)]
495    #[allow(clippy::unreachable)]
496    fn handle_class_range_dash(&mut self, mut acc: ClassAccumulator) -> StringResult {
497        match self.pattern.next() {
498            Some(chr) => {
499                if chr == ']' {
500                    acc.items.push(ClassItem::Char('-'));
501                    self.state = State::Literal;
502                    Ok(Some(close_class(acc)))
503                } else if let Some(ClassItem::Range(start, end)) = acc.items.pop() {
504                    Err(FError::RangeAfterRange(start, end))
505                } else {
506                    // Let's hope the optimizer hears us...
507                    unreachable!()
508                }
509            }
510            None => Err(FError::UnclosedClass),
511        }
512    }
513
514    /// Start a set of alternatives.
515    fn handle_alternate(&mut self, mut current: String, mut gathered: Vec<String>) -> StringResult {
516        match self.pattern.next() {
517            Some(chr) => match chr {
518                ',' => {
519                    gathered.push(current);
520                    self.state = State::Alternate(String::new(), gathered);
521                    Ok(None)
522                }
523                '}' => {
524                    self.state = State::Literal;
525                    if current.is_empty() && gathered.is_empty() {
526                        Ok(Some(r"\{\}".to_owned()))
527                    } else {
528                        gathered.push(current);
529                        Ok(Some(close_alternate(gathered)))
530                    }
531                }
532                '\\' => {
533                    self.state = State::AlternateEscape(current, gathered);
534                    Ok(None)
535                }
536                '[' => Err(FError::NotImplemented(
537                    "FIXME: alternate character class".to_owned(),
538                )),
539                other => {
540                    current.push(other);
541                    self.state = State::Alternate(current, gathered);
542                    Ok(None)
543                }
544            },
545            None => Err(FError::UnclosedAlternation),
546        }
547    }
548
549    /// Escape a character within a list of alternatives.
550    fn handle_alternate_escape(
551        &mut self,
552        mut current: String,
553        gathered: Vec<String>,
554    ) -> StringResult {
555        match self.pattern.next() {
556            Some(chr) => {
557                current.push(map_letter_escape(chr));
558                self.state = State::Alternate(current, gathered);
559                Ok(None)
560            }
561            None => Err(FError::UnclosedAlternation),
562        }
563    }
564}
565
566impl<I> Iterator for GlobIterator<I>
567where
568    I: Iterator<Item = char>,
569{
570    type Item = StringResult;
571
572    fn next(&mut self) -> Option<Self::Item> {
573        match mem::take(&mut self.state) {
574            State::Start => Some(Ok(Some(self.handle_start()))),
575            State::End => None,
576            State::Literal => Some(Ok(self.handle_literal())),
577            State::Escape => Some(self.handle_escape()),
578            State::ClassStart => Some(self.handle_class_start()),
579            State::Class(acc) => Some(self.handle_class(acc)),
580            State::ClassEscape(acc) => Some(self.handle_class_escape(acc)),
581            State::ClassRange(acc, start) => Some(self.handle_class_range(acc, start)),
582            State::ClassRangeDash(acc) => Some(self.handle_class_range_dash(acc)),
583            State::Alternate(current, gathered) => Some(self.handle_alternate(current, gathered)),
584            State::AlternateEscape(current, gathered) => {
585                Some(self.handle_alternate_escape(current, gathered))
586            }
587        }
588    }
589}
590
591/// Parse a shell glob-like pattern into a regular expression.
592///
593/// See the module-level documentation for a description of the pattern
594/// features supported.
595///
596/// # Errors
597/// Most of the [`crate::error::Error`] values, mostly syntax errors in
598/// the specified glob pattern.
599#[allow(clippy::missing_inline_in_public_items)]
600pub fn glob_to_regex(pattern: &str) -> Result<Regex, FError> {
601    let parser = GlobIterator {
602        pattern: pattern.chars(),
603        state: State::Start,
604    };
605    let re_pattern = parser.flatten_ok().collect::<Result<Vec<_>, _>>()?.join("");
606    Regex::new(&re_pattern).map_err(|err| FError::InvalidRegex(re_pattern, err.to_string()))
607}
608
609/// Parse a shell glob-like pattern into a regular expression.
610///
611/// See the module-level documentation for a description of the pattern
612/// features supported.
613///
614/// # Errors
615/// Most of the [`crate::error::Error`] values, mostly syntax errors in
616/// the specified glob pattern.
617#[allow(clippy::missing_inline_in_public_items)]
618pub fn glob_to_regex_ext(pattern: &str, ignore_case: bool) -> Result<Regex, FError> {
619    let parser = GlobIterator {
620        pattern: pattern.chars(),
621        state: State::Start,
622    };
623    let re_pattern = parser.flatten_ok().collect::<Result<Vec<_>, _>>()?.join("");
624    RegexBuilder::new(&re_pattern)
625        .case_insensitive(ignore_case)
626        .build()
627        .map_err(|err| FError::InvalidRegex(re_pattern, err.to_string()))
628}