irregex/
lib.rs

1//! A simple regular expression engine.
2
3use nom::branch::alt;
4use nom::character::complete::{char, none_of, one_of};
5use nom::combinator::{all_consuming, flat_map, success};
6use nom::error::Error as NomError;
7use nom::multi::{fold_many0, many0_count, many1, separated_list1};
8use nom::sequence::{delimited, preceded};
9use nom::{Finish, IResult, Parser};
10
11/// A regular expression.
12#[derive(Clone, Debug, Eq, PartialEq)]
13pub enum Regex {
14    /// Matches the empty string.
15    Empty,
16    /// Matches any one character.
17    Dot,
18    /// Matches a literal character.
19    Literal(char),
20    /// Matches zero or more repetitions.
21    Star(Box<Regex>),
22    /// Matches two patterns in a row.
23    Concat(Box<Regex>, Box<Regex>),
24    /// Matches either of two patterns.
25    Or(Box<Regex>, Box<Regex>),
26    /// Matches one or more repetitions.
27    Plus(Box<Regex>),
28    /// Matches zero or one times.
29    Maybe(Box<Regex>),
30    /// Matches the opposite of a pattern.
31    Not(Box<Regex>),
32    /// Matches the intersection of two patterns.
33    And(Box<Regex>, Box<Regex>),
34}
35
36/// A regular expression matcher.
37pub trait Matcher {
38    /// Activate this matcher's *start* state.
39    ///
40    /// Returns whether this matcher is in an accepting state.
41    fn start(&mut self) -> bool;
42
43    /// Process a character from the string we're matching against.
44    ///
45    /// Returns whether this matcher is in an accepting state.
46    fn push(&mut self, c: char) -> bool;
47
48    /// Test if a string matches.
49    fn matches(&mut self, text: &str) -> bool {
50        text.chars().fold(self.start(), |_, c| self.push(c))
51    }
52}
53
54impl<T: Matcher + ?Sized> Matcher for Box<T> {
55    fn start(&mut self) -> bool {
56        self.as_mut().start()
57    }
58
59    fn push(&mut self, c: char) -> bool {
60        self.as_mut().push(c)
61    }
62}
63
64/// Extension methods for matchers.
65pub trait MatcherExt: Matcher + Sized {
66    /// Wrap this matcher with the `*` operator.
67    fn star(self) -> Star<Self> {
68        Star(self)
69    }
70
71    /// Concatenate two matchers.
72    fn concat<T: Matcher>(self, other: T) -> Concat<Self, T> {
73        Concat::new(self, other)
74    }
75
76    /// Alternate two matchers.
77    fn or<T: Matcher>(self, other: T) -> Or<Self, T> {
78        Or(self, other)
79    }
80
81    /// Invert a matcher.
82    fn not(self) -> Not<Self> {
83        Not(self)
84    }
85
86    /// Intersect two matchers.
87    fn and<T: Matcher>(self, other: T) -> And<Self, T> {
88        And(self, other)
89    }
90}
91
92impl<T: Matcher> MatcherExt for T {}
93
94/// An error parsing a regular expression.
95pub type ParseError<'a> = NomError<&'a str>;
96
97/// Helper for applying binary operators.
98fn reduce<T>(v: Vec<T>, f: impl FnMut(T, T) -> T) -> T {
99    v.into_iter().reduce(f).unwrap()
100}
101
102/// Parser implementation.
103fn regex(pattern: &str) -> IResult<&str, Regex> {
104    // Empty : ``
105    let empty = success(Regex::Empty);
106
107    // Group : `(` Regex `)`
108    let group = delimited(char('('), regex, char(')'));
109
110    // Dot : `.`
111    let dot = char('.').map(|_| Regex::Dot);
112
113    // Meta : `\` | `(` | `)` | ...
114    let meta = r"\().*|+?!&";
115
116    // Literal : [^Meta]
117    let literal = none_of(meta).map(Regex::Literal);
118
119    // Escape : `\` Meta
120    let escape = preceded(char('\\'), one_of(meta))
121        .map(Regex::Literal);
122
123    // Atom : Literal | Escape | Dot | Group
124    let atom = alt((literal, escape, dot, group));
125
126    // Repeat : Atom
127    //        | Repeat `*`
128    //        | Repeat `+`
129    //        | Repeat `?`
130    let repeat = flat_map(atom, |r| fold_many0(
131        one_of("*+?"),
132        move || r.clone(),
133        |r, c| match c {
134            '*' => r.star(),
135            '+' => r.plus(),
136            '?' => r.maybe(),
137            _ => unreachable!(),
138        },
139    ));
140
141    // Word : Repeat | Word Repeat
142    let word = many1(repeat)
143        .map(|v| reduce(v, Regex::concat));
144
145    // NotWord : Word
146    //         | `!` NotWord
147    let not_word = many0_count(char('!'))
148        .and(word)
149        .map(|(n, r)| (0..n).fold(r, |r, _| r.not()));
150
151    // Chunk : NotWord | Empty
152    let chunk = alt((not_word, empty));
153
154    // Clause : Chunk
155    //        | Alternate `&` Chunk
156    let clause = separated_list1(char('&'), chunk)
157        .map(|v| reduce(v, Regex::and));
158
159    // Regex : Clause
160    //       | Regex `|` Clause
161    separated_list1(char('|'), clause)
162        .map(|v| reduce(v, Regex::or))
163        .parse(pattern)
164}
165
166impl Regex {
167    /// Parse a regular expression.
168    pub fn parse(pattern: &str) -> Result<Self, ParseError<'_>> {
169        all_consuming(regex)
170            .parse(pattern)
171            .finish()
172            .map(|(_, r)| r)
173    }
174
175    /// Compile a regular expression.
176    pub fn matcher(&self) -> Box<dyn Matcher> {
177        match self {
178            Self::Empty => Box::new(
179                Empty::default()
180            ),
181            Self::Dot => Box::new(
182                Dot::default()
183            ),
184            Self::Literal(c) => Box::new(
185                Literal::new(*c)
186            ),
187            Self::Star(r) => Box::new(
188                r.matcher().star()
189            ),
190            Self::Concat(a, b) => Box::new(
191                a.matcher().concat(b.matcher())
192            ),
193            Self::Or(a, b) => Box::new(
194                a.matcher().or(b.matcher())
195            ),
196            Self::Plus(r) => Box::new(
197                r.matcher().concat(r.matcher().star())
198            ),
199            Self::Maybe(r) => Box::new(
200                r.matcher().or(Empty::default())
201            ),
202            Self::Not(r) => Box::new(
203                r.matcher().not()
204            ),
205            Self::And(a, b) => Box::new(
206                a.matcher().and(b.matcher())
207            ),
208        }
209    }
210
211    /// Test if a string matches.
212    pub fn matches(&self, text: &str) -> bool {
213        self.matcher().matches(text)
214    }
215
216    /// Wrap this regex with the `*` operator.
217    pub fn star(self) -> Self {
218        Self::Star(self.into())
219    }
220
221    /// Concatenate two regexes.
222    pub fn concat(self, other: Self) -> Self {
223        Self::Concat(self.into(), other.into())
224    }
225
226    /// Alternate two regexes.
227    pub fn or(self, other: Self) -> Self {
228        Self::Or(self.into(), other.into())
229    }
230
231    /// Wrap this regex with the `+` operator.
232    pub fn plus(self) -> Self {
233        Self::Plus(self.into())
234    }
235
236    /// Wrap this regex with the `?` operator.
237    pub fn maybe(self) -> Self {
238        Self::Maybe(self.into())
239    }
240
241    /// Invert a regex.
242    pub fn not(self) -> Self {
243        Self::Not(self.into())
244    }
245
246    /// Intersect two regexes.
247    pub fn and(self, other: Self) -> Self {
248        Self::And(self.into(), other.into())
249    }
250}
251
252/// Matcher for Regex::Empty.
253#[derive(Debug, Default)]
254pub struct Empty {
255    matched: bool,
256}
257
258impl Matcher for Empty {
259    fn start(&mut self) -> bool {
260        self.matched = true;
261        true
262    }
263
264    fn push(&mut self, _: char) -> bool {
265        self.matched = false;
266        false
267    }
268}
269
270/// Matcher for Regex::Dot.
271#[derive(Debug, Default)]
272pub struct Dot {
273    started: bool,
274    matched: bool,
275}
276
277impl Matcher for Dot {
278    fn start(&mut self) -> bool {
279        self.started = true;
280        self.matched
281    }
282
283    fn push(&mut self, _: char) -> bool {
284        self.matched = self.started;
285        self.started = false;
286        self.matched
287    }
288}
289
290/// Matcher for Regex::Literal.
291#[derive(Debug)]
292pub struct Literal {
293    c: char,
294    started: bool,
295    matched: bool,
296}
297
298impl Literal {
299    /// Create a literal matcher.
300    fn new(c: char) -> Self {
301        Self {
302            c,
303            started: false,
304            matched: false,
305        }
306    }
307}
308
309impl Matcher for Literal {
310    fn start(&mut self) -> bool {
311        self.started = true;
312        self.matched
313    }
314
315    fn push(&mut self, c: char) -> bool {
316        self.matched = self.started && c == self.c;
317        self.started = false;
318        self.matched
319    }
320}
321
322/// Matcher for Regex::Star.
323#[derive(Debug)]
324pub struct Star<T>(T);
325
326impl<T: Matcher> Matcher for Star<T> {
327    fn start(&mut self) -> bool {
328        self.0.start() || true
329    }
330
331    fn push(&mut self, c: char) -> bool {
332        self.0.push(c) && self.start()
333    }
334}
335
336/// Matcher for Regex::Concat.
337#[derive(Debug)]
338pub struct Concat<L, R> {
339    left: L,
340    right: R,
341    right_started: bool,
342}
343
344impl<L: Matcher, R: Matcher> Concat<L, R> {
345    fn new(left: L, right: R) -> Self {
346        Self {
347            left,
348            right,
349            right_started: false,
350        }
351    }
352
353    fn start_right(&mut self) -> bool {
354        self.right_started = true;
355        self.right.start()
356    }
357}
358
359impl<L: Matcher, R: Matcher> Matcher for Concat<L, R> {
360    fn start(&mut self) -> bool {
361        self.left.start() && self.start_right()
362    }
363
364    fn push(&mut self, c: char) -> bool {
365        (self.right_started && self.right.push(c))
366            | (self.left.push(c) && self.start_right())
367    }
368}
369
370/// Matcher for Regex::Or.
371#[derive(Debug)]
372pub struct Or<T, U>(T, U);
373
374impl<T: Matcher, U: Matcher> Matcher for Or<T, U> {
375    fn start(&mut self) -> bool {
376        self.0.start() | self.1.start()
377    }
378
379    fn push(&mut self, c: char) -> bool {
380        self.0.push(c) | self.1.push(c)
381    }
382}
383
384/// Matcher for Regex::Not.
385#[derive(Debug)]
386pub struct Not<T>(T);
387
388impl<T: Matcher> Matcher for Not<T> {
389    fn start(&mut self) -> bool {
390        !self.0.start()
391    }
392
393    fn push(&mut self, c: char) -> bool {
394        !self.0.push(c)
395    }
396}
397
398/// Intersection matcher.
399#[derive(Debug)]
400pub struct And<T, U>(T, U);
401
402impl<T: Matcher, U: Matcher> Matcher for And<T, U> {
403    fn start(&mut self) -> bool {
404        self.0.start() & self.1.start()
405    }
406
407    fn push(&mut self, c: char) -> bool {
408        self.0.push(c) & self.1.push(c)
409    }
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415
416    /// Parse a regex or panic.
417    fn parse(pattern: &str) -> Regex {
418        Regex::parse(pattern)
419            .expect(&format!("Pattern '{pattern}' should parse successfully"))
420    }
421
422    /// Check the result of a successful parse.
423    fn assert_parse(pattern: &str, regex: Regex) {
424        assert_eq!(parse(pattern), regex, "'{pattern}'");
425    }
426
427    /// Check that a parse failed.
428    fn assert_parse_err(pattern: &str) {
429        Regex::parse(pattern)
430            .expect_err(&format!("Pattern '{pattern}' should fail to parse"));
431    }
432
433    #[test]
434    fn parsing() {
435        assert_parse(r"", Regex::Empty);
436        assert_parse(r"()", Regex::Empty);
437        assert_parse(r"(())", Regex::Empty);
438
439        assert_parse(r".", Regex::Dot);
440        assert_parse(r"(.)", Regex::Dot);
441
442        assert_parse(r"a", Regex::Literal('a'));
443        assert_parse(r"\\", Regex::Literal('\\'));
444        assert_parse(r"\(", Regex::Literal('('));
445        assert_parse(r"\)", Regex::Literal(')'));
446        assert_parse(r"\.", Regex::Literal('.'));
447        assert_parse(r"\*", Regex::Literal('*'));
448        assert_parse(r"\+", Regex::Literal('+'));
449        assert_parse(r"\?", Regex::Literal('?'));
450        assert_parse(r"\|", Regex::Literal('|'));
451        assert_parse(r"\!", Regex::Literal('!'));
452        assert_parse(r"\&", Regex::Literal('&'));
453
454        assert_parse(r"()*", Regex::Empty.star());
455        assert_parse(r".*", Regex::Dot.star());
456        assert_parse(r"a*", Regex::Literal('a').star());
457        assert_parse(r"\**", Regex::Literal('*').star());
458        assert_parse(r".**", Regex::Dot.star().star());
459
460        let a = || Regex::Literal('a');
461        let b = || Regex::Literal('b');
462
463        assert_parse("ab", a().concat(b()));
464        assert_parse("ba", b().concat(a()));
465        assert_parse("a*b", a().star().concat(b()));
466        assert_parse("ab*", a().concat(b().star()));
467        assert_parse("(ab)*", a().concat(b()).star());
468
469        assert_parse("a|b", a().or(b()));
470        assert_parse("a|b*", a().or(b().star()));
471        assert_parse("ab|b", a().concat(b()).or(b()));
472        assert_parse("(ab|b)*", a().concat(b()).or(b()).star());
473
474        assert_parse("a+", a().plus());
475        assert_parse("a+b*", a().plus().concat(b().star()));
476        assert_parse("a+*", a().plus().star());
477        assert_parse("a*+", a().star().plus());
478
479        assert_parse("a?", a().maybe());
480
481        assert_parse("!a", a().not());
482        assert_parse("!.*", Regex::Dot.star().not());
483        assert_parse("!a|b", a().not().or(b()));
484
485        assert_parse("a&b", a().and(b()));
486        assert_parse("a&!b|b", a().and(b().not()).or(b()));
487        assert_parse("a&(!b|b)", a().and(b().not().or(b())));
488
489        assert_parse_err(r"\");
490        assert_parse_err(r"(");
491        assert_parse_err(r")");
492    }
493
494    /// Check whether a regex matches a set of strings.
495    fn assert_matches(pattern: &str, matches: &[&str], non_matches: &[&str]) {
496        let regex = parse(pattern);
497
498        for text in matches {
499            assert!(regex.matches(text), "'{pattern}' should match '{text}'");
500        }
501
502        for text in non_matches {
503            assert!(!regex.matches(text), "'{pattern}' should not match '{text}'");
504        }
505    }
506
507    #[test]
508    fn matching() {
509        assert_matches(
510            r"",
511            &[""],
512            &["a", "b", "ab"],
513        );
514
515        assert_matches(
516            r".",
517            &["a", "b"],
518            &["", "ab", "abc"],
519        );
520
521        assert_matches(
522            r"a",
523            &["a"],
524            &["", "b", "ab"],
525        );
526
527        assert_matches(
528            r"()*",
529            &[""],
530            &["a", "b", "ab"],
531        );
532
533        assert_matches(
534            r".*",
535            &["", "a", "b", "ab", "abc", "abcd"],
536            &[],
537        );
538
539        assert_matches(
540            r"a*",
541            &["", "a", "aa", "aaa"],
542            &["b", "ab", "ba", "aab", "aba", "baa"],
543        );
544
545        assert_matches(
546            r"a**",
547            &["", "a", "aa", "aaa"],
548            &["b", "ab", "ba", "aab", "aba", "baa"],
549        );
550
551        assert_matches(
552            r"a.c",
553            &["aac", "abc", "acc", "adc"],
554            &["", "a", "c", "ac", "aab", "bbc"],
555        );
556
557        assert_matches(
558            r"ab",
559            &["ab"],
560            &["", "a", "b", "ba", "abc"],
561        );
562
563        assert_matches(
564            r"a*b",
565            &["b", "ab", "aab"],
566            &["", "a", "ac", "abc"],
567        );
568
569        assert_matches(
570            r"ab*",
571            &["a", "ab", "abb"],
572            &["", "b", "ac", "abc"],
573        );
574
575        assert_matches(
576            r"a|b",
577            &["a", "b"],
578            &["", "aa", "ab", "bb"],
579        );
580
581        assert_matches(
582            r"(ab*|c)*",
583            &["", "a", "ab", "c", "abc", "abbacca"],
584            &["b", "acb"],
585        );
586
587        assert_matches(
588            r"a+",
589            &["a", "aa", "aaa"],
590            &["", "ab", "ba"],
591        );
592
593        assert_matches(
594            r"a?",
595            &["", "a"],
596            &["b", "aa"],
597        );
598
599        assert_matches(
600            r"!a",
601            &["", "b", "aa", "abc"],
602            &["a"],
603        );
604
605        assert_matches(
606            r"!.*",
607            &[],
608            &["", "a", "ab", "abc"],
609        );
610
611        assert_matches(
612            r"!(!(.*hello.*)|!(.*world.*))",
613            &["hello world", "world hello"],
614            &["", "hello", "world"],
615        );
616
617        assert_matches(
618            r"a&a",
619            &["a"],
620            &["", "b", "ab"],
621        );
622
623        assert_matches(
624            r"a&b",
625            &[],
626            &["", "a", "b", "ab"],
627        );
628
629        assert_matches(
630            r"(.*hello.*)&(.*world.*)",
631            &["hello world", "world hello"],
632            &["", "hello", "world"],
633        );
634    }
635}