ad_editor/regex/
mod.rs

1//! A simple regex engine for operating on character streams and supporting
2//! the Sam text editor's structural regular expressions.
3//!
4//! Thompson's original paper on writing a regex engine can be found here:
5//!   https://dl.acm.org/doi/pdf/10.1145/363347.363387
6use std::{iter::Peekable, str::Chars};
7
8mod ast;
9mod compile;
10mod matches;
11mod vm;
12
13pub use matches::{Match, MatchIter};
14pub use vm::Regex;
15
16/// Errors that can be returned by the regex engine
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub enum Error {
19    /// Empty parens
20    EmptyParens,
21    /// Empty string used when creating a [Regex]
22    EmptyRegex,
23    /// Invalid regex class
24    InvalidClass,
25    /// Invalid escape sequence
26    InvalidEscape(char),
27    /// Invalid repetition pattern
28    InvalidRepetition,
29    /// The provided regex is too long
30    ReTooLong,
31    /// Too many parens in the provided regex
32    TooManyParens,
33    /// Alternation without a right hand side
34    UnbalancedAlt,
35    /// Unbalanced parens
36    UnbalancedParens,
37    /// Group name without a closing paren
38    UnclosedGroupName(String),
39    /// Invalid group qualifier following (?...)
40    UnknownGroupQualifier(char),
41}
42
43/// Helper for converting characters to 0 based inicies for looking things up in caches.
44const fn char_ix(ch: char) -> usize {
45    ((ch as u16) & 0xFF) as usize
46}
47
48const fn init_escapes() -> [Option<char>; 256] {
49    macro_rules! escape {
50        ($escapes:expr, $($ch:expr),+) => {
51            $($escapes[char_ix($ch)] = Some($ch);)+
52        };
53        ($escapes:expr, $($ch:expr => $esc:expr),+) => {
54            $($escapes[char_ix($ch)] = Some($esc);)+
55        };
56    }
57
58    let mut escapes = [None; 256];
59    escape!(escapes, '*', '+', '?', '.', '@', '(', ')', '[', ']', '{', '}', '|');
60    escape!(escapes, '\\', '\'', '"', '^', '$', '-');
61    escape!(escapes, 'b', 'B', 'd', 'D', 'w', 'W', 's', 'S');
62    escape!(escapes, 'n'=>'\n', 'r'=>'\r', 't'=>'\t');
63
64    escapes
65}
66
67/// Supported escape sequences
68const ESCAPES: [Option<char>; 256] = init_escapes();
69
70#[derive(Debug, Clone, PartialEq, Eq)]
71struct CharClass {
72    negated: bool,
73    chars: Vec<char>,
74    ranges: Vec<(char, char)>,
75}
76
77impl CharClass {
78    fn try_parse(it: &mut Peekable<Chars<'_>>) -> Result<Self, Error> {
79        let mut next = || next_char(it)?.ok_or(Error::InvalidClass);
80        let (mut ch, _) = next()?;
81
82        let negated = ch == '^';
83        if negated {
84            (ch, _) = next()?
85        };
86        let mut chars = vec![ch];
87        let mut ranges = vec![];
88
89        loop {
90            let (ch, escaped) = next()?;
91            match ch {
92                ']' if !escaped => break,
93
94                '-' if !escaped => {
95                    let start = chars.pop().ok_or(Error::InvalidClass)?;
96                    let (end, _) = next()?;
97                    ranges.push((start, end));
98                }
99
100                ch => chars.push(ch),
101            }
102        }
103
104        Ok(Self {
105            negated,
106            chars,
107            ranges,
108        })
109    }
110
111    // Negated classes still don't match a newline
112    #[inline]
113    fn matches(&self, ch: char) -> bool {
114        if self.negated && ch == '\n' {
115            return false;
116        }
117
118        let res = self.chars.contains(&ch)
119            || self
120                .ranges
121                .iter()
122                .any(|&(start, end)| ch >= start && ch <= end);
123
124        if self.negated {
125            !res
126        } else {
127            res
128        }
129    }
130}
131
132fn next_char(it: &mut Peekable<Chars<'_>>) -> Result<Option<(char, bool)>, Error> {
133    match it.next() {
134        Some('\\') => (),
135        Some(ch) => return Ok(Some((ch, false))),
136        None => return Ok(None),
137    }
138
139    let ch = match it.next() {
140        Some(ch) => ch,
141        None => return Err(Error::InvalidEscape('\0')),
142    };
143
144    match ESCAPES[char_ix(ch)] {
145        Some(ch) => Ok(Some((ch, true))),
146        None => Err(Error::InvalidEscape(ch)),
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153    use simple_test_case::test_case;
154
155    #[test_case("_", &['_'], &[]; "single underscore")]
156    #[test_case("abc_", &['a', 'b', 'c', '_'], &[]; "chars")]
157    #[test_case("a-z", &[], &[('a', 'z')]; "single range")]
158    #[test_case("a-zA-Z", &[], &[('a', 'z'), ('A', 'Z')]; "multi range")]
159    #[test_case("a-z_./]", &['_', '.', '/'], &[('a', 'z')]; "compound")]
160    #[test_case("a-zA-Z_\\-.]", &['_', '-', '.'], &[('a', 'z'), ('A', 'Z')]; "compound escaped dash")]
161    #[test]
162    fn parsing_classes_works(raw: &str, chars: &[char], ranges: &[(char, char)]) {
163        // The outer regex parser consumes the initial '[' before passing through so test cases
164        // look a little lopsided due to missing this.
165        for (s, negated) in [(format!("{raw}]"), false), (format!("^{raw}]"), true)] {
166            let cls = CharClass::try_parse(&mut s.chars().peekable()).unwrap();
167            let expected = CharClass {
168                negated,
169                chars: chars.to_vec(),
170                ranges: ranges.to_vec(),
171            };
172
173            assert_eq!(cls, expected, "negated={negated}");
174        }
175    }
176}