ad_editor/regex/
mod.rs

1//! A simple regex engine for operating on character streams and supporting
2//! the Sam text editor's structural regular expressions.
3//!
4//! Thompson's original paper on writing a regex engine can be found here:
5//!   <https://dl.acm.org/doi/pdf/10.1145/363347.363387>
6use std::{fmt, iter::Peekable, str::Chars};
7
8mod ast;
9mod compile;
10mod haystack;
11mod matches;
12mod stream;
13mod vm;
14
15pub use haystack::Haystack;
16pub use matches::Match;
17pub use stream::{CachingStream, CachingStreamIter};
18pub use vm::{Regex, RevRegex};
19
20/// Errors that can be returned by the regex engine
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub enum Error {
23    /// Empty parens
24    EmptyParens,
25    /// Empty string used when creating a [Regex]
26    EmptyRegex,
27    /// Invalid regex class
28    InvalidClass,
29    /// Invalid escape sequence
30    InvalidEscape(char),
31    /// Invalid repetition pattern
32    InvalidRepetition,
33    /// The provided regex is too long
34    ReTooLong,
35    /// Alternation without a right hand side
36    UnbalancedAlt,
37    /// Unbalanced parens
38    UnbalancedParens,
39    /// Group name without a closing '<'
40    UnclosedGroupName(String),
41    /// Invalid group qualifier following (?...)
42    UnknownGroupQualifier(char),
43}
44
45impl std::error::Error for Error {}
46
47impl fmt::Display for Error {
48    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
49        match self {
50            Self::EmptyParens => write!(f, "empty parens"),
51            Self::EmptyRegex => write!(f, "empty regular expression"),
52            Self::InvalidClass => write!(f, "invalid class"),
53            Self::InvalidEscape(c) => write!(f, "invalid escaped character {c:?}"),
54            Self::InvalidRepetition => write!(f, "invalid repetition"),
55            Self::ReTooLong => write!(f, "regex too long"),
56            Self::UnbalancedAlt => write!(f, "alternation had no right hand side"),
57            Self::UnbalancedParens => write!(f, "unbalanced parens"),
58            Self::UnclosedGroupName(s) => write!(f, "unclosed group name {s:?}"),
59            Self::UnknownGroupQualifier(c) => write!(f, "unknown group qualifier {c:?}"),
60        }
61    }
62}
63
64/// Helper for converting characters to 0 based inicies for looking things up in caches.
65const fn char_ix(ch: char) -> usize {
66    ((ch as u16) & 0xFF) as usize
67}
68
69const fn init_escapes() -> [Option<char>; 256] {
70    macro_rules! escape {
71        ($escapes:expr, $($ch:expr),+) => {
72            $($escapes[char_ix($ch)] = Some($ch);)+
73        };
74        ($escapes:expr, $($ch:expr => $esc:expr),+) => {
75            $($escapes[char_ix($ch)] = Some($esc);)+
76        };
77    }
78
79    let mut escapes = [None; 256];
80    escape!(
81        escapes, '*', '+', '?', '.', '@', '(', ')', '[', ']', '{', '}', '|'
82    );
83    escape!(escapes, '\\', '\'', '"', '^', '$', '-');
84    escape!(escapes, 'b', 'B', 'd', 'D', 'w', 'W', 's', 'S');
85    escape!(escapes, 'n'=>'\n', 'r'=>'\r', 't'=>'\t');
86
87    escapes
88}
89
90/// Supported escape sequences
91const ESCAPES: [Option<char>; 256] = init_escapes();
92
93#[derive(Debug, Clone, PartialEq, Eq)]
94struct CharClass {
95    pub(crate) negated: bool,
96    pub(crate) chars: Vec<char>,
97    pub(crate) ranges: Vec<(char, char)>,
98}
99
100impl CharClass {
101    fn try_parse(it: &mut Peekable<Chars<'_>>) -> Result<Self, Error> {
102        let mut next = || next_char(it)?.ok_or(Error::InvalidClass);
103        let (mut ch, _) = next()?;
104
105        let negated = ch == '^';
106        if negated {
107            (ch, _) = next()?
108        };
109        let mut chars = vec![ch];
110        let mut ranges = vec![];
111
112        loop {
113            let (ch, escaped) = next()?;
114            match ch {
115                ']' if !escaped => break,
116
117                '-' if !escaped => {
118                    let start = chars.pop().ok_or(Error::InvalidClass)?;
119                    let (end, _) = next()?;
120                    if start as u32 >= end as u32 {
121                        return Err(Error::InvalidClass);
122                    }
123
124                    ranges.push((start, end));
125                }
126
127                ch => chars.push(ch),
128            }
129        }
130
131        Ok(Self {
132            negated,
133            chars,
134            ranges,
135        })
136    }
137
138    // Negated classes still don't match a newline
139    #[inline]
140    fn matches(&self, ch: char) -> bool {
141        if self.negated && ch == '\n' {
142            return false;
143        }
144
145        let res = self.chars.contains(&ch)
146            || self
147                .ranges
148                .iter()
149                .any(|&(start, end)| ch >= start && ch <= end);
150
151        if self.negated { !res } else { res }
152    }
153
154    fn size(&self) -> usize {
155        self.chars.len()
156            + self
157                .ranges
158                .iter()
159                .map(|(s, e)| (*e as u32 - *s as u32 + 1) as usize)
160                .sum::<usize>()
161    }
162}
163
164fn next_char(it: &mut Peekable<Chars<'_>>) -> Result<Option<(char, bool)>, Error> {
165    match it.next() {
166        Some('\\') => (),
167        Some(ch) => return Ok(Some((ch, false))),
168        None => return Ok(None),
169    }
170
171    let ch = match it.next() {
172        Some(ch) => ch,
173        None => return Err(Error::InvalidEscape('\0')),
174    };
175
176    match ESCAPES[char_ix(ch)] {
177        Some(ch) => Ok(Some((ch, true))),
178        None => Err(Error::InvalidEscape(ch)),
179    }
180}
181
182mod impl_structex {
183    use super::*;
184    use crate::buffer::{Buffer, GapBuffer, Slice};
185    use std::{io, ops::Range};
186    use structex::re::{Haystack, RawCaptures, RegexEngine, Sliceable, Writable};
187
188    impl RegexEngine for Regex {
189        type CompileError = Error;
190
191        fn compile(re: &str) -> Result<Self, Self::CompileError> {
192            Regex::compile(re)
193        }
194    }
195
196    impl Haystack<Regex> for str {
197        fn is_match_between(&self, re: &Regex, from: usize, to: usize) -> bool {
198            re.matches_between(&self, from, to)
199        }
200
201        fn captures_between(&self, re: &Regex, from: usize, to: usize) -> Option<RawCaptures> {
202            let m = re.find_between(&self, from, to)?;
203
204            Some(RawCaptures::new(m.iter_locs()))
205        }
206    }
207
208    impl Haystack<Regex> for GapBuffer {
209        fn is_match_between(&self, re: &Regex, from: usize, to: usize) -> bool {
210            re.matches_between(self, from, to)
211        }
212
213        fn captures_between(&self, re: &Regex, from: usize, to: usize) -> Option<RawCaptures> {
214            let m = re.find_between(self, from, to)?;
215
216            Some(RawCaptures::new(m.iter_locs()))
217        }
218    }
219
220    impl Sliceable for GapBuffer {
221        type Slice<'h>
222            = Slice<'h>
223        where
224            Self: 'h;
225
226        fn char_at(&self, byte_offset: usize) -> Option<char> {
227            self.get_char_at(byte_offset)
228        }
229
230        fn slice(&self, range: Range<usize>) -> Self::Slice<'_> {
231            self.slice_from_byte_offsets(range.start, range.end)
232        }
233
234        fn max_len(&self) -> usize {
235            self.len()
236        }
237    }
238
239    impl Writable for GapBuffer {
240        fn write_to<W>(&self, w: &mut W) -> io::Result<usize>
241        where
242            W: io::Write,
243        {
244            let (l, r) = self.as_byte_slices();
245            w.write_all(l)?;
246            w.write_all(r)?;
247
248            Ok(self.len())
249        }
250    }
251
252    impl<'a> Writable for Slice<'a> {
253        fn write_to<W>(&self, w: &mut W) -> io::Result<usize>
254        where
255            W: std::io::Write,
256        {
257            let (l, r) = self.as_slices();
258            w.write_all(l)?;
259            w.write_all(r)?;
260
261            Ok(l.len() + r.len())
262        }
263    }
264
265    impl Haystack<Regex> for Buffer {
266        fn is_match_between(&self, re: &Regex, from: usize, to: usize) -> bool {
267            re.matches_between(self, from, to)
268        }
269
270        fn captures_between(&self, re: &Regex, from: usize, to: usize) -> Option<RawCaptures> {
271            let m = re.find_between(self, from, to)?;
272
273            Some(RawCaptures::new(m.iter_locs()))
274        }
275    }
276
277    impl Sliceable for Buffer {
278        type Slice<'h>
279            = Slice<'h>
280        where
281            Self: 'h;
282
283        fn char_at(&self, byte_offset: usize) -> Option<char> {
284            self.txt.get_char_at(byte_offset)
285        }
286
287        fn slice(&self, range: Range<usize>) -> Self::Slice<'_> {
288            self.txt.slice_from_byte_offsets(range.start, range.end)
289        }
290
291        fn max_len(&self) -> usize {
292            self.txt.len()
293        }
294    }
295
296    impl Writable for Buffer {
297        fn write_to<W>(&self, w: &mut W) -> io::Result<usize>
298        where
299            W: io::Write,
300        {
301            let (l, r) = self.txt.as_byte_slices();
302            w.write_all(l)?;
303            w.write_all(r)?;
304
305            Ok(self.txt.len())
306        }
307    }
308}
309
310#[cfg(test)]
311mod tests {
312    use super::*;
313    use simple_test_case::test_case;
314
315    #[test_case("_", &['_'], &[]; "single underscore")]
316    #[test_case("abc_", &['a', 'b', 'c', '_'], &[]; "chars")]
317    #[test_case("a-z", &[], &[('a', 'z')]; "single range")]
318    #[test_case("a-zA-Z", &[], &[('a', 'z'), ('A', 'Z')]; "multi range")]
319    #[test_case("a-z_./]", &['_', '.', '/'], &[('a', 'z')]; "compound")]
320    #[test_case("a-zA-Z_\\-.]", &['_', '-', '.'], &[('a', 'z'), ('A', 'Z')]; "compound escaped dash")]
321    #[test]
322    fn parsing_classes_works(raw: &str, chars: &[char], ranges: &[(char, char)]) {
323        // The outer regex parser consumes the initial '[' before passing through so test cases
324        // look a little lopsided due to missing this.
325        for (s, negated) in [(format!("{raw}]"), false), (format!("^{raw}]"), true)] {
326            let cls = CharClass::try_parse(&mut s.chars().peekable()).unwrap();
327            let expected = CharClass {
328                negated,
329                chars: chars.to_vec(),
330                ranges: ranges.to_vec(),
331            };
332
333            assert_eq!(cls, expected, "negated={negated}");
334        }
335    }
336
337    // Each of the inputs here is missing the opening '[' as that is consumed prior to calling
338    // `CharClass::try_parse`
339    #[test_case("z-a]"; "backwards alpha")]
340    #[test_case("Z-A]"; "backwards upper alpha")]
341    #[test_case("9-0]"; "backwards numeric")]
342    #[test_case("a-a]"; "equal alpha endpoints")]
343    #[test_case("5-5]"; "equal numeric endpoints")]
344    #[test_case("a-z9-0]"; "valid then invalid range")]
345    #[test_case("\u{00FF}-\u{0000}]"; "backwards unicode")]
346    #[test]
347    fn invalid_character_ranges_error(s: &str) {
348        assert!(CharClass::try_parse(&mut s.chars().peekable()).is_err());
349    }
350}