ad_editor/regex/
vm.rs

1//! Virtual machine based implementation based on the instruction set described
2//! in Russ Cox's second article in the series and the source of plan9 Sam:
3//!   https://swtch.com/~rsc/regexp/regexp2.html
4//!   https://github.com/sminez/plan9port/blob/master/src/cmd/sam/regexp.c
5//!
6//! The compilation step used is custom (rather than using a YACC parser).
7//!
8//! We make use of pre-allocated buffers for the Thread lists and track the
9//! index we are up to per-iteration as this results in roughly a 100x speed
10//! up from not having to allocate and free inside of the main loop.
11use super::{
12    ast::{parse, Assertion},
13    compile::{compile_ast, optimise, CompiledOps, Inst, Op, Prog},
14    matches::{Match, MatchIter},
15    Error,
16};
17use crate::buffer::{Buffer, GapBuffer};
18use std::{mem::swap, rc::Rc};
19
20pub(super) const N_SLOTS: usize = 30;
21
22/// A regular expression engine designed for use within the ad text editor.
23///
24/// This is a relatively naive implementation though it does have some
25/// optimisations and runs reasonably quickly. It is not at all designed to
26/// be robust against mallicious input and it does not attempt to support
27/// full PCRE syntax or functionality.
28#[derive(Clone, PartialEq, Eq)]
29pub struct Regex {
30    /// The compiled instructions for running the VM
31    prog: Prog,
32    /// Names to be used for extracting named submatches
33    submatch_names: Rc<[String]>,
34    /// Pre-allocated Thread list in priority order to handle leftmost-longest semantics
35    clist: Box<[Thread]>,
36    /// Pre-allocated Thread list in priority order to handle leftmost-longest semantics
37    nlist: Box<[Thread]>,
38    /// Pre-allocated SubMatch positions referenced by threads
39    sms: Box<[SubMatches]>,
40    /// Available indicies into self.sms for storing SubMatch positions for new threads
41    free_sms: Vec<usize>,
42    track_submatches: bool,
43    /// Monotonically increasing index used to dedup Threads
44    /// Will overflow at some point if a given regex is used a VERY large number of times
45    gen: usize,
46    /// Index into the current Thread list
47    p: usize,
48    /// Previous character from the input
49    prev: Option<char>,
50    /// Next character in the input after the one currently being processed
51    next: Option<char>,
52}
53
54impl std::fmt::Debug for Regex {
55    fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
56        fmt.debug_tuple("Regex").field(&self.prog).finish()
57    }
58}
59
60impl Regex {
61    /// Attempt to compile the given regular expression into its optimised VM opcode form.
62    ///
63    /// This method handles pre-allocation of the memory required for running the VM so
64    /// that the allocation cost is paid once up front rather than on each use of the Regex.
65    pub fn compile(re: &str) -> Result<Self, Error> {
66        let mut ast = parse(re)?;
67        ast.optimise();
68        let CompiledOps {
69            ops,
70            submatch_names,
71        } = compile_ast(ast, false);
72
73        Ok(Self::new(ops, submatch_names))
74    }
75
76    /// Attempt to compile the given regular expression into its reversed optimised VM opcode form.
77    /// This is used for searching backwards through an input stream.
78    ///
79    /// This method handles pre-allocation of the memory required for running the VM so
80    /// that the allocation cost is paid once up front rather than on each use of the Regex.
81    pub fn compile_reverse(re: &str) -> Result<Self, Error> {
82        let mut ast = parse(re)?;
83        ast.optimise();
84        let CompiledOps {
85            ops,
86            submatch_names,
87        } = compile_ast(ast, true);
88
89        Ok(Self::new(ops, submatch_names))
90    }
91
92    fn new(ops: Vec<Op>, submatch_names: Vec<String>) -> Self {
93        let prog: Prog = optimise(ops)
94            .into_iter()
95            .map(|op| Inst { op, gen: 0 })
96            .collect();
97
98        let clist = vec![Thread::default(); prog.len()].into_boxed_slice();
99        let nlist = vec![Thread::default(); prog.len()].into_boxed_slice();
100        let sms = vec![SubMatches::default(); prog.len()].into_boxed_slice();
101        let free_sms = (1..prog.len()).collect();
102
103        Self {
104            prog,
105            submatch_names: Rc::from(submatch_names.into_boxed_slice()),
106            clist,
107            nlist,
108            gen: 0,
109            p: 0,
110            prev: None,
111            next: None,
112
113            sms,
114            free_sms,
115            track_submatches: true,
116        }
117    }
118
119    /// Attempt to match this Regex against a given `&str` input, returning the position
120    /// of the match and all submatches if successful.
121    pub fn match_str(&mut self, input: &str) -> Option<Match> {
122        self.track_submatches = true;
123        self.match_iter(&mut input.chars().enumerate(), 0)
124    }
125
126    /// Iterate over all non-overlapping matches of this Regex for a given `&str` input.
127    pub fn match_str_all<'a, 'b>(&'a mut self, input: &'b str) -> MatchIter<'a, &'b str> {
128        self.track_submatches = true;
129        MatchIter {
130            it: input,
131            r: self,
132            from: 0,
133        }
134    }
135
136    /// Iterate over all non-overlapping matches of this Regex for a given `Buffer` input.
137    pub fn match_buffer_all<'a, 'b>(&'a mut self, b: &'b Buffer) -> MatchIter<'a, &'b GapBuffer> {
138        self.track_submatches = true;
139        MatchIter {
140            it: &b.txt,
141            r: self,
142            from: 0,
143        }
144    }
145
146    /// Attempt to match this Regex against an arbitrary iterator input, returning the
147    /// position of the match and all submatches if successful.
148    pub fn match_iter<I>(&mut self, input: &mut I, sp: usize) -> Option<Match>
149    where
150        I: Iterator<Item = (usize, char)>,
151    {
152        self.track_submatches = true;
153        self._match_iter(input, sp)
154    }
155
156    /// Determine whether or not this Regex matches the input `&str` without searching for
157    /// the leftmost-longest match and associated submatch boundaries.
158    pub fn matches_str(&mut self, input: &str) -> bool {
159        self.track_submatches = false;
160        self.matches_iter(&mut input.chars().enumerate(), 0)
161    }
162
163    /// Determine whether or not this Regex matches the input iterator without searching
164    /// for the leftmost-longest match and associated submatch boundaries.
165    pub fn matches_iter<I>(&mut self, input: &mut I, sp: usize) -> bool
166    where
167        I: Iterator<Item = (usize, char)>,
168    {
169        self.track_submatches = false;
170        self._match_iter(input, sp).is_some()
171    }
172
173    /// This is the main VM implementation that is used by all other matching methods on Regex.
174    ///
175    /// The `return_on_first_match` flag is used to early return a dummy Match as soon as we
176    /// can tell that the given regular expression matches the input (rather than looking for
177    /// the leftmost-longest match).
178    ///  - The Match returned in this case will always point to the null string at the start
179    ///    of the string and should only be used for conversion to a bool in `matches_*`
180    ///    methods.
181    fn _match_iter<I>(&mut self, input: &mut I, mut sp: usize) -> Option<Match>
182    where
183        I: Iterator<Item = (usize, char)>,
184    {
185        let mut sub_matches = [0; N_SLOTS];
186        self.free_sms = (1..self.prog.len()).collect();
187        self.sms[0] = SubMatches {
188            refs: 1,
189            inner: [0; N_SLOTS],
190        };
191
192        // We bump the generation to ensure we don't collide with anything from
193        // a previous run while initialising the VM.
194        self.gen += 1;
195        // When setting up the initial threads we have our prelude which uses "@" so we provide a
196        // null byte for the initial character as it is not needed and it avoids us having to make
197        // the "ch" param of add_thread optional.
198        self.add_thread(Thread::default(), sp, '\0', true);
199        swap(&mut self.clist, &mut self.nlist);
200        self.gen += 1;
201
202        // Same as at the end of the outer for-loop, we need to reset self.p to 0
203        // so that we are correctly tracking the length of the new nlist.
204        let mut n = self.p;
205        self.p = 0;
206        let mut matched = false;
207
208        let mut it = input.peekable();
209        self.prev = None;
210        self.next = None;
211
212        while let Some((i, ch)) = it.next() {
213            sp = i;
214            self.next = it.peek().map(|(_, c)| *c);
215
216            for i in 0..n {
217                if let Some(sm) = self.step_thread(i, sp, ch) {
218                    if !self.track_submatches {
219                        return Some(Match::synthetic(0, 0));
220                    }
221
222                    matched = true;
223                    sub_matches = self.sms[sm].inner;
224
225                    // We're ending this thread and all others that have lower priority
226                    // so decrement the references they have to their submatches
227                    for j in i..n {
228                        self.sm_dec_ref(self.clist[j].sm);
229                    }
230
231                    break;
232                }
233            }
234
235            swap(&mut self.clist, &mut self.nlist);
236            self.prev = Some(ch);
237            self.gen += 1;
238            n = self.p;
239
240            if self.p == 0 {
241                break;
242            }
243
244            self.p = 0;
245        }
246
247        self.prev = None;
248        self.next = None;
249
250        // Check to see if the final pass had a match which would be better than any
251        // that we have so far.
252        for t in self.clist.iter_mut().take(n) {
253            if self.prog[t.pc].op == Op::Match && self.sms[t.sm].inner[1] >= sub_matches[1] {
254                matched = true;
255                sub_matches = self.sms[t.sm].inner;
256                break;
257            }
258        }
259
260        if !matched {
261            return None;
262        }
263
264        Some(Match {
265            sub_matches,
266            submatch_names: self.submatch_names.clone(),
267        })
268    }
269
270    #[inline]
271    fn step_thread(&mut self, i: usize, sp: usize, ch: char) -> Option<usize> {
272        let t = &self.clist[i];
273        match &self.prog[t.pc].op {
274            // If comparisons and their assertions hold then queue the resulting threads
275            Op::Comp(comp) if comp.matches(ch) => match t.assertion {
276                Some(a) if !a.holds_for(self.prev, ch, self.next) => {
277                    self.sm_dec_ref(t.sm);
278                    return None;
279                }
280                _ => self.add_thread(thread(t.pc + 1, t.sm), sp, ch, false),
281            },
282
283            Op::Match => return Some(t.sm),
284
285            // Save, Jump & Split are handled in add_thread.
286            // Non-matching comparison ops result in that thread dying.
287            _ => self.sm_dec_ref(t.sm),
288        }
289
290        None
291    }
292
293    #[inline]
294    fn add_thread(&mut self, t: Thread, sp: usize, ch: char, initial: bool) {
295        if self.prog[t.pc].gen == self.gen {
296            self.sm_dec_ref(t.sm);
297            return; // already on the list we are currently building
298        }
299        self.prog[t.pc].gen = self.gen;
300
301        // We do this as chained if-let as we need to recursively call add_thread with data
302        // from self.prog but add_thread required &mut self, so matching would mean we had
303        // to Clone as Op::Class does not implement Copy.
304        // > This is faster than cloning the op and matching
305        if let Op::Jump(l1) = self.prog[t.pc].op {
306            let th = match t.assertion {
307                Some(a) => assert_thread(l1, t.sm, a),
308                None => thread(l1, t.sm),
309            };
310            self.add_thread(th, sp, ch, initial);
311        } else if let Op::Split(l1, l2) = self.prog[t.pc].op {
312            self.sms[t.sm].refs += 1;
313            let (t1, t2) = match t.assertion {
314                Some(a) => (assert_thread(l1, t.sm, a), assert_thread(l2, t.sm, a)),
315                None => (thread(l1, t.sm), thread(l2, t.sm)),
316            };
317            self.add_thread(t1, sp, ch, initial);
318            self.add_thread(t2, sp, ch, initial);
319        } else if let Op::Assertion(a) = self.prog[t.pc].op {
320            self.add_thread(assert_thread(t.pc + 1, t.sm, a), sp, ch, initial);
321        } else if let Op::Save(s) = self.prog[t.pc].op {
322            self.handle_save(t, s, sp, ch, initial, false)
323        } else if let Op::RSave(s) = self.prog[t.pc].op {
324            self.handle_save(t, s, sp, ch, initial, true)
325        } else {
326            self.nlist[self.p] = t;
327            self.p += 1;
328        }
329    }
330
331    #[inline]
332    fn handle_save(&mut self, t: Thread, s: usize, sp: usize, ch: char, initial: bool, rev: bool) {
333        if (!rev && s % 2 == 0) || (rev && s % 2 == 1) {
334            let sm = self.sm_update(t.sm, s, sp, initial, rev);
335            let th = match t.assertion {
336                Some(a) => assert_thread(t.pc + 1, sm, a),
337                None => thread(t.pc + 1, sm),
338            };
339            self.add_thread(th, sp, ch, initial);
340        } else {
341            match t.assertion {
342                Some(a) if !a.holds_for(self.prev, ch, self.next) => self.sm_dec_ref(t.sm),
343                _ => {
344                    let sm = self.sm_update(t.sm, s, sp, initial, rev);
345                    self.add_thread(thread(t.pc + 1, sm), sp, ch, initial);
346                }
347            }
348        }
349    }
350
351    #[inline]
352    fn sm_dec_ref(&mut self, i: usize) {
353        if !self.track_submatches {
354            return;
355        }
356
357        self.sms[i].refs -= 1;
358        if self.sms[i].refs == 0 {
359            self.free_sms.push(i);
360        }
361    }
362
363    #[inline]
364    fn sm_update(&mut self, i: usize, s: usize, sp: usize, initial: bool, reverse: bool) -> usize {
365        // We don't hard error on compiling a regex with more than out max submatches
366        // but we don't track anything past the last one
367        if !self.track_submatches || s >= N_SLOTS {
368            return i;
369        }
370
371        let i = if self.sms[i].refs == 1 {
372            i
373        } else {
374            self.sm_dec_ref(i);
375            let j = self.free_sms.swap_remove(0);
376            self.sms[j].inner = self.sms[i].inner;
377            self.sms[j].refs = 1;
378            j
379        };
380
381        // If we are saving our initial position then we are looking at the correct
382        // character, otherwise the Save op is being processed at the character
383        // before the one we need to save.
384        let mut val = if !initial { sp + 1 } else { sp };
385        if reverse && !initial {
386            val = val.saturating_sub(1);
387        }
388
389        self.sms[i].inner[s] = val;
390
391        i
392    }
393}
394
395#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
396struct SubMatches {
397    /// How many threads are currently pointing at this SubMatches
398    refs: usize,
399    /// $0 -> $n submatches with $0 being the full match
400    /// for submatch $k the start index is 2$k and the end is 2$k+1
401    inner: [usize; N_SLOTS],
402}
403
404#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
405struct Thread {
406    /// VM program counter for this thread
407    pc: usize,
408    /// An assertion that must hold for this instruction to be runnable
409    assertion: Option<Assertion>,
410    /// Index into the Regex sms field
411    sm: usize,
412}
413
414#[inline]
415fn thread(pc: usize, sm: usize) -> Thread {
416    Thread {
417        pc,
418        sm,
419        assertion: None,
420    }
421}
422
423#[inline]
424fn assert_thread(pc: usize, sm: usize, a: Assertion) -> Thread {
425    Thread {
426        pc,
427        sm,
428        assertion: Some(a),
429    }
430}
431
432#[cfg(test)]
433mod tests {
434    use super::*;
435    use simple_test_case::test_case;
436
437    #[test_case("foo", "foo", Some("foo"); "literal full string")]
438    #[test_case("ba*", "baaaaa", Some("baaaaa"); "zero or more present")]
439    #[test_case("ba*", "b", Some("b"); "zero or more not present")]
440    #[test_case("ba+", "baaaaa", Some("baaaaa"); "one or more present")]
441    #[test_case("ba+", "b", None; "one or more not present")]
442    #[test_case("b?a", "ba", Some("ba"); "optional present")]
443    #[test_case("b?a", "a", Some("a"); "optional not present")]
444    #[test_case("a(bb)+a", "abbbba", Some("abbbba"); "article example matching")]
445    #[test_case("a(bb)+a", "abbba", None; "article example non matching")]
446    #[test_case(".*b", "123b", Some("123b"); "dot star prefix")]
447    #[test_case("1.*", "123b", Some("123b"); "dot star suffix")]
448    #[test_case("1.*b", "123b", Some("123b"); "dot star inner")]
449    #[test_case("(c|C)ase matters", "case matters", Some("case matters"); "alternation first")]
450    #[test_case("(c|C)ase matters", "Case matters", Some("Case matters"); "alternation second")]
451    #[test_case("(aa|bbb|c|dd)", "c", Some("c"); "chained alternation")]
452    #[test_case("this@*works", "this contains\nbut still works", Some("this contains\nbut still works"); "true any")]
453    #[test_case(r"literal\?", "literal?", Some("literal?"); "escape special char")]
454    #[test_case(r"literal\t", "literal\t", Some("literal\t"); "escape sequence")]
455    #[test_case("[abc] happy cow", "a happy cow", Some("a happy cow"); "character class")]
456    #[test_case("[^abc] happy cow", "a happy cow", None; "negated character class")]
457    #[test_case("[a-zA-Z]*", "camelCaseFtw", Some("camelCaseFtw"); "char class ranges matching")]
458    #[test_case("[a-zA-Z]*1", "kebab-case-not-so-much", None; "char class ranges non matching")]
459    #[test_case("[a-zA-Z ]*", "this should work", Some("this should work"); "char class mixed")]
460    #[test_case("[\\]5]*", "5]]5555]]", Some("5]]5555]]"); "char class escaped bracket")]
461    #[test_case("[0-9]+", "0123", Some("0123"); "digit range")]
462    #[test_case("[0-9]+", "0", Some("0"); "digit range range start only")]
463    #[test_case("25[0-5]", "255", Some("255"); "ipv4 element one")]
464    #[test_case("2[0-4][0-9]", "231", Some("231"); "ipv4 element two")]
465    #[test_case("1?[0-9]?[0-9]", "155", Some("155"); "ipv4 element three three digit")]
466    #[test_case("1?[0-9]?[0-9]", "72", Some("72"); "ipv4 element three two digit")]
467    #[test_case("1?[0-9]?[0-9]", "8", Some("8"); "ipv4 element three one digit")]
468    #[test_case("1?[0-9]?[0-9]", "0", Some("0"); "ipv4 element three zero")]
469    #[test_case("(25[0-5]|2[0-4][0-9])", "255", Some("255"); "ipv4 elements one and two matching one")]
470    #[test_case("(25[0-5]|2[0-4][0-9])", "219", Some("219"); "ipv4 elements one and two matching two")]
471    #[test_case("(25[0-5]|2[0-4][0-9])", "42", None; "ipv4 elements one and two not matching")]
472    #[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 elements two and three matching two")]
473    #[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 elements two and three matching three")]
474    #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "251", Some("251"); "ipv4 all elements matching one")]
475    #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 all elements matching two")]
476    #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 all elements matching three")]
477    #[test_case(
478        r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
479        "127.0.0.1 ",
480        Some("127.0.0.1");
481        "ipv4 full"
482    )]
483    #[test_case("^foo", "foo at the start", Some("foo"); "SOL holding")]
484    #[test_case("^foo", "bar\nfoo at the start", Some("foo"); "SOL holding after newline")]
485    #[test_case("^foo", "we have foo but not at the start", None; "SOL not holding")]
486    #[test_case("foo$", "a line that ends with foo", Some("foo"); "BOL holding")]
487    #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
488    #[test_case("foo$", "a line with foo in the middle", None; "BOL not holding")]
489    #[test_case("a{3}", "aaa", Some("aaa"); "counted repetition")]
490    #[test_case("a{3}", "aa", None; "counted repetition non matching")]
491    #[test_case("a{3,}", "aaaaaa", Some("aaaaaa"); "counted repetition at least")]
492    #[test_case("a{3,}", "aa", None; "counted repetition at least non matching")]
493    #[test_case("a{3,5}", "aaa", Some("aaa"); "counted repetition between lower")]
494    #[test_case("a{3,5}", "aaaaa", Some("aaaaa"); "counted repetition between upper")]
495    #[test_case("a{3,5}", "aaaa", Some("aaaa"); "counted repetition in range")]
496    #[test_case("a{3,5}", "aa", None; "counted repetition less")]
497    #[test_case("^a{3,5}$", "aaaaaa", None; "counted repetition more")]
498    #[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
499    #[test_case("\\bfor\\b", "forward", None; "word boundary for match at start of word")]
500    #[test_case("\\bfor\\b", "for ward", Some("for"); "word boundary for match not inside word")]
501    #[test_case("\\bfor\\b", "bob for", Some("for"); "word boundary match not at BOF")]
502    #[test_case("\\bfor\\b", "bob for bob", Some("for"); "word boundary match not at BOF or EOF")]
503    #[test_case("\\bin\\b", "min", None; "word boundary for match at end of word")]
504    #[test_case("\\b(in)\\b", "min", None; "word boundary for sub expression match at end of word")]
505    #[test_case("\\b(in|for)\\b", "min", None; "word boundary for alt match at end of word")]
506    #[test_case("\\b(in|for)\\b", "bob for", Some("for"); "word boundary for alt match not at BOF")]
507    #[test_case("[a-zA-Z0-9_\\-./@]+\\.jpe?g", "glenda_space_medium.jpg", Some("glenda_space_medium.jpg"); "complex group")]
508    #[test_case("[a-zA-Z¡-0-9_\\-./@]+", "foo-bar_99.pdf", Some("foo-bar_99.pdf"); "multibyte group")]
509    #[test]
510    fn match_works(re: &str, s: &str, expected: Option<&str>) {
511        let mut r = Regex::compile(re).unwrap();
512        let m = r.match_str(s).map(|m| m.str_match_text(s));
513        assert_eq!(m.as_deref(), expected);
514    }
515
516    #[test_case("foo", "foo", Some("foo"); "literal full string")]
517    #[test_case("ba*", " baaaaa foo", Some("baaaaa"); "zero or more present")]
518    #[test_case("ba*", "b foo", Some("b"); "zero or more not present")]
519    #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
520    #[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
521    #[test_case(
522        r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
523        "127.0.0.1 ",
524        Some("127.0.0.1");
525        "ipv4 full"
526    )]
527    #[test_case(
528        "his",
529        "this is a line\nand another\n- [ ] something to do\n",
530        Some("his");
531        "multiline intput"
532    )]
533    #[test]
534    fn rev_match_works(re: &str, s: &str, expected: Option<&str>) {
535        use crate::exec::IterBoundedChars;
536
537        let mut r = Regex::compile_reverse(re).unwrap();
538        let b = Buffer::new_unnamed(0, s);
539        let mut it = b.rev_iter_between(s.len(), 0);
540        let m = r
541            .match_iter(&mut it, s.len())
542            .map(|m| m.str_match_text(&b.txt.to_string()).to_string());
543
544        assert_eq!(m.as_deref(), expected);
545    }
546
547    #[test_case("[0-9]+", " 42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers")]
548    #[test_case("[0-9]+", " 42 3 127 9991", &["42", "3", "127", "9991"]; "integers to EOF")]
549    #[test_case("[0-9]+", "42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers from BOF")]
550    #[test_case("[0-9]+", "42 3 127 9991", &["42", "3", "127", "9991"]; "integers full input")]
551    #[test_case("foo|bar|baz", "baz bar foo bar", &["baz", "bar", "foo", "bar"]; "alts spaced in s")]
552    #[test_case("foo|bar|baz", "bazbarfoobar", &["baz", "bar", "foo", "bar"]; "alts back to back in s")]
553    #[test_case("(foo|bar|baz)", "foo foobar barfoo baz", &["foo", "foo", "bar", "bar", "foo", "baz"]; "alts in parens")]
554    #[test_case("\\b(foo|bar|baz)\\b", "foo foobar barfoo baz", &["foo", "baz"]; "alts with word boundaries")]
555    #[test]
556    fn match_all_works(re: &str, s: &str, expected: &[&str]) {
557        let mut r = Regex::compile(re).unwrap();
558        let matches: Vec<String> = r.match_str_all(s).map(|m| m.str_match_text(s)).collect();
559
560        assert_eq!(&matches, expected);
561    }
562
563    #[test]
564    fn dot_star_works() {
565        let mut r = Regex::compile(".*").unwrap();
566        let s = "\nthis is\na multiline\nfile";
567        let m1 = r.match_str(s).unwrap();
568        assert_eq!(m1.str_match_text(s), "");
569
570        // Skipping the leading newline should cause us to match all of the following line
571        let m2 = r.match_str(&s[1..]).unwrap();
572        assert_eq!(m2.str_match_text(&s[1..]), "this is");
573    }
574
575    #[test]
576    fn match_extraction_works() {
577        let re = "([0-9]+)-([0-9]+)-([0-9]+)";
578        let mut r = Regex::compile(re).unwrap();
579        let s = "this should work 123-456-789 other stuff";
580        let m = r.match_str(s).unwrap();
581
582        assert_eq!(m.str_submatch_text(1, s).as_deref(), Some("123"));
583        assert_eq!(m.str_submatch_text(2, s).as_deref(), Some("456"));
584        assert_eq!(m.str_submatch_text(3, s).as_deref(), Some("789"));
585        assert_eq!(m.str_match_text(s), "123-456-789");
586    }
587
588    #[test_case("(?<xy>X|Y)", "xy", "X"; "named match on its own")]
589    #[test_case("(?<xy>X|Y)(a|b)", "xy", "X"; "named match before unnamed")]
590    #[test_case("(e| )(?<xy>X|Y)", "xy", "X"; "named match after unnamed")]
591    #[test_case("(e| )(?<xy>X|Y)(a|b)", "xy", "X"; "named match inbetween unnamed")]
592    #[test]
593    fn named_submatch_works(re: &str, name: &str, expected: &str) {
594        let mut r = Regex::compile(re).unwrap();
595        let s = "text before Xanadu";
596        let m = r.match_str(s).unwrap();
597
598        assert_eq!(m.named_matches(), vec![name]);
599        assert_eq!(m.str_sub_loc_text_ref_by_name(name, s), Some(expected));
600    }
601
602    #[test]
603    fn multiline_input_match_dot_star_works() {
604        let mut r = Regex::compile(".*").unwrap();
605        let s = "this is\na multiline\nfile";
606
607        let m = r.match_str(s).unwrap();
608        assert_eq!(m.str_match_text(s), "this is");
609    }
610
611    #[test]
612    fn multiline_input_match_dot_star_works_with_non_zero_initial_sp() {
613        let mut r = Regex::compile(".*").unwrap();
614        let s = "this is\na multiline\nfile";
615
616        // Just to convince me that the offsets here are exactly as I am expecting
617        assert_eq!(s.chars().skip(7).collect::<String>(), "\na multiline\nfile");
618
619        let m1 = r.match_iter(&mut s.chars().enumerate().skip(7), 7).unwrap();
620        assert_eq!(m1.str_match_text(s), "");
621
622        let m2 = r.match_iter(&mut s.chars().enumerate().skip(8), 8).unwrap();
623        assert_eq!(m2.str_match_text(s), "a multiline");
624    }
625
626    #[test]
627    fn multiline_input_match_all_dot_star_works() {
628        let mut r = Regex::compile(".*").unwrap();
629        let s = "this is\na multiline\nfile";
630
631        let mut it = r.match_str_all(s);
632
633        // written this way rather than using collect as if we introduce a bug in the MatchIter
634        // impl we can end up with an iterator that gets stuck and never terminates.
635        let m1 = it.next().unwrap();
636        assert_eq!(m1.str_match_text(s), "this is");
637
638        let m2 = it.next().unwrap();
639        assert_eq!(m2.str_match_text(s), "");
640
641        let m3 = it.next().unwrap();
642        assert_eq!(m3.str_match_text(s), "a multiline");
643
644        let m4 = it.next().unwrap();
645        assert_eq!(m4.str_match_text(s), "");
646
647        let m5 = it.next().unwrap();
648        assert_eq!(m5.str_match_text(s), "file");
649        assert_eq!(it.next(), None);
650    }
651
652    #[test]
653    fn match_extraction_works_when_multibyte_characters_are_present() {
654        let s: &str = "const VLINE: char = '│';
655
656impl Editor {
657";
658
659        let re = r"impl (\w+) \{";
660        let mut r = Regex::compile(re).unwrap();
661        let m = r.match_str(s).unwrap();
662
663        assert_eq!(m.str_submatch_text(1, s).as_deref(), Some("Editor"));
664        assert_eq!(m.str_match_text(s), "impl Editor {");
665    }
666
667    // This is the pathological case that Cox covers in his article which leads
668    // to exponential behaviour in backtracking based implementations.
669    #[test]
670    fn pathological_match_doesnt_explode() {
671        let s = "a".repeat(100);
672        let mut re = "a?".repeat(100);
673        re.push_str(&s);
674
675        let mut r = Regex::compile(&re).unwrap();
676        assert!(r.match_str(&s).is_some());
677    }
678
679    // Make sure that the previous cached state for a given Regex doesn't cause
680    // any strange behaviour for future matches
681    #[test]
682    fn repeated_match_works() {
683        let re = "a(bb)+a";
684
685        let mut r = Regex::compile(re).unwrap();
686        for _ in 0..10 {
687            assert!(r.match_str("abbbba").is_some());
688            assert!(r.match_str("foo").is_none());
689        }
690    }
691}
ad_editor/regex/vm.rs

ad_editor/regex/
vm.rs