ad_editor/regex/
vm.rs

1//! Virtual machine based implementation based on the instruction set described
2//! in Russ Cox's second article in the series and the source of plan9 Sam:
3//!   <https://swtch.com/~rsc/regexp/regexp2.html>
4//!   <https://github.com/sminez/plan9port/blob/master/src/cmd/sam/regexp.c>
5//!
6//! The compilation step used is custom (rather than using a YACC parser).
7//!
8//! We make use of pre-allocated buffers for the Thread lists and track the
9//! index we are up to per-iteration as this results in roughly a 100x speed
10//! up from not having to allocate and free inside of the main loop.
11use crate::regex::{
12    Error, Haystack,
13    ast::{Assertion, parse},
14    compile::{CompiledOps, Inst, Op, Prog, compile_ast, optimise},
15    matches::{Match, MatchIter},
16};
17use aho_corasick::AhoCorasick;
18use std::{
19    collections::HashSet,
20    fmt,
21    mem::swap,
22    sync::{Arc, Mutex},
23};
24
25pub(super) const N_SLOTS: usize = 30;
26
27/// A regular expression engine designed for use within the ad text editor.
28///
29/// This is a relatively naive implementation though it does have some
30/// optimisations and runs reasonably quickly. It is not at all designed to
31/// be robust against malicious input and it does not attempt to support
32/// full PCRE syntax or functionality.
33pub struct Regex {
34    re: Arc<str>,
35    inner: Mutex<RegexInner>,
36}
37
38impl Clone for Regex {
39    fn clone(&self) -> Self {
40        let inner = self.inner.lock().unwrap().clone();
41
42        Self {
43            re: self.re.clone(),
44            inner: Mutex::new(inner),
45        }
46    }
47}
48
49impl PartialEq for Regex {
50    fn eq(&self, other: &Self) -> bool {
51        self.re == other.re
52    }
53}
54
55impl Eq for Regex {}
56
57impl fmt::Debug for Regex {
58    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
59        f.debug_tuple("Regex").field(&self.re).finish()
60    }
61}
62
63impl fmt::Display for Regex {
64    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
65        write!(f, "{}", self.re)
66    }
67}
68
69impl Regex {
70    /// Attempt to compile the given regular expression into its optimised VM opcode form.
71    ///
72    /// This method handles pre-allocation of the memory required for running the VM so
73    /// that the allocation cost is paid once up front rather than on each use of the Regex.
74    pub fn compile(re: impl AsRef<str>) -> Result<Self, Error> {
75        let mut ast = parse(re.as_ref())?;
76        ast.optimise();
77        let lits = ast.leading_literals();
78        let CompiledOps {
79            ops,
80            n_submatches,
81            submatch_names,
82        } = compile_ast(ast, false);
83
84        Ok(Self::new(
85            re.as_ref(),
86            ops,
87            n_submatches,
88            submatch_names,
89            lits,
90        ))
91    }
92
93    fn new(
94        re: &str,
95        ops: Vec<Op>,
96        n_submatches: usize,
97        submatch_names: Vec<String>,
98        leading_lits: HashSet<String>,
99    ) -> Self {
100        let prog: Prog = optimise(ops)
101            .into_iter()
102            .map(|op| Inst { op, generation: 0 })
103            .collect();
104
105        let clist = vec![Thread::default(); prog.len()].into_boxed_slice();
106        let nlist = vec![Thread::default(); prog.len()].into_boxed_slice();
107        let sms = vec![SubMatches::default(); prog.len()].into_boxed_slice();
108        let free_sms = (1..prog.len()).collect();
109
110        let fast_start = if leading_lits.is_empty() {
111            None
112        } else {
113            Some(Box::new(
114                AhoCorasick::new(leading_lits).expect("using auto builder so no errors possible"),
115            ))
116        };
117
118        Self {
119            re: Arc::from(re),
120            inner: Mutex::new(RegexInner {
121                prog,
122                fast_start,
123                n_submatches,
124                submatch_names: Arc::from(submatch_names.into_boxed_slice()),
125                clist,
126                nlist,
127                generation: 0,
128                p: 0,
129                prev: None,
130                next: None,
131                sms,
132                free_sms,
133                track_submatches: true,
134            }),
135        }
136    }
137
138    /// Determine whether or not this Regex matches the [Haystack] without searching for the
139    /// leftmost-longest match and associated submatch boundaries.
140    pub fn matches<H>(&self, haystack: &H) -> bool
141    where
142        H: Haystack,
143    {
144        let mut inner = self.inner.lock().unwrap();
145        inner.track_submatches = false;
146        inner.match_from_byte_offset(haystack, 0).is_some()
147    }
148
149    /// Determine whether or not this Regex matches the [Haystack] from the given offset without
150    /// searching for the leftmost-longest match and associated submatch boundaries.
151    pub fn matches_from<H>(&self, haystack: &H, offset: usize) -> bool
152    where
153        H: Haystack,
154    {
155        let mut inner = self.inner.lock().unwrap();
156        inner.track_submatches = false;
157        inner.match_from_byte_offset(haystack, offset).is_some()
158    }
159
160    /// Determine whether or not this Regex matches the [Haystack] between the given offsets
161    /// without searching for the leftmost-longest match and associated submatch boundaries.
162    pub fn matches_between<H>(&self, haystack: &H, from: usize, to: usize) -> bool
163    where
164        H: Haystack,
165    {
166        let mut inner = self.inner.lock().unwrap();
167        inner.track_submatches = false;
168        inner
169            .match_between_byte_offsets(haystack, from, to)
170            .is_some()
171    }
172
173    /// Search the given [Haystack] for the leftmost longest match of this [Regex] and return the
174    /// match position along with all submatches.
175    ///
176    /// It is recommended that you call [Haystack::try_make_contiguous] before calling this method
177    /// in order to speed up searching whenever this is possible.
178    pub fn find<H>(&self, haystack: &H) -> Option<Match>
179    where
180        H: Haystack,
181    {
182        let mut inner = self.inner.lock().unwrap();
183        inner.track_submatches = true;
184        inner.match_from_byte_offset(haystack, 0)
185    }
186
187    /// Search the given [Haystack] for the leftmost longest match of this [Regex] starting from
188    /// the provided byte offset rather than the beginning of the haystack, returning
189    /// the match position along with all submatches.
190    ///
191    /// It is recommended that you call [Haystack::try_make_contiguous] before calling this method
192    /// in order to speed up searching whenever this is possible.
193    pub fn find_from<H>(&self, haystack: &H, offset: usize) -> Option<Match>
194    where
195        H: Haystack,
196    {
197        let mut inner = self.inner.lock().unwrap();
198        inner.track_submatches = true;
199        inner.match_from_byte_offset(haystack, offset)
200    }
201
202    /// Search the given [Haystack] for the leftmost longest match of this [Regex] starting from
203    /// the provided byte offset rather than the beginning of the haystack, and ending before
204    /// the provided `char_to`, returning the match position along with all submatches.
205    ///
206    /// It is recommended that you call [Haystack::try_make_contiguous] before calling this method
207    /// in order to speed up searching whenever this is possible.
208    pub fn find_between<H>(&self, haystack: &H, from: usize, to: usize) -> Option<Match>
209    where
210        H: Haystack,
211    {
212        let mut inner = self.inner.lock().unwrap();
213        inner.track_submatches = true;
214        inner.match_between_byte_offsets(haystack, from, to)
215    }
216
217    /// It is recommended that you call [Haystack::try_make_contiguous] before calling this method
218    /// in order to speed up searching whenever this is possible.
219    pub fn find_iter<'a, H>(&'a mut self, haystack: &'a H) -> MatchIter<'a, H>
220    where
221        H: Haystack,
222    {
223        self.inner.lock().unwrap().track_submatches = true;
224
225        MatchIter {
226            haystack,
227            r: self,
228            from: 0,
229        }
230    }
231}
232
233#[derive(Debug, Clone, PartialEq, Eq)]
234pub struct RevRegex(Regex);
235
236impl RevRegex {
237    /// Attempt to compile the given regular expression into its reversed optimised VM opcode form.
238    /// This is used for searching backwards through an input stream.
239    ///
240    /// This method handles pre-allocation of the memory required for running the VM so
241    /// that the allocation cost is paid once up front rather than on each use of the Regex.
242    pub fn compile(re: impl AsRef<str>) -> Result<Self, Error> {
243        let mut ast = parse(re.as_ref())?;
244        ast.optimise();
245        let CompiledOps {
246            ops,
247            n_submatches,
248            submatch_names,
249        } = compile_ast(ast, true);
250
251        Ok(Self(Regex::new(
252            re.as_ref(),
253            ops,
254            n_submatches,
255            submatch_names,
256            HashSet::new(),
257        )))
258    }
259
260    /// Search the given [Haystack] for the leftmost longest match of this [Regex] starting from
261    /// the provided `from` bytes offset rather than the beginning of the haystack, returning the
262    /// match position along with all submatches.
263    pub fn find_rev_from<H>(&self, haystack: &H, offset: usize) -> Option<Match>
264    where
265        H: Haystack,
266    {
267        let mut inner = self.0.inner.lock().unwrap();
268        inner.track_submatches = true;
269        inner.run_vm(&mut haystack.rev_iter_between(0, offset), offset)
270    }
271}
272
273#[derive(Clone)]
274struct RegexInner {
275    /// The compiled instructions for running the VM
276    prog: Prog,
277    /// Fast searcher for the first potential match site
278    fast_start: Option<Box<AhoCorasick>>,
279    /// The number of submatches present in the pattern
280    n_submatches: usize,
281    /// Names to be used for extracting named submatches
282    submatch_names: Arc<[String]>,
283    /// Pre-allocated Thread list in priority order to handle leftmost-longest semantics
284    clist: Box<[Thread]>,
285    /// Pre-allocated Thread list in priority order to handle leftmost-longest semantics
286    nlist: Box<[Thread]>,
287    /// Pre-allocated SubMatch positions referenced by threads
288    sms: Box<[SubMatches]>,
289    /// Available indices into self.sms for storing SubMatch positions for new threads
290    free_sms: Vec<usize>,
291    track_submatches: bool,
292    /// Monotonically increasing index used to dedup Threads
293    /// Will overflow at some point if a given regex is used a VERY large number of times
294    generation: usize,
295    /// Index into the current Thread list
296    p: usize,
297    /// Previous character from the input
298    prev: Option<char>,
299    /// Next character in the input after the one currently being processed
300    next: Option<char>,
301}
302
303impl RegexInner {
304    /// If we have leading literals then it is possible to try to find the start of a potential
305    /// match more quickly using aho-corasick.
306    ///
307    /// This is only valid to use when the haystack is contiguous.
308    fn fast_update_byte_offset<H>(&self, haystack: &H, mut byte_offset: usize) -> Option<usize>
309    where
310        H: Haystack,
311    {
312        assert!(
313            haystack.is_contiguous(),
314            "fast_update_byte_offset called for discontiguous haystack"
315        );
316
317        let ac = self.fast_start.as_ref()?;
318        // If aho-corasick cant find a match for our literal prefixes then we fall back to running
319        // the full search as we need to place an upper bound on the number of leading literal
320        // patterns we check for in order to constrain the search space. Otherwise we know that any
321        // potential match cant start before the start of the prefix that was found so we update
322        // our byte_offset to there before running the VM.
323        if let Some(m) = ac.find(haystack.substr_from(byte_offset)?.as_ref()) {
324            // m.start is based on the substr not the full haystack so we need to add the original
325            // offset to get the correct value.
326            byte_offset += m.start();
327        }
328
329        Some(byte_offset)
330    }
331
332    /// If the given [Haystack] supports accelerated searching then it is handled here before
333    /// passing off to [Regex::run_vm].
334    ///
335    /// Callers need to set `self.track_submatches` prior to calling this method.
336    fn match_from_byte_offset<H>(&mut self, haystack: &H, mut offset: usize) -> Option<Match>
337    where
338        H: Haystack,
339    {
340        if haystack.is_contiguous() {
341            offset = self
342                .fast_update_byte_offset(haystack, offset)
343                .unwrap_or(offset);
344        }
345
346        self.run_vm(&mut haystack.iter_from(offset)?, offset)
347    }
348
349    fn match_between_byte_offsets<H>(
350        &mut self,
351        haystack: &H,
352        mut from: usize,
353        to: usize,
354    ) -> Option<Match>
355    where
356        H: Haystack,
357    {
358        if haystack.is_contiguous()
359            && let Some(new_from) = self.fast_update_byte_offset(haystack, from)
360        {
361            if new_from > to {
362                return None; // no match within offsets
363            }
364            from = new_from;
365        }
366
367        self.run_vm(&mut haystack.iter_between(from, to), from)
368    }
369
370    /// This is the main VM implementation that is used by all other matching methods on Regex.
371    ///
372    /// The `track_submatches` flag is used to early return a dummy Match as soon as we
373    /// can tell that the given regular expression matches the input (rather than looking for
374    /// the leftmost-longest match).
375    ///  - The Match returned in this case will always point to the null string at the start
376    ///    of the string and should only be used for conversion to a bool in `matches_*`
377    ///    methods.
378    fn run_vm<I>(&mut self, input: &mut I, mut sp: usize) -> Option<Match>
379    where
380        I: Iterator<Item = (usize, char)>,
381    {
382        let mut sub_matches = [0; N_SLOTS];
383        self.free_sms = (1..self.prog.len()).collect();
384        self.sms[0] = SubMatches {
385            refs: 1,
386            inner: [0; N_SLOTS],
387        };
388
389        // We bump the generation to ensure we don't collide with anything from
390        // a previous run while initialising the VM.
391        self.generation += 1;
392        // When setting up the initial threads we have our prelude which uses "@" so we provide a
393        // null byte for the initial character as it is not needed and it avoids us having to make
394        // the "ch" param of add_thread optional.
395        self.add_thread(Thread::default(), sp, '\0', true);
396        swap(&mut self.clist, &mut self.nlist);
397        self.generation += 1;
398
399        // Same as at the end of the outer for-loop, we need to reset self.p to 0
400        // so that we are correctly tracking the length of the new nlist.
401        let mut n = self.p;
402        self.p = 0;
403        let mut matched = false;
404
405        let mut it = input.peekable();
406        self.prev = None;
407        self.next = None;
408
409        while let Some((i, ch)) = it.next() {
410            sp = i;
411            self.next = it.peek().map(|(_, c)| *c);
412
413            for i in 0..n {
414                if let Some(sm) = self.step_thread(i, sp, ch) {
415                    if !self.track_submatches {
416                        return Some(Match::synthetic(0, 0));
417                    }
418
419                    matched = true;
420                    sub_matches = self.sms[sm].inner;
421
422                    // We're ending this thread and all others that have lower priority
423                    // so decrement the references they have to their submatches
424                    for j in i..n {
425                        self.sm_dec_ref(self.clist[j].sm);
426                    }
427
428                    break;
429                }
430            }
431
432            swap(&mut self.clist, &mut self.nlist);
433            self.prev = Some(ch);
434            self.generation += 1;
435            n = self.p;
436
437            if self.p == 0 {
438                break;
439            }
440
441            self.p = 0;
442        }
443
444        self.prev = None;
445        self.next = None;
446
447        // Check to see if the final pass had a match which would be better than any
448        // that we have so far.
449        for t in self.clist.iter_mut().take(n) {
450            if self.prog[t.pc].op == Op::Match && self.sms[t.sm].inner[1] >= sub_matches[1] {
451                matched = true;
452                sub_matches = self.sms[t.sm].inner;
453                break;
454            }
455        }
456
457        if !matched {
458            return None;
459        }
460
461        Some(Match {
462            n_submatches: self.n_submatches,
463            sub_matches,
464            submatch_names: self.submatch_names.clone(),
465        })
466    }
467
468    #[inline]
469    fn step_thread(&mut self, i: usize, sp: usize, ch: char) -> Option<usize> {
470        let t = &self.clist[i];
471        match &self.prog[t.pc].op {
472            // If comparisons and their assertions hold then queue the resulting threads
473            Op::Comp(comp) if comp.matches(ch) => match t.assertion {
474                Some(a) if !a.holds_for(self.prev, ch, self.next) => {
475                    self.sm_dec_ref(t.sm);
476                    return None;
477                }
478                _ => self.add_thread(thread(t.pc + 1, t.sm), sp, ch, false),
479            },
480
481            Op::Match => return Some(t.sm),
482
483            // Save, Jump & Split are handled in add_thread.
484            // Non-matching comparison ops result in that thread dying.
485            _ => self.sm_dec_ref(t.sm),
486        }
487
488        None
489    }
490
491    #[inline]
492    fn add_thread(&mut self, t: Thread, sp: usize, ch: char, initial: bool) {
493        if self.prog[t.pc].generation == self.generation {
494            self.sm_dec_ref(t.sm);
495            return; // already on the list we are currently building
496        }
497        self.prog[t.pc].generation = self.generation;
498
499        // We do this as chained if-let as we need to recursively call add_thread with data
500        // from self.prog but add_thread required &mut self, so matching would mean we had
501        // to Clone as Op::Class does not implement Copy.
502        // > This is faster than cloning the op and matching
503        if let Op::Jump(l1) = self.prog[t.pc].op {
504            let th = match t.assertion {
505                Some(a) => assert_thread(l1, t.sm, a),
506                None => thread(l1, t.sm),
507            };
508            self.add_thread(th, sp, ch, initial);
509        } else if let Op::Split(l1, l2) = self.prog[t.pc].op {
510            self.sms[t.sm].refs += 1;
511            let (t1, t2) = match t.assertion {
512                Some(a) => (assert_thread(l1, t.sm, a), assert_thread(l2, t.sm, a)),
513                None => (thread(l1, t.sm), thread(l2, t.sm)),
514            };
515            self.add_thread(t1, sp, ch, initial);
516            self.add_thread(t2, sp, ch, initial);
517        } else if let Op::Assertion(a) = self.prog[t.pc].op {
518            self.add_thread(assert_thread(t.pc + 1, t.sm, a), sp, ch, initial);
519        } else if let Op::Save(s) = self.prog[t.pc].op {
520            self.handle_save(t, s, sp, ch, initial, false)
521        } else if let Op::RSave(s) = self.prog[t.pc].op {
522            self.handle_save(t, s, sp, ch, initial, true)
523        } else {
524            self.nlist[self.p] = t;
525            self.p += 1;
526        }
527    }
528
529    #[inline]
530    fn handle_save(&mut self, t: Thread, s: usize, sp: usize, ch: char, initial: bool, rev: bool) {
531        // If we are saving our initial position from a forward match then we are looking at the
532        // correct character, otherwise the Save op is being processed at the character before the
533        // one we need to save.
534        let inc_bytes = if !initial && !rev { ch.len_utf8() } else { 0 };
535
536        if (!rev && s.is_multiple_of(2)) || (rev && !s.is_multiple_of(2)) {
537            let sm = self.sm_update(t.sm, s, sp + inc_bytes);
538            let th = match t.assertion {
539                Some(a) => assert_thread(t.pc + 1, sm, a),
540                None => thread(t.pc + 1, sm),
541            };
542            self.add_thread(th, sp, ch, initial);
543        } else {
544            match t.assertion {
545                Some(a) if !a.holds_for(self.prev, ch, self.next) => self.sm_dec_ref(t.sm),
546                _ => {
547                    let sm = self.sm_update(t.sm, s, sp + inc_bytes);
548                    self.add_thread(thread(t.pc + 1, sm), sp, ch, initial);
549                }
550            }
551        }
552    }
553
554    #[inline]
555    fn sm_dec_ref(&mut self, i: usize) {
556        if !self.track_submatches {
557            return;
558        }
559
560        self.sms[i].refs -= 1;
561        if self.sms[i].refs == 0 {
562            self.free_sms.push(i);
563        }
564    }
565
566    #[inline]
567    fn sm_update(&mut self, i: usize, s: usize, sp: usize) -> usize {
568        // We don't hard error on compiling a regex with more than out max submatches
569        // but we don't track anything past the last one
570        if !self.track_submatches || s >= N_SLOTS {
571            return i;
572        }
573
574        let i = if self.sms[i].refs == 1 {
575            i
576        } else {
577            self.sm_dec_ref(i);
578            let j = self.free_sms.swap_remove(0);
579            self.sms[j].inner = self.sms[i].inner;
580            self.sms[j].refs = 1;
581            j
582        };
583
584        self.sms[i].inner[s] = sp;
585
586        i
587    }
588}
589
590#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
591struct SubMatches {
592    /// How many threads are currently pointing at this SubMatches
593    refs: usize,
594    /// $0 -> $n submatches with $0 being the full match
595    /// for submatch $k the start index is 2$k and the end is 2$k+1
596    inner: [usize; N_SLOTS],
597}
598
599#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
600struct Thread {
601    /// VM program counter for this thread
602    pc: usize,
603    /// An assertion that must hold for this instruction to be runnable
604    assertion: Option<Assertion>,
605    /// Index into the Regex sms field
606    sm: usize,
607}
608
609#[inline]
610fn thread(pc: usize, sm: usize) -> Thread {
611    Thread {
612        pc,
613        sm,
614        assertion: None,
615    }
616}
617
618#[inline]
619fn assert_thread(pc: usize, sm: usize, a: Assertion) -> Thread {
620    Thread {
621        pc,
622        sm,
623        assertion: Some(a),
624    }
625}
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630    use crate::buffer::Buffer;
631    use simple_test_case::test_case;
632
633    // typos:off
634    #[test_case("foo", "foo", Some("foo"); "literal full string")]
635    #[test_case("ba*", "baaaaa", Some("baaaaa"); "zero or more present")]
636    #[test_case("ba*", "b", Some("b"); "zero or more not present")]
637    #[test_case("ba+", "baaaaa", Some("baaaaa"); "one or more present")]
638    #[test_case("ba+", "b", None; "one or more not present")]
639    #[test_case("b?a", "ba", Some("ba"); "optional present")]
640    #[test_case("b?a", "a", Some("a"); "optional not present")]
641    #[test_case("a(bb)+a", "abbbba", Some("abbbba"); "article example matching")]
642    #[test_case("a(bb)+a", "abbba", None; "article example non matching")]
643    #[test_case(".*b", "123b", Some("123b"); "dot star prefix")]
644    #[test_case("1.*", "123b", Some("123b"); "dot star suffix")]
645    #[test_case("1.*b", "123b", Some("123b"); "dot star inner")]
646    #[test_case("(c|C)ase matters", "case matters", Some("case matters"); "alternation first")]
647    #[test_case("(c|C)ase matters", "Case matters", Some("Case matters"); "alternation second")]
648    #[test_case("(aa|bbb|c|dd)", "c", Some("c"); "chained alternation")]
649    #[test_case("this@*works", "this contains\nbut still works", Some("this contains\nbut still works"); "true any")]
650    #[test_case(r"literal\?", "literal?", Some("literal?"); "escape special char")]
651    #[test_case(r"literal\t", "literal\t", Some("literal\t"); "escape sequence")]
652    #[test_case("[abc] happy cow", "a happy cow", Some("a happy cow"); "character class")]
653    #[test_case("[^abc] happy cow", "a happy cow", None; "negated character class")]
654    #[test_case("[a-zA-Z]*", "camelCaseFtw", Some("camelCaseFtw"); "char class ranges matching")]
655    #[test_case("[a-zA-Z]*1", "kebab-case-not-so-much", None; "char class ranges non matching")]
656    #[test_case("[a-zA-Z ]*", "this should work", Some("this should work"); "char class mixed")]
657    #[test_case("[\\]5]*", "5]]5555]]", Some("5]]5555]]"); "char class escaped bracket")]
658    #[test_case("[0-9]+", "0123", Some("0123"); "digit range")]
659    #[test_case("[0-9]+", "0", Some("0"); "digit range range start only")]
660    #[test_case("25[0-5]", "255", Some("255"); "ipv4 element one")]
661    #[test_case("2[0-4][0-9]", "231", Some("231"); "ipv4 element two")]
662    #[test_case("1?[0-9]?[0-9]", "155", Some("155"); "ipv4 element three three digit")]
663    #[test_case("1?[0-9]?[0-9]", "72", Some("72"); "ipv4 element three two digit")]
664    #[test_case("1?[0-9]?[0-9]", "8", Some("8"); "ipv4 element three one digit")]
665    #[test_case("1?[0-9]?[0-9]", "0", Some("0"); "ipv4 element three zero")]
666    #[test_case("(25[0-5]|2[0-4][0-9])", "255", Some("255"); "ipv4 elements one and two matching one")]
667    #[test_case("(25[0-5]|2[0-4][0-9])", "219", Some("219"); "ipv4 elements one and two matching two")]
668    #[test_case("(25[0-5]|2[0-4][0-9])", "42", None; "ipv4 elements one and two not matching")]
669    #[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 elements two and three matching two")]
670    #[test_case("(2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 elements two and three matching three")]
671    #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "251", Some("251"); "ipv4 all elements matching one")]
672    #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "237", Some("237"); "ipv4 all elements matching two")]
673    #[test_case("(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])", "142", Some("142"); "ipv4 all elements matching three")]
674    #[test_case(
675        r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
676        "127.0.0.1 ",
677        Some("127.0.0.1");
678        "ipv4 full"
679    )]
680    #[test_case("^foo", "foo at the start", Some("foo"); "SOL holding")]
681    #[test_case("^foo", "bar\nfoo at the start", Some("foo"); "SOL holding after newline")]
682    #[test_case("^foo", "we have foo but not at the start", None; "SOL not holding")]
683    #[test_case("foo$", "a line that ends with foo", Some("foo"); "BOL holding")]
684    #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
685    #[test_case("foo$", "a line with foo in the middle", None; "BOL not holding")]
686    #[test_case("foo", "│foo", Some("foo"); "after a multibyte char")]
687    #[test_case("a{3}", "aaa", Some("aaa"); "counted repetition")]
688    #[test_case("a{3}", "aa", None; "counted repetition non matching")]
689    #[test_case("a{3,}", "aaaaaa", Some("aaaaaa"); "counted repetition at least")]
690    #[test_case("a{3,}", "aa", None; "counted repetition at least non matching")]
691    #[test_case("a{3,5}", "aaa", Some("aaa"); "counted repetition between lower")]
692    #[test_case("a{3,5}", "aaaaa", Some("aaaaa"); "counted repetition between upper")]
693    #[test_case("a{3,5}", "aaaa", Some("aaaa"); "counted repetition in range")]
694    #[test_case("a{3,5}", "aa", None; "counted repetition less")]
695    #[test_case("^a{3,5}$", "aaaaaa", None; "counted repetition more")]
696    #[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
697    #[test_case("\\bfor\\b", "forward", None; "word boundary for match at start of word")]
698    #[test_case("\\bfor\\b", "for ward", Some("for"); "word boundary for match not inside word")]
699    #[test_case("\\bfor\\b", "bob for", Some("for"); "word boundary match not at BOF")]
700    #[test_case("\\bfor\\b", "bob for bob", Some("for"); "word boundary match not at BOF or EOF")]
701    #[test_case("\\bin\\b", "min", None; "word boundary for match at end of word")]
702    #[test_case("\\b(in)\\b", "min", None; "word boundary for sub expression match at end of word")]
703    #[test_case("\\b(in|for)\\b", "min", None; "word boundary for alt match at end of word")]
704    #[test_case("\\b(in|for)\\b", "bob for", Some("for"); "word boundary for alt match not at BOF")]
705    #[test_case("[a-zA-Z0-9_\\-./@]+\\.jpe?g", "glenda_space_medium.jpg", Some("glenda_space_medium.jpg"); "complex group")]
706    #[test_case("[a-zA-Z¡-￿0-9_\\-./@]+", "foo-bar_99.pdf", Some("foo-bar_99.pdf"); "multibyte group")]
707    // typos:on
708    #[test]
709    fn find_works(re: &str, s: &str, expected: Option<&str>) {
710        let r = Regex::compile(re).unwrap();
711        let m = r.find(&s).map(|m| m.match_text(&s));
712        assert_eq!(m.as_deref(), expected);
713    }
714
715    #[test_case("foo", "foo", Some("foo"); "literal full string")]
716    #[test_case("ba*", " baaaaa foo", Some("baaaaa"); "zero or more present")] // typos:ignore
717    #[test_case("ba*", "b foo", Some("b"); "zero or more not present")] // typos:ignore
718    #[test_case("foo$", "a line that ends with foo\nnow bar", Some("foo"); "BOL holding before newline")]
719    #[test_case("\\b\\w+\\b", "foo", Some("foo"); "word boundary at end of input")]
720    #[test_case(
721        r"(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])\.(25[0-5]|2[0-4][0-9]|1?[0-9]?[0-9])",
722        "127.0.0.1 ",
723        Some("127.0.0.1");
724        "ipv4 full"
725    )]
726    #[test_case(
727        "his",
728        "this is a line\nand another\n- [ ] something to do\n",
729        Some("his");
730        "multiline input"
731    )]
732    #[test]
733    fn find_rev_works(re: &str, s: &str, expected: Option<&str>) {
734        let r = RevRegex::compile(re).unwrap();
735        let b = Buffer::new_unnamed(0, s, Default::default());
736        let m = r.find_rev_from(&b, s.len()).map(|m| m.match_text(&b));
737
738        assert_eq!(m.as_deref(), expected);
739    }
740
741    #[test_case("[0-9]+", " 42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers")]
742    #[test_case("[0-9]+", " 42 3 127 9991", &["42", "3", "127", "9991"]; "integers to EOF")]
743    #[test_case("[0-9]+", "42 3 127 9991 ", &["42", "3", "127", "9991"]; "integers from BOF")]
744    #[test_case("[0-9]+", "42 3 127 9991", &["42", "3", "127", "9991"]; "integers full input")]
745    #[test_case("foo|bar|baz", "baz bar foo bar", &["baz", "bar", "foo", "bar"]; "alts spaced in s")]
746    #[test_case("foo|bar|baz", "bazbarfoobar", &["baz", "bar", "foo", "bar"]; "alts back to back in s")]
747    #[test_case("(foo|bar|baz)", "foo foobar barfoo baz", &["foo", "foo", "bar", "bar", "foo", "baz"]; "alts in parens")]
748    #[test_case("\\b(foo|bar|baz)\\b", "foo foobar barfoo baz", &["foo", "baz"]; "alts with word boundaries")]
749    #[test]
750    fn find_iter_works(re: &str, s: &str, expected: &[&str]) {
751        let mut r = Regex::compile(re).unwrap();
752        let matches: Vec<String> = r
753            .find_iter(&s)
754            .map(|m| m.match_text(&s).into_owned())
755            .collect();
756
757        assert_eq!(&matches, expected);
758    }
759
760    #[test]
761    fn dot_star_works() {
762        let r = Regex::compile(".*").unwrap();
763        let s = "\nthis is\na multiline\nfile";
764        let m1 = r.find(&s).unwrap();
765        assert_eq!(m1.match_text(&s), "");
766
767        // Skipping the leading newline should cause us to match all of the following line
768        let m2 = r.find(&&s[1..]).unwrap();
769        assert_eq!(m2.match_text(&&s[1..]), "this is");
770    }
771
772    #[test]
773    fn match_extraction_works() {
774        let re = "([0-9]+)-([0-9]+)-([0-9]+)";
775        let r = Regex::compile(re).unwrap();
776        let s = "this should work 123-456-789 other stuff";
777        let m = r.find(&s).unwrap();
778
779        assert_eq!(m.match_text(&s), "123-456-789");
780        assert_eq!(m.submatch_text(1, &s).as_deref(), Some("123"));
781        assert_eq!(m.submatch_text(2, &s).as_deref(), Some("456"));
782        assert_eq!(m.submatch_text(3, &s).as_deref(), Some("789"));
783    }
784
785    #[test_case("(?<xy>X|Y)", "xy", "X"; "named match on its own")]
786    #[test_case("(?<xy>X|Y)(a|b)", "xy", "X"; "named match before unnamed")]
787    #[test_case("(e| )(?<xy>X|Y)", "xy", "X"; "named match after unnamed")]
788    #[test_case("(e| )(?<xy>X|Y)(a|b)", "xy", "X"; "named match inbetween unnamed")]
789    #[test]
790    fn named_submatch_works(re: &str, name: &str, expected: &str) {
791        let r = Regex::compile(re).unwrap();
792        let s = "text before Xanadu";
793        let m = r.find(&s).unwrap();
794
795        assert_eq!(m.named_matches(), vec![name]);
796        assert_eq!(m.submatch_text_by_name(name, &s).as_deref(), Some(expected));
797    }
798
799    #[test]
800    fn multiline_input_match_dot_star_works() {
801        let r = Regex::compile(".*").unwrap();
802        let s = "this is\na multiline\nfile";
803
804        let m = r.find(&s).unwrap();
805        assert_eq!(m.match_text(&s), "this is");
806    }
807
808    #[test]
809    fn multiline_input_find_from_dot_star_works_with_non_zero_initial_sp() {
810        let r = Regex::compile(".*").unwrap();
811        let s = "this is\na multiline\nfile";
812
813        // Just to convince me that the offsets here are exactly as I am expecting
814        assert_eq!(s.chars().skip(7).collect::<String>(), "\na multiline\nfile");
815
816        let m1 = r.find_from(&s, 7).unwrap();
817        assert_eq!(m1.match_text(&s), "");
818
819        let m2 = r.find_from(&s, 8).unwrap();
820        assert_eq!(m2.match_text(&s), "a multiline");
821    }
822
823    #[test]
824    fn multiline_input_find_iter_dot_star_works() {
825        let mut r = Regex::compile(".*").unwrap();
826        let s = "this is\na multiline\nfile";
827
828        let mut it = r.find_iter(&s);
829
830        // written this way rather than using collect as if we introduce a bug in the MatchIter
831        // impl we can end up with an iterator that gets stuck and never terminates.
832        let m1 = it.next().unwrap();
833        assert_eq!(m1.match_text(&s), "this is");
834
835        let m2 = it.next().unwrap();
836        assert_eq!(m2.match_text(&s), "");
837
838        let m3 = it.next().unwrap();
839        assert_eq!(m3.match_text(&s), "a multiline");
840
841        let m4 = it.next().unwrap();
842        assert_eq!(m4.match_text(&s), "");
843
844        let m5 = it.next().unwrap();
845        assert_eq!(m5.match_text(&s), "file");
846        assert_eq!(it.next(), None);
847    }
848
849    #[test]
850    fn match_extraction_works_when_multibyte_characters_are_present() {
851        let s: &str = "const VLINE: char = '│';
852
853impl Editor {
854";
855
856        let re = r"impl (\w+) \{";
857        let r = Regex::compile(re).unwrap();
858        let m = r.find(&s).unwrap();
859
860        assert_eq!(m.submatch_text(1, &s).as_deref(), Some("Editor"));
861        assert_eq!(m.match_text(&s), "impl Editor {");
862    }
863
864    // This is the pathological case that Cox covers in his article which leads
865    // to exponential behaviour in backtracking based implementations.
866    #[test]
867    fn pathological_match_doesnt_explode() {
868        let s = "a".repeat(100);
869        let mut re = "a?".repeat(100);
870        re.push_str(&s);
871
872        let r = Regex::compile(&re).unwrap();
873        assert!(r.find(&s.as_str()).is_some());
874    }
875
876    // Make sure that the previous cached state for a given Regex doesn't cause
877    // any strange behaviour for future matches
878    #[test]
879    fn repeated_match_works() {
880        let re = "a(bb)+a";
881
882        let r = Regex::compile(re).unwrap();
883        for _ in 0..10 {
884            assert!(r.find(&"abbbba").is_some());
885            assert!(r.find(&"foo").is_none());
886        }
887    }
888
889    // Regression case for https://github.com/sminez/ad/issues/165
890    #[test_case("Tracing_Summit_2025_Perfet_RYQoyoF.pdf"; "tracing")]
891    #[test_case("Bracing_Summit_2025_Perfet_RYQoyoF.pdf"; "bracing")]
892    #[test]
893    fn leading_literal_truncation_doesnt_affect_matching(s: &str) {
894        let re = "([a-zA-Z¡-�0-9_\\-./@]+).[Pp][Dd][Ff]";
895        let r = Regex::compile(re).unwrap();
896        let m = r.find(&s).unwrap();
897
898        assert_eq!(m.match_text(&s), s);
899    }
900}