fancy_regex/
vm.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Backtracking VM for implementing fancy regexes.
22//!
23//! Read <https://swtch.com/~rsc/regexp/regexp2.html> for a good introduction for how this works.
24//!
25//! The VM executes a sequence of instructions (a program) against an input string. It keeps track
26//! of a program counter (PC) and an index into the string (IX). Execution can have one or more
27//! threads.
28//!
29//! One of the basic instructions is `Lit`, which matches a string against the input. If it matches,
30//! the PC advances to the next instruction and the IX to the position after the matched string.
31//! If not, the current thread is stopped because it failed.
32//!
33//! If execution reaches an `End` instruction, the program is successful because a match was found.
34//! If there are no more threads to execute, the program has failed to match.
35//!
36//! A very simple program for the regex `a`:
37//!
38//! ```text
39//! 0: Lit("a")
40//! 1: End
41//! ```
42//!
43//! The `Split` instruction causes execution to split into two threads. The first thread is executed
44//! with the current string index. If it fails, we reset the string index and resume execution with
45//! the second thread. That is what "backtracking" refers to. In order to do that, we keep a stack
46//! of threads (PC and IX) to try.
47//!
48//! Example program for the regex `ab|ac`:
49//!
50//! ```text
51//! 0: Split(1, 4)
52//! 1: Lit("a")
53//! 2: Lit("b")
54//! 3: Jmp(6)
55//! 4: Lit("a")
56//! 5: Lit("c")
57//! 6: End
58//! ```
59//!
60//! The `Jmp` instruction causes execution to jump to the specified instruction. In the example it
61//! is needed to separate the two threads.
62//!
63//! Let's step through execution with that program for the input `ac`:
64//!
65//! 1. We're at PC 0 and IX 0
66//! 2. `Split(1, 4)` means we save a thread with PC 4 and IX 0 for trying later
67//! 3. Continue at `Lit("a")` which matches, so we advance IX to 1
68//! 4. `Lit("b")` doesn't match at IX 1 (`"b" != "c"`), so the thread fails
69//! 5. We continue with the previously saved thread at PC 4 and IX 0 (backtracking)
70//! 6. Both `Lit("a")` and `Lit("c")` match and we reach `End` -> successful match (index 0 to 2)
71
72use alloc::collections::BTreeSet;
73use alloc::string::String;
74#[cfg(feature = "variable-lookbehinds")]
75use alloc::sync::Arc;
76use alloc::vec;
77use alloc::vec::Vec;
78use regex_automata::meta::Regex;
79use regex_automata::util::look::LookMatcher;
80use regex_automata::util::primitives::NonMaxUsize;
81use regex_automata::Anchored;
82use regex_automata::Input;
83
84#[cfg(feature = "variable-lookbehinds")]
85use regex_automata::util::pool::Pool;
86
87#[cfg(feature = "variable-lookbehinds")]
88pub(crate) type CachePoolFn = alloc::boxed::Box<
89    dyn Fn() -> regex_automata::hybrid::dfa::Cache
90        + Send
91        + Sync
92        + core::panic::UnwindSafe
93        + core::panic::RefUnwindSafe,
94>;
95
96use crate::error::RuntimeError;
97use crate::prev_codepoint_ix;
98use crate::Assertion;
99use crate::Error;
100use crate::Formatter;
101use crate::Result;
102use crate::{codepoint_len, RegexOptions};
103
104/// Enable tracing of VM execution. Only for debugging/investigating.
105const OPTION_TRACE: u32 = 1 << 0;
106/// When iterating over all matches within a text (e.g. with `find_iter`), empty matches need to be
107/// handled specially. If we kept matching at the same position, we'd never stop. So what we do
108/// after we've had an empty match, is to advance the position where matching is attempted.
109/// If `\G` is used in the pattern, that means it no longer matches. If we didn't tell the VM about
110/// the fact that we skipped because of an empty match, it would still treat `\G` as matching. So
111/// this option is for communicating that to the VM. Phew.
112pub(crate) const OPTION_SKIPPED_EMPTY_MATCH: u32 = 1 << 1;
113
114// TODO: make configurable
115const MAX_STACK: usize = 1_000_000;
116
117/// Represents a range of capture groups by storing the first and last group numbers.
118#[derive(Clone, Copy, Debug, PartialEq, Eq)]
119pub struct CaptureGroupRange(pub usize, pub usize);
120
121impl CaptureGroupRange {
122    /// Returns the start (first) group number.
123    pub fn start(&self) -> usize {
124        self.0
125    }
126
127    /// Returns the end (last) group number.
128    pub fn end(&self) -> usize {
129        self.1
130    }
131
132    /// Converts this range to an Option, returning None if start equals end (no capture groups).
133    pub fn to_option_if_non_empty(self) -> Option<Self> {
134        if self.start() == self.end() {
135            None
136        } else {
137            Some(self)
138        }
139    }
140}
141
142#[derive(Clone)]
143/// Delegate matching to the regex crate
144pub struct Delegate {
145    /// The regex
146    pub inner: Regex,
147    /// The regex pattern as a string
148    pub pattern: String,
149    /// The range of capture groups. None if there are no capture groups.
150    pub capture_groups: Option<CaptureGroupRange>,
151}
152
153impl core::fmt::Debug for Delegate {
154    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
155        // Ensures it fails to compile if the struct changes
156        let Self {
157            inner: _,
158            pattern,
159            capture_groups,
160        } = self;
161
162        f.debug_struct("Delegate")
163            .field("pattern", pattern)
164            .field("capture_groups", capture_groups)
165            .finish()
166    }
167}
168
169#[cfg(feature = "variable-lookbehinds")]
170/// Delegate matching in reverse to regex-automata
171pub struct ReverseBackwardsDelegate {
172    /// The regex pattern as a string which will be matched in reverse, in a backwards direction
173    pub pattern: String,
174    /// The delegate regex to match backwards (wrapped in Arc for efficient cloning)
175    pub(crate) dfa: Arc<regex_automata::hybrid::dfa::DFA>,
176    /// Cache pool for DFA searches
177    pub(crate) cache_pool: Pool<regex_automata::hybrid::dfa::Cache, CachePoolFn>,
178    /// The forward regex for capture group extraction
179    pub(crate) capture_group_extraction_inner: Option<Regex>,
180    /// The range of capture groups. None if there are no capture groups.
181    pub capture_groups: Option<CaptureGroupRange>,
182}
183
184#[cfg(feature = "variable-lookbehinds")]
185impl Clone for ReverseBackwardsDelegate {
186    fn clone(&self) -> Self {
187        let dfa_for_closure = Arc::clone(&self.dfa);
188        let create: CachePoolFn = alloc::boxed::Box::new(move || dfa_for_closure.create_cache());
189        Self {
190            pattern: self.pattern.clone(),
191            cache_pool: Pool::new(create),
192            dfa: Arc::clone(&self.dfa),
193            capture_group_extraction_inner: self.capture_group_extraction_inner.clone(),
194            capture_groups: self.capture_groups,
195        }
196    }
197}
198
199#[cfg(feature = "variable-lookbehinds")]
200impl core::fmt::Debug for ReverseBackwardsDelegate {
201    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
202        // Ensures it fails to compile if the struct changes
203        let Self {
204            pattern,
205            dfa: _,
206            cache_pool: _,
207            capture_group_extraction_inner: _,
208            capture_groups,
209        } = self;
210
211        f.debug_struct("ReverseBackwardsDelegate")
212            .field("pattern", pattern)
213            .field("capture_groups", capture_groups)
214            .finish()
215    }
216}
217
218/// Instruction of the VM.
219#[derive(Debug)]
220pub enum Insn {
221    /// Successful end of program
222    End,
223    /// Match any character (including newline)
224    Any,
225    /// Match any character (not including newline)
226    AnyNoNL,
227    /// Assertions
228    Assertion(Assertion),
229    /// Match the literal string at the current index
230    Lit(String), // should be cow?
231    /// Split execution into two threads. The two fields are positions of instructions. Execution
232    /// first tries the first thread. If that fails, the second position is tried.
233    Split(usize, usize),
234    /// Jump to instruction at position
235    Jmp(usize),
236    /// Save the current string index into the specified slot
237    Save(usize),
238    /// Save `0` into the specified slot
239    Save0(usize),
240    /// Set the string index to the value that was saved in the specified slot
241    Restore(usize),
242    /// Repeat greedily (match as much as possible)
243    RepeatGr {
244        /// Minimum number of matches
245        lo: usize,
246        /// Maximum number of matches
247        hi: usize,
248        /// The instruction after the repeat
249        next: usize,
250        /// The slot for keeping track of the number of repetitions
251        repeat: usize,
252    },
253    /// Repeat non-greedily (prefer matching as little as possible)
254    RepeatNg {
255        /// Minimum number of matches
256        lo: usize,
257        /// Maximum number of matches
258        hi: usize,
259        /// The instruction after the repeat
260        next: usize,
261        /// The slot for keeping track of the number of repetitions
262        repeat: usize,
263    },
264    /// Repeat greedily and prevent infinite loops from empty matches
265    RepeatEpsilonGr {
266        /// Minimum number of matches
267        lo: usize,
268        /// The instruction after the repeat
269        next: usize,
270        /// The slot for keeping track of the number of repetitions
271        repeat: usize,
272        /// The slot for saving the previous IX to check if we had an empty match
273        check: usize,
274    },
275    /// Repeat non-greedily and prevent infinite loops from empty matches
276    RepeatEpsilonNg {
277        /// Minimum number of matches
278        lo: usize,
279        /// The instruction after the repeat
280        next: usize,
281        /// The slot for keeping track of the number of repetitions
282        repeat: usize,
283        /// The slot for saving the previous IX to check if we had an empty match
284        check: usize,
285    },
286    /// Negative look-around failed
287    FailNegativeLookAround,
288    /// Set IX back by the specified number of characters
289    GoBack(usize),
290    /// Back reference to a group number to check
291    Backref {
292        /// The save slot representing the start of the capture group
293        slot: usize,
294        /// Whether the backref should be matched case insensitively
295        casei: bool,
296    },
297    /// Begin of atomic group
298    BeginAtomic,
299    /// End of atomic group
300    EndAtomic,
301    /// Delegate matching to the regex crate
302    Delegate(Delegate),
303    /// Anchor to match at the position where the previous match ended.
304    ContinueFromPreviousMatchEnd {
305        /// Whether this is at the start of the pattern (allowing early exit on failure)
306        at_start: bool,
307    },
308    /// Continue only if the specified capture group has already been populated as part of the match
309    BackrefExistsCondition(usize),
310    #[cfg(feature = "variable-lookbehinds")]
311    /// Reverse lookbehind using regex-automata for variable-sized patterns
312    BackwardsDelegate(ReverseBackwardsDelegate),
313}
314
315/// Sequence of instructions for the VM to execute.
316#[derive(Debug)]
317pub struct Prog {
318    /// Instructions of the program
319    pub body: Vec<Insn>,
320    n_saves: usize,
321}
322
323impl Prog {
324    pub(crate) fn new(body: Vec<Insn>, n_saves: usize) -> Prog {
325        Prog { body, n_saves }
326    }
327
328    #[doc(hidden)]
329    pub(crate) fn debug_print(&self, writer: &mut Formatter<'_>) -> core::fmt::Result {
330        for (i, insn) in self.body.iter().enumerate() {
331            writeln!(writer, "{:3}: {:?}", i, insn)?;
332        }
333        Ok(())
334    }
335}
336
337#[derive(Debug)]
338struct Branch {
339    pc: usize,
340    ix: usize,
341    nsave: usize,
342}
343
344#[derive(Debug)]
345struct Save {
346    slot: usize,
347    value: usize,
348}
349
350struct State {
351    /// Saved values indexed by slot. Mostly indices to s, but can be repeat values etc.
352    /// Always contains the saves of the current state.
353    saves: Vec<usize>,
354    /// Stack of backtrack branches.
355    stack: Vec<Branch>,
356    /// Old saves (slot, value)
357    oldsave: Vec<Save>,
358    /// Number of saves at the end of `oldsave` that need to be restored to `saves` on pop
359    nsave: usize,
360    explicit_sp: usize,
361    /// Maximum size of the stack. If the size would be exceeded during execution, a `StackOverflow`
362    /// error is raised.
363    max_stack: usize,
364    #[allow(dead_code)]
365    options: u32,
366}
367
368// Each element in the stack conceptually represents the entire state
369// of the machine: the pc (index into prog), the index into the
370// string, and the entire vector of saves. However, copying the save
371// vector on every push/pop would be inefficient, so instead we use a
372// copy-on-write approach for each slot within the save vector. The
373// top `nsave` elements in `oldsave` represent the delta from the
374// current machine state to the top of stack.
375
376impl State {
377    fn new(n_saves: usize, max_stack: usize, options: u32) -> State {
378        State {
379            saves: vec![usize::MAX; n_saves],
380            stack: Vec::new(),
381            oldsave: Vec::new(),
382            nsave: 0,
383            explicit_sp: n_saves,
384            max_stack,
385            options,
386        }
387    }
388
389    // push a backtrack branch
390    fn push(&mut self, pc: usize, ix: usize) -> Result<()> {
391        if self.stack.len() < self.max_stack {
392            let nsave = self.nsave;
393            self.stack.push(Branch { pc, ix, nsave });
394            self.nsave = 0;
395            self.trace_stack("push");
396            Ok(())
397        } else {
398            Err(Error::RuntimeError(RuntimeError::StackOverflow))
399        }
400    }
401
402    // pop a backtrack branch
403    fn pop(&mut self) -> (usize, usize) {
404        for _ in 0..self.nsave {
405            let Save { slot, value } = self.oldsave.pop().unwrap();
406            self.saves[slot] = value;
407        }
408        let Branch { pc, ix, nsave } = self.stack.pop().unwrap();
409        self.nsave = nsave;
410        self.trace_stack("pop");
411        (pc, ix)
412    }
413
414    fn save(&mut self, slot: usize, val: usize) {
415        for i in 0..self.nsave {
416            // could avoid this iteration with some overhead; worth it?
417            if self.oldsave[self.oldsave.len() - i - 1].slot == slot {
418                // already saved, just update
419                self.saves[slot] = val;
420                return;
421            }
422        }
423        self.oldsave.push(Save {
424            slot,
425            value: self.saves[slot],
426        });
427        self.nsave += 1;
428        self.saves[slot] = val;
429
430        #[cfg(feature = "std")]
431        if self.options & OPTION_TRACE != 0 {
432            println!("saves: {:?}", self.saves);
433        }
434    }
435
436    fn get(&self, slot: usize) -> usize {
437        self.saves[slot]
438    }
439
440    // push a value onto the explicit stack; note: the entire contents of
441    // the explicit stack is saved and restored on backtrack.
442    fn stack_push(&mut self, val: usize) {
443        if self.saves.len() == self.explicit_sp {
444            self.saves.push(self.explicit_sp + 1);
445        }
446        let explicit_sp = self.explicit_sp;
447        let sp = self.get(explicit_sp);
448        if self.saves.len() == sp {
449            self.saves.push(val);
450        } else {
451            self.save(sp, val);
452        }
453        self.save(explicit_sp, sp + 1);
454    }
455
456    // pop a value from the explicit stack
457    fn stack_pop(&mut self) -> usize {
458        let explicit_sp = self.explicit_sp;
459        let sp = self.get(explicit_sp) - 1;
460        let result = self.get(sp);
461        self.save(explicit_sp, sp);
462        result
463    }
464
465    /// Get the current number of backtrack branches
466    fn backtrack_count(&self) -> usize {
467        self.stack.len()
468    }
469
470    /// Discard backtrack branches that were pushed since the call to `backtrack_count`.
471    ///
472    /// What we want:
473    /// * Keep the current `saves` as they are
474    /// * Only keep `count` backtrack branches on `stack`, discard the rest
475    /// * Keep the first `oldsave` for each slot, discard the rest (multiple pushes might have
476    ///   happened with saves to the same slot)
477    fn backtrack_cut(&mut self, count: usize) {
478        if self.stack.len() == count {
479            // no backtrack branches to discard, all good
480            return;
481        }
482        // start and end indexes of old saves for the branch we're cutting to
483        let (oldsave_start, oldsave_end) = {
484            let mut end = self.oldsave.len() - self.nsave;
485            for &Branch { nsave, .. } in &self.stack[count + 1..] {
486                end -= nsave;
487            }
488            let start = end - self.stack[count].nsave;
489            (start, end)
490        };
491        let mut saved = BTreeSet::new();
492        // keep all the old saves of our branch (they're all for different slots)
493        for &Save { slot, .. } in &self.oldsave[oldsave_start..oldsave_end] {
494            saved.insert(slot);
495        }
496        let mut oldsave_ix = oldsave_end;
497        // for other old saves, keep them only if they're for a slot that we haven't saved yet
498        for ix in oldsave_end..self.oldsave.len() {
499            let Save { slot, .. } = self.oldsave[ix];
500            let new_slot = saved.insert(slot);
501            if new_slot {
502                // put the save we want to keep (ix) after the ones we already have (oldsave_ix)
503                // note that it's fine if the indexes are the same (then swapping is a no-op)
504                self.oldsave.swap(oldsave_ix, ix);
505                oldsave_ix += 1;
506            }
507        }
508        self.stack.truncate(count);
509        self.oldsave.truncate(oldsave_ix);
510        self.nsave = oldsave_ix - oldsave_start;
511    }
512
513    #[inline]
514    #[allow(unused_variables)]
515    fn trace_stack(&self, operation: &str) {
516        #[cfg(feature = "std")]
517        if self.options & OPTION_TRACE != 0 {
518            println!("stack after {}: {:?}", operation, self.stack);
519        }
520    }
521}
522
523fn codepoint_len_at(s: &str, ix: usize) -> usize {
524    codepoint_len(s.as_bytes()[ix])
525}
526
527#[inline]
528fn matches_literal(s: &str, ix: usize, end: usize, literal: &str) -> bool {
529    // Compare as bytes because the literal might be a single byte char whereas ix
530    // points to a multibyte char. Comparing with str would result in an error like
531    // "byte index N is not a char boundary".
532    end <= s.len() && &s.as_bytes()[ix..end] == literal.as_bytes()
533}
534
535fn matches_literal_casei(s: &str, ix: usize, end: usize, literal: &str) -> bool {
536    if end > s.len() {
537        return false;
538    }
539    if matches_literal(s, ix, end, literal) {
540        return true;
541    }
542    if !s.is_char_boundary(ix) || !s.is_char_boundary(end) {
543        return false;
544    }
545    if s[ix..end].is_ascii() {
546        return s[ix..end].eq_ignore_ascii_case(literal);
547    }
548
549    // text captured and being backreferenced is not ascii, so we utilize regex-automata's case insensitive matching
550    use regex_syntax::ast::*;
551    let span = Span::splat(Position::new(0, 0, 0));
552    let literals = literal
553        .chars()
554        .map(|c| {
555            Ast::literal(Literal {
556                span,
557                kind: LiteralKind::Verbatim,
558                c,
559            })
560        })
561        .collect();
562    let ast = Ast::concat(Concat {
563        span,
564        asts: literals,
565    });
566
567    let mut translator = regex_syntax::hir::translate::TranslatorBuilder::new()
568        .case_insensitive(true)
569        .build();
570    let hir = translator.translate(literal, &ast).unwrap();
571
572    use regex_automata::meta::Builder as RaBuilder;
573    let re = RaBuilder::new()
574        .build_from_hir(&hir)
575        .expect("literal hir should get built successfully");
576    re.find(&s[ix..end]).is_some()
577}
578
579/// Helper function to store capture group positions from inner_slots into state.
580/// This is used by both Delegate and BackwardsDelegate instructions.
581#[inline]
582fn store_capture_groups(
583    state: &mut State,
584    inner_slots: &[Option<NonMaxUsize>],
585    range: CaptureGroupRange,
586) {
587    let start_group = range.start();
588    let end_group = range.end();
589    for i in 0..(end_group - start_group) {
590        let slot = (start_group + i) * 2;
591        if let Some(start) = inner_slots[(i + 1) * 2] {
592            let end = inner_slots[(i + 1) * 2 + 1].unwrap();
593            state.save(slot, start.get());
594            state.save(slot + 1, end.get());
595        } else {
596            state.save(slot, usize::MAX);
597            state.save(slot + 1, usize::MAX);
598        }
599    }
600}
601
602/// Run the program with trace printing for debugging.
603pub fn run_trace(prog: &Prog, s: &str, pos: usize) -> Result<Option<Vec<usize>>> {
604    run(prog, s, pos, OPTION_TRACE, &RegexOptions::default())
605}
606
607/// Run the program with default options.
608pub fn run_default(prog: &Prog, s: &str, pos: usize) -> Result<Option<Vec<usize>>> {
609    run(prog, s, pos, 0, &RegexOptions::default())
610}
611
612/// Run the program with options.
613#[allow(clippy::cognitive_complexity)]
614pub(crate) fn run(
615    prog: &Prog,
616    s: &str,
617    pos: usize,
618    option_flags: u32,
619    options: &RegexOptions,
620) -> Result<Option<Vec<usize>>> {
621    let mut state = State::new(prog.n_saves, MAX_STACK, option_flags);
622    let mut inner_slots: Vec<Option<NonMaxUsize>> = Vec::new();
623    let look_matcher = LookMatcher::new();
624    #[cfg(feature = "std")]
625    if option_flags & OPTION_TRACE != 0 {
626        println!("pos\tinstruction");
627    }
628    let mut backtrack_count = 0;
629    let mut pc = 0;
630    let mut ix = pos;
631    loop {
632        // break from this loop to fail, causes stack to pop
633        'fail: loop {
634            #[cfg(feature = "std")]
635            if option_flags & OPTION_TRACE != 0 {
636                println!("{}\t{} {:?}", ix, pc, prog.body[pc]);
637            }
638            match prog.body[pc] {
639                Insn::End => {
640                    // save of end position into slot 1 is now done
641                    // with an explicit group; we might want to
642                    // optimize that.
643                    //state.saves[1] = ix;
644                    #[cfg(feature = "std")]
645                    if option_flags & OPTION_TRACE != 0 {
646                        println!("saves: {:?}", state.saves);
647                    }
648                    if let Some(&slot1) = state.saves.get(1) {
649                        // With some features like keep out (\K), the match start can be after
650                        // the match end. Cap the start to <= end.
651                        if state.get(0) > slot1 {
652                            state.save(0, slot1);
653                        }
654                    }
655                    return Ok(Some(state.saves));
656                }
657                Insn::Any => {
658                    if ix < s.len() {
659                        ix += codepoint_len_at(s, ix);
660                    } else {
661                        break 'fail;
662                    }
663                }
664                Insn::AnyNoNL => {
665                    if ix < s.len() && s.as_bytes()[ix] != b'\n' {
666                        ix += codepoint_len_at(s, ix);
667                    } else {
668                        break 'fail;
669                    }
670                }
671                Insn::Lit(ref val) => {
672                    let ix_end = ix + val.len();
673                    if !matches_literal(s, ix, ix_end, val) {
674                        break 'fail;
675                    }
676                    ix = ix_end
677                }
678                Insn::Assertion(assertion) => {
679                    if !match assertion {
680                        Assertion::StartText => look_matcher.is_start(s.as_bytes(), ix),
681                        Assertion::EndText => look_matcher.is_end(s.as_bytes(), ix),
682                        Assertion::StartLine { crlf: false } => {
683                            look_matcher.is_start_lf(s.as_bytes(), ix)
684                        }
685                        Assertion::StartLine { crlf: true } => {
686                            look_matcher.is_start_crlf(s.as_bytes(), ix)
687                        }
688                        Assertion::EndLine { crlf: false } => {
689                            look_matcher.is_end_lf(s.as_bytes(), ix)
690                        }
691                        Assertion::EndLine { crlf: true } => {
692                            look_matcher.is_end_crlf(s.as_bytes(), ix)
693                        }
694                        Assertion::LeftWordBoundary => look_matcher
695                            .is_word_start_unicode(s.as_bytes(), ix)
696                            .unwrap(),
697                        Assertion::RightWordBoundary => {
698                            look_matcher.is_word_end_unicode(s.as_bytes(), ix).unwrap()
699                        }
700                        Assertion::LeftWordHalfBoundary => look_matcher
701                            .is_word_start_half_unicode(s.as_bytes(), ix)
702                            .unwrap(),
703                        Assertion::RightWordHalfBoundary => look_matcher
704                            .is_word_end_half_unicode(s.as_bytes(), ix)
705                            .unwrap(),
706                        Assertion::WordBoundary => {
707                            look_matcher.is_word_unicode(s.as_bytes(), ix).unwrap()
708                        }
709                        Assertion::NotWordBoundary => look_matcher
710                            .is_word_unicode_negate(s.as_bytes(), ix)
711                            .unwrap(),
712                    } {
713                        break 'fail;
714                    }
715                }
716                Insn::Split(x, y) => {
717                    state.push(y, ix)?;
718                    pc = x;
719                    continue;
720                }
721                Insn::Jmp(target) => {
722                    pc = target;
723                    continue;
724                }
725                Insn::Save(slot) => state.save(slot, ix),
726                Insn::Save0(slot) => state.save(slot, 0),
727                Insn::Restore(slot) => ix = state.get(slot),
728                Insn::RepeatGr {
729                    lo,
730                    hi,
731                    next,
732                    repeat,
733                } => {
734                    let repcount = state.get(repeat);
735                    if repcount == hi {
736                        pc = next;
737                        continue;
738                    }
739                    state.save(repeat, repcount + 1);
740                    if repcount >= lo {
741                        state.push(next, ix)?;
742                    }
743                }
744                Insn::RepeatNg {
745                    lo,
746                    hi,
747                    next,
748                    repeat,
749                } => {
750                    let repcount = state.get(repeat);
751                    if repcount == hi {
752                        pc = next;
753                        continue;
754                    }
755                    state.save(repeat, repcount + 1);
756                    if repcount >= lo {
757                        state.push(pc + 1, ix)?;
758                        pc = next;
759                        continue;
760                    }
761                }
762                Insn::RepeatEpsilonGr {
763                    lo,
764                    next,
765                    repeat,
766                    check,
767                } => {
768                    let repcount = state.get(repeat);
769                    if repcount > 0 && state.get(check) == ix {
770                        // zero-length match on repeat, then move to next instruction
771                        pc = next;
772                        continue;
773                    }
774                    state.save(repeat, repcount + 1);
775                    if repcount >= lo {
776                        state.save(check, ix);
777                        state.push(next, ix)?;
778                    }
779                }
780                Insn::RepeatEpsilonNg {
781                    lo,
782                    next,
783                    repeat,
784                    check,
785                } => {
786                    let repcount = state.get(repeat);
787                    if repcount > 0 && state.get(check) == ix {
788                        // zero-length match on repeat, then move to next instruction
789                        pc = next;
790                        continue;
791                    }
792                    state.save(repeat, repcount + 1);
793                    if repcount >= lo {
794                        state.save(check, ix);
795                        state.push(pc + 1, ix)?;
796                        pc = next;
797                        continue;
798                    }
799                }
800                Insn::GoBack(count) => {
801                    for _ in 0..count {
802                        if ix == 0 {
803                            break 'fail;
804                        }
805                        ix = prev_codepoint_ix(s, ix);
806                    }
807                }
808                Insn::FailNegativeLookAround => {
809                    // Reaching this instruction means that the body of the
810                    // look-around matched. Because it's a *negative* look-around,
811                    // that means the look-around itself should fail (not match).
812                    // But before, we need to discard all the states that have
813                    // been pushed with the look-around, because we don't want to
814                    // explore them.
815                    loop {
816                        let (popped_pc, _) = state.pop();
817                        if popped_pc == pc + 1 {
818                            // We've reached the state that would jump us to
819                            // after the look-around (in case the look-around
820                            // succeeded). That means we popped enough states.
821                            break;
822                        }
823                    }
824                    break 'fail;
825                }
826                Insn::Backref { slot, casei } => {
827                    let lo = state.get(slot);
828                    if lo == usize::MAX {
829                        // Referenced group hasn't matched, so the backref doesn't match either
830                        break 'fail;
831                    }
832                    let hi = state.get(slot + 1);
833                    if hi == usize::MAX {
834                        // Referenced group hasn't matched, so the backref doesn't match either
835                        break 'fail;
836                    }
837                    let ref_text = &s[lo..hi];
838                    let ix_end = ix + ref_text.len();
839                    if casei {
840                        if !matches_literal_casei(s, ix, ix_end, ref_text) {
841                            break 'fail;
842                        }
843                    } else if !matches_literal(s, ix, ix_end, ref_text) {
844                        break 'fail;
845                    }
846                    ix = ix_end;
847                }
848                Insn::BackrefExistsCondition(group) => {
849                    let lo = state.get(group * 2);
850                    if lo == usize::MAX {
851                        // Referenced group hasn't matched, so the backref doesn't match either
852                        break 'fail;
853                    }
854                }
855                #[cfg(feature = "variable-lookbehinds")]
856                Insn::BackwardsDelegate(ReverseBackwardsDelegate {
857                    ref dfa,
858                    ref cache_pool,
859                    pattern: _,
860                    ref capture_group_extraction_inner,
861                    capture_groups,
862                }) => {
863                    // Use regex-automata to search backwards from current position
864                    let mut cache_guard = cache_pool.get();
865                    let input = Input::new(s).anchored(Anchored::Yes).range(0..ix);
866
867                    match dfa.try_search_rev(&mut cache_guard, &input) {
868                        Ok(Some(match_result)) => {
869                            // Update ix to the start position of the match
870                            let match_start = match_result.offset();
871
872                            if let Some(inner) = capture_group_extraction_inner {
873                                if let Some(range) = capture_groups {
874                                    // There are capture groups, need to search forward to populate them
875                                    let forward_input =
876                                        Input::new(s).span(match_start..ix).anchored(Anchored::Yes);
877                                    inner_slots.resize((range.end() - range.start() + 1) * 2, None);
878
879                                    if inner
880                                        .search_slots(&forward_input, &mut inner_slots)
881                                        .is_some()
882                                    {
883                                        // Store capture group positions
884                                        store_capture_groups(&mut state, &inner_slots, range);
885                                    } else {
886                                        break 'fail;
887                                    }
888                                } else {
889                                    // No groups, just update ix to the match start
890                                    ix = match_start;
891                                }
892                            } else {
893                                // No groups, just update ix to the match start
894                                ix = match_start;
895                            }
896                        }
897                        _ => break 'fail,
898                    }
899                }
900                Insn::BeginAtomic => {
901                    let count = state.backtrack_count();
902                    state.stack_push(count);
903                }
904                Insn::EndAtomic => {
905                    let count = state.stack_pop();
906                    state.backtrack_cut(count);
907                }
908                Insn::Delegate(Delegate {
909                    ref inner,
910                    pattern: _,
911                    capture_groups,
912                }) => {
913                    let input = Input::new(s).span(ix..s.len()).anchored(Anchored::Yes);
914                    if let Some(range) = capture_groups {
915                        // Has capture groups, need to extract them
916                        inner_slots.resize((range.end() - range.start() + 1) * 2, None);
917                        if inner.search_slots(&input, &mut inner_slots).is_some() {
918                            store_capture_groups(&mut state, &inner_slots, range);
919                            ix = inner_slots[1].unwrap().get();
920                        } else {
921                            break 'fail;
922                        }
923                    } else {
924                        // No groups, so we can use faster methods
925                        match inner.search_half(&input) {
926                            Some(m) => ix = m.offset(),
927                            _ => break 'fail,
928                        }
929                    }
930                }
931                Insn::ContinueFromPreviousMatchEnd { at_start } => {
932                    if ix > pos || option_flags & OPTION_SKIPPED_EMPTY_MATCH != 0 {
933                        // If \G is at the start of the pattern, we can fail early
934                        // instead of checking at each position in the haystack
935                        // because \G will never match at any other position
936                        if at_start && state.stack.len() == 1 {
937                            // The only item on the stack is from the Split instruction for non-anchored search
938                            // We can safely return None immediately
939                            return Ok(None);
940                        }
941                        break 'fail;
942                    }
943                }
944            }
945            pc += 1;
946        }
947        #[cfg(feature = "std")]
948        if option_flags & OPTION_TRACE != 0 {
949            println!("fail");
950        }
951        // "break 'fail" goes here
952        if state.stack.is_empty() {
953            return Ok(None);
954        }
955
956        backtrack_count += 1;
957        if backtrack_count > options.backtrack_limit {
958            return Err(Error::RuntimeError(RuntimeError::BacktrackLimitExceeded));
959        }
960
961        let (newpc, newix) = state.pop();
962        pc = newpc;
963        ix = newix;
964    }
965}
966
967#[cfg(test)]
968mod tests {
969    use super::*;
970    use quickcheck::{quickcheck, Arbitrary, Gen};
971
972    #[test]
973    fn state_push_pop() {
974        let mut state = State::new(1, MAX_STACK, 0);
975
976        state.push(0, 0).unwrap();
977        state.push(1, 1).unwrap();
978        assert_eq!(state.pop(), (1, 1));
979        assert_eq!(state.pop(), (0, 0));
980        assert!(state.stack.is_empty());
981
982        state.push(2, 2).unwrap();
983        assert_eq!(state.pop(), (2, 2));
984        assert!(state.stack.is_empty());
985    }
986
987    #[test]
988    fn state_save_override() {
989        let mut state = State::new(1, MAX_STACK, 0);
990        state.save(0, 10);
991        state.push(0, 0).unwrap();
992        state.save(0, 20);
993        assert_eq!(state.pop(), (0, 0));
994        assert_eq!(state.get(0), 10);
995    }
996
997    #[test]
998    fn state_save_override_twice() {
999        let mut state = State::new(1, MAX_STACK, 0);
1000        state.save(0, 10);
1001        state.push(0, 0).unwrap();
1002        state.save(0, 20);
1003        state.push(1, 1).unwrap();
1004        state.save(0, 30);
1005
1006        assert_eq!(state.get(0), 30);
1007        assert_eq!(state.pop(), (1, 1));
1008        assert_eq!(state.get(0), 20);
1009        assert_eq!(state.pop(), (0, 0));
1010        assert_eq!(state.get(0), 10);
1011    }
1012
1013    #[test]
1014    fn state_explicit_stack() {
1015        let mut state = State::new(1, MAX_STACK, 0);
1016        state.stack_push(11);
1017        state.stack_push(12);
1018
1019        state.push(100, 101).unwrap();
1020        state.stack_push(13);
1021        assert_eq!(state.stack_pop(), 13);
1022        state.stack_push(14);
1023        assert_eq!(state.pop(), (100, 101));
1024
1025        // Note: 14 is not there because it was pushed as part of the backtrack branch
1026        assert_eq!(state.stack_pop(), 12);
1027        assert_eq!(state.stack_pop(), 11);
1028    }
1029
1030    #[test]
1031    fn state_backtrack_cut_simple() {
1032        let mut state = State::new(2, MAX_STACK, 0);
1033        state.save(0, 1);
1034        state.save(1, 2);
1035
1036        let count = state.backtrack_count();
1037
1038        state.push(0, 0).unwrap();
1039        state.save(0, 3);
1040        assert_eq!(state.backtrack_count(), 1);
1041
1042        state.backtrack_cut(count);
1043        assert_eq!(state.backtrack_count(), 0);
1044        assert_eq!(state.get(0), 3);
1045        assert_eq!(state.get(1), 2);
1046    }
1047
1048    #[test]
1049    fn state_backtrack_cut_complex() {
1050        let mut state = State::new(2, MAX_STACK, 0);
1051        state.save(0, 1);
1052        state.save(1, 2);
1053
1054        state.push(0, 0).unwrap();
1055        state.save(0, 3);
1056
1057        let count = state.backtrack_count();
1058
1059        state.push(1, 1).unwrap();
1060        state.save(0, 4);
1061        state.push(2, 2).unwrap();
1062        state.save(1, 5);
1063        assert_eq!(state.backtrack_count(), 3);
1064
1065        state.backtrack_cut(count);
1066        assert_eq!(state.backtrack_count(), 1);
1067        assert_eq!(state.get(0), 4);
1068        assert_eq!(state.get(1), 5);
1069
1070        state.pop();
1071        assert_eq!(state.backtrack_count(), 0);
1072        // Check that oldsave were set correctly
1073        assert_eq!(state.get(0), 1);
1074        assert_eq!(state.get(1), 2);
1075    }
1076
1077    #[derive(Clone, Debug)]
1078    enum Operation {
1079        Push,
1080        Pop,
1081        Save(usize, usize),
1082    }
1083
1084    impl Arbitrary for Operation {
1085        fn arbitrary(g: &mut Gen) -> Self {
1086            match g.choose(&[0, 1, 2]) {
1087                Some(0) => Operation::Push,
1088                Some(1) => Operation::Pop,
1089                _ => Operation::Save(
1090                    *g.choose(&[0usize, 1, 2, 3, 4]).unwrap(),
1091                    usize::arbitrary(g),
1092                ),
1093            }
1094        }
1095    }
1096
1097    fn check_saves_for_operations(operations: Vec<Operation>) -> bool {
1098        let slots = operations
1099            .iter()
1100            .map(|o| match o {
1101                &Operation::Save(slot, _) => slot + 1,
1102                _ => 0,
1103            })
1104            .max()
1105            .unwrap_or(0);
1106        if slots == 0 {
1107            // No point checking if there's no save instructions
1108            return true;
1109        }
1110
1111        // Stack with the complete VM state (including saves)
1112        let mut stack = Vec::new();
1113        let mut saves = vec![usize::MAX; slots];
1114
1115        let mut state = State::new(slots, MAX_STACK, 0);
1116
1117        let mut expected = Vec::new();
1118        let mut actual = Vec::new();
1119
1120        for operation in operations {
1121            match operation {
1122                Operation::Push => {
1123                    // We're not checking pc and ix later, so don't bother
1124                    // putting in random values.
1125                    stack.push((0, 0, saves.clone()));
1126                    state.push(0, 0).unwrap();
1127                }
1128                Operation::Pop => {
1129                    // Note that because we generate the operations randomly
1130                    // there might be more pops than pushes. So ignore a pop
1131                    // if the stack was empty.
1132                    if let Some((_, _, previous_saves)) = stack.pop() {
1133                        saves = previous_saves;
1134                        state.pop();
1135                    }
1136                }
1137                Operation::Save(slot, value) => {
1138                    saves[slot] = value;
1139                    state.save(slot, value);
1140                }
1141            }
1142
1143            // Remember state of saves for checking later
1144            expected.push(saves.clone());
1145            let mut actual_saves = vec![usize::MAX; slots];
1146            for (i, item) in actual_saves.iter_mut().enumerate().take(slots) {
1147                *item = state.get(i);
1148            }
1149            actual.push(actual_saves);
1150        }
1151
1152        expected == actual
1153    }
1154
1155    quickcheck! {
1156        fn state_save_quickcheck(operations: Vec<Operation>) -> bool {
1157            check_saves_for_operations(operations)
1158        }
1159    }
1160}