fancy_regex/vm.rs
1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Backtracking VM for implementing fancy regexes.
22//!
23//! Read <https://swtch.com/~rsc/regexp/regexp2.html> for a good introduction for how this works.
24//!
25//! The VM executes a sequence of instructions (a program) against an input string. It keeps track
26//! of a program counter (PC) and an index into the string (IX). Execution can have one or more
27//! threads.
28//!
29//! One of the basic instructions is `Lit`, which matches a string against the input. If it matches,
30//! the PC advances to the next instruction and the IX to the position after the matched string.
31//! If not, the current thread is stopped because it failed.
32//!
33//! If execution reaches an `End` instruction, the program is successful because a match was found.
34//! If there are no more threads to execute, the program has failed to match.
35//!
36//! A very simple program for the regex `a`:
37//!
38//! ```text
39//! 0: Lit("a")
40//! 1: End
41//! ```
42//!
43//! The `Split` instruction causes execution to split into two threads. The first thread is executed
44//! with the current string index. If it fails, we reset the string index and resume execution with
45//! the second thread. That is what "backtracking" refers to. In order to do that, we keep a stack
46//! of threads (PC and IX) to try.
47//!
48//! Example program for the regex `ab|ac`:
49//!
50//! ```text
51//! 0: Split(1, 4)
52//! 1: Lit("a")
53//! 2: Lit("b")
54//! 3: Jmp(6)
55//! 4: Lit("a")
56//! 5: Lit("c")
57//! 6: End
58//! ```
59//!
60//! The `Jmp` instruction causes execution to jump to the specified instruction. In the example it
61//! is needed to separate the two threads.
62//!
63//! Let's step through execution with that program for the input `ac`:
64//!
65//! 1. We're at PC 0 and IX 0
66//! 2. `Split(1, 4)` means we save a thread with PC 4 and IX 0 for trying later
67//! 3. Continue at `Lit("a")` which matches, so we advance IX to 1
68//! 4. `Lit("b")` doesn't match at IX 1 (`"b" != "c"`), so the thread fails
69//! 5. We continue with the previously saved thread at PC 4 and IX 0 (backtracking)
70//! 6. Both `Lit("a")` and `Lit("c")` match and we reach `End` -> successful match (index 0 to 2)
71
72use alloc::collections::BTreeSet;
73use alloc::string::String;
74#[cfg(feature = "variable-lookbehinds")]
75use alloc::sync::Arc;
76use alloc::vec;
77use alloc::vec::Vec;
78use regex_automata::meta::Regex;
79use regex_automata::util::look::LookMatcher;
80use regex_automata::util::primitives::NonMaxUsize;
81use regex_automata::Anchored;
82use regex_automata::Input;
83
84#[cfg(feature = "variable-lookbehinds")]
85use regex_automata::util::pool::Pool;
86
87#[cfg(feature = "variable-lookbehinds")]
88pub(crate) type CachePoolFn = alloc::boxed::Box<
89 dyn Fn() -> regex_automata::hybrid::dfa::Cache
90 + Send
91 + Sync
92 + core::panic::UnwindSafe
93 + core::panic::RefUnwindSafe,
94>;
95
96use crate::error::RuntimeError;
97use crate::prev_codepoint_ix;
98use crate::Assertion;
99use crate::Error;
100use crate::Formatter;
101use crate::Result;
102use crate::{codepoint_len, RegexOptions};
103
104/// Enable tracing of VM execution. Only for debugging/investigating.
105const OPTION_TRACE: u32 = 1 << 0;
106/// When iterating over all matches within a text (e.g. with `find_iter`), empty matches need to be
107/// handled specially. If we kept matching at the same position, we'd never stop. So what we do
108/// after we've had an empty match, is to advance the position where matching is attempted.
109/// If `\G` is used in the pattern, that means it no longer matches. If we didn't tell the VM about
110/// the fact that we skipped because of an empty match, it would still treat `\G` as matching. So
111/// this option is for communicating that to the VM. Phew.
112pub(crate) const OPTION_SKIPPED_EMPTY_MATCH: u32 = 1 << 1;
113
114// TODO: make configurable
115const MAX_STACK: usize = 1_000_000;
116
117/// Represents a range of capture groups by storing the first and last group numbers.
118#[derive(Clone, Copy, Debug, PartialEq, Eq)]
119pub struct CaptureGroupRange(pub usize, pub usize);
120
121impl CaptureGroupRange {
122 /// Returns the start (first) group number.
123 pub fn start(&self) -> usize {
124 self.0
125 }
126
127 /// Returns the end (last) group number.
128 pub fn end(&self) -> usize {
129 self.1
130 }
131
132 /// Converts this range to an Option, returning None if start equals end (no capture groups).
133 pub fn to_option_if_non_empty(self) -> Option<Self> {
134 if self.start() == self.end() {
135 None
136 } else {
137 Some(self)
138 }
139 }
140}
141
142#[derive(Clone)]
143/// Delegate matching to the regex crate
144pub struct Delegate {
145 /// The regex
146 pub inner: Regex,
147 /// The regex pattern as a string
148 pub pattern: String,
149 /// The range of capture groups. None if there are no capture groups.
150 pub capture_groups: Option<CaptureGroupRange>,
151}
152
153impl core::fmt::Debug for Delegate {
154 fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
155 // Ensures it fails to compile if the struct changes
156 let Self {
157 inner: _,
158 pattern,
159 capture_groups,
160 } = self;
161
162 f.debug_struct("Delegate")
163 .field("pattern", pattern)
164 .field("capture_groups", capture_groups)
165 .finish()
166 }
167}
168
169#[cfg(feature = "variable-lookbehinds")]
170/// Delegate matching in reverse to regex-automata
171pub struct ReverseBackwardsDelegate {
172 /// The regex pattern as a string which will be matched in reverse, in a backwards direction
173 pub pattern: String,
174 /// The delegate regex to match backwards (wrapped in Arc for efficient cloning)
175 pub(crate) dfa: Arc<regex_automata::hybrid::dfa::DFA>,
176 /// Cache pool for DFA searches
177 pub(crate) cache_pool: Pool<regex_automata::hybrid::dfa::Cache, CachePoolFn>,
178 /// The forward regex for capture group extraction
179 pub(crate) capture_group_extraction_inner: Option<Regex>,
180 /// The range of capture groups. None if there are no capture groups.
181 pub capture_groups: Option<CaptureGroupRange>,
182}
183
184#[cfg(feature = "variable-lookbehinds")]
185impl Clone for ReverseBackwardsDelegate {
186 fn clone(&self) -> Self {
187 let dfa_for_closure = Arc::clone(&self.dfa);
188 let create: CachePoolFn = alloc::boxed::Box::new(move || dfa_for_closure.create_cache());
189 Self {
190 pattern: self.pattern.clone(),
191 cache_pool: Pool::new(create),
192 dfa: Arc::clone(&self.dfa),
193 capture_group_extraction_inner: self.capture_group_extraction_inner.clone(),
194 capture_groups: self.capture_groups,
195 }
196 }
197}
198
199#[cfg(feature = "variable-lookbehinds")]
200impl core::fmt::Debug for ReverseBackwardsDelegate {
201 fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
202 // Ensures it fails to compile if the struct changes
203 let Self {
204 pattern,
205 dfa: _,
206 cache_pool: _,
207 capture_group_extraction_inner: _,
208 capture_groups,
209 } = self;
210
211 f.debug_struct("ReverseBackwardsDelegate")
212 .field("pattern", pattern)
213 .field("capture_groups", capture_groups)
214 .finish()
215 }
216}
217
218/// Instruction of the VM.
219#[derive(Debug)]
220pub enum Insn {
221 /// Successful end of program
222 End,
223 /// Match any character (including newline)
224 Any,
225 /// Match any character (not including newline)
226 AnyNoNL,
227 /// Assertions
228 Assertion(Assertion),
229 /// Match the literal string at the current index
230 Lit(String), // should be cow?
231 /// Split execution into two threads. The two fields are positions of instructions. Execution
232 /// first tries the first thread. If that fails, the second position is tried.
233 Split(usize, usize),
234 /// Jump to instruction at position
235 Jmp(usize),
236 /// Save the current string index into the specified slot
237 Save(usize),
238 /// Save `0` into the specified slot
239 Save0(usize),
240 /// Set the string index to the value that was saved in the specified slot
241 Restore(usize),
242 /// Repeat greedily (match as much as possible)
243 RepeatGr {
244 /// Minimum number of matches
245 lo: usize,
246 /// Maximum number of matches
247 hi: usize,
248 /// The instruction after the repeat
249 next: usize,
250 /// The slot for keeping track of the number of repetitions
251 repeat: usize,
252 },
253 /// Repeat non-greedily (prefer matching as little as possible)
254 RepeatNg {
255 /// Minimum number of matches
256 lo: usize,
257 /// Maximum number of matches
258 hi: usize,
259 /// The instruction after the repeat
260 next: usize,
261 /// The slot for keeping track of the number of repetitions
262 repeat: usize,
263 },
264 /// Repeat greedily and prevent infinite loops from empty matches
265 RepeatEpsilonGr {
266 /// Minimum number of matches
267 lo: usize,
268 /// The instruction after the repeat
269 next: usize,
270 /// The slot for keeping track of the number of repetitions
271 repeat: usize,
272 /// The slot for saving the previous IX to check if we had an empty match
273 check: usize,
274 },
275 /// Repeat non-greedily and prevent infinite loops from empty matches
276 RepeatEpsilonNg {
277 /// Minimum number of matches
278 lo: usize,
279 /// The instruction after the repeat
280 next: usize,
281 /// The slot for keeping track of the number of repetitions
282 repeat: usize,
283 /// The slot for saving the previous IX to check if we had an empty match
284 check: usize,
285 },
286 /// Negative look-around failed
287 FailNegativeLookAround,
288 /// Set IX back by the specified number of characters
289 GoBack(usize),
290 /// Back reference to a group number to check
291 Backref {
292 /// The save slot representing the start of the capture group
293 slot: usize,
294 /// Whether the backref should be matched case insensitively
295 casei: bool,
296 },
297 /// Begin of atomic group
298 BeginAtomic,
299 /// End of atomic group
300 EndAtomic,
301 /// Delegate matching to the regex crate
302 Delegate(Delegate),
303 /// Anchor to match at the position where the previous match ended.
304 ContinueFromPreviousMatchEnd {
305 /// Whether this is at the start of the pattern (allowing early exit on failure)
306 at_start: bool,
307 },
308 /// Continue only if the specified capture group has already been populated as part of the match
309 BackrefExistsCondition(usize),
310 #[cfg(feature = "variable-lookbehinds")]
311 /// Reverse lookbehind using regex-automata for variable-sized patterns
312 BackwardsDelegate(ReverseBackwardsDelegate),
313}
314
315/// Sequence of instructions for the VM to execute.
316#[derive(Debug)]
317pub struct Prog {
318 /// Instructions of the program
319 pub body: Vec<Insn>,
320 n_saves: usize,
321}
322
323impl Prog {
324 pub(crate) fn new(body: Vec<Insn>, n_saves: usize) -> Prog {
325 Prog { body, n_saves }
326 }
327
328 #[doc(hidden)]
329 pub(crate) fn debug_print(&self, writer: &mut Formatter<'_>) -> core::fmt::Result {
330 for (i, insn) in self.body.iter().enumerate() {
331 writeln!(writer, "{:3}: {:?}", i, insn)?;
332 }
333 Ok(())
334 }
335}
336
337#[derive(Debug)]
338struct Branch {
339 pc: usize,
340 ix: usize,
341 nsave: usize,
342}
343
344#[derive(Debug)]
345struct Save {
346 slot: usize,
347 value: usize,
348}
349
350struct State {
351 /// Saved values indexed by slot. Mostly indices to s, but can be repeat values etc.
352 /// Always contains the saves of the current state.
353 saves: Vec<usize>,
354 /// Stack of backtrack branches.
355 stack: Vec<Branch>,
356 /// Old saves (slot, value)
357 oldsave: Vec<Save>,
358 /// Number of saves at the end of `oldsave` that need to be restored to `saves` on pop
359 nsave: usize,
360 explicit_sp: usize,
361 /// Maximum size of the stack. If the size would be exceeded during execution, a `StackOverflow`
362 /// error is raised.
363 max_stack: usize,
364 #[allow(dead_code)]
365 options: u32,
366}
367
368// Each element in the stack conceptually represents the entire state
369// of the machine: the pc (index into prog), the index into the
370// string, and the entire vector of saves. However, copying the save
371// vector on every push/pop would be inefficient, so instead we use a
372// copy-on-write approach for each slot within the save vector. The
373// top `nsave` elements in `oldsave` represent the delta from the
374// current machine state to the top of stack.
375
376impl State {
377 fn new(n_saves: usize, max_stack: usize, options: u32) -> State {
378 State {
379 saves: vec![usize::MAX; n_saves],
380 stack: Vec::new(),
381 oldsave: Vec::new(),
382 nsave: 0,
383 explicit_sp: n_saves,
384 max_stack,
385 options,
386 }
387 }
388
389 // push a backtrack branch
390 fn push(&mut self, pc: usize, ix: usize) -> Result<()> {
391 if self.stack.len() < self.max_stack {
392 let nsave = self.nsave;
393 self.stack.push(Branch { pc, ix, nsave });
394 self.nsave = 0;
395 self.trace_stack("push");
396 Ok(())
397 } else {
398 Err(Error::RuntimeError(RuntimeError::StackOverflow))
399 }
400 }
401
402 // pop a backtrack branch
403 fn pop(&mut self) -> (usize, usize) {
404 for _ in 0..self.nsave {
405 let Save { slot, value } = self.oldsave.pop().unwrap();
406 self.saves[slot] = value;
407 }
408 let Branch { pc, ix, nsave } = self.stack.pop().unwrap();
409 self.nsave = nsave;
410 self.trace_stack("pop");
411 (pc, ix)
412 }
413
414 fn save(&mut self, slot: usize, val: usize) {
415 for i in 0..self.nsave {
416 // could avoid this iteration with some overhead; worth it?
417 if self.oldsave[self.oldsave.len() - i - 1].slot == slot {
418 // already saved, just update
419 self.saves[slot] = val;
420 return;
421 }
422 }
423 self.oldsave.push(Save {
424 slot,
425 value: self.saves[slot],
426 });
427 self.nsave += 1;
428 self.saves[slot] = val;
429
430 #[cfg(feature = "std")]
431 if self.options & OPTION_TRACE != 0 {
432 println!("saves: {:?}", self.saves);
433 }
434 }
435
436 fn get(&self, slot: usize) -> usize {
437 self.saves[slot]
438 }
439
440 // push a value onto the explicit stack; note: the entire contents of
441 // the explicit stack is saved and restored on backtrack.
442 fn stack_push(&mut self, val: usize) {
443 if self.saves.len() == self.explicit_sp {
444 self.saves.push(self.explicit_sp + 1);
445 }
446 let explicit_sp = self.explicit_sp;
447 let sp = self.get(explicit_sp);
448 if self.saves.len() == sp {
449 self.saves.push(val);
450 } else {
451 self.save(sp, val);
452 }
453 self.save(explicit_sp, sp + 1);
454 }
455
456 // pop a value from the explicit stack
457 fn stack_pop(&mut self) -> usize {
458 let explicit_sp = self.explicit_sp;
459 let sp = self.get(explicit_sp) - 1;
460 let result = self.get(sp);
461 self.save(explicit_sp, sp);
462 result
463 }
464
465 /// Get the current number of backtrack branches
466 fn backtrack_count(&self) -> usize {
467 self.stack.len()
468 }
469
470 /// Discard backtrack branches that were pushed since the call to `backtrack_count`.
471 ///
472 /// What we want:
473 /// * Keep the current `saves` as they are
474 /// * Only keep `count` backtrack branches on `stack`, discard the rest
475 /// * Keep the first `oldsave` for each slot, discard the rest (multiple pushes might have
476 /// happened with saves to the same slot)
477 fn backtrack_cut(&mut self, count: usize) {
478 if self.stack.len() == count {
479 // no backtrack branches to discard, all good
480 return;
481 }
482 // start and end indexes of old saves for the branch we're cutting to
483 let (oldsave_start, oldsave_end) = {
484 let mut end = self.oldsave.len() - self.nsave;
485 for &Branch { nsave, .. } in &self.stack[count + 1..] {
486 end -= nsave;
487 }
488 let start = end - self.stack[count].nsave;
489 (start, end)
490 };
491 let mut saved = BTreeSet::new();
492 // keep all the old saves of our branch (they're all for different slots)
493 for &Save { slot, .. } in &self.oldsave[oldsave_start..oldsave_end] {
494 saved.insert(slot);
495 }
496 let mut oldsave_ix = oldsave_end;
497 // for other old saves, keep them only if they're for a slot that we haven't saved yet
498 for ix in oldsave_end..self.oldsave.len() {
499 let Save { slot, .. } = self.oldsave[ix];
500 let new_slot = saved.insert(slot);
501 if new_slot {
502 // put the save we want to keep (ix) after the ones we already have (oldsave_ix)
503 // note that it's fine if the indexes are the same (then swapping is a no-op)
504 self.oldsave.swap(oldsave_ix, ix);
505 oldsave_ix += 1;
506 }
507 }
508 self.stack.truncate(count);
509 self.oldsave.truncate(oldsave_ix);
510 self.nsave = oldsave_ix - oldsave_start;
511 }
512
513 #[inline]
514 #[allow(unused_variables)]
515 fn trace_stack(&self, operation: &str) {
516 #[cfg(feature = "std")]
517 if self.options & OPTION_TRACE != 0 {
518 println!("stack after {}: {:?}", operation, self.stack);
519 }
520 }
521}
522
523fn codepoint_len_at(s: &str, ix: usize) -> usize {
524 codepoint_len(s.as_bytes()[ix])
525}
526
527#[inline]
528fn matches_literal(s: &str, ix: usize, end: usize, literal: &str) -> bool {
529 // Compare as bytes because the literal might be a single byte char whereas ix
530 // points to a multibyte char. Comparing with str would result in an error like
531 // "byte index N is not a char boundary".
532 end <= s.len() && &s.as_bytes()[ix..end] == literal.as_bytes()
533}
534
535fn matches_literal_casei(s: &str, ix: usize, end: usize, literal: &str) -> bool {
536 if end > s.len() {
537 return false;
538 }
539 if matches_literal(s, ix, end, literal) {
540 return true;
541 }
542 if !s.is_char_boundary(ix) || !s.is_char_boundary(end) {
543 return false;
544 }
545 if s[ix..end].is_ascii() {
546 return s[ix..end].eq_ignore_ascii_case(literal);
547 }
548
549 // text captured and being backreferenced is not ascii, so we utilize regex-automata's case insensitive matching
550 use regex_syntax::ast::*;
551 let span = Span::splat(Position::new(0, 0, 0));
552 let literals = literal
553 .chars()
554 .map(|c| {
555 Ast::literal(Literal {
556 span,
557 kind: LiteralKind::Verbatim,
558 c,
559 })
560 })
561 .collect();
562 let ast = Ast::concat(Concat {
563 span,
564 asts: literals,
565 });
566
567 let mut translator = regex_syntax::hir::translate::TranslatorBuilder::new()
568 .case_insensitive(true)
569 .build();
570 let hir = translator.translate(literal, &ast).unwrap();
571
572 use regex_automata::meta::Builder as RaBuilder;
573 let re = RaBuilder::new()
574 .build_from_hir(&hir)
575 .expect("literal hir should get built successfully");
576 re.find(&s[ix..end]).is_some()
577}
578
579/// Helper function to store capture group positions from inner_slots into state.
580/// This is used by both Delegate and BackwardsDelegate instructions.
581#[inline]
582fn store_capture_groups(
583 state: &mut State,
584 inner_slots: &[Option<NonMaxUsize>],
585 range: CaptureGroupRange,
586) {
587 let start_group = range.start();
588 let end_group = range.end();
589 for i in 0..(end_group - start_group) {
590 let slot = (start_group + i) * 2;
591 if let Some(start) = inner_slots[(i + 1) * 2] {
592 let end = inner_slots[(i + 1) * 2 + 1].unwrap();
593 state.save(slot, start.get());
594 state.save(slot + 1, end.get());
595 } else {
596 state.save(slot, usize::MAX);
597 state.save(slot + 1, usize::MAX);
598 }
599 }
600}
601
602/// Run the program with trace printing for debugging.
603pub fn run_trace(prog: &Prog, s: &str, pos: usize) -> Result<Option<Vec<usize>>> {
604 run(prog, s, pos, OPTION_TRACE, &RegexOptions::default())
605}
606
607/// Run the program with default options.
608pub fn run_default(prog: &Prog, s: &str, pos: usize) -> Result<Option<Vec<usize>>> {
609 run(prog, s, pos, 0, &RegexOptions::default())
610}
611
612/// Run the program with options.
613#[allow(clippy::cognitive_complexity)]
614pub(crate) fn run(
615 prog: &Prog,
616 s: &str,
617 pos: usize,
618 option_flags: u32,
619 options: &RegexOptions,
620) -> Result<Option<Vec<usize>>> {
621 let mut state = State::new(prog.n_saves, MAX_STACK, option_flags);
622 let mut inner_slots: Vec<Option<NonMaxUsize>> = Vec::new();
623 let look_matcher = LookMatcher::new();
624 #[cfg(feature = "std")]
625 if option_flags & OPTION_TRACE != 0 {
626 println!("pos\tinstruction");
627 }
628 let mut backtrack_count = 0;
629 let mut pc = 0;
630 let mut ix = pos;
631 loop {
632 // break from this loop to fail, causes stack to pop
633 'fail: loop {
634 #[cfg(feature = "std")]
635 if option_flags & OPTION_TRACE != 0 {
636 println!("{}\t{} {:?}", ix, pc, prog.body[pc]);
637 }
638 match prog.body[pc] {
639 Insn::End => {
640 // save of end position into slot 1 is now done
641 // with an explicit group; we might want to
642 // optimize that.
643 //state.saves[1] = ix;
644 #[cfg(feature = "std")]
645 if option_flags & OPTION_TRACE != 0 {
646 println!("saves: {:?}", state.saves);
647 }
648 if let Some(&slot1) = state.saves.get(1) {
649 // With some features like keep out (\K), the match start can be after
650 // the match end. Cap the start to <= end.
651 if state.get(0) > slot1 {
652 state.save(0, slot1);
653 }
654 }
655 return Ok(Some(state.saves));
656 }
657 Insn::Any => {
658 if ix < s.len() {
659 ix += codepoint_len_at(s, ix);
660 } else {
661 break 'fail;
662 }
663 }
664 Insn::AnyNoNL => {
665 if ix < s.len() && s.as_bytes()[ix] != b'\n' {
666 ix += codepoint_len_at(s, ix);
667 } else {
668 break 'fail;
669 }
670 }
671 Insn::Lit(ref val) => {
672 let ix_end = ix + val.len();
673 if !matches_literal(s, ix, ix_end, val) {
674 break 'fail;
675 }
676 ix = ix_end
677 }
678 Insn::Assertion(assertion) => {
679 if !match assertion {
680 Assertion::StartText => look_matcher.is_start(s.as_bytes(), ix),
681 Assertion::EndText => look_matcher.is_end(s.as_bytes(), ix),
682 Assertion::StartLine { crlf: false } => {
683 look_matcher.is_start_lf(s.as_bytes(), ix)
684 }
685 Assertion::StartLine { crlf: true } => {
686 look_matcher.is_start_crlf(s.as_bytes(), ix)
687 }
688 Assertion::EndLine { crlf: false } => {
689 look_matcher.is_end_lf(s.as_bytes(), ix)
690 }
691 Assertion::EndLine { crlf: true } => {
692 look_matcher.is_end_crlf(s.as_bytes(), ix)
693 }
694 Assertion::LeftWordBoundary => look_matcher
695 .is_word_start_unicode(s.as_bytes(), ix)
696 .unwrap(),
697 Assertion::RightWordBoundary => {
698 look_matcher.is_word_end_unicode(s.as_bytes(), ix).unwrap()
699 }
700 Assertion::LeftWordHalfBoundary => look_matcher
701 .is_word_start_half_unicode(s.as_bytes(), ix)
702 .unwrap(),
703 Assertion::RightWordHalfBoundary => look_matcher
704 .is_word_end_half_unicode(s.as_bytes(), ix)
705 .unwrap(),
706 Assertion::WordBoundary => {
707 look_matcher.is_word_unicode(s.as_bytes(), ix).unwrap()
708 }
709 Assertion::NotWordBoundary => look_matcher
710 .is_word_unicode_negate(s.as_bytes(), ix)
711 .unwrap(),
712 } {
713 break 'fail;
714 }
715 }
716 Insn::Split(x, y) => {
717 state.push(y, ix)?;
718 pc = x;
719 continue;
720 }
721 Insn::Jmp(target) => {
722 pc = target;
723 continue;
724 }
725 Insn::Save(slot) => state.save(slot, ix),
726 Insn::Save0(slot) => state.save(slot, 0),
727 Insn::Restore(slot) => ix = state.get(slot),
728 Insn::RepeatGr {
729 lo,
730 hi,
731 next,
732 repeat,
733 } => {
734 let repcount = state.get(repeat);
735 if repcount == hi {
736 pc = next;
737 continue;
738 }
739 state.save(repeat, repcount + 1);
740 if repcount >= lo {
741 state.push(next, ix)?;
742 }
743 }
744 Insn::RepeatNg {
745 lo,
746 hi,
747 next,
748 repeat,
749 } => {
750 let repcount = state.get(repeat);
751 if repcount == hi {
752 pc = next;
753 continue;
754 }
755 state.save(repeat, repcount + 1);
756 if repcount >= lo {
757 state.push(pc + 1, ix)?;
758 pc = next;
759 continue;
760 }
761 }
762 Insn::RepeatEpsilonGr {
763 lo,
764 next,
765 repeat,
766 check,
767 } => {
768 let repcount = state.get(repeat);
769 if repcount > 0 && state.get(check) == ix {
770 // zero-length match on repeat, then move to next instruction
771 pc = next;
772 continue;
773 }
774 state.save(repeat, repcount + 1);
775 if repcount >= lo {
776 state.save(check, ix);
777 state.push(next, ix)?;
778 }
779 }
780 Insn::RepeatEpsilonNg {
781 lo,
782 next,
783 repeat,
784 check,
785 } => {
786 let repcount = state.get(repeat);
787 if repcount > 0 && state.get(check) == ix {
788 // zero-length match on repeat, then move to next instruction
789 pc = next;
790 continue;
791 }
792 state.save(repeat, repcount + 1);
793 if repcount >= lo {
794 state.save(check, ix);
795 state.push(pc + 1, ix)?;
796 pc = next;
797 continue;
798 }
799 }
800 Insn::GoBack(count) => {
801 for _ in 0..count {
802 if ix == 0 {
803 break 'fail;
804 }
805 ix = prev_codepoint_ix(s, ix);
806 }
807 }
808 Insn::FailNegativeLookAround => {
809 // Reaching this instruction means that the body of the
810 // look-around matched. Because it's a *negative* look-around,
811 // that means the look-around itself should fail (not match).
812 // But before, we need to discard all the states that have
813 // been pushed with the look-around, because we don't want to
814 // explore them.
815 loop {
816 let (popped_pc, _) = state.pop();
817 if popped_pc == pc + 1 {
818 // We've reached the state that would jump us to
819 // after the look-around (in case the look-around
820 // succeeded). That means we popped enough states.
821 break;
822 }
823 }
824 break 'fail;
825 }
826 Insn::Backref { slot, casei } => {
827 let lo = state.get(slot);
828 if lo == usize::MAX {
829 // Referenced group hasn't matched, so the backref doesn't match either
830 break 'fail;
831 }
832 let hi = state.get(slot + 1);
833 if hi == usize::MAX {
834 // Referenced group hasn't matched, so the backref doesn't match either
835 break 'fail;
836 }
837 let ref_text = &s[lo..hi];
838 let ix_end = ix + ref_text.len();
839 if casei {
840 if !matches_literal_casei(s, ix, ix_end, ref_text) {
841 break 'fail;
842 }
843 } else if !matches_literal(s, ix, ix_end, ref_text) {
844 break 'fail;
845 }
846 ix = ix_end;
847 }
848 Insn::BackrefExistsCondition(group) => {
849 let lo = state.get(group * 2);
850 if lo == usize::MAX {
851 // Referenced group hasn't matched, so the backref doesn't match either
852 break 'fail;
853 }
854 }
855 #[cfg(feature = "variable-lookbehinds")]
856 Insn::BackwardsDelegate(ReverseBackwardsDelegate {
857 ref dfa,
858 ref cache_pool,
859 pattern: _,
860 ref capture_group_extraction_inner,
861 capture_groups,
862 }) => {
863 // Use regex-automata to search backwards from current position
864 let mut cache_guard = cache_pool.get();
865 let input = Input::new(s).anchored(Anchored::Yes).range(0..ix);
866
867 match dfa.try_search_rev(&mut cache_guard, &input) {
868 Ok(Some(match_result)) => {
869 // Update ix to the start position of the match
870 let match_start = match_result.offset();
871
872 if let Some(inner) = capture_group_extraction_inner {
873 if let Some(range) = capture_groups {
874 // There are capture groups, need to search forward to populate them
875 let forward_input =
876 Input::new(s).span(match_start..ix).anchored(Anchored::Yes);
877 inner_slots.resize((range.end() - range.start() + 1) * 2, None);
878
879 if inner
880 .search_slots(&forward_input, &mut inner_slots)
881 .is_some()
882 {
883 // Store capture group positions
884 store_capture_groups(&mut state, &inner_slots, range);
885 } else {
886 break 'fail;
887 }
888 } else {
889 // No groups, just update ix to the match start
890 ix = match_start;
891 }
892 } else {
893 // No groups, just update ix to the match start
894 ix = match_start;
895 }
896 }
897 _ => break 'fail,
898 }
899 }
900 Insn::BeginAtomic => {
901 let count = state.backtrack_count();
902 state.stack_push(count);
903 }
904 Insn::EndAtomic => {
905 let count = state.stack_pop();
906 state.backtrack_cut(count);
907 }
908 Insn::Delegate(Delegate {
909 ref inner,
910 pattern: _,
911 capture_groups,
912 }) => {
913 let input = Input::new(s).span(ix..s.len()).anchored(Anchored::Yes);
914 if let Some(range) = capture_groups {
915 // Has capture groups, need to extract them
916 inner_slots.resize((range.end() - range.start() + 1) * 2, None);
917 if inner.search_slots(&input, &mut inner_slots).is_some() {
918 store_capture_groups(&mut state, &inner_slots, range);
919 ix = inner_slots[1].unwrap().get();
920 } else {
921 break 'fail;
922 }
923 } else {
924 // No groups, so we can use faster methods
925 match inner.search_half(&input) {
926 Some(m) => ix = m.offset(),
927 _ => break 'fail,
928 }
929 }
930 }
931 Insn::ContinueFromPreviousMatchEnd { at_start } => {
932 if ix > pos || option_flags & OPTION_SKIPPED_EMPTY_MATCH != 0 {
933 // If \G is at the start of the pattern, we can fail early
934 // instead of checking at each position in the haystack
935 // because \G will never match at any other position
936 if at_start && state.stack.len() == 1 {
937 // The only item on the stack is from the Split instruction for non-anchored search
938 // We can safely return None immediately
939 return Ok(None);
940 }
941 break 'fail;
942 }
943 }
944 }
945 pc += 1;
946 }
947 #[cfg(feature = "std")]
948 if option_flags & OPTION_TRACE != 0 {
949 println!("fail");
950 }
951 // "break 'fail" goes here
952 if state.stack.is_empty() {
953 return Ok(None);
954 }
955
956 backtrack_count += 1;
957 if backtrack_count > options.backtrack_limit {
958 return Err(Error::RuntimeError(RuntimeError::BacktrackLimitExceeded));
959 }
960
961 let (newpc, newix) = state.pop();
962 pc = newpc;
963 ix = newix;
964 }
965}
966
967#[cfg(test)]
968mod tests {
969 use super::*;
970 use quickcheck::{quickcheck, Arbitrary, Gen};
971
972 #[test]
973 fn state_push_pop() {
974 let mut state = State::new(1, MAX_STACK, 0);
975
976 state.push(0, 0).unwrap();
977 state.push(1, 1).unwrap();
978 assert_eq!(state.pop(), (1, 1));
979 assert_eq!(state.pop(), (0, 0));
980 assert!(state.stack.is_empty());
981
982 state.push(2, 2).unwrap();
983 assert_eq!(state.pop(), (2, 2));
984 assert!(state.stack.is_empty());
985 }
986
987 #[test]
988 fn state_save_override() {
989 let mut state = State::new(1, MAX_STACK, 0);
990 state.save(0, 10);
991 state.push(0, 0).unwrap();
992 state.save(0, 20);
993 assert_eq!(state.pop(), (0, 0));
994 assert_eq!(state.get(0), 10);
995 }
996
997 #[test]
998 fn state_save_override_twice() {
999 let mut state = State::new(1, MAX_STACK, 0);
1000 state.save(0, 10);
1001 state.push(0, 0).unwrap();
1002 state.save(0, 20);
1003 state.push(1, 1).unwrap();
1004 state.save(0, 30);
1005
1006 assert_eq!(state.get(0), 30);
1007 assert_eq!(state.pop(), (1, 1));
1008 assert_eq!(state.get(0), 20);
1009 assert_eq!(state.pop(), (0, 0));
1010 assert_eq!(state.get(0), 10);
1011 }
1012
1013 #[test]
1014 fn state_explicit_stack() {
1015 let mut state = State::new(1, MAX_STACK, 0);
1016 state.stack_push(11);
1017 state.stack_push(12);
1018
1019 state.push(100, 101).unwrap();
1020 state.stack_push(13);
1021 assert_eq!(state.stack_pop(), 13);
1022 state.stack_push(14);
1023 assert_eq!(state.pop(), (100, 101));
1024
1025 // Note: 14 is not there because it was pushed as part of the backtrack branch
1026 assert_eq!(state.stack_pop(), 12);
1027 assert_eq!(state.stack_pop(), 11);
1028 }
1029
1030 #[test]
1031 fn state_backtrack_cut_simple() {
1032 let mut state = State::new(2, MAX_STACK, 0);
1033 state.save(0, 1);
1034 state.save(1, 2);
1035
1036 let count = state.backtrack_count();
1037
1038 state.push(0, 0).unwrap();
1039 state.save(0, 3);
1040 assert_eq!(state.backtrack_count(), 1);
1041
1042 state.backtrack_cut(count);
1043 assert_eq!(state.backtrack_count(), 0);
1044 assert_eq!(state.get(0), 3);
1045 assert_eq!(state.get(1), 2);
1046 }
1047
1048 #[test]
1049 fn state_backtrack_cut_complex() {
1050 let mut state = State::new(2, MAX_STACK, 0);
1051 state.save(0, 1);
1052 state.save(1, 2);
1053
1054 state.push(0, 0).unwrap();
1055 state.save(0, 3);
1056
1057 let count = state.backtrack_count();
1058
1059 state.push(1, 1).unwrap();
1060 state.save(0, 4);
1061 state.push(2, 2).unwrap();
1062 state.save(1, 5);
1063 assert_eq!(state.backtrack_count(), 3);
1064
1065 state.backtrack_cut(count);
1066 assert_eq!(state.backtrack_count(), 1);
1067 assert_eq!(state.get(0), 4);
1068 assert_eq!(state.get(1), 5);
1069
1070 state.pop();
1071 assert_eq!(state.backtrack_count(), 0);
1072 // Check that oldsave were set correctly
1073 assert_eq!(state.get(0), 1);
1074 assert_eq!(state.get(1), 2);
1075 }
1076
1077 #[derive(Clone, Debug)]
1078 enum Operation {
1079 Push,
1080 Pop,
1081 Save(usize, usize),
1082 }
1083
1084 impl Arbitrary for Operation {
1085 fn arbitrary(g: &mut Gen) -> Self {
1086 match g.choose(&[0, 1, 2]) {
1087 Some(0) => Operation::Push,
1088 Some(1) => Operation::Pop,
1089 _ => Operation::Save(
1090 *g.choose(&[0usize, 1, 2, 3, 4]).unwrap(),
1091 usize::arbitrary(g),
1092 ),
1093 }
1094 }
1095 }
1096
1097 fn check_saves_for_operations(operations: Vec<Operation>) -> bool {
1098 let slots = operations
1099 .iter()
1100 .map(|o| match o {
1101 &Operation::Save(slot, _) => slot + 1,
1102 _ => 0,
1103 })
1104 .max()
1105 .unwrap_or(0);
1106 if slots == 0 {
1107 // No point checking if there's no save instructions
1108 return true;
1109 }
1110
1111 // Stack with the complete VM state (including saves)
1112 let mut stack = Vec::new();
1113 let mut saves = vec![usize::MAX; slots];
1114
1115 let mut state = State::new(slots, MAX_STACK, 0);
1116
1117 let mut expected = Vec::new();
1118 let mut actual = Vec::new();
1119
1120 for operation in operations {
1121 match operation {
1122 Operation::Push => {
1123 // We're not checking pc and ix later, so don't bother
1124 // putting in random values.
1125 stack.push((0, 0, saves.clone()));
1126 state.push(0, 0).unwrap();
1127 }
1128 Operation::Pop => {
1129 // Note that because we generate the operations randomly
1130 // there might be more pops than pushes. So ignore a pop
1131 // if the stack was empty.
1132 if let Some((_, _, previous_saves)) = stack.pop() {
1133 saves = previous_saves;
1134 state.pop();
1135 }
1136 }
1137 Operation::Save(slot, value) => {
1138 saves[slot] = value;
1139 state.save(slot, value);
1140 }
1141 }
1142
1143 // Remember state of saves for checking later
1144 expected.push(saves.clone());
1145 let mut actual_saves = vec![usize::MAX; slots];
1146 for (i, item) in actual_saves.iter_mut().enumerate().take(slots) {
1147 *item = state.get(i);
1148 }
1149 actual.push(actual_saves);
1150 }
1151
1152 expected == actual
1153 }
1154
1155 quickcheck! {
1156 fn state_save_quickcheck(operations: Vec<Operation>) -> bool {
1157 check_saves_for_operations(operations)
1158 }
1159 }
1160}