regex_cursor/engines/
pikevm.rs

1#[cfg(feature = "internal-instrument-pikevm")]
2use core::cell::RefCell;
3
4pub use regex_automata::nfa::thompson::pikevm::{Builder, Config, PikeVM};
5use regex_automata::nfa::thompson::State;
6use regex_automata::util::captures::Captures;
7use regex_automata::util::primitives::{NonMaxUsize, SmallIndex, StateID};
8use regex_automata::{Anchored, HalfMatch, Match, MatchKind, PatternID};
9
10use crate::cursor::Cursor;
11use crate::util::sparse_set::SparseSet;
12use crate::util::{empty, iter};
13use crate::{literal, Input};
14
15#[cfg(test)]
16mod tests;
17
18/// Returns an iterator over all non-overlapping leftmost matches in the
19/// given bytes. If no match exists, then the iterator yields no elements.
20///
21/// # Example
22///
23/// ```
24/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
25///
26/// let re = PikeVM::new("foo[0-9]+")?;
27/// let mut cache = re.create_cache();
28///
29/// let text = "foo1 foo12 foo123";
30/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
31/// assert_eq!(matches, vec![
32///     Match::must(0, 0..4),
33///     Match::must(0, 5..10),
34///     Match::must(0, 11..17),
35/// ]);
36/// # Ok::<(), Box<dyn std::error::Error>>(())
37/// ```
38#[inline]
39pub fn find_iter<'r, 'c, C: Cursor>(
40    vm: &'r PikeVM,
41    cache: &'c mut Cache,
42    input: Input<C>,
43) -> FindMatches<'r, 'c, C> {
44    let caps = Captures::matches(vm.get_nfa().group_info().clone());
45    let it = iter::Searcher::new(input);
46    FindMatches { re: vm, cache, caps, it }
47}
48
49/// Executes a leftmost forward search and writes the spans of capturing
50/// groups that participated in a match into the provided [`Captures`]
51/// value. If no match was found, then [`Captures::is_match`] is guaranteed
52/// to return `false`.
53///
54/// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
55/// instead of an `Into<Input>`.
56///
57/// # Example: specific pattern search
58///
59/// This example shows how to build a multi-PikeVM that permits searching
60/// for specific patterns.
61///
62/// ```
63/// use regex_automata::{
64///     nfa::thompson::pikevm::PikeVM,
65///     Anchored, Match, PatternID, Input,
66/// };
67///
68/// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
69/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
70/// let haystack = "foo123";
71///
72/// // Since we are using the default leftmost-first match and both
73/// // patterns match at the same starting position, only the first pattern
74/// // will be returned in this case when doing a search for any of the
75/// // patterns.
76/// let expected = Some(Match::must(0, 0..6));
77/// re.search(&mut cache, &Input::new(haystack), &mut caps);
78/// assert_eq!(expected, caps.get_match());
79///
80/// // But if we want to check whether some other pattern matches, then we
81/// // can provide its pattern ID.
82/// let expected = Some(Match::must(1, 0..6));
83/// let input = Input::new(haystack)
84///     .anchored(Anchored::Pattern(PatternID::must(1)));
85/// re.search(&mut cache, &input, &mut caps);
86/// assert_eq!(expected, caps.get_match());
87///
88/// # Ok::<(), Box<dyn std::error::Error>>(())
89/// ```
90///
91/// # Example: specifying the bounds of a search
92///
93/// This example shows how providing the bounds of a search can produce
94/// different results than simply sub-slicing the haystack.
95///
96/// ```
97/// # if cfg!(miri) { return Ok(()); } // miri takes too long
98/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
99///
100/// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
101/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
102/// let haystack = "foo123bar";
103///
104/// // Since we sub-slice the haystack, the search doesn't know about
105/// // the larger context and assumes that `123` is surrounded by word
106/// // boundaries. And of course, the match position is reported relative
107/// // to the sub-slice as well, which means we get `0..3` instead of
108/// // `3..6`.
109/// let expected = Some(Match::must(0, 0..3));
110/// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps);
111/// assert_eq!(expected, caps.get_match());
112///
113/// // But if we provide the bounds of the search within the context of the
114/// // entire haystack, then the search can take the surrounding context
115/// // into account. (And if we did find a match, it would be reported
116/// // as a valid offset into `haystack` instead of its sub-slice.)
117/// let expected = None;
118/// let input = Input::new(haystack).range(3..6);
119/// re.search(&mut cache, &input, &mut caps);
120/// assert_eq!(expected, caps.get_match());
121///
122/// # Ok::<(), Box<dyn std::error::Error>>(())
123/// ```
124#[inline]
125pub fn search<C: Cursor>(
126    vm: &PikeVM,
127    cache: &mut Cache,
128    input: &mut Input<C>,
129    caps: &mut Captures,
130) {
131    caps.set_pattern(None);
132    let pid = search_slots(vm, cache, input, caps.slots_mut());
133    caps.set_pattern(pid);
134}
135
136/// Returns true if and only if this `PikeVM` matches the given haystack.
137///
138/// This routine may short circuit if it knows that scanning future
139/// input will never lead to a different result. In particular, if the
140/// underlying NFA enters a match state, then this routine will return
141/// `true` immediately without inspecting any future input. (Consider how
142/// this might make a difference given the regex `a+` on the haystack
143/// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
144/// but routines like `find` need to continue searching because `+` is
145/// greedy by default.)
146///
147/// # Example
148///
149/// This shows basic usage:
150///
151/// ```
152/// use regex_automata::nfa::thompson::pikevm::PikeVM;
153///
154/// let re = PikeVM::new("foo[0-9]+bar")?;
155/// let mut cache = re.create_cache();
156///
157/// assert!(re.is_match(&mut cache, "foo12345bar"));
158/// assert!(!re.is_match(&mut cache, "foobar"));
159/// # Ok::<(), Box<dyn std::error::Error>>(())
160/// ```
161///
162/// # Example: consistency with search APIs
163///
164/// `is_match` is guaranteed to return `true` whenever `find` returns a
165/// match. This includes searches that are executed entirely within a
166/// codepoint:
167///
168/// ```
169/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
170///
171/// let re = PikeVM::new("a*")?;
172/// let mut cache = re.create_cache();
173///
174/// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
175/// # Ok::<(), Box<dyn std::error::Error>>(())
176/// ```
177///
178/// Notice that when UTF-8 mode is disabled, then the above reports a
179/// match because the restriction against zero-width matches that split a
180/// codepoint has been lifted:
181///
182/// ```
183/// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
184///
185/// let re = PikeVM::builder()
186///     .thompson(NFA::config().utf8(false))
187///     .build("a*")?;
188/// let mut cache = re.create_cache();
189///
190/// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
191/// # Ok::<(), Box<dyn std::error::Error>>(())
192/// ```
193#[inline]
194pub fn is_match<C: Cursor>(vm: &PikeVM, cache: &mut Cache, input: &mut Input<C>) -> bool {
195    input.with(|input| {
196        let input = input.earliest(true);
197        search_slots(vm, cache, input, &mut []).is_some()
198    })
199}
200/// A simple macro for conditionally executing instrumentation logic when
201/// the 'trace' log level is enabled. This is a compile-time no-op when the
202/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
203/// this makes it easier to avoid doing extra work when instrumentation isn't
204/// enabled.
205///
206/// This macro accepts a closure of type `|&mut Counters|`. The closure can
207/// then increment counters (or whatever) in accordance with what one wants
208/// to track.
209macro_rules! instrument {
210    ($fun:expr) => {
211        #[cfg(feature = "internal-instrument-pikevm")]
212        {
213            let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
214            COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut()));
215        }
216    };
217}
218
219#[cfg(feature = "internal-instrument-pikevm")]
220std::thread_local! {
221    /// Effectively global state used to keep track of instrumentation
222    /// counters. The "proper" way to do this is to thread it through the
223    /// PikeVM, but it makes the code quite icky. Since this is just a
224    /// debugging feature, we're content to relegate it to thread local
225    /// state. When instrumentation is enabled, the counters are reset at the
226    /// beginning of every search and printed (with the 'trace' log level) at
227    /// the end of every search.
228    static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
229}
230
231pub fn search_slots<C: Cursor>(
232    vm: &PikeVM,
233    cache: &mut Cache,
234    input: &mut Input<C>,
235    slots: &mut [Option<NonMaxUsize>],
236) -> Option<PatternID> {
237    let utf8empty = vm.get_nfa().has_empty() && vm.get_nfa().is_utf8();
238    if !utf8empty {
239        let hm = search_slots_imp(vm, cache, input, slots)?;
240        return Some(hm.pattern());
241    }
242    // There is an unfortunate special case where if the regex can
243    // match the empty string and UTF-8 mode is enabled, the search
244    // implementation requires that the slots have at least as much space
245    // to report the bounds of any match. This is so zero-width matches
246    // that split a codepoint can be filtered out.
247    //
248    // Note that if utf8empty is true, we specialize the case for when
249    // the number of patterns is 1. In that case, we can just use a stack
250    // allocation. Otherwise we resort to a heap allocation, which we
251    // convince ourselves we're fine with due to the pathological nature of
252    // this case.
253    let min = vm.get_nfa().group_info().implicit_slot_len();
254    if slots.len() >= min {
255        let hm = search_slots_imp(vm, cache, input, slots)?;
256        return Some(hm.pattern());
257    }
258    if vm.get_nfa().pattern_len() == 1 {
259        let mut enough = [None, None];
260        let got = search_slots_imp(vm, cache, input, &mut enough);
261        // This is OK because we know `enough` is strictly bigger than
262        // `slots`, otherwise this special case isn't reached.
263        slots.copy_from_slice(&enough[..slots.len()]);
264        return got.map(|hm| hm.pattern());
265    }
266    let mut enough = vec![None; min];
267    let got = search_slots_imp(vm, cache, input, &mut enough);
268    // This is OK because we know `enough` is strictly bigger than `slots`,
269    // otherwise this special case isn't reached.
270    slots.copy_from_slice(&enough[..slots.len()]);
271    got.map(|hm| hm.pattern())
272}
273
274/// This is the actual implementation of `search_slots_imp` that
275/// doesn't account for the special case when 1) the NFA has UTF-8 mode
276/// enabled, 2) the NFA can match the empty string and 3) the caller has
277/// provided an insufficient number of slots to record match offsets.
278#[inline(never)]
279fn search_slots_imp<C: Cursor>(
280    vm: &PikeVM,
281    cache: &mut Cache,
282    input: &mut Input<C>,
283    slots: &mut [Option<NonMaxUsize>],
284) -> Option<HalfMatch> {
285    let utf8empty = vm.get_nfa().has_empty() && vm.get_nfa().is_utf8();
286    let hm = match search_imp(vm, cache, input, slots) {
287        None => return None,
288        Some(hm) if !utf8empty => return Some(hm),
289        Some(hm) => hm,
290    };
291    empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
292        Ok(search_imp(vm, cache, input, slots).map(|hm| (hm, hm.offset())))
293    })
294    // OK because the PikeVM never errors.
295    .unwrap()
296}
297
298/// Return the starting configuration of a PikeVM search.
299///
300/// The "start config" is basically whether the search should be anchored
301/// or not and the NFA state ID at which to begin the search. The state ID
302/// returned always corresponds to an anchored starting state even when the
303/// search is unanchored. This is because the PikeVM search loop deals with
304/// unanchored searches with an explicit epsilon closure out of the start
305/// state.
306///
307/// This routine accounts for both the caller's `Input` configuration
308/// and the pattern itself. For example, even if the caller asks for an
309/// unanchored search, if the pattern itself is anchored, then this will
310/// always return 'true' because implementing an unanchored search in that
311/// case would be incorrect.
312///
313/// Similarly, if the caller requests an anchored search for a particular
314/// pattern, then the starting state ID returned will reflect that.
315///
316/// If a pattern ID is given in the input configuration that is not in
317/// this regex, then `None` is returned.
318fn start_config<C: Cursor>(vm: &PikeVM, input: &Input<C>) -> Option<(bool, StateID)> {
319    match input.get_anchored() {
320        // Only way we're unanchored is if both the caller asked for an
321        // unanchored search *and* the pattern is itself not anchored.
322        Anchored::No => {
323            Some((vm.get_nfa().is_always_start_anchored(), vm.get_nfa().start_anchored()))
324        }
325        Anchored::Yes => Some((true, vm.get_nfa().start_anchored())),
326        Anchored::Pattern(pid) => Some((true, vm.get_nfa().start_pattern(pid)?)),
327    }
328}
329
330fn search_imp<C: Cursor>(
331    vm: &PikeVM,
332    cache: &mut Cache,
333    input: &mut Input<C>,
334    slots: &mut [Option<NonMaxUsize>],
335) -> Option<HalfMatch> {
336    cache.setup_search(slots.len());
337    if input.is_done() {
338        return None;
339    }
340    instrument!(|c| c.reset(&self.nfa));
341
342    // Whether we want to visit all match states instead of emulating the
343    // 'leftmost' semantics of typical backtracking regex engines.
344    let allmatches = vm.get_config().get_match_kind() == MatchKind::All;
345    let (anchored, start_id) = match start_config(vm, input) {
346        None => return None,
347        Some(config) => config,
348    };
349
350    let pre = if anchored { None } else { vm.get_config().get_prefilter() };
351    let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
352    let mut hm = None;
353    // Yes, our search doesn't end at input.end(), but includes it. This
354    // is necessary because matches are delayed by one byte, just like
355    // how the DFA engines work. The delay is used to handle look-behind
356    // assertions. In the case of the PikeVM, the delay is implemented
357    // by not considering a match to exist until it is visited in
358    // 'steps'. Technically, we know a match exists in the previous
359    // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
360    // determinization. We don't mark a DFA state as a match state if it
361    // contains an NFA match state, but rather, whether the DFA state was
362    // generated by a transition from a DFA state that contains an NFA
363    // match state.)
364    input.move_to(input.start());
365    input.clear_look_behind();
366    input.ensure_look_behind();
367    while input.at() <= input.end() {
368        // If we have no states left to visit, then there are some cases
369        // where we know we can quit early or even skip ahead.
370        if curr.set.is_empty() {
371            // We have a match and we haven't been instructed to continue
372            // on even after finding a match, so we can quit.
373            if hm.is_some() && !allmatches {
374                break;
375            }
376            // If we're running an anchored search and we've advanced
377            // beyond the start position with no other states to try, then
378            // we will never observe a match and thus can stop.
379            if anchored && input.at() > input.start() {
380                break;
381            }
382            // If there no states left to explore at this position and we
383            // know we can't terminate early, then we are effectively at
384            // the starting state of the NFA. If we fell through here,
385            // we'd end up adding our '(?s-u:.)*?' prefix and it would be
386            // the only thing in 'curr'. So we might as well just skip
387            // ahead until we find something that we know might advance us
388            // forward.
389            if let Some(pre) = pre {
390                let chunk_offst = input.chunk_offset();
391                match literal::find(pre, input) {
392                    None => break,
393                    Some(ref span) => {
394                        input.move_to(span.start);
395                        if chunk_offst != input.chunk_offset() {
396                            input.clear_look_behind();
397                            input.ensure_look_behind();
398                        }
399                    }
400                }
401            }
402        }
403        // Instead of using the NFA's unanchored start state, we actually
404        // always use its anchored starting state. As a result, when doing
405        // an unanchored search, we need to simulate our own '(?s-u:.)*?'
406        // prefix, to permit a match to appear anywhere.
407        //
408        // Now, we don't *have* to do things this way. We could use the
409        // NFA's unanchored starting state and do one 'epsilon_closure'
410        // call from that starting state before the main loop here. And
411        // that is just as correct. However, it turns out to be slower
412        // than our approach here because it slightly increases the cost
413        // of processing each byte by requiring us to visit more NFA
414        // states to deal with the additional NFA states in the unanchored
415        // prefix. By simulating it explicitly here, we lower those costs
416        // substantially. The cost is itself small, but it adds up for
417        // large haystacks.
418        //
419        // In order to simulate the '(?s-u:.)*?' prefix---which is not
420        // greedy---we are careful not to perform an epsilon closure on
421        // the start state if we already have a match. Namely, if we
422        // did otherwise, we would never reach a terminating condition
423        // because there would always be additional states to process.
424        // In effect, the exclusion of running 'epsilon_closure' when
425        // we have a match corresponds to the "dead" states we have in
426        // our DFA regex engines. Namely, in a DFA, match states merely
427        // instruct the search execution to record the current offset as
428        // the most recently seen match. It is the dead state that actually
429        // indicates when to stop the search (other than EOF or quit
430        // states).
431        //
432        // However, when 'allmatches' is true, the caller has asked us to
433        // leave in every possible match state. This tends not to make a
434        // whole lot of sense in unanchored searches, because it means the
435        // search really cannot terminate until EOF. And often, in that
436        // case, you wind up skipping over a bunch of matches and are left
437        // with the "last" match. Arguably, it just doesn't make a lot of
438        // sense to run a 'leftmost' search (which is what this routine is)
439        // with 'allmatches' set to true. But the DFAs support it and this
440        // matches their behavior. (Generally, 'allmatches' is useful for
441        // overlapping searches or leftmost anchored searches to find the
442        // longest possible match by ignoring match priority.)
443        //
444        // Additionally, when we're running an anchored search, this
445        // epsilon closure should only be computed at the beginning of the
446        // search. If we re-computed it at every position, we would be
447        // simulating an unanchored search when we were tasked to perform
448        // an anchored search.
449        if (hm.is_none() || allmatches) && (!anchored || input.at() == input.start()) {
450            // Since we are adding to the 'curr' active states and since
451            // this is for the start ID, we use a slots slice that is
452            // guaranteed to have the right length but where every element
453            // is absent. This is exactly what we want, because this
454            // epsilon closure is responsible for simulating an unanchored
455            // '(?s:.)*?' prefix. It is specifically outside of any
456            // capturing groups, and thus, using slots that are always
457            // absent is correct.
458            //
459            // Note though that we can't just use '&mut []' here, since
460            // this epsilon closure may traverse through 'Captures' epsilon
461            // transitions, and thus must be able to write offsets to the
462            // slots given which are later copied to slot values in 'curr'.
463            let slots = next.slot_table.all_absent();
464            epsilon_closure(vm, stack, slots, curr, input, start_id);
465        }
466        input.chunk_pos += 1;
467        if input.chunk_pos() >= input.chunk().len() {
468            input.advance_with_look_behind();
469        }
470        if let Some(pid) = nexts(vm, stack, curr, next, input, slots) {
471            hm = Some(HalfMatch::new(pid, input.at() - 1));
472        }
473        // Unless the caller asked us to return early, we need to mush on
474        // to see if we can extend our match. (But note that 'nexts' will
475        // quit right after seeing a match when match_kind==LeftmostFirst,
476        // as is consistent with leftmost-first match priority.)
477        if input.get_earliest() && hm.is_some() {
478            break;
479        }
480        core::mem::swap(curr, next);
481        next.set.clear();
482    }
483    instrument!(|c| c.eprint(&self.nfa));
484    hm
485}
486
487/// Process the active states in 'curr' to find the states (written to
488/// 'next') we should process for the next byte in the haystack.
489///
490/// 'stack' is used to perform a depth first traversal of the NFA when
491/// computing an epsilon closure.
492///
493/// When a match is found, the slots for that match state (in 'curr') are
494/// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
495/// stops (unless the PikeVM was configured with MatchKind::All semantics).
496#[cfg_attr(feature = "perf-inline", inline(always))]
497fn nexts<C: Cursor>(
498    vm: &PikeVM,
499    stack: &mut Vec<FollowEpsilon>,
500    curr: &mut ActiveStates,
501    next_: &mut ActiveStates,
502    input: &mut Input<C>,
503    slots: &mut [Option<NonMaxUsize>],
504) -> Option<PatternID> {
505    instrument!(|c| c.record_state_set(&curr.set));
506    let mut pid = None;
507    let ActiveStates { ref set, ref mut slot_table } = *curr;
508    for sid in set.iter() {
509        pid = match next(vm, stack, slot_table, next_, input, sid) {
510            None => continue,
511            Some(pid) => Some(pid),
512        };
513        slots.copy_from_slice(slot_table.for_state(sid));
514        if vm.get_config().get_match_kind() != MatchKind::All {
515            break;
516        }
517    }
518    pid
519}
520
521/// Starting from 'sid', if the position 'at' in the 'input' haystack has a
522/// transition defined out of 'sid', then add the state transitioned to and
523/// its epsilon closure to the 'next' set of states to explore.
524///
525/// 'stack' is used by the epsilon closure computation to perform a depth
526/// first traversal of the NFA.
527///
528/// 'curr_slot_table' should be the table of slots for the current set of
529/// states being explored. If there is a transition out of 'sid', then
530/// sid's row in the slot table is used to perform the epsilon closure.
531#[cfg_attr(feature = "perf-inline", inline(always))]
532fn next<C: Cursor>(
533    vm: &PikeVM,
534    stack: &mut Vec<FollowEpsilon>,
535    curr_slot_table: &mut SlotTable,
536    next: &mut ActiveStates,
537    input: &mut Input<C>,
538    sid: StateID,
539) -> Option<PatternID> {
540    instrument!(|c| c.record_step(sid));
541    let state = vm.get_nfa().state(sid);
542    match *state {
543        State::Fail
544        | State::Look { .. }
545        | State::Union { .. }
546        | State::BinaryUnion { .. }
547        | State::Capture { .. } => None,
548        State::ByteRange { ref trans } => {
549            let (chunk, pos) = input.look_around();
550            if trans.matches(chunk, pos - 1) {
551                let slots = curr_slot_table.for_state(sid);
552                epsilon_closure(vm, stack, slots, next, input, trans.next);
553            }
554            None
555        }
556        State::Sparse(ref sparse) => {
557            let (chunk, pos) = input.look_around();
558            if let Some(next_sid) = sparse.matches(chunk, pos - 1) {
559                let slots = curr_slot_table.for_state(sid);
560                epsilon_closure(vm, stack, slots, next, input, next_sid);
561            }
562            None
563        }
564        State::Dense(ref dense) => {
565            let (chunk, pos) = input.look_around();
566            if let Some(next_sid) = dense.matches(chunk, pos - 1) {
567                let slots = curr_slot_table.for_state(sid);
568                epsilon_closure(vm, stack, slots, next, input, next_sid);
569            }
570            None
571        }
572        State::Match { pattern_id } => Some(pattern_id),
573    }
574}
575
576/// Compute the epsilon closure of 'sid', writing the closure into 'next'
577/// while copying slot values from 'curr_slots' into corresponding states
578/// in 'next'. 'curr_slots' should be the slot values corresponding to
579/// 'sid'.
580///
581/// The given 'stack' is used to perform a depth first traversal of the
582/// NFA by recursively following all epsilon transitions out of 'sid'.
583/// Conditional epsilon transitions are followed if and only if they are
584/// satisfied for the position 'at' in the 'input' haystack.
585///
586/// While this routine may write to 'curr_slots', once it returns, any
587/// writes are undone and the original values (even if absent) are
588/// restored.
589#[cfg_attr(feature = "perf-inline", inline(always))]
590fn epsilon_closure<C: Cursor>(
591    vm: &PikeVM,
592    stack: &mut Vec<FollowEpsilon>,
593    curr_slots: &mut [Option<NonMaxUsize>],
594    next: &mut ActiveStates,
595    input: &mut Input<C>,
596    sid: StateID,
597) {
598    instrument!(|c| {
599        c.record_closure(sid);
600        c.record_stack_push(sid);
601    });
602    stack.push(FollowEpsilon::Explore(sid));
603    while let Some(frame) = stack.pop() {
604        match frame {
605            FollowEpsilon::RestoreCapture { slot, offset: pos } => {
606                curr_slots[slot] = pos;
607            }
608            FollowEpsilon::Explore(sid) => {
609                epsilon_closure_explore(vm, stack, curr_slots, next, input, sid);
610            }
611        }
612    }
613}
614
615/// Explore all of the epsilon transitions out of 'sid'. This is mostly
616/// split out from 'epsilon_closure' in order to clearly delineate
617/// the actual work of computing an epsilon closure from the stack
618/// book-keeping.
619///
620/// This will push any additional explorations needed on to 'stack'.
621///
622/// 'curr_slots' should refer to the slots for the currently active NFA
623/// state. That is, the current state we are stepping through. These
624/// slots are mutated in place as new 'Captures' states are traversed
625/// during epsilon closure, but the slots are restored to their original
626/// values once the full epsilon closure is completed. The ultimate use of
627/// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
628/// the capturing group spans are forwarded from the currently active state
629/// to the next.
630///
631/// 'next' refers to the next set of active states. Computing an epsilon
632/// closure may increase the next set of active states.
633///
634/// 'input' refers to the caller's input configuration and 'at' refers to
635/// the current position in the haystack. These are used to check whether
636/// conditional epsilon transitions (like look-around) are satisfied at
637/// the current position. If they aren't, then the epsilon closure won't
638/// include them.
639#[cfg_attr(feature = "perf-inline", inline(always))]
640fn epsilon_closure_explore<C: Cursor>(
641    vm: &PikeVM,
642    stack: &mut Vec<FollowEpsilon>,
643    curr_slots: &mut [Option<NonMaxUsize>],
644    next: &mut ActiveStates,
645    input: &mut Input<C>,
646    mut sid: StateID,
647) {
648    // We can avoid pushing some state IDs on to our stack in precisely
649    // the cases where a 'push(x)' would be immediately followed by a 'x
650    // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
651    // to be the next state ID we want to explore once we're done with
652    // our initial exploration. In practice, this avoids a lot of stack
653    // thrashing.
654    loop {
655        instrument!(|c| c.record_set_insert(sid));
656        // Record this state as part of our next set of active states. If
657        // we've already explored it, then no need to do it again.
658        if !next.set.insert(sid) {
659            return;
660        }
661        match *vm.get_nfa().state(sid) {
662            State::Fail
663            | State::Match { .. }
664            | State::ByteRange { .. }
665            | State::Sparse { .. }
666            | State::Dense { .. } => {
667                next.slot_table.for_state(sid).copy_from_slice(curr_slots);
668                return;
669            }
670            State::Look { look, next } => {
671                // OK because we don't permit building a searcher with a
672                // Unicode word boundary if the requisite Unicode data is
673                // unavailable.
674                let (chunk, at) = input.look_around();
675                if !vm.get_nfa().look_matcher().matches(look, chunk, at) {
676                    return;
677                }
678                sid = next;
679            }
680            State::Union { ref alternates } => {
681                sid = match alternates.get(0) {
682                    None => return,
683                    Some(&sid) => sid,
684                };
685                instrument!(|c| {
686                    for &alt in &alternates[1..] {
687                        c.record_stack_push(alt);
688                    }
689                });
690                stack.extend(alternates[1..].iter().copied().rev().map(FollowEpsilon::Explore));
691            }
692            State::BinaryUnion { alt1, alt2 } => {
693                sid = alt1;
694                instrument!(|c| c.record_stack_push(sid));
695                stack.push(FollowEpsilon::Explore(alt2));
696            }
697            State::Capture { next, slot, .. } => {
698                // There's no need to do anything with slots that
699                // ultimately won't be copied into the caller-provided
700                // 'Captures' value. So we just skip dealing with them at
701                // all.
702                if slot.as_usize() < curr_slots.len() {
703                    instrument!(|c| c.record_stack_push(sid));
704                    stack.push(FollowEpsilon::RestoreCapture { slot, offset: curr_slots[slot] });
705                    // OK because length of a slice must fit into an isize.
706                    curr_slots[slot] = Some(NonMaxUsize::new(input.at()).unwrap());
707                }
708                sid = next;
709            }
710        }
711    }
712}
713/// A cache represents mutable state that a [`PikeVM`] requires during a
714/// search.
715///
716/// For a given [`PikeVM`], its corresponding cache may be created either via
717/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
718/// every way, except the former does not require explicitly importing `Cache`.
719///
720/// A particular `Cache` is coupled with the [`PikeVM`] from which it
721/// was created. It may only be used with that `PikeVM`. A cache and its
722/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
723/// only be used with the new `PikeVM` (and not the old one).
724#[derive(Clone, Debug)]
725pub struct Cache {
726    /// Stack used while computing epsilon closure. This effectively lets us
727    /// move what is more naturally expressed through recursion to a stack
728    /// on the heap.
729    stack: Vec<FollowEpsilon>,
730    /// The current active states being explored for the current byte in the
731    /// haystack.
732    curr: ActiveStates,
733    /// The next set of states we're building that will be explored for the
734    /// next byte in the haystack.
735    next: ActiveStates,
736}
737
738impl Cache {
739    /// Create a new [`PikeVM`] cache.
740    ///
741    /// A potentially more convenient routine to create a cache is
742    /// [`PikeVM::create_cache`], as it does not require also importing the
743    /// `Cache` type.
744    ///
745    /// If you want to reuse the returned `Cache` with some other `PikeVM`,
746    /// then you must call [`Cache::reset`] with the desired `PikeVM`.
747    pub fn new(re: &PikeVM) -> Cache {
748        Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re) }
749    }
750
751    /// Reset this cache such that it can be used for searching with a
752    /// different [`PikeVM`].
753    ///
754    /// A cache reset permits reusing memory already allocated in this cache
755    /// with a different `PikeVM`.
756    ///
757    /// # Example
758    ///
759    /// This shows how to re-purpose a cache for use with a different `PikeVM`.
760    ///
761    /// ```
762    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
763    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
764    ///
765    /// let re1 = PikeVM::new(r"\w")?;
766    /// let re2 = PikeVM::new(r"\W")?;
767    ///
768    /// let mut cache = re1.create_cache();
769    /// assert_eq!(
770    ///     Some(Match::must(0, 0..2)),
771    ///     re1.find_iter(&mut cache, "Δ").next(),
772    /// );
773    ///
774    /// // Using 'cache' with re2 is not allowed. It may result in panics or
775    /// // incorrect results. In order to re-purpose the cache, we must reset
776    /// // it with the PikeVM we'd like to use it with.
777    /// //
778    /// // Similarly, after this reset, using the cache with 're1' is also not
779    /// // allowed.
780    /// cache.reset(&re2);
781    /// assert_eq!(
782    ///     Some(Match::must(0, 0..3)),
783    ///     re2.find_iter(&mut cache, "☃").next(),
784    /// );
785    ///
786    /// # Ok::<(), Box<dyn std::error::Error>>(())
787    /// ```
788    pub fn reset(&mut self, re: &PikeVM) {
789        self.curr.reset(re);
790        self.next.reset(re);
791    }
792
793    /// Returns the heap memory usage, in bytes, of this cache.
794    ///
795    /// This does **not** include the stack size used up by this cache. To
796    /// compute that, use `std::mem::size_of::<Cache>()`.
797    pub fn memory_usage(&self) -> usize {
798        use core::mem::size_of;
799        (self.stack.len() * size_of::<FollowEpsilon>())
800            + self.curr.memory_usage()
801            + self.next.memory_usage()
802    }
803
804    /// Clears this cache. This should be called at the start of every search
805    /// to ensure we start with a clean slate.
806    ///
807    /// This also sets the length of the capturing groups used in the current
808    /// search. This permits an optimization where by 'SlotTable::for_state'
809    /// only returns the number of slots equivalent to the number of slots
810    /// given in the 'Captures' value. This may be less than the total number
811    /// of possible slots, e.g., when one only wants to track overall match
812    /// offsets. This in turn permits less copying of capturing group spans
813    /// in the PikeVM.
814    fn setup_search(&mut self, captures_slot_len: usize) {
815        self.stack.clear();
816        self.curr.setup_search(captures_slot_len);
817        self.next.setup_search(captures_slot_len);
818    }
819}
820
821/// Represents a stack frame for use while computing an epsilon closure.
822///
823/// (An "epsilon closure" refers to the set of reachable NFA states from a
824/// single state without consuming any input. That is, the set of all epsilon
825/// transitions not only from that single state, but from every other state
826/// reachable by an epsilon transition as well. This is why it's called a
827/// "closure." Computing an epsilon closure is also done during DFA
828/// determinization! Compare and contrast the epsilon closure here in this
829/// PikeVM and the one used for determinization in crate::util::determinize.)
830///
831/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
832/// first traversal over all epsilon transitions from a particular state.
833/// (A depth first traversal is important because it emulates the same priority
834/// of matches that is typically found in backtracking regex engines.) This
835/// depth first traversal is naturally expressed using recursion, but to avoid
836/// a call stack size proportional to the size of a regex, we put our stack on
837/// the heap instead.
838///
839/// This stack thus consists of call frames. The typical call frame is
840/// `Explore`, which instructs epsilon closure to explore the epsilon
841/// transitions from that state. (Subsequent epsilon transitions are then
842/// pushed on to the stack as more `Explore` frames.) If the state ID being
843/// explored has no epsilon transitions, then the capturing group slots are
844/// copied from the original state that sparked the epsilon closure (from the
845/// 'step' routine) to the state ID being explored. This way, capturing group
846/// slots are forwarded from the previous state to the next.
847///
848/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
849/// set the position for a particular slot back to some particular offset. This
850/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
851/// set the offset of the slot indicated in `Capture` to the current offset,
852/// and then push the old offset on to the stack as a `RestoreCapture` frame.
853/// Thus, the new offset is only used until the epsilon closure reverts back to
854/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
855/// transition its "scope" to only states that come "after" it during depth
856/// first traversal.
857#[derive(Clone, Debug)]
858enum FollowEpsilon {
859    /// Explore the epsilon transitions from a state ID.
860    Explore(StateID),
861    /// Reset the given `slot` to the given `offset` (which might be `None`).
862    RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
863}
864
865/// A set of active states used to "simulate" the execution of an NFA via the
866/// PikeVM.
867///
868/// There are two sets of these used during NFA simulation. One set corresponds
869/// to the "current" set of states being traversed for the current position
870/// in a haystack. The other set corresponds to the "next" set of states being
871/// built, which will become the new "current" set for the next position in the
872/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
873/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
874///
875/// In addition to representing a set of NFA states, this also maintains slot
876/// values for each state. These slot values are what turn the NFA simulation
877/// into the "Pike VM." Namely, they track capturing group values for each
878/// state. During the computation of epsilon closure, we copy slot values from
879/// states in the "current" set to the "next" set. Eventually, once a match
880/// is found, the slot values for that match state are what we write to the
881/// caller provided 'Captures' value.
882#[derive(Clone, Debug)]
883struct ActiveStates {
884    /// The set of active NFA states. This set preserves insertion order, which
885    /// is critical for simulating the match semantics of backtracking regex
886    /// engines.
887    set: SparseSet,
888    /// The slots for every NFA state, where each slot stores a (possibly
889    /// absent) offset. Every capturing group has two slots. One for a start
890    /// offset and one for an end offset.
891    slot_table: SlotTable,
892}
893
894impl ActiveStates {
895    /// Create a new set of active states for the given PikeVM. The active
896    /// states returned may only be used with the given PikeVM. (Use 'reset'
897    /// to re-purpose the allocation for a different PikeVM.)
898    fn new(re: &PikeVM) -> ActiveStates {
899        let mut active = ActiveStates { set: SparseSet::new(0), slot_table: SlotTable::new() };
900        active.reset(re);
901        active
902    }
903
904    /// Reset this set of active states such that it can be used with the given
905    /// PikeVM (and only that PikeVM).
906    fn reset(&mut self, re: &PikeVM) {
907        self.set.resize(re.get_nfa().states().len());
908        self.slot_table.reset(re);
909    }
910
911    /// Return the heap memory usage, in bytes, used by this set of active
912    /// states.
913    ///
914    /// This does not include the stack size of this value.
915    fn memory_usage(&self) -> usize {
916        self.set.memory_usage() + self.slot_table.memory_usage()
917    }
918
919    /// Setup this set of active states for a new search. The given slot
920    /// length should be the number of slots in a caller provided 'Captures'
921    /// (and may be zero).
922    fn setup_search(&mut self, captures_slot_len: usize) {
923        self.set.clear();
924        self.slot_table.setup_search(captures_slot_len);
925    }
926}
927
928/// A table of slots, where each row represent a state in an NFA. Thus, the
929/// table has room for storing slots for every single state in an NFA.
930///
931/// This table is represented with a single contiguous allocation. In general,
932/// the notion of "capturing group" doesn't really exist at this level of
933/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
934/// maps to a pair of slots, one for the start offset and one for the end
935/// offset.) Slots are indexed by the 'Captures' NFA state.
936///
937/// N.B. Not every state actually needs a row of slots. Namely, states that
938/// only have epsilon transitions currently never have anything written to
939/// their rows in this table. Thus, the table is somewhat wasteful in its heap
940/// usage. However, it is important to maintain fast random access by state
941/// ID, which means one giant table tends to work well. RE2 takes a different
942/// approach here and allocates each row as its own reference counted thing.
943/// I explored such a strategy at one point here, but couldn't get it to work
944/// well using entirely safe code. (To the ambitious reader: I encourage you to
945/// re-litigate that experiment.) I very much wanted to stick to safe code, but
946/// could be convinced otherwise if there was a solid argument and the safety
947/// was encapsulated well.
948#[derive(Clone, Debug)]
949struct SlotTable {
950    /// The actual table of offsets.
951    table: Vec<Option<NonMaxUsize>>,
952    /// The number of slots per state, i.e., the table's stride or the length
953    /// of each row.
954    slots_per_state: usize,
955    /// The number of slots in the caller-provided 'Captures' value for the
956    /// current search. Setting this to 'slots_per_state' is always correct,
957    /// but may be wasteful.
958    slots_for_captures: usize,
959}
960
961impl SlotTable {
962    /// Create a new slot table.
963    ///
964    /// One should call 'reset' with the corresponding PikeVM before use.
965    fn new() -> SlotTable {
966        SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
967    }
968
969    /// Reset this slot table such that it can be used with the given PikeVM
970    /// (and only that PikeVM).
971    fn reset(&mut self, re: &PikeVM) {
972        let nfa = re.get_nfa();
973        self.slots_per_state = nfa.group_info().slot_len();
974        // This is always correct, but may be reduced for a particular search
975        // if a 'Captures' has fewer slots, e.g., none at all or only slots
976        // for tracking the overall match instead of all slots for every
977        // group.
978        self.slots_for_captures =
979            core::cmp::max(self.slots_per_state, nfa.pattern_len().checked_mul(2).unwrap());
980        let len = nfa
981            .states()
982            .len()
983            .checked_mul(self.slots_per_state)
984            // Add space to account for scratch space used during a search.
985            .and_then(|x| x.checked_add(self.slots_for_captures))
986            // It seems like this could actually panic on legitimate inputs on
987            // 32-bit targets, and very likely to panic on 16-bit. Should we
988            // somehow convert this to an error? What about something similar
989            // for the lazy DFA cache? If you're tripping this assert, please
990            // file a bug.
991            .expect("slot table length doesn't overflow");
992        // This happens about as often as a regex is compiled, so it probably
993        // should be at debug level, but I found it quite distracting and not
994        // particularly useful.
995        self.table.resize(len, None);
996    }
997
998    /// Return the heap memory usage, in bytes, used by this slot table.
999    ///
1000    /// This does not include the stack size of this value.
1001    fn memory_usage(&self) -> usize {
1002        self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
1003    }
1004
1005    /// Perform any per-search setup for this slot table.
1006    ///
1007    /// In particular, this sets the length of the number of slots used in the
1008    /// 'Captures' given by the caller (if any at all). This number may be
1009    /// smaller than the total number of slots available, e.g., when the caller
1010    /// is only interested in tracking the overall match and not the spans of
1011    /// every matching capturing group. Only tracking the overall match can
1012    /// save a substantial amount of time copying capturing spans during a
1013    /// search.
1014    fn setup_search(&mut self, captures_slot_len: usize) {
1015        self.slots_for_captures = captures_slot_len;
1016    }
1017
1018    /// Return a mutable slice of the slots for the given state.
1019    ///
1020    /// Note that the length of the slice returned may be less than the total
1021    /// number of slots available for this state. In particular, the length
1022    /// always matches the number of slots indicated via 'setup_search'.
1023    fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
1024        let i = sid.as_usize() * self.slots_per_state;
1025        &mut self.table[i..i + self.slots_for_captures]
1026    }
1027
1028    /// Return a slice of slots of appropriate length where every slot offset
1029    /// is guaranteed to be absent. This is useful in cases where you need to
1030    /// compute an epsilon closure outside of the user supplied regex, and thus
1031    /// never want it to have any capturing slots set.
1032    fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
1033        let i = self.table.len() - self.slots_for_captures;
1034        &mut self.table[i..i + self.slots_for_captures]
1035    }
1036}
1037
1038/// An iterator over all non-overlapping matches for a particular search.
1039///
1040/// The iterator yields a [`Match`] value until no more matches could be found.
1041///
1042/// The lifetime parameters are as follows:
1043///
1044/// * `'r` represents the lifetime of the PikeVM.
1045/// * `'c` represents the lifetime of the PikeVM's cache.
1046/// * `'h` represents the lifetime of the haystack being searched.
1047///
1048/// This iterator can be created with the [`PikeVM::find_iter`] method.
1049#[derive(Debug)]
1050pub struct FindMatches<'r, 'c, C: Cursor> {
1051    re: &'r PikeVM,
1052    cache: &'c mut Cache,
1053    caps: Captures,
1054    it: iter::Searcher<C>,
1055}
1056
1057impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> {
1058    type Item = Match;
1059
1060    #[inline]
1061    fn next(&mut self) -> Option<Match> {
1062        // Splitting 'self' apart seems necessary to appease borrowck.
1063        let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = *self;
1064        // 'advance' converts errors into panics, which is OK here because
1065        // the PikeVM can never return an error.
1066        it.advance(|input| {
1067            search(re, cache, input, caps);
1068            Ok(caps.get_match())
1069        })
1070    }
1071}
regex_cursor/engines/pikevm.rs

regex_cursor/engines/
pikevm.rs