#[cfg(feature = "internal-instrument-pikevm")]
use core::cell::RefCell;
pub use regex_automata::nfa::thompson::pikevm::{Builder, Config, PikeVM};
use regex_automata::nfa::thompson::State;
use regex_automata::util::captures::Captures;
use regex_automata::util::primitives::{NonMaxUsize, SmallIndex, StateID};
use regex_automata::{Anchored, HalfMatch, Match, MatchKind, PatternID};
use crate::cursor::Cursor;
use crate::util::sparse_set::SparseSet;
use crate::util::{empty, iter};
use crate::{literal, Input};
#[cfg(test)]
mod tests;
#[inline]
pub fn find_iter<'r, 'c, C: Cursor>(
vm: &'r PikeVM,
cache: &'c mut Cache,
input: Input<C>,
) -> FindMatches<'r, 'c, C> {
let caps = Captures::matches(vm.get_nfa().group_info().clone());
let it = iter::Searcher::new(input);
FindMatches { re: vm, cache, caps, it }
}
#[inline]
pub fn search<C: Cursor>(
vm: &PikeVM,
cache: &mut Cache,
input: &mut Input<C>,
caps: &mut Captures,
) {
caps.set_pattern(None);
let pid = search_slots(vm, cache, input, caps.slots_mut());
caps.set_pattern(pid);
}
#[inline]
pub fn is_match<C: Cursor>(vm: &PikeVM, cache: &mut Cache, input: &mut Input<C>) -> bool {
input.with(|input| {
let input = input.earliest(true);
search_slots(vm, cache, input, &mut []).is_some()
})
}
macro_rules! instrument {
($fun:expr) => {
#[cfg(feature = "internal-instrument-pikevm")]
{
let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut()));
}
};
}
#[cfg(feature = "internal-instrument-pikevm")]
std::thread_local! {
static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
}
pub fn search_slots<C: Cursor>(
vm: &PikeVM,
cache: &mut Cache,
input: &mut Input<C>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
let utf8empty = vm.get_nfa().has_empty() && vm.get_nfa().is_utf8();
if !utf8empty {
let hm = search_slots_imp(vm, cache, input, slots)?;
return Some(hm.pattern());
}
let min = vm.get_nfa().group_info().implicit_slot_len();
if slots.len() >= min {
let hm = search_slots_imp(vm, cache, input, slots)?;
return Some(hm.pattern());
}
if vm.get_nfa().pattern_len() == 1 {
let mut enough = [None, None];
let got = search_slots_imp(vm, cache, input, &mut enough);
slots.copy_from_slice(&enough[..slots.len()]);
return got.map(|hm| hm.pattern());
}
let mut enough = vec![None; min];
let got = search_slots_imp(vm, cache, input, &mut enough);
slots.copy_from_slice(&enough[..slots.len()]);
got.map(|hm| hm.pattern())
}
#[inline(never)]
fn search_slots_imp<C: Cursor>(
vm: &PikeVM,
cache: &mut Cache,
input: &mut Input<C>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<HalfMatch> {
let utf8empty = vm.get_nfa().has_empty() && vm.get_nfa().is_utf8();
let hm = match search_imp(vm, cache, input, slots) {
None => return None,
Some(hm) if !utf8empty => return Some(hm),
Some(hm) => hm,
};
empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
Ok(search_imp(vm, cache, input, slots).map(|hm| (hm, hm.offset())))
})
.unwrap()
}
fn start_config<C: Cursor>(vm: &PikeVM, input: &Input<C>) -> Option<(bool, StateID)> {
match input.get_anchored() {
Anchored::No => {
Some((vm.get_nfa().is_always_start_anchored(), vm.get_nfa().start_anchored()))
}
Anchored::Yes => Some((true, vm.get_nfa().start_anchored())),
Anchored::Pattern(pid) => Some((true, vm.get_nfa().start_pattern(pid)?)),
}
}
fn search_imp<C: Cursor>(
vm: &PikeVM,
cache: &mut Cache,
input: &mut Input<C>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<HalfMatch> {
cache.setup_search(slots.len());
if input.is_done() {
return None;
}
instrument!(|c| c.reset(&self.nfa));
let allmatches = vm.get_config().get_match_kind() == MatchKind::All;
let (anchored, start_id) = match start_config(vm, input) {
None => return None,
Some(config) => config,
};
let pre = if anchored { None } else { vm.get_config().get_prefilter() };
let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
let mut hm = None;
input.move_to(input.start());
input.clear_look_behind();
input.ensure_look_behind();
while input.at() <= input.end() {
if curr.set.is_empty() {
if hm.is_some() && !allmatches {
break;
}
if anchored && input.at() > input.start() {
break;
}
if let Some(pre) = pre {
let chunk_offst = input.chunk_offset();
match literal::find(pre, input) {
None => break,
Some(ref span) => {
input.move_to(span.start);
if chunk_offst != input.chunk_offset() {
input.clear_look_behind();
input.ensure_look_behind();
}
}
}
}
}
if (hm.is_none() || allmatches) && (!anchored || input.at() == input.start()) {
let slots = next.slot_table.all_absent();
epsilon_closure(vm, stack, slots, curr, input, start_id);
}
input.chunk_pos += 1;
if input.chunk_pos() >= input.chunk().len() {
input.advance_with_look_behind();
}
if let Some(pid) = nexts(vm, stack, curr, next, input, slots) {
hm = Some(HalfMatch::new(pid, input.at() - 1));
}
if input.get_earliest() && hm.is_some() {
break;
}
core::mem::swap(curr, next);
next.set.clear();
}
instrument!(|c| c.eprint(&self.nfa));
hm
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn nexts<C: Cursor>(
vm: &PikeVM,
stack: &mut Vec<FollowEpsilon>,
curr: &mut ActiveStates,
next_: &mut ActiveStates,
input: &mut Input<C>,
slots: &mut [Option<NonMaxUsize>],
) -> Option<PatternID> {
instrument!(|c| c.record_state_set(&curr.set));
let mut pid = None;
let ActiveStates { ref set, ref mut slot_table } = *curr;
for sid in set.iter() {
pid = match next(vm, stack, slot_table, next_, input, sid) {
None => continue,
Some(pid) => Some(pid),
};
slots.copy_from_slice(slot_table.for_state(sid));
if vm.get_config().get_match_kind() != MatchKind::All {
break;
}
}
pid
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn next<C: Cursor>(
vm: &PikeVM,
stack: &mut Vec<FollowEpsilon>,
curr_slot_table: &mut SlotTable,
next: &mut ActiveStates,
input: &mut Input<C>,
sid: StateID,
) -> Option<PatternID> {
instrument!(|c| c.record_step(sid));
let state = vm.get_nfa().state(sid);
match *state {
State::Fail
| State::Look { .. }
| State::Union { .. }
| State::BinaryUnion { .. }
| State::Capture { .. } => None,
State::ByteRange { ref trans } => {
let (chunk, pos) = input.look_around();
if trans.matches(chunk, pos - 1) {
let slots = curr_slot_table.for_state(sid);
epsilon_closure(vm, stack, slots, next, input, trans.next);
}
None
}
State::Sparse(ref sparse) => {
let (chunk, pos) = input.look_around();
if let Some(next_sid) = sparse.matches(chunk, pos - 1) {
let slots = curr_slot_table.for_state(sid);
epsilon_closure(vm, stack, slots, next, input, next_sid);
}
None
}
State::Dense(ref dense) => {
let (chunk, pos) = input.look_around();
if let Some(next_sid) = dense.matches(chunk, pos - 1) {
let slots = curr_slot_table.for_state(sid);
epsilon_closure(vm, stack, slots, next, input, next_sid);
}
None
}
State::Match { pattern_id } => Some(pattern_id),
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn epsilon_closure<C: Cursor>(
vm: &PikeVM,
stack: &mut Vec<FollowEpsilon>,
curr_slots: &mut [Option<NonMaxUsize>],
next: &mut ActiveStates,
input: &mut Input<C>,
sid: StateID,
) {
instrument!(|c| {
c.record_closure(sid);
c.record_stack_push(sid);
});
stack.push(FollowEpsilon::Explore(sid));
while let Some(frame) = stack.pop() {
match frame {
FollowEpsilon::RestoreCapture { slot, offset: pos } => {
curr_slots[slot] = pos;
}
FollowEpsilon::Explore(sid) => {
epsilon_closure_explore(vm, stack, curr_slots, next, input, sid);
}
}
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn epsilon_closure_explore<C: Cursor>(
vm: &PikeVM,
stack: &mut Vec<FollowEpsilon>,
curr_slots: &mut [Option<NonMaxUsize>],
next: &mut ActiveStates,
input: &mut Input<C>,
mut sid: StateID,
) {
loop {
instrument!(|c| c.record_set_insert(sid));
if !next.set.insert(sid) {
return;
}
match *vm.get_nfa().state(sid) {
State::Fail
| State::Match { .. }
| State::ByteRange { .. }
| State::Sparse { .. }
| State::Dense { .. } => {
next.slot_table.for_state(sid).copy_from_slice(curr_slots);
return;
}
State::Look { look, next } => {
let (chunk, at) = input.look_around();
if !vm.get_nfa().look_matcher().matches(look, chunk, at) {
return;
}
sid = next;
}
State::Union { ref alternates } => {
sid = match alternates.get(0) {
None => return,
Some(&sid) => sid,
};
instrument!(|c| {
for &alt in &alternates[1..] {
c.record_stack_push(alt);
}
});
stack.extend(alternates[1..].iter().copied().rev().map(FollowEpsilon::Explore));
}
State::BinaryUnion { alt1, alt2 } => {
sid = alt1;
instrument!(|c| c.record_stack_push(sid));
stack.push(FollowEpsilon::Explore(alt2));
}
State::Capture { next, slot, .. } => {
if slot.as_usize() < curr_slots.len() {
instrument!(|c| c.record_stack_push(sid));
stack.push(FollowEpsilon::RestoreCapture { slot, offset: curr_slots[slot] });
curr_slots[slot] = Some(NonMaxUsize::new(input.at()).unwrap());
}
sid = next;
}
}
}
}
#[derive(Clone, Debug)]
pub struct Cache {
stack: Vec<FollowEpsilon>,
curr: ActiveStates,
next: ActiveStates,
}
impl Cache {
pub fn new(re: &PikeVM) -> Cache {
Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re) }
}
pub fn reset(&mut self, re: &PikeVM) {
self.curr.reset(re);
self.next.reset(re);
}
pub fn memory_usage(&self) -> usize {
use core::mem::size_of;
(self.stack.len() * size_of::<FollowEpsilon>())
+ self.curr.memory_usage()
+ self.next.memory_usage()
}
fn setup_search(&mut self, captures_slot_len: usize) {
self.stack.clear();
self.curr.setup_search(captures_slot_len);
self.next.setup_search(captures_slot_len);
}
}
#[derive(Clone, Debug)]
enum FollowEpsilon {
Explore(StateID),
RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
}
#[derive(Clone, Debug)]
struct ActiveStates {
set: SparseSet,
slot_table: SlotTable,
}
impl ActiveStates {
fn new(re: &PikeVM) -> ActiveStates {
let mut active = ActiveStates { set: SparseSet::new(0), slot_table: SlotTable::new() };
active.reset(re);
active
}
fn reset(&mut self, re: &PikeVM) {
self.set.resize(re.get_nfa().states().len());
self.slot_table.reset(re);
}
fn memory_usage(&self) -> usize {
self.set.memory_usage() + self.slot_table.memory_usage()
}
fn setup_search(&mut self, captures_slot_len: usize) {
self.set.clear();
self.slot_table.setup_search(captures_slot_len);
}
}
#[derive(Clone, Debug)]
struct SlotTable {
table: Vec<Option<NonMaxUsize>>,
slots_per_state: usize,
slots_for_captures: usize,
}
impl SlotTable {
fn new() -> SlotTable {
SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
}
fn reset(&mut self, re: &PikeVM) {
let nfa = re.get_nfa();
self.slots_per_state = nfa.group_info().slot_len();
self.slots_for_captures =
core::cmp::max(self.slots_per_state, nfa.pattern_len().checked_mul(2).unwrap());
let len = nfa
.states()
.len()
.checked_mul(self.slots_per_state)
.and_then(|x| x.checked_add(self.slots_for_captures))
.expect("slot table length doesn't overflow");
self.table.resize(len, None);
}
fn memory_usage(&self) -> usize {
self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
}
fn setup_search(&mut self, captures_slot_len: usize) {
self.slots_for_captures = captures_slot_len;
}
fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
let i = sid.as_usize() * self.slots_per_state;
&mut self.table[i..i + self.slots_for_captures]
}
fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
let i = self.table.len() - self.slots_for_captures;
&mut self.table[i..i + self.slots_for_captures]
}
}
#[derive(Debug)]
pub struct FindMatches<'r, 'c, C: Cursor> {
re: &'r PikeVM,
cache: &'c mut Cache,
caps: Captures,
it: iter::Searcher<C>,
}
impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> {
type Item = Match;
#[inline]
fn next(&mut self) -> Option<Match> {
let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = *self;
it.advance(|input| {
search(re, cache, input, caps);
Ok(caps.get_match())
})
}
}