use alloc::{vec, vec::Vec};
use crate::{
int::{NonMaxUsize, U32},
nfa::{State, StateID, NFA},
pool::CachePoolGuard,
utf8,
};
#[derive(Clone, Debug)]
pub(crate) struct PikeVM {
nfa: NFA,
}
impl PikeVM {
pub(crate) fn new(nfa: NFA) -> PikeVM {
PikeVM { nfa }
}
pub(crate) fn nfa(&self) -> &NFA {
&self.nfa
}
pub(crate) fn find_iter<'r, 'h>(
&'r self,
cache: CachePoolGuard<'r>,
haystack: &'h [u8],
) -> FindMatches<'r, 'h> {
FindMatches {
pikevm: self,
cache,
haystack,
at: 0,
slots: vec![None, None],
last_match_end: None,
}
}
pub(crate) fn captures_iter<'r, 'h>(
&'r self,
cache: CachePoolGuard<'r>,
haystack: &'h [u8],
) -> CapturesMatches<'r, 'h> {
let len = self.nfa().group_len().checked_mul(2).unwrap();
CapturesMatches {
it: FindMatches {
pikevm: self,
cache,
haystack,
at: 0,
slots: vec![None; len],
last_match_end: None,
},
}
}
pub(crate) fn search(
&self,
cache: &mut Cache,
haystack: &[u8],
start: usize,
end: usize,
earliest: bool,
slots: &mut [Option<NonMaxUsize>],
) -> bool {
cache.setup_search(slots.len());
if start > end {
return false;
}
assert!(
haystack.len() < core::usize::MAX,
"byte slice lengths must be less than usize MAX",
);
let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
let start_id = self.nfa().start();
let anchored = self.nfa().is_start_anchored();
let mut matched = false;
let mut at = start;
while at <= end {
if curr.set.is_empty() {
if matched {
break;
}
if anchored && at > start {
break;
}
}
if !matched {
let slots = next.slot_table.all_absent();
self.epsilon_closure(
stack, slots, curr, haystack, at, start_id,
);
}
let (ch, len) = utf8::decode_lossy(&haystack[at..]);
if self.nexts(stack, curr, next, haystack, at, ch, len, slots) {
matched = true;
}
if (earliest && matched) || len == 0 {
break;
}
core::mem::swap(curr, next);
next.set.clear();
at += len;
}
matched
}
fn nexts(
&self,
stack: &mut Vec<FollowEpsilon>,
curr: &mut ActiveStates,
next: &mut ActiveStates,
haystack: &[u8],
at: usize,
at_ch: char,
at_len: usize,
slots: &mut [Option<NonMaxUsize>],
) -> bool {
let ActiveStates { ref set, ref mut slot_table } = *curr;
for sid in set.iter() {
if self.next(
stack, slot_table, next, haystack, at, at_ch, at_len, sid,
) {
slots.copy_from_slice(slot_table.for_state(sid));
return true;
}
}
false
}
fn next(
&self,
stack: &mut Vec<FollowEpsilon>,
curr_slot_table: &mut SlotTable,
next: &mut ActiveStates,
haystack: &[u8],
at: usize,
at_ch: char,
at_len: usize,
sid: StateID,
) -> bool {
match *self.nfa.state(sid) {
State::Fail
| State::Goto { .. }
| State::Splits { .. }
| State::Capture { .. } => false,
State::Char { target, ch } => {
if at_ch == ch && at_len > 0 {
let slots = curr_slot_table.for_state(sid);
let at = at.wrapping_add(at_len);
self.epsilon_closure(
stack, slots, next, haystack, at, target,
);
}
false
}
State::Ranges { target, ref ranges } => {
for (start, end) in ranges.iter().copied() {
if start > at_ch {
break;
} else if start <= at_ch && at_ch <= end {
if at_len == 0 {
return false;
}
let slots = curr_slot_table.for_state(sid);
let at = at.wrapping_add(at_len);
self.epsilon_closure(
stack, slots, next, haystack, at, target,
);
}
}
false
}
State::Match => true,
}
}
fn epsilon_closure(
&self,
stack: &mut Vec<FollowEpsilon>,
curr_slots: &mut [Option<NonMaxUsize>],
next: &mut ActiveStates,
haystack: &[u8],
at: usize,
sid: StateID,
) {
stack.push(FollowEpsilon::Explore(sid));
while let Some(frame) = stack.pop() {
match frame {
FollowEpsilon::RestoreCapture { slot, offset } => {
curr_slots[slot.as_usize()] = offset;
}
FollowEpsilon::Explore(sid) => {
self.epsilon_closure_explore(
stack, curr_slots, next, haystack, at, sid,
);
}
}
}
}
fn epsilon_closure_explore(
&self,
stack: &mut Vec<FollowEpsilon>,
curr_slots: &mut [Option<NonMaxUsize>],
next: &mut ActiveStates,
haystack: &[u8],
at: usize,
mut sid: StateID,
) {
loop {
if !next.set.insert(sid) {
return;
}
match *self.nfa.state(sid) {
State::Fail
| State::Match { .. }
| State::Char { .. }
| State::Ranges { .. } => {
next.slot_table.for_state(sid).copy_from_slice(curr_slots);
return;
}
State::Goto { target, look: None } => {
sid = target;
}
State::Goto { target, look: Some(look) } => {
if !look.is_match(haystack, at) {
return;
}
sid = target;
}
State::Splits { ref targets, reverse: false } => {
sid = match targets.get(0) {
None => return,
Some(&sid) => sid,
};
stack.extend(
targets[1..]
.iter()
.copied()
.rev()
.map(FollowEpsilon::Explore),
);
}
State::Splits { ref targets, reverse: true } => {
sid = match targets.last() {
None => return,
Some(&sid) => sid,
};
stack.extend(
targets[..targets.len() - 1]
.iter()
.copied()
.map(FollowEpsilon::Explore),
);
}
State::Capture { target, slot } => {
if slot.as_usize() < curr_slots.len() {
stack.push(FollowEpsilon::RestoreCapture {
slot,
offset: curr_slots[slot.as_usize()],
});
curr_slots[slot.as_usize()] =
Some(NonMaxUsize::new(at).unwrap());
}
sid = target;
}
}
}
}
}
#[derive(Debug)]
pub(crate) struct FindMatches<'r, 'h> {
pikevm: &'r PikeVM,
cache: CachePoolGuard<'r>,
haystack: &'h [u8],
at: usize,
slots: Vec<Option<NonMaxUsize>>,
last_match_end: Option<usize>,
}
impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
if !self.pikevm.search(
&mut self.cache,
self.haystack,
self.at,
self.haystack.len(),
false,
&mut self.slots,
) {
return None;
}
let mut m =
(self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
if m.0 >= m.1 {
m = self.handle_overlapping_empty_match(m)?;
}
self.at = m.1;
self.last_match_end = Some(m.1);
Some(m)
}
}
impl<'r, 'h> FindMatches<'r, 'h> {
#[cold]
#[inline(never)]
fn handle_overlapping_empty_match(
&mut self,
mut m: (usize, usize),
) -> Option<(usize, usize)> {
assert!(m.0 >= m.1);
if Some(m.1) == self.last_match_end {
let len =
core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1);
self.at = self.at.checked_add(len).unwrap();
if !self.pikevm.search(
&mut self.cache,
self.haystack,
self.at,
self.haystack.len(),
false,
&mut self.slots,
) {
return None;
}
m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
}
Some(m)
}
}
#[derive(Debug)]
pub(crate) struct CapturesMatches<'r, 'h> {
it: FindMatches<'r, 'h>,
}
impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
type Item = Vec<Option<NonMaxUsize>>;
fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> {
self.it.next()?;
Some(self.it.slots.clone())
}
}
#[derive(Clone, Debug)]
pub(crate) struct Cache {
stack: Vec<FollowEpsilon>,
curr: ActiveStates,
next: ActiveStates,
}
impl Cache {
pub(crate) fn new(re: &PikeVM) -> Cache {
Cache {
stack: vec![],
curr: ActiveStates::new(re),
next: ActiveStates::new(re),
}
}
fn setup_search(&mut self, captures_slot_len: usize) {
self.stack.clear();
self.curr.setup_search(captures_slot_len);
self.next.setup_search(captures_slot_len);
}
}
#[derive(Clone, Debug)]
struct ActiveStates {
set: SparseSet,
slot_table: SlotTable,
}
impl ActiveStates {
fn new(re: &PikeVM) -> ActiveStates {
let mut active = ActiveStates {
set: SparseSet::new(0),
slot_table: SlotTable::new(),
};
active.reset(re);
active
}
fn reset(&mut self, re: &PikeVM) {
self.set.resize(re.nfa().len());
self.slot_table.reset(re);
}
fn setup_search(&mut self, captures_slot_len: usize) {
self.set.clear();
self.slot_table.setup_search(captures_slot_len);
}
}
#[derive(Clone, Debug)]
struct SlotTable {
table: Vec<Option<NonMaxUsize>>,
slots_per_state: usize,
slots_for_captures: usize,
}
impl SlotTable {
fn new() -> SlotTable {
SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
}
fn reset(&mut self, re: &PikeVM) {
let nfa = re.nfa();
self.slots_per_state = nfa.group_len().checked_mul(2).unwrap();
self.slots_for_captures = self.slots_per_state;
let len = nfa
.len()
.checked_add(1)
.and_then(|x| x.checked_mul(self.slots_per_state))
.expect("slot table length doesn't overflow");
self.table.resize(len, None);
}
fn setup_search(&mut self, captures_slot_len: usize) {
self.slots_for_captures = captures_slot_len;
}
fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
let i = sid.as_usize() * self.slots_per_state;
&mut self.table[i..i + self.slots_for_captures]
}
fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
let i = self.table.len() - self.slots_per_state;
&mut self.table[i..i + self.slots_for_captures]
}
}
#[derive(Clone, Debug)]
enum FollowEpsilon {
Explore(StateID),
RestoreCapture { slot: u32, offset: Option<NonMaxUsize> },
}
#[derive(Clone)]
struct SparseSet {
len: usize,
dense: Vec<StateID>,
sparse: Vec<StateID>,
}
impl SparseSet {
fn new(capacity: usize) -> SparseSet {
let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
set.resize(capacity);
set
}
fn resize(&mut self, new_capacity: usize) {
assert!(
new_capacity <= u32::MAX.as_usize(),
"sparse set capacity cannot exceed {:?}",
u32::MAX,
);
self.clear();
self.dense.resize(new_capacity, 0);
self.sparse.resize(new_capacity, 0);
}
fn capacity(&self) -> usize {
self.dense.len()
}
fn len(&self) -> usize {
self.len
}
fn is_empty(&self) -> bool {
self.len() == 0
}
fn insert(&mut self, id: StateID) -> bool {
if self.contains(id) {
return false;
}
let index = self.len();
assert!(
index < self.capacity(),
"{:?} exceeds capacity of {:?} when inserting {:?}",
index,
self.capacity(),
id,
);
self.dense[index] = id;
self.sparse[id.as_usize()] = u32::try_from(index).unwrap();
self.len += 1;
true
}
fn contains(&self, id: StateID) -> bool {
let index = self.sparse[id.as_usize()];
index.as_usize() < self.len() && self.dense[index.as_usize()] == id
}
fn clear(&mut self) {
self.len = 0;
}
fn iter(&self) -> SparseSetIter<'_> {
SparseSetIter(self.dense[..self.len()].iter())
}
}
impl core::fmt::Debug for SparseSet {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
let elements: Vec<StateID> = self.iter().collect();
f.debug_tuple("SparseSet").field(&elements).finish()
}
}
#[derive(Debug)]
struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
impl<'a> Iterator for SparseSetIter<'a> {
type Item = StateID;
fn next(&mut self) -> Option<StateID> {
self.0.next().map(|&id| id)
}
}