use std::num::NonZero;
use std::sync::Arc;
use rustc_hash::{FxHashMap, FxHashSet};
use smallvec::{SmallVec, smallvec};
use super::small_table::{AccelInfo, BYTE_CEILING, BYTE_CEILING_U8, FieldMatcher};
use super::sparse_set::SparseSet;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct StateId(u32);
impl StateId {
pub const NONE: Self = Self(u32::MAX);
pub const DEAD: Self = Self(u32::MAX - 1);
#[inline]
#[must_use]
#[allow(
clippy::cast_possible_truncation,
reason = "StateId encodes a u32-bounded index; arenas can't reach 2^32 states"
)]
pub const fn from_index(index: usize) -> Self {
Self(index as u32)
}
#[inline]
#[must_use]
pub const fn is_none(self) -> bool {
self.0 == u32::MAX
}
#[inline]
#[must_use]
pub const fn is_dead(self) -> bool {
self.0 == u32::MAX - 1
}
#[inline]
#[must_use]
pub const fn index(self) -> usize {
self.0 as usize
}
}
#[inline]
fn state_count_u32(count: usize, what: &'static str) -> u32 {
u32::try_from(count)
.unwrap_or_else(|_| panic!("arena {what} exceeds u32::MAX (arena overflow)"))
}
#[inline]
fn buffer_offset_u32(offset: usize, what: &'static str) -> u32 {
u32::try_from(offset)
.unwrap_or_else(|_| panic!("arena {what} exceeds u32::MAX (buffer overflow)"))
}
#[derive(Clone, Default)]
pub struct FaState {
pub table: SmallTable,
pub field_transitions: SmallVec<[Arc<FieldMatcher>; 1]>,
pub closure_start: u32,
pub closure_len: u16,
pub ft_start: u32,
pub ft_len: u8,
}
impl std::fmt::Debug for FaState {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("FaState")
.field("table", &self.table)
.field("field_transitions_count", &self.field_transitions.len())
.finish()
}
}
impl FaState {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_table(table: SmallTable) -> Self {
Self {
table,
field_transitions: SmallVec::new(),
closure_start: 0,
closure_len: 0,
ft_start: 0,
ft_len: 0,
}
}
}
#[derive(Clone, Debug)]
pub struct SmallTable {
pub ceilings: SmallVec<[u8; 8]>,
pub steps: SmallVec<[StateId; 8]>,
pub epsilons: SmallVec<[StateId; 2]>,
pub accel: Option<AccelInfo>,
}
impl Default for SmallTable {
fn default() -> Self {
Self::new()
}
}
impl SmallTable {
#[must_use]
pub fn new() -> Self {
Self {
ceilings: smallvec![BYTE_CEILING_U8],
steps: smallvec![StateId::NONE],
epsilons: SmallVec::new(),
accel: None,
}
}
#[must_use]
pub fn with_mappings(default: StateId, bytes: &[u8], targets: &[StateId]) -> Self {
let mut unpacked = [StateId::NONE; BYTE_CEILING];
if !default.is_none() {
unpacked.fill(default);
}
for (b, t) in bytes.iter().zip(targets.iter()) {
unpacked[*b as usize] = *t;
}
let mut table = Self::new();
table.pack(&unpacked);
table
}
pub fn pack(&mut self, unpacked: &[StateId; BYTE_CEILING]) {
self.ceilings.clear();
self.steps.clear();
let mut current = unpacked[0];
for (i, &state_id) in unpacked.iter().enumerate() {
if state_id != current {
self.ceilings
.push(u8::try_from(i).expect("unpacked index bounded by BYTE_CEILING"));
self.steps.push(current);
current = state_id;
}
}
self.ceilings.push(BYTE_CEILING_U8);
self.steps.push(current);
}
pub fn set_transition(&mut self, byte: u8, target: StateId) {
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(self, &mut unpacked);
unpacked[byte as usize] = target;
self.pack(&unpacked);
}
#[inline(always)]
#[allow(unsafe_code)]
#[must_use]
pub fn dstep(&self, byte: u8) -> StateId {
let ceilings = self.ceilings.as_slice();
for (i, &ceiling) in ceilings.iter().enumerate() {
if byte < ceiling {
return unsafe { *self.steps.as_slice().get_unchecked(i) };
}
}
StateId::NONE
}
}
#[derive(Clone, Debug, Default)]
pub struct Stats {
pub state_count: u32,
pub tables_with_transitions: u32,
pub total_ceiling_entries: u32,
pub max_ceilings: u16,
pub total_epsilons: u32,
pub max_epsilons: u16,
pub states_with_field_transitions: u32,
pub closure_data_len: u32,
pub states_with_closures: u32,
pub total_closure_entries: u32,
pub max_closure_len: u16,
pub ft_ptrs_len: u32,
pub dfa_lookup_states: u32,
pub estimated_bytes: usize,
}
impl std::fmt::Display for Stats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"states={}, tables={} (avg_ceil={:.1}, max_ceil={}), \
epsilons={} (max={}), field_trans={}, \
closures={}/{} (avg={:.1}, max={}), \
ft_ptrs={}, dfa_lookup={}, ~{}KB",
self.state_count,
self.tables_with_transitions,
if self.tables_with_transitions > 0 {
f64::from(self.total_ceiling_entries) / f64::from(self.tables_with_transitions)
} else {
0.0
},
self.max_ceilings,
self.total_epsilons,
self.max_epsilons,
self.states_with_field_transitions,
self.states_with_closures,
self.state_count,
if self.states_with_closures > 0 {
f64::from(self.total_closure_entries) / f64::from(self.states_with_closures)
} else {
0.0
},
self.max_closure_len,
self.ft_ptrs_len,
self.dfa_lookup_states,
self.estimated_bytes / 1024,
)
}
}
impl Stats {
pub fn add(&mut self, other: &Self) {
self.state_count += other.state_count;
self.tables_with_transitions += other.tables_with_transitions;
self.total_ceiling_entries += other.total_ceiling_entries;
self.max_ceilings = self.max_ceilings.max(other.max_ceilings);
self.total_epsilons += other.total_epsilons;
self.max_epsilons = self.max_epsilons.max(other.max_epsilons);
self.states_with_field_transitions += other.states_with_field_transitions;
self.closure_data_len += other.closure_data_len;
self.states_with_closures += other.states_with_closures;
self.total_closure_entries += other.total_closure_entries;
self.max_closure_len = self.max_closure_len.max(other.max_closure_len);
self.ft_ptrs_len += other.ft_ptrs_len;
self.dfa_lookup_states += other.dfa_lookup_states;
self.estimated_bytes += other.estimated_bytes;
}
}
#[allow(clippy::module_name_repetitions)]
#[derive(Clone, Default)]
pub struct StateArena {
states: Vec<FaState>,
closure_data: Vec<StateId>,
ft_ptrs: Vec<usize>,
dfa_lookup: Vec<StateId>,
}
impl std::fmt::Debug for StateArena {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("StateArena")
.field("states_count", &self.states.len())
.finish()
}
}
impl StateArena {
#[must_use]
pub const fn new() -> Self {
Self {
states: Vec::new(),
closure_data: Vec::new(),
ft_ptrs: Vec::new(),
dfa_lookup: Vec::new(),
}
}
#[must_use]
pub fn with_capacity(capacity: usize) -> Self {
Self {
states: Vec::with_capacity(capacity),
closure_data: Vec::with_capacity(capacity),
ft_ptrs: Vec::new(),
dfa_lookup: Vec::new(),
}
}
#[must_use]
pub const fn estimated_byte_size(&self) -> usize {
self.states.capacity() * std::mem::size_of::<FaState>()
+ self.closure_data.capacity() * std::mem::size_of::<StateId>()
+ self.ft_ptrs.capacity() * std::mem::size_of::<usize>()
+ self.dfa_lookup.capacity() * std::mem::size_of::<StateId>()
}
#[inline(always)]
#[allow(unsafe_code)]
unsafe fn state_unchecked(&self, id: StateId) -> &FaState {
unsafe { self.states.get_unchecked(id.index()) }
}
#[inline(always)]
#[allow(unsafe_code)]
#[must_use]
pub fn closure_of(&self, id: StateId) -> &[StateId] {
unsafe {
let state = self.state_unchecked(id);
let start = state.closure_start as usize;
let len = state.closure_len as usize;
self.closure_data.get_unchecked(start..start + len)
}
}
#[inline(always)]
#[allow(unsafe_code)]
#[must_use]
pub fn ft_ptrs_of(&self, id: StateId) -> &[usize] {
unsafe {
let state = self.state_unchecked(id);
let len = state.ft_len as usize;
if len == 0 {
return &[];
}
let start = state.ft_start as usize;
self.ft_ptrs.get_unchecked(start..start + len)
}
}
#[inline(always)]
#[allow(unsafe_code)]
#[must_use]
pub fn dstep(&self, id: StateId, byte: u8) -> StateId {
if self.dfa_lookup.is_empty() {
self.states[id.index()].table.dstep(byte)
} else {
unsafe {
*self
.dfa_lookup
.get_unchecked(id.index() * 256 + byte as usize)
}
}
}
pub fn alloc(&mut self) -> StateId {
let id = StateId::from_index(self.states.len());
let state = FaState {
closure_start: buffer_offset_u32(self.closure_data.len(), "closure_data length"),
closure_len: 1,
..Default::default()
};
self.closure_data.push(id);
self.states.push(state);
id
}
pub fn alloc_with_table(&mut self, table: SmallTable) -> StateId {
let id = StateId::from_index(self.states.len());
let mut state = FaState::with_table(table);
state.closure_start = buffer_offset_u32(self.closure_data.len(), "closure_data length");
state.closure_len = 1;
self.closure_data.push(id);
self.states.push(state);
id
}
#[inline]
#[must_use]
pub fn get(&self, id: StateId) -> Option<&FaState> {
if id.is_none() {
None
} else {
self.states.get(id.index())
}
}
#[inline]
pub fn get_mut(&mut self, id: StateId) -> Option<&mut FaState> {
if id.is_none() {
None
} else {
self.states.get_mut(id.index())
}
}
#[must_use]
pub const fn len(&self) -> usize {
self.states.len()
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.states.is_empty()
}
#[must_use]
pub fn stats(&self) -> Stats {
let state_count = self.states.len();
if state_count == 0 {
return Stats::default();
}
let mut tables_with_transitions = 0u32;
let mut total_ceiling_entries = 0u32;
let mut max_ceilings = 0u16;
let mut total_epsilons = 0u32;
let mut max_epsilons = 0u16;
let mut states_with_field_transitions = 0u32;
let mut total_closure_entries = 0u32;
let mut max_closure_len = 0u16;
let mut states_with_closures = 0u32;
for state in &self.states {
let nc = state.table.ceilings.len();
if nc > 1 {
tables_with_transitions += 1;
total_ceiling_entries +=
u32::try_from(nc).expect("ceiling count bounded by BYTE_CEILING");
let nc_u16 = u16::try_from(nc).expect("ceiling count bounded by BYTE_CEILING");
max_ceilings = max_ceilings.max(nc_u16);
}
let ne = state.table.epsilons.len();
if ne > 0 {
total_epsilons += u32::try_from(ne).expect("epsilon count bounded by BYTE_CEILING");
let ne_u16 = u16::try_from(ne).expect("epsilon count bounded by BYTE_CEILING");
max_epsilons = max_epsilons.max(ne_u16);
}
if !state.field_transitions.is_empty() {
states_with_field_transitions += 1;
}
if state.closure_len > 0 {
states_with_closures += 1;
total_closure_entries += u32::from(state.closure_len);
max_closure_len = max_closure_len.max(state.closure_len);
}
}
let dfa_lookup_states = if self.dfa_lookup.is_empty() {
0
} else {
self.dfa_lookup.len() / 256
};
Stats {
state_count: state_count_u32(state_count, "state count"),
tables_with_transitions,
total_ceiling_entries,
max_ceilings,
total_epsilons,
max_epsilons,
states_with_field_transitions,
closure_data_len: buffer_offset_u32(self.closure_data.len(), "closure_data length"),
states_with_closures,
total_closure_entries,
max_closure_len,
ft_ptrs_len: buffer_offset_u32(self.ft_ptrs.len(), "ft_ptrs length"),
dfa_lookup_states: state_count_u32(dfa_lookup_states, "dfa_lookup state count"),
estimated_bytes: self.estimated_byte_size(),
}
}
#[must_use]
pub fn is_nondeterministic(&self) -> bool {
self.states
.iter()
.any(|state| !state.table.epsilons.is_empty())
}
pub fn precompute_epsilon_closures(&mut self) {
let arena_len = self.states.len();
if arena_len == 0 {
return;
}
let mut seen = SparseSet::new(arena_len);
let mut stack: Vec<StateId> = Vec::new();
let mut closure_data: Vec<StateId> = Vec::with_capacity(arena_len);
let mut closure_buf: Vec<StateId> = Vec::new();
for state_idx in 0..arena_len {
let state_id = StateId::from_index(state_idx);
let start = buffer_offset_u32(closure_data.len(), "closure_data length");
if self.states[state_idx].table.epsilons.is_empty() {
closure_data.push(state_id);
self.states[state_idx].closure_start = start;
self.states[state_idx].closure_len = 1;
} else {
seen.clear();
stack.clear();
closure_buf.clear();
closure_buf.push(state_id);
stack.push(state_id);
seen.insert(state_idx);
while let Some(current_id) = stack.pop() {
if current_id.is_none() {
continue;
}
for &eps_id in &self.states[current_id.index()].table.epsilons {
if !eps_id.is_none() {
let idx = eps_id.index();
if idx < seen.capacity() && seen.insert(idx) {
closure_buf.push(eps_id);
stack.push(eps_id);
}
}
}
}
let len = u16::try_from(closure_buf.len())
.expect("epsilon closure exceeds u16::MAX states");
closure_data.extend_from_slice(&closure_buf);
self.states[state_idx].closure_start = start;
self.states[state_idx].closure_len = len;
}
}
self.closure_data = closure_data;
}
pub fn flatten_tables(&mut self) {
self.flatten_ft_ptrs();
self.build_dfa_lookup();
}
fn flatten_ft_ptrs(&mut self) {
let arena_len = self.states.len();
if arena_len == 0 {
return;
}
let mut ft_ptrs = Vec::new();
for state_idx in 0..arena_len {
let state = &self.states[state_idx];
let ft_start = ft_ptrs.len();
let ft_len = state.field_transitions.len();
for ft in &state.field_transitions {
ft_ptrs.push(Arc::as_ptr(ft) as usize);
}
self.states[state_idx].ft_start = buffer_offset_u32(ft_start, "ft_start offset");
self.states[state_idx].ft_len =
u8::try_from(ft_len).expect("per-state field_transitions.len() fits in u8");
}
self.ft_ptrs = ft_ptrs;
}
#[cfg(not(miri))]
fn build_dfa_lookup(&mut self) {
let arena_len = self.states.len();
if arena_len == 0 {
return;
}
let mut dfa_lookup = vec![StateId::NONE; arena_len * 256];
for state_idx in 0..arena_len {
let state = &self.states[state_idx];
let ceilings = state.table.ceilings.as_slice();
let steps = state.table.steps.as_slice();
let base = state_idx * 256;
let mut prev_ceiling: u8 = 0;
for (ci, &ceiling) in ceilings.iter().enumerate() {
let step = steps[ci];
for byte in prev_ceiling..ceiling {
dfa_lookup[base + byte as usize] = step;
}
prev_ceiling = ceiling;
}
}
self.dfa_lookup = dfa_lookup;
}
#[cfg(miri)]
fn build_dfa_lookup(&mut self) {}
#[must_use]
pub fn nfa_to_dfa(&self, start: StateId, state_budget: usize) -> Option<(Self, StateId)> {
if start.is_none() || self.states.is_empty() {
return Some((Self::new(), StateId::NONE));
}
debug_assert!(
!self.closure_data.is_empty(),
"epsilon closures must be precomputed before nfa_to_dfa"
);
let mut dfa_arena = Self::with_capacity(self.states.len());
let mut state_map: FxHashMap<Vec<u32>, StateId> = FxHashMap::default();
let mut worklist: Vec<StateId> = Vec::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
let mut dfa_unpacked = [StateId::NONE; BYTE_CEILING];
let mut closure_set: Vec<StateId> = Vec::new();
let mut seen: FxHashSet<StateId> = FxHashSet::default();
let mut byte_to_next: Vec<Vec<StateId>> = (0..BYTE_CEILING).map(|_| Vec::new()).collect();
let start_closure = self.closure_of(start);
let start_key = Self::make_state_set_key(start_closure);
let dfa_start = dfa_arena.alloc();
state_map.insert(start_key, dfa_start);
worklist.push(dfa_start);
let mut dfa_nfa_sets: Vec<Vec<StateId>> = Vec::new();
dfa_nfa_sets.push(start_closure.to_vec());
Self::collect_field_transitions(self, start_closure, &mut dfa_arena[dfa_start]);
while let Some(dfa_state) = worklist.pop() {
let nfa_states = dfa_nfa_sets[dfa_state.index()].clone();
for v in &mut byte_to_next {
v.clear();
}
for &nfa_state in &nfa_states {
if nfa_state.is_none() {
continue;
}
unpack_arena_table(&self[nfa_state].table, &mut unpacked);
for byte in 0..BYTE_CEILING {
let target = unpacked[byte];
if !target.is_none() {
let target_closure = self.closure_of(target);
byte_to_next[byte].extend_from_slice(target_closure);
}
}
}
dfa_unpacked.fill(StateId::NONE);
for byte in 0..BYTE_CEILING {
if byte_to_next[byte].is_empty() {
continue;
}
seen.clear();
closure_set.clear();
for &s in &byte_to_next[byte] {
if seen.insert(s) {
closure_set.push(s);
}
}
closure_set.sort_unstable_by_key(|s| s.0);
let key = Self::make_state_set_key(&closure_set);
let dfa_next = if let Some(&existing) = state_map.get(&key) {
existing
} else {
if dfa_arena.len() >= state_budget {
return None; }
let new_dfa = dfa_arena.alloc();
state_map.insert(key, new_dfa);
dfa_nfa_sets.push(closure_set.clone());
Self::collect_field_transitions(self, &closure_set, &mut dfa_arena[new_dfa]);
worklist.push(new_dfa);
new_dfa
};
dfa_unpacked[byte] = dfa_next;
}
dfa_arena[dfa_state].table.pack(&dfa_unpacked);
}
dfa_arena.precompute_epsilon_closures();
dfa_arena.compute_dfa_accel();
Some((dfa_arena, dfa_start))
}
fn compute_dfa_accel(&mut self) {
let mut unpacked = [StateId::NONE; BYTE_CEILING];
for state_idx in 0..self.states.len() {
let state_id = StateId::from_index(state_idx);
unpack_arena_table(&self.states[state_idx].table, &mut unpacked);
let mut has_self_loop = false;
let mut exit_count = 0u8;
let mut exit_bytes = [0u8; 3];
for (byte, &target) in unpacked[..0x80].iter().enumerate() {
if target == state_id {
has_self_loop = true;
} else {
if exit_count < 3 {
exit_bytes[exit_count as usize] =
u8::try_from(byte).expect("ASCII range bounded above by 0x80");
}
exit_count += 1;
}
}
if has_self_loop && (1..=3).contains(&exit_count) {
self.states[state_idx].table.accel = Some(AccelInfo {
exit_bytes,
len: exit_count,
});
}
}
}
fn make_state_set_key(states: &[StateId]) -> Vec<u32> {
states.iter().map(|s| s.0).collect()
}
fn collect_field_transitions(
nfa_arena: &Self,
nfa_states: &[StateId],
dfa_state: &mut FaState,
) {
let mut seen_ptrs: FxHashSet<usize> = FxHashSet::default();
for &nfa_state in nfa_states {
if nfa_state.is_none() {
continue;
}
for ft in &nfa_arena[nfa_state].field_transitions {
let ptr = Arc::as_ptr(ft) as usize;
if seen_ptrs.insert(ptr) {
dfa_state.field_transitions.push(ft.clone());
}
}
}
}
}
impl std::ops::Index<StateId> for StateArena {
type Output = FaState;
#[inline]
fn index(&self, id: StateId) -> &Self::Output {
&self.states[id.index()]
}
}
impl std::ops::IndexMut<StateId> for StateArena {
#[inline]
fn index_mut(&mut self, id: StateId) -> &mut Self::Output {
&mut self.states[id.index()]
}
}
struct LazyDfaState {
transitions: Vec<StateId>,
field_transition_ptrs: Vec<usize>,
cached: bool,
accel: Option<AccelInfo>,
}
pub(crate) struct LazyDfa {
nfa_arena: StateArena,
states: Vec<LazyDfaState>,
state_map: FxHashMap<Vec<u32>, StateId>,
nfa_sets: Vec<Vec<StateId>>,
state_budget: usize,
cached_count: usize,
}
impl LazyDfa {
pub fn new(nfa_arena: StateArena, nfa_start: StateId, state_budget: usize) -> Self {
let mut lazy = Self {
nfa_arena,
states: Vec::new(),
state_map: FxHashMap::default(),
nfa_sets: Vec::new(),
state_budget,
cached_count: 0,
};
if !nfa_start.is_none() {
let closure = lazy.nfa_arena.closure_of(nfa_start).to_vec();
lazy.intern_state(&closure, true);
}
lazy
}
fn intern_state(&mut self, nfa_states: &[StateId], allow_cache: bool) -> StateId {
let key = StateArena::make_state_set_key(nfa_states);
if let Some(&idx) = self.state_map.get(&key) {
return idx;
}
let can_cache = allow_cache && self.cached_count < self.state_budget;
let idx = StateId::from_index(self.states.len());
let mut field_transition_ptrs = Vec::new();
let mut seen_ptrs: FxHashSet<usize> = FxHashSet::default();
for &nfa_state in nfa_states {
if nfa_state.is_none() {
continue;
}
for ft in &self.nfa_arena[nfa_state].field_transitions {
let ptr = Arc::as_ptr(ft) as usize;
if seen_ptrs.insert(ptr) {
field_transition_ptrs.push(ptr);
}
}
}
let state = LazyDfaState {
transitions: vec![StateId::NONE; BYTE_CEILING],
field_transition_ptrs,
cached: can_cache,
accel: None,
};
self.states.push(state);
self.nfa_sets.push(nfa_states.to_vec());
self.state_map.insert(key, idx);
if can_cache {
self.cached_count += 1;
}
idx
}
fn step(&mut self, state_idx: StateId, byte: u8, scratch: &mut LazyDfaScratch) -> StateId {
let cached = self.states[state_idx.index()].transitions[byte as usize];
if !cached.is_none() {
return cached;
}
let nfa_states = &self.nfa_sets[state_idx.index()];
scratch.next_nfa_states.clear();
scratch.seen.clear();
for &nfa_state in nfa_states {
if nfa_state.is_none() {
continue;
}
let next = self.nfa_arena.dstep(nfa_state, byte);
if !next.is_none() {
let closure = self.nfa_arena.closure_of(next);
for &cs in closure {
if scratch.seen.insert(cs) {
scratch.next_nfa_states.push(cs);
}
}
}
}
if scratch.next_nfa_states.is_empty() {
self.states[state_idx.index()].transitions[byte as usize] = StateId::DEAD;
return StateId::DEAD;
}
scratch.next_nfa_states.sort_unstable_by_key(|s| s.0);
let next_idx = self.intern_state(&scratch.next_nfa_states, true);
if self.states[state_idx.index()].cached {
self.states[state_idx.index()].transitions[byte as usize] = next_idx;
}
next_idx
}
fn try_compute_accel(&mut self, state_idx: StateId, scratch: &mut LazyDfaScratch) {
let nfa_states = &self.nfa_sets[state_idx.index()];
let mut common_accel: Option<AccelInfo> = None;
for &nfa_state in nfa_states {
if nfa_state.is_none() {
continue;
}
if let Some(ref accel) = self.nfa_arena[nfa_state].table.accel {
match &common_accel {
None => common_accel = Some(accel.clone()),
Some(existing)
if existing.len == accel.len
&& existing.exit_bytes[..existing.len as usize]
== accel.exit_bytes[..accel.len as usize] => {}
Some(_) => return,
}
}
}
let accel = match common_accel {
Some(a) => a,
None => return,
};
for i in 0..accel.len as usize {
let next = self.step(state_idx, accel.exit_bytes[i], scratch);
if next == state_idx {
return;
}
}
self.states[state_idx.index()].accel = Some(accel);
}
}
#[derive(Default)]
struct LazyDfaScratch {
next_nfa_states: Vec<StateId>,
seen: FxHashSet<StateId>,
}
#[inline]
pub(crate) fn traverse_lazy_dfa(lazy_dfa: &mut LazyDfa, val: &[u8], transitions: &mut Vec<usize>) {
if lazy_dfa.states.is_empty() {
return;
}
let mut scratch = LazyDfaScratch::default();
let mut current = StateId::from_index(0); let mut remaining = val;
transitions.extend_from_slice(&lazy_dfa.states[0].field_transition_ptrs);
loop {
if let Some(ref accel) = lazy_dfa.states[current.index()].accel
&& let Some(skip) = accel.try_accelerate(remaining)
{
remaining = &remaining[skip.get()..];
continue;
}
let (byte, at_end) = match remaining.split_first() {
Some((&b, rest)) => {
remaining = rest;
(b, false)
}
None => (ARENA_VALUE_TERMINATOR, true),
};
let next = lazy_dfa.step(current, byte, &mut scratch);
if next.is_dead() {
return;
}
if next == current && lazy_dfa.states[current.index()].accel.is_none() {
lazy_dfa.try_compute_accel(current, &mut scratch);
}
current = next;
transitions.extend_from_slice(&lazy_dfa.states[current.index()].field_transition_ptrs);
if at_end {
break;
}
}
}
#[derive(Default)]
pub struct NfaBuffers {
pub current_states: Vec<StateId>,
pub next_states: Vec<StateId>,
pub transitions: Vec<usize>,
seen_transitions: FxHashSet<usize>,
step_gen: u64,
seen_states: FxHashMap<StateId, u64>,
}
impl NfaBuffers {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn with_capacity() -> Self {
Self {
current_states: Vec::with_capacity(16),
next_states: Vec::with_capacity(16),
transitions: Vec::new(),
seen_transitions: FxHashSet::default(),
step_gen: 0,
seen_states: FxHashMap::default(),
}
}
pub fn clear(&mut self) {
self.current_states.clear();
self.next_states.clear();
self.transitions.clear();
self.seen_transitions.clear();
}
}
pub const ARENA_VALUE_TERMINATOR: u8 = 0xF5;
#[inline]
fn try_accelerate_arena(table: &SmallTable, remaining: &[u8]) -> Option<NonZero<usize>> {
table.accel.as_ref()?.try_accelerate(remaining)
}
#[inline]
#[allow(clippy::too_many_lines)] pub fn traverse_arena_nfa(arena: &StateArena, start: StateId, val: &[u8], bufs: &mut NfaBuffers) {
bufs.clear();
if start.is_none() {
return;
}
bufs.current_states.push(start);
let mut remaining = val;
loop {
if bufs.current_states.is_empty() {
break;
}
if bufs.current_states.len() == 1 {
let state_id = bufs.current_states[0];
let state = &arena[state_id];
if let Some(skip) = try_accelerate_arena(&state.table, remaining) {
remaining = &remaining[skip.get()..];
continue;
}
}
let (byte, at_end) = match remaining.split_first() {
Some((&b, rest)) => {
remaining = rest;
(b, false)
}
None => (ARENA_VALUE_TERMINATOR, true),
};
let NfaBuffers {
ref mut current_states,
ref mut next_states,
ref mut transitions,
ref mut seen_transitions,
ref mut step_gen,
ref mut seen_states,
} = *bufs;
if arena.ft_ptrs.is_empty() {
for &state_id in current_states.iter() {
let closure = arena.closure_of(state_id);
if closure.len() == 1 {
for ft in &arena[state_id].field_transitions {
let ptr = Arc::as_ptr(ft) as usize;
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
let next = arena.dstep(state_id, byte);
if !next.is_none() {
next_states.push(next);
}
} else {
for &ec_state_id in closure {
for ft in &arena[ec_state_id].field_transitions {
let ptr = Arc::as_ptr(ft) as usize;
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
let next = arena.dstep(ec_state_id, byte);
if !next.is_none() {
next_states.push(next);
}
}
}
}
} else {
for &state_id in current_states.iter() {
let closure = arena.closure_of(state_id);
if closure.len() == 1 {
for &ptr in arena.ft_ptrs_of(state_id) {
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
let next = arena.dstep(state_id, byte);
if !next.is_none() {
next_states.push(next);
}
} else {
for &ec_state_id in closure {
for &ptr in arena.ft_ptrs_of(ec_state_id) {
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
let next = arena.dstep(ec_state_id, byte);
if !next.is_none() {
next_states.push(next);
}
}
}
}
}
if next_states.len() > 64 {
*step_gen += 1;
let generation = *step_gen;
let mut j = 0;
for i_ns in 0..next_states.len() {
let state = next_states[i_ns];
if seen_states.get(&state).copied() != Some(generation) {
seen_states.insert(state, generation);
next_states[j] = state;
j += 1;
}
}
next_states.truncate(j);
}
current_states.clear();
std::mem::swap(current_states, next_states);
if at_end {
break;
}
}
let NfaBuffers {
ref current_states,
ref mut transitions,
ref mut seen_transitions,
..
} = *bufs;
if arena.ft_ptrs.is_empty() {
for &state_id in current_states {
let closure = arena.closure_of(state_id);
if closure.len() == 1 {
for ft in &arena[state_id].field_transitions {
let ptr = Arc::as_ptr(ft) as usize;
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
} else {
for &ec_state_id in closure {
for ft in &arena[ec_state_id].field_transitions {
let ptr = Arc::as_ptr(ft) as usize;
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
}
}
}
} else {
for &state_id in current_states {
let closure = arena.closure_of(state_id);
if closure.len() == 1 {
for &ptr in arena.ft_ptrs_of(state_id) {
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
} else {
for &ec_state_id in closure {
for &ptr in arena.ft_ptrs_of(ec_state_id) {
if seen_transitions.insert(ptr) {
transitions.push(ptr);
}
}
}
}
}
}
}
#[inline]
pub fn traverse_arena_dfa(
arena: &StateArena,
start: StateId,
val: &[u8],
transitions: &mut Vec<usize>,
) {
if start.is_none() {
return;
}
let has_flat = !arena.ft_ptrs.is_empty();
let mut current = start;
let mut remaining = val;
loop {
if has_flat {
for &ptr in arena.ft_ptrs_of(current) {
transitions.push(ptr);
}
} else {
for ft in &arena[current].field_transitions {
transitions.push(Arc::as_ptr(ft) as usize);
}
}
if let Some(skip) = try_accelerate_arena(&arena[current].table, remaining) {
debug_assert!(
if has_flat {
arena.ft_ptrs_of(current).is_empty()
} else {
arena[current].field_transitions.is_empty()
},
"accelerated state {current:?} has field transitions; they would be collected redundantly on each skip"
);
remaining = &remaining[skip.get()..];
continue;
}
let (byte, at_end) = match remaining.split_first() {
Some((&b, rest)) => {
remaining = rest;
(b, false)
}
None => (ARENA_VALUE_TERMINATOR, true),
};
let next = arena.dstep(current, byte);
if next.is_none() {
return;
}
current = next;
if at_end {
break;
}
}
if has_flat {
for &ptr in arena.ft_ptrs_of(current) {
transitions.push(ptr);
}
} else {
for ft in &arena[current].field_transitions {
transitions.push(Arc::as_ptr(ft) as usize);
}
}
}
#[inline]
pub fn traverse_arena_dfa_backward(
arena: &StateArena,
start: StateId,
val: &[u8],
transitions: &mut Vec<usize>,
) {
if start.is_none() || val.is_empty() {
return;
}
let mut current = start;
for i in (0..val.len()).rev() {
let next = arena.dstep(current, val[i]);
if next.is_none() {
return;
}
current = next;
if arena.ft_ptrs.is_empty() {
for ft in &arena[current].field_transitions {
transitions.push(Arc::as_ptr(ft) as usize);
}
} else {
for &ptr in arena.ft_ptrs_of(current) {
transitions.push(ptr);
}
}
}
}
#[must_use]
pub fn merge_arena_dfas(
arena1: &StateArena,
start1: StateId,
arena2: &StateArena,
start2: StateId,
) -> (StateArena, StateId) {
if start1.is_none() && start2.is_none() {
return (StateArena::new(), StateId::NONE);
}
if start1.is_none() {
return clone_arena_subset(arena2, start2);
}
if start2.is_none() {
return clone_arena_subset(arena1, start1);
}
let mut memo: FxHashMap<(StateId, StateId), StateId> = FxHashMap::default();
let mut new_arena = StateArena::new();
let start =
merge_arena_states_recursive(arena1, start1, arena2, start2, &mut new_arena, &mut memo);
new_arena.precompute_epsilon_closures();
(new_arena, start)
}
fn clone_arena_subset(arena: &StateArena, start: StateId) -> (StateArena, StateId) {
if start.is_none() {
return (StateArena::new(), StateId::NONE);
}
let mut new_arena = StateArena::new();
let mut id_map: FxHashMap<u32, StateId> = FxHashMap::default();
clone_state_recursive(arena, start, &mut new_arena, &mut id_map);
let new_start = id_map.get(&start.0).copied().unwrap_or(StateId::NONE);
new_arena.precompute_epsilon_closures();
(new_arena, new_start)
}
fn clone_state_recursive(
arena: &StateArena,
state_id: StateId,
new_arena: &mut StateArena,
id_map: &mut FxHashMap<u32, StateId>,
) -> StateId {
if state_id.is_none() {
return StateId::NONE;
}
if let Some(&new_id) = id_map.get(&state_id.0) {
return new_id;
}
let new_id = new_arena.alloc();
id_map.insert(state_id.0, new_id);
new_arena[new_id].field_transitions = arena[state_id].field_transitions.clone();
let old_state = &arena[state_id];
let old_table = &old_state.table;
let mut new_table = SmallTable {
ceilings: old_table.ceilings.clone(),
steps: SmallVec::with_capacity(old_table.steps.len()),
epsilons: SmallVec::with_capacity(old_table.epsilons.len()),
accel: old_table.accel.clone(),
};
for &step_id in &old_table.steps {
let new_step = clone_state_recursive(arena, step_id, new_arena, id_map);
new_table.steps.push(new_step);
}
for &eps_id in &old_table.epsilons {
let new_eps = clone_state_recursive(arena, eps_id, new_arena, id_map);
new_table.epsilons.push(new_eps);
}
new_arena[new_id].table = new_table;
new_id
}
fn merge_arena_states_recursive(
arena1: &StateArena,
state1: StateId,
arena2: &StateArena,
state2: StateId,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
) -> StateId {
let key = (state1, state2);
if let Some(&cached) = memo.get(&key) {
return cached;
}
if state1.is_none() && state2.is_none() {
return StateId::NONE;
}
let new_id = new_arena.alloc();
memo.insert(key, new_id);
if state1.is_none() {
let s2 = &arena2[state2];
new_arena[new_id].field_transitions = s2.field_transitions.clone();
new_arena[new_id].table =
remap_table_recursive(arena2, &s2.table, arena1, new_arena, memo, false);
return new_id;
}
if state2.is_none() {
let s1 = &arena1[state1];
new_arena[new_id].field_transitions = s1.field_transitions.clone();
new_arena[new_id].table =
remap_table_recursive(arena1, &s1.table, arena2, new_arena, memo, true);
return new_id;
}
let s1 = &arena1[state1];
let s2 = &arena2[state2];
let mut field_transitions = s1.field_transitions.clone();
field_transitions.extend(s2.field_transitions.iter().cloned());
new_arena[new_id].field_transitions = field_transitions;
new_arena[new_id].table =
merge_arena_tables(arena1, &s1.table, arena2, &s2.table, new_arena, memo);
new_id
}
fn remap_table_recursive(
source_arena: &StateArena,
table: &SmallTable,
_other_arena: &StateArena,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
is_arena1: bool,
) -> SmallTable {
let mut new_table = SmallTable {
ceilings: table.ceilings.clone(),
steps: SmallVec::with_capacity(table.steps.len()),
epsilons: SmallVec::with_capacity(table.epsilons.len()),
accel: table.accel.clone(),
};
for &step_id in &table.steps {
if step_id.is_none() {
new_table.steps.push(StateId::NONE);
} else {
let merged = if is_arena1 {
merge_arena_states_recursive(
source_arena,
step_id,
_other_arena,
StateId::NONE,
new_arena,
memo,
)
} else {
merge_arena_states_recursive(
_other_arena,
StateId::NONE,
source_arena,
step_id,
new_arena,
memo,
)
};
new_table.steps.push(merged);
}
}
for &eps_id in &table.epsilons {
if eps_id.is_none() {
new_table.epsilons.push(StateId::NONE);
} else {
let merged = if is_arena1 {
merge_arena_states_recursive(
source_arena,
eps_id,
_other_arena,
StateId::NONE,
new_arena,
memo,
)
} else {
merge_arena_states_recursive(
_other_arena,
StateId::NONE,
source_arena,
eps_id,
new_arena,
memo,
)
};
new_table.epsilons.push(merged);
}
}
new_table
}
fn merge_arena_tables(
arena1: &StateArena,
table1: &SmallTable,
arena2: &StateArena,
table2: &SmallTable,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
) -> SmallTable {
let mut unpacked1 = [StateId::NONE; BYTE_CEILING];
let mut unpacked2 = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(table1, &mut unpacked1);
unpack_arena_table(table2, &mut unpacked2);
let mut merged_unpacked = [StateId::NONE; BYTE_CEILING];
for i in 0..BYTE_CEILING {
let s1 = unpacked1[i];
let s2 = unpacked2[i];
merged_unpacked[i] = merge_arena_states_recursive(arena1, s1, arena2, s2, new_arena, memo);
}
let mut result = SmallTable::new();
result.pack(&merged_unpacked);
for &eps1 in &table1.epsilons {
let merged =
merge_arena_states_recursive(arena1, eps1, arena2, StateId::NONE, new_arena, memo);
if !merged.is_none() {
result.epsilons.push(merged);
}
}
for &eps2 in &table2.epsilons {
let merged =
merge_arena_states_recursive(arena1, StateId::NONE, arena2, eps2, new_arena, memo);
if !merged.is_none() {
result.epsilons.push(merged);
}
}
result
}
fn unpack_arena_table(table: &SmallTable, unpacked: &mut [StateId; BYTE_CEILING]) {
let mut byte_idx = 0usize;
for (i, &ceiling) in table.ceilings.iter().enumerate() {
let end = ceiling as usize;
unpacked[byte_idx..end].fill(table.steps[i]);
byte_idx = end;
}
}
#[must_use]
pub fn merge_arena_nfas(
arena1: &StateArena,
start1: StateId,
arena2: &StateArena,
start2: StateId,
) -> (StateArena, StateId) {
if start1.is_none() && start2.is_none() {
return (StateArena::new(), StateId::NONE);
}
if start1.is_none() {
return clone_arena_subset(arena2, start2);
}
if start2.is_none() {
return clone_arena_subset(arena1, start1);
}
let mut memo: FxHashMap<(StateId, StateId), StateId> = FxHashMap::default();
let mut new_arena = StateArena::new();
let start =
merge_arena_nfa_states_recursive(arena1, start1, arena2, start2, &mut new_arena, &mut memo);
new_arena.precompute_epsilon_closures();
(new_arena, start)
}
fn is_epsilon_only_state(arena: &StateArena, state_id: StateId) -> bool {
if state_id.is_none() {
return false;
}
let state = &arena[state_id];
!state.table.epsilons.is_empty()
&& state.table.ceilings.len() == 1
&& state.table.steps[0].is_none()
&& state.field_transitions.is_empty()
}
fn flatten_epsilon_targets(arena: &StateArena, states: &[StateId]) -> SmallVec<[StateId; 2]> {
let mut targets = SmallVec::new();
for &state_id in states {
if !state_id.is_none() && is_epsilon_only_state(arena, state_id) {
for &eps_id in &arena[state_id].table.epsilons {
targets.push(eps_id);
}
} else {
targets.push(state_id);
}
}
targets
}
fn is_spinout_state(arena: &StateArena, state_id: StateId) -> bool {
if state_id.is_none() {
return false;
}
let state = &arena[state_id];
state.table.steps.contains(&state_id) && state.table.epsilons.len() <= 1
}
fn merge_arena_nfa_states_recursive(
arena1: &StateArena,
state1: StateId,
arena2: &StateArena,
state2: StateId,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
) -> StateId {
let key = (state1, state2);
if let Some(&cached) = memo.get(&key) {
return cached;
}
if state1.is_none() && state2.is_none() {
return StateId::NONE;
}
let new_id = new_arena.alloc();
memo.insert(key, new_id);
if state1.is_none() {
let s2 = &arena2[state2];
new_arena[new_id].field_transitions = s2.field_transitions.clone();
new_arena[new_id].table =
remap_nfa_table_recursive(arena2, &s2.table, arena1, new_arena, memo, false);
return new_id;
}
if state2.is_none() {
let s1 = &arena1[state1];
new_arena[new_id].field_transitions = s1.field_transitions.clone();
new_arena[new_id].table =
remap_nfa_table_recursive(arena1, &s1.table, arena2, new_arena, memo, true);
return new_id;
}
let s1 = &arena1[state1];
let s2 = &arena2[state2];
let s1_has_spinout = is_spinout_state(arena1, state1);
let s2_has_spinout = is_spinout_state(arena2, state2);
let s1_has_epsilons = !s1.table.epsilons.is_empty();
let s2_has_epsilons = !s2.table.epsilons.is_empty();
if s1_has_spinout && s2_has_spinout {
merge_dual_spinout_states(arena1, s1, arena2, s2, new_arena, memo, new_id);
return new_id;
}
if (s1_has_spinout && !s2_has_epsilons) || (s2_has_spinout && !s1_has_epsilons) {
return asymmetric_spinner_merge(
arena1,
state1,
arena2,
state2,
s1_has_spinout,
new_arena,
memo,
new_id,
);
}
if s1_has_epsilons || s2_has_epsilons {
let mut clone_map1: FxHashMap<u32, StateId> = FxHashMap::default();
let mut clone_map2: FxHashMap<u32, StateId> = FxHashMap::default();
let cloned1 = clone_state_into_arena(arena1, state1, new_arena, &mut clone_map1);
let cloned2 = clone_state_into_arena(arena2, state2, new_arena, &mut clone_map2);
let epsilons = flatten_epsilon_targets(new_arena, &[cloned1, cloned2]);
new_arena[new_id].table = SmallTable {
ceilings: smallvec![BYTE_CEILING_U8],
steps: smallvec![StateId::NONE],
epsilons,
accel: None,
};
return new_id;
}
let combined_table =
merge_nfa_tables_bytewise(arena1, &s1.table, arena2, &s2.table, new_arena, memo);
let mut field_transitions = s1.field_transitions.clone();
field_transitions.extend(s2.field_transitions.iter().cloned());
new_arena[new_id].table = combined_table;
new_arena[new_id].field_transitions = field_transitions;
new_id
}
fn merge_dual_spinout_states(
arena1: &StateArena,
s1: &FaState,
arena2: &StateArena,
s2: &FaState,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
new_id: StateId,
) {
let mut combined_table =
merge_nfa_tables_bytewise(arena1, &s1.table, arena2, &s2.table, new_arena, memo);
let mut merged_epsilons: SmallVec<[StateId; 2]> = SmallVec::new();
for &eps1 in &s1.table.epsilons {
let merged =
merge_arena_nfa_states_recursive(arena1, eps1, arena2, StateId::NONE, new_arena, memo);
if !merged.is_none() {
merged_epsilons.push(merged);
}
}
for &eps2 in &s2.table.epsilons {
let merged =
merge_arena_nfa_states_recursive(arena1, StateId::NONE, arena2, eps2, new_arena, memo);
if !merged.is_none() {
merged_epsilons.push(merged);
}
}
combined_table.epsilons = merged_epsilons;
let mut field_transitions = s1.field_transitions.clone();
field_transitions.extend(s2.field_transitions.iter().cloned());
new_arena[new_id].table = combined_table;
new_arena[new_id].field_transitions = field_transitions;
}
#[allow(clippy::too_many_arguments)]
fn asymmetric_spinner_merge(
arena1: &StateArena,
state1: StateId,
arena2: &StateArena,
state2: StateId,
s1_has_spinout: bool,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
new_id: StateId,
) -> StateId {
let s1 = &arena1[state1];
let s2 = &arena2[state2];
let (spinner_arena, spinner_id, spinner_table, other_arena, other_table) = if s1_has_spinout {
(arena1, state1, &s1.table, arena2, &s2.table)
} else {
(arena2, state2, &s2.table, arena1, &s1.table)
};
let mut spinner_unpacked = [StateId::NONE; BYTE_CEILING];
let mut other_unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(spinner_table, &mut spinner_unpacked);
unpack_arena_table(other_table, &mut other_unpacked);
let mut merged_unpacked = [StateId::NONE; BYTE_CEILING];
for i in 0..BYTE_CEILING {
merged_unpacked[i] = merge_asymmetric_spinner_byte(
spinner_arena,
spinner_id,
other_arena,
spinner_unpacked[i],
other_unpacked[i],
s1_has_spinout,
new_arena,
memo,
new_id,
arena1,
state1,
arena2,
state2,
);
}
let mut combined_table = SmallTable::new();
combined_table.pack(&merged_unpacked);
let mut merged_epsilons: SmallVec<[StateId; 2]> = SmallVec::new();
for &spinner_eps in &spinner_table.epsilons {
let merged = if s1_has_spinout {
merge_arena_nfa_states_recursive(
spinner_arena,
spinner_eps,
other_arena,
StateId::NONE,
new_arena,
memo,
)
} else {
merge_arena_nfa_states_recursive(
other_arena,
StateId::NONE,
spinner_arena,
spinner_eps,
new_arena,
memo,
)
};
if !merged.is_none() {
merged_epsilons.push(merged);
}
}
combined_table.epsilons = merged_epsilons;
let mut field_transitions = arena1[state1].field_transitions.clone();
field_transitions.extend(arena2[state2].field_transitions.iter().cloned());
new_arena[new_id].table = combined_table;
new_arena[new_id].field_transitions = field_transitions;
new_id
}
#[allow(clippy::too_many_arguments)]
fn merge_asymmetric_spinner_byte(
spinner_arena: &StateArena,
spinner_id: StateId,
other_arena: &StateArena,
spinner_next: StateId,
other_next: StateId,
s1_has_spinout: bool,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
new_id: StateId,
arena1: &StateArena,
state1: StateId,
arena2: &StateArena,
state2: StateId,
) -> StateId {
if spinner_next.is_none() {
return StateId::NONE;
}
if other_next.is_none() {
if spinner_next == spinner_id {
return new_id; }
return if s1_has_spinout {
merge_arena_nfa_states_recursive(
spinner_arena,
spinner_next,
other_arena,
StateId::NONE,
new_arena,
memo,
)
} else {
merge_arena_nfa_states_recursive(
other_arena,
StateId::NONE,
spinner_arena,
spinner_next,
new_arena,
memo,
)
};
}
if spinner_next == spinner_id {
let remapped_other = if s1_has_spinout {
merge_arena_nfa_states_recursive(
spinner_arena,
StateId::NONE,
other_arena,
other_next,
new_arena,
memo,
)
} else {
merge_arena_nfa_states_recursive(
other_arena,
other_next,
spinner_arena,
StateId::NONE,
new_arena,
memo,
)
};
if !remapped_other.is_none() {
new_arena[remapped_other].table.epsilons.push(new_id);
let spinner_fts = if s1_has_spinout {
&arena1[state1].field_transitions
} else {
&arena2[state2].field_transitions
};
for ft in spinner_fts {
new_arena[remapped_other].field_transitions.push(ft.clone());
}
}
return remapped_other;
}
let merged_branch = if s1_has_spinout {
merge_arena_nfa_states_recursive(
spinner_arena,
spinner_next,
other_arena,
other_next,
new_arena,
memo,
)
} else {
merge_arena_nfa_states_recursive(
other_arena,
other_next,
spinner_arena,
spinner_next,
new_arena,
memo,
)
};
if !merged_branch.is_none() {
new_arena[merged_branch].table.epsilons.push(new_id);
}
merged_branch
}
fn clone_state_into_arena(
source_arena: &StateArena,
state_id: StateId,
target_arena: &mut StateArena,
id_map: &mut FxHashMap<u32, StateId>,
) -> StateId {
if state_id.is_none() {
return StateId::NONE;
}
if let Some(&new_id) = id_map.get(&state_id.0) {
return new_id;
}
let new_id = target_arena.alloc();
id_map.insert(state_id.0, new_id);
target_arena[new_id].field_transitions = source_arena[state_id].field_transitions.clone();
let old_state = &source_arena[state_id];
let old_table = &old_state.table;
let mut new_table = SmallTable {
ceilings: old_table.ceilings.clone(),
steps: SmallVec::with_capacity(old_table.steps.len()),
epsilons: SmallVec::with_capacity(old_table.epsilons.len()),
accel: old_table.accel.clone(),
};
for &step_id in &old_table.steps {
let new_step = clone_state_into_arena(source_arena, step_id, target_arena, id_map);
new_table.steps.push(new_step);
}
for &eps_id in &old_table.epsilons {
let new_eps = clone_state_into_arena(source_arena, eps_id, target_arena, id_map);
new_table.epsilons.push(new_eps);
}
target_arena[new_id].table = new_table;
new_id
}
fn remap_nfa_table_recursive(
source_arena: &StateArena,
table: &SmallTable,
_other_arena: &StateArena,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
is_arena1: bool,
) -> SmallTable {
let mut new_table = SmallTable {
ceilings: table.ceilings.clone(),
steps: SmallVec::with_capacity(table.steps.len()),
epsilons: SmallVec::with_capacity(table.epsilons.len()),
accel: table.accel.clone(),
};
for &step_id in &table.steps {
if step_id.is_none() {
new_table.steps.push(StateId::NONE);
} else {
let merged = if is_arena1 {
merge_arena_nfa_states_recursive(
source_arena,
step_id,
_other_arena,
StateId::NONE,
new_arena,
memo,
)
} else {
merge_arena_nfa_states_recursive(
_other_arena,
StateId::NONE,
source_arena,
step_id,
new_arena,
memo,
)
};
new_table.steps.push(merged);
}
}
for &eps_id in &table.epsilons {
if eps_id.is_none() {
new_table.epsilons.push(StateId::NONE);
} else {
let merged = if is_arena1 {
merge_arena_nfa_states_recursive(
source_arena,
eps_id,
_other_arena,
StateId::NONE,
new_arena,
memo,
)
} else {
merge_arena_nfa_states_recursive(
_other_arena,
StateId::NONE,
source_arena,
eps_id,
new_arena,
memo,
)
};
new_table.epsilons.push(merged);
}
}
new_table
}
fn merge_nfa_tables_bytewise(
arena1: &StateArena,
table1: &SmallTable,
arena2: &StateArena,
table2: &SmallTable,
new_arena: &mut StateArena,
memo: &mut FxHashMap<(StateId, StateId), StateId>,
) -> SmallTable {
let mut unpacked1 = [StateId::NONE; BYTE_CEILING];
let mut unpacked2 = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(table1, &mut unpacked1);
unpack_arena_table(table2, &mut unpacked2);
let mut merged_unpacked = [StateId::NONE; BYTE_CEILING];
for i in 0..BYTE_CEILING {
let s1 = unpacked1[i];
let s2 = unpacked2[i];
merged_unpacked[i] =
merge_arena_nfa_states_recursive(arena1, s1, arena2, s2, new_arena, memo);
}
let mut result = SmallTable::new();
result.pack(&merged_unpacked);
for &eps1 in &table1.epsilons {
let merged =
merge_arena_nfa_states_recursive(arena1, eps1, arena2, StateId::NONE, new_arena, memo);
if !merged.is_none() {
result.epsilons.push(merged);
}
}
for &eps2 in &table2.epsilons {
let merged =
merge_arena_nfa_states_recursive(arena1, StateId::NONE, arena2, eps2, new_arena, memo);
if !merged.is_none() {
result.epsilons.push(merged);
}
}
result
}
#[must_use]
pub fn make_numeric_less_arena_fa(
bound: f64,
inclusive: bool,
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let bound_q = crate::numbits::q_num_from_f64(bound);
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = make_less_arena_fa_step(&bound_q, 0, inclusive, match_state, &mut arena);
arena.precompute_epsilon_closures();
(arena, start)
}
#[must_use]
pub fn make_numeric_greater_arena_fa(
bound: f64,
inclusive: bool,
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let bound_q = crate::numbits::q_num_from_f64(bound);
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = make_greater_arena_fa_step(&bound_q, 0, inclusive, match_state, &mut arena);
arena.precompute_epsilon_closures();
(arena, start)
}
#[must_use]
pub fn make_numeric_range_arena_fa(
lower: f64,
lower_incl: bool,
upper: f64,
upper_incl: bool,
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let lower_q = crate::numbits::q_num_from_f64(lower);
let upper_q = crate::numbits::q_num_from_f64(upper);
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = make_range_arena_fa_step(
&lower_q,
&upper_q,
0,
lower_incl,
upper_incl,
match_state,
&mut arena,
);
arena.precompute_epsilon_closures();
(arena, start)
}
fn make_less_arena_fa_step(
bound_q: &[u8],
index: usize,
inclusive: bool,
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
if index >= bound_q.len() {
if inclusive {
let start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
return start;
}
return arena.alloc();
}
let bound_byte = bound_q[index];
let continuation = make_less_arena_fa_step(bound_q, index + 1, inclusive, match_state, arena);
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[ARENA_VALUE_TERMINATOR as usize] = match_state;
for b in 0..bound_byte {
if b != ARENA_VALUE_TERMINATOR {
unpacked[b as usize] = match_state;
}
}
unpacked[bound_byte as usize] = continuation;
let mut table = SmallTable::new();
table.pack(&unpacked);
arena.alloc_with_table(table)
}
fn make_greater_arena_fa_step(
bound_q: &[u8],
index: usize,
inclusive: bool,
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
if index >= bound_q.len() {
let mut unpacked = [match_state; BYTE_CEILING];
if !inclusive {
unpacked[ARENA_VALUE_TERMINATOR as usize] = StateId::NONE;
}
let mut table = SmallTable::new();
table.pack(&unpacked);
return arena.alloc_with_table(table);
}
let bound_byte = bound_q[index];
let continuation =
make_greater_arena_fa_step(bound_q, index + 1, inclusive, match_state, arena);
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[bound_byte as usize] = continuation;
for b in (bound_byte + 1)..(BYTE_CEILING_U8) {
if b != ARENA_VALUE_TERMINATOR {
unpacked[b as usize] = match_state;
}
}
let mut table = SmallTable::new();
table.pack(&unpacked);
arena.alloc_with_table(table)
}
fn make_range_arena_fa_step(
lower_q: &[u8],
upper_q: &[u8],
index: usize,
lower_incl: bool,
upper_incl: bool,
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
let lower_done = index >= lower_q.len();
let upper_done = index >= upper_q.len();
if lower_done && upper_done {
if lower_incl && upper_incl {
return arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
}
return arena.alloc();
}
if lower_done {
return make_less_arena_fa_step(upper_q, index, upper_incl, match_state, arena);
}
if upper_done {
return arena.alloc();
}
let lower_byte = lower_q[index];
let upper_byte = upper_q[index];
if lower_byte == upper_byte {
let continuation = make_range_arena_fa_step(
lower_q,
upper_q,
index + 1,
lower_incl,
upper_incl,
match_state,
arena,
);
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[lower_byte as usize] = continuation;
let mut table = SmallTable::new();
table.pack(&unpacked);
return arena.alloc_with_table(table);
}
let mut unpacked = [StateId::NONE; BYTE_CEILING];
let lower_continuation =
make_greater_arena_fa_step(lower_q, index + 1, lower_incl, match_state, arena);
unpacked[lower_byte as usize] = lower_continuation;
for b in (lower_byte + 1)..upper_byte {
unpacked[b as usize] = match_state;
}
let upper_continuation =
make_less_arena_fa_step(upper_q, index + 1, upper_incl, match_state, arena);
unpacked[upper_byte as usize] = upper_continuation;
let mut table = SmallTable::new();
table.pack(&unpacked);
arena.alloc_with_table(table)
}
#[must_use]
pub fn make_string_arena_fa(val: &[u8], next_field: Arc<FieldMatcher>) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = make_string_arena_fa_step(val, 0, match_state, &mut arena);
arena.precompute_epsilon_closures();
(arena, start)
}
fn make_string_arena_fa_step(
val: &[u8],
index: usize,
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
if index >= val.len() {
return arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
}
let continuation = make_string_arena_fa_step(val, index + 1, match_state, arena);
arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[val[index]],
&[continuation],
))
}
pub fn insert_string(
arena: &mut StateArena,
start: StateId,
val: &[u8],
field_matcher: Arc<FieldMatcher>,
) {
let mut current = start;
for i in 0..=val.len() {
let byte = if i < val.len() {
val[i]
} else {
ARENA_VALUE_TERMINATOR
};
let next = arena[current].table.dstep(byte);
if next.is_none() {
let match_state = arena.alloc();
arena[match_state].field_transitions.push(field_matcher);
let mut target = match_state;
for j in (i + 1..=val.len()).rev() {
let b = if j < val.len() {
val[j]
} else {
ARENA_VALUE_TERMINATOR
};
target = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[b],
&[target],
));
}
arena[current].table.set_transition(byte, target);
return;
}
current = next;
}
arena[current].field_transitions.push(field_matcher);
}
#[must_use]
pub fn make_suffix_dfa(
reversed_bytes: &[u8],
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let mut target = match_state;
for &byte in reversed_bytes.iter().rev() {
target =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, &[byte], &[target]));
}
arena.precompute_epsilon_closures();
(arena, target) }
pub fn insert_suffix(
arena: &mut StateArena,
start: StateId,
reversed_bytes: &[u8],
field_matcher: Arc<FieldMatcher>,
) {
let mut current = start;
for (i, &byte) in reversed_bytes.iter().enumerate() {
let next = arena[current].table.dstep(byte);
if next.is_none() {
let match_state = arena.alloc();
arena[match_state].field_transitions.push(field_matcher);
let mut target = match_state;
for &b in reversed_bytes[i + 1..].iter().rev() {
target = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[b],
&[target],
));
}
arena[current].table.set_transition(byte, target);
return;
}
current = next;
}
arena[current].field_transitions.push(field_matcher);
}
#[must_use]
pub fn make_prefix_arena_fa(prefix: &[u8], next_field: Arc<FieldMatcher>) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = make_prefix_arena_fa_step(prefix, 0, match_state, &mut arena);
arena.precompute_epsilon_closures();
(arena, start)
}
fn make_prefix_arena_fa_step(
prefix: &[u8],
index: usize,
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
if index >= prefix.len() {
return arena.alloc_with_table(SmallTable::with_mappings(
match_state, &[],
&[],
));
}
let continuation = make_prefix_arena_fa_step(prefix, index + 1, match_state, arena);
arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[prefix[index]],
&[continuation],
))
}
#[must_use]
pub fn make_shellstyle_arena_fa(
pattern: &[u8],
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = arena.alloc();
let mut state = start;
let mut iter = pattern.iter();
while let Some(&ch) = iter.next() {
if ch == b'*' {
let spinner = arena.alloc();
arena[spinner].table = make_byte_dot_table(spinner);
arena[state].table.epsilons.push(spinner);
if let Some(&next_byte) = iter.next() {
let spin_escape = arena.alloc();
arena[spin_escape].table.epsilons.push(spinner);
arena[spinner].table.set_transition(next_byte, spin_escape);
state = spin_escape;
} else {
state = spinner;
}
} else {
let next_step = arena.alloc();
arena[state].table.set_transition(ch, next_step);
state = next_step;
}
}
let last_step = arena.alloc();
arena[last_step].field_transitions = arena[match_state].field_transitions.clone();
arena[state]
.table
.set_transition(ARENA_VALUE_TERMINATOR, last_step);
arena.precompute_epsilon_closures();
(arena, start)
}
#[derive(Debug)]
enum ShellstyleSegment {
Literal(Vec<u8>),
Wildcard,
}
fn build_fa_from_segments(
segments: &[ShellstyleSegment],
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
let start = arena.alloc();
let mut state = start;
let mut skip_first_literal_byte = false;
for (seg_idx, seg) in segments.iter().enumerate() {
match seg {
ShellstyleSegment::Literal(bytes) => {
let byte_start = usize::from(skip_first_literal_byte);
skip_first_literal_byte = false;
for &ch in &bytes[byte_start..] {
let next_step = arena.alloc();
arena[state].table.set_transition(ch, next_step);
state = next_step;
}
}
ShellstyleSegment::Wildcard => {
let spinner = arena.alloc();
arena[spinner].table = make_byte_dot_table(spinner);
arena[state].table.epsilons.push(spinner);
if let Some(ShellstyleSegment::Literal(next_bytes)) = segments.get(seg_idx + 1)
&& !next_bytes.is_empty()
{
let spin_escape = arena.alloc();
arena[spin_escape].table.epsilons.push(spinner);
arena[spinner]
.table
.set_transition(next_bytes[0], spin_escape);
state = spin_escape;
skip_first_literal_byte = true;
}
if !skip_first_literal_byte {
state = spinner;
}
}
}
}
let last_step = arena.alloc();
arena[last_step].field_transitions = arena[match_state].field_transitions.clone();
arena[state]
.table
.set_transition(ARENA_VALUE_TERMINATOR, last_step);
start
}
fn make_byte_dot_table(dest: StateId) -> SmallTable {
let mut table = SmallTable::new();
table.ceilings = smallvec![0xC0, 0xC2, ARENA_VALUE_TERMINATOR, BYTE_CEILING_U8];
table.steps = smallvec![dest, StateId::NONE, dest, StateId::NONE];
table
}
#[must_use]
pub fn make_wildcard_arena_fa(
pattern: &[u8],
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let segments = parse_wildcard_segments(pattern);
let start = build_fa_from_segments(&segments, match_state, &mut arena);
arena.precompute_epsilon_closures();
(arena, start)
}
fn parse_wildcard_segments(pattern: &[u8]) -> Vec<ShellstyleSegment> {
let mut segments = Vec::new();
let mut iter = pattern.iter();
while let Some(&ch) = iter.next() {
if ch == b'*' {
segments.push(ShellstyleSegment::Wildcard);
continue;
}
let literal = if ch == b'\\'
&& let Some(&escaped) = iter.next()
{
escaped
} else {
ch
};
if let Some(ShellstyleSegment::Literal(bytes)) = segments.last_mut() {
bytes.push(literal);
} else {
segments.push(ShellstyleSegment::Literal(vec![literal]));
}
}
segments
}
#[must_use]
pub fn make_anything_but_arena_fa(
excluded: &[Vec<u8>],
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let success = arena.alloc();
arena[success].field_transitions.push(next_field);
let start = build_anything_but_step(excluded, 0, success, &mut arena);
arena.precompute_epsilon_closures();
(arena, start)
}
fn build_anything_but_step(
vals: &[Vec<u8>],
index: usize,
success: StateId,
arena: &mut StateArena,
) -> StateId {
let mut vals_with_bytes_remaining: FxHashMap<u8, Vec<&Vec<u8>>> = FxHashMap::default();
let mut vals_ending_here: FxHashSet<u8> = FxHashSet::default();
for val in vals {
let Some(&utf8_byte) = val.get(index) else {
continue;
};
if index == val.len() - 1 {
vals_ending_here.insert(utf8_byte);
} else {
vals_with_bytes_remaining
.entry(utf8_byte)
.or_default()
.push(val);
}
}
let all_bytes: FxHashSet<u8> = vals_with_bytes_remaining
.keys()
.chain(vals_ending_here.iter())
.copied()
.collect();
let mut special_mappings: Vec<(u8, StateId)> = Vec::new();
for utf8_byte in all_bytes {
let has_continuation = vals_with_bytes_remaining.contains_key(&utf8_byte);
let ends_here = vals_ending_here.contains(&utf8_byte);
if has_continuation && ends_here {
let continuing_vals = vals_with_bytes_remaining.get(&utf8_byte).unwrap();
let owned_vals: Vec<Vec<u8>> = continuing_vals.iter().copied().cloned().collect();
let continuation = build_anything_but_step(&owned_vals, index + 1, success, arena);
let fail_state = arena.alloc();
let mut combined_unpacked: [StateId; BYTE_CEILING] = [success; BYTE_CEILING];
unpack_arena_table(&arena[continuation].table, &mut combined_unpacked);
combined_unpacked[ARENA_VALUE_TERMINATOR as usize] = fail_state;
let combined_state = arena.alloc();
arena[combined_state].table.pack(&combined_unpacked);
special_mappings.push((utf8_byte, combined_state));
} else if has_continuation {
let continuing_vals = vals_with_bytes_remaining.get(&utf8_byte).unwrap();
let owned_vals: Vec<Vec<u8>> = continuing_vals.iter().copied().cloned().collect();
let next_state = build_anything_but_step(&owned_vals, index + 1, success, arena);
special_mappings.push((utf8_byte, next_state));
} else if ends_here {
let fail_state = arena.alloc(); let last_state = arena.alloc_with_table(SmallTable::with_mappings(
success, &[ARENA_VALUE_TERMINATOR],
&[fail_state], ));
special_mappings.push((utf8_byte, last_state));
}
}
if special_mappings.is_empty() {
return arena.alloc_with_table(SmallTable::with_mappings(success, &[], &[]));
}
let bytes: Vec<u8> = special_mappings.iter().map(|(b, _)| *b).collect();
let states: Vec<StateId> = special_mappings.iter().map(|(_, s)| *s).collect();
arena.alloc_with_table(SmallTable::with_mappings(success, &bytes, &states))
}
#[must_use]
pub fn make_monocase_arena_fa(val: &[u8], next_field: Arc<FieldMatcher>) -> (StateArena, StateId) {
use crate::case_folding::case_fold_char;
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let start = if val.is_empty() {
arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
))
} else if let Ok(s) = std::str::from_utf8(val) {
let chars: Vec<(Vec<u8>, Option<Vec<u8>>)> = s
.char_indices()
.map(|(offset, ch)| {
let next_offset = s[offset..]
.chars()
.next()
.map_or(val.len(), |c| offset + c.len_utf8());
let orig = val[offset..next_offset].to_vec();
let alt = case_fold_char(ch).map(|alt_char| {
let mut buf = [0u8; 4];
alt_char.encode_utf8(&mut buf);
buf[..alt_char.len_utf8()].to_vec()
});
(orig, alt)
})
.collect();
build_monocase_arena_recursive(&chars, 0, match_state, &mut arena)
} else {
build_monocase_ascii_chain(val, match_state, &mut arena)
};
arena.precompute_epsilon_closures();
(arena, start)
}
fn build_monocase_ascii_chain(val: &[u8], match_state: StateId, arena: &mut StateArena) -> StateId {
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let mut current_next = term_state;
for i in (0..val.len()).rev() {
let byte = val[i];
let alt_byte = if byte.is_ascii_lowercase() {
Some(byte.to_ascii_uppercase())
} else if byte.is_ascii_uppercase() {
Some(byte.to_ascii_lowercase())
} else {
None
};
let state = if let Some(alt) = alt_byte {
arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte, alt],
&[current_next, current_next],
))
} else {
arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current_next],
))
};
current_next = state;
}
current_next
}
fn build_monocase_arena_recursive(
chars: &[(Vec<u8>, Option<Vec<u8>>)],
idx: usize,
match_state: StateId,
arena: &mut StateArena,
) -> StateId {
if idx >= chars.len() {
return arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
}
let (orig, alt) = &chars[idx];
let next_state = build_monocase_arena_recursive(chars, idx + 1, match_state, arena);
if let Some(alt_bytes) = alt {
let common_prefix = orig
.iter()
.zip(alt_bytes.iter())
.take_while(|(a, b)| a == b)
.count();
if common_prefix == 0 {
let orig_state = build_arena_fragment(orig, next_state, arena);
let alt_state = build_arena_fragment(alt_bytes, next_state, arena);
arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[orig[0], alt_bytes[0]],
&[orig_state, alt_state],
))
} else {
let orig_suffix = &orig[common_prefix..];
let alt_suffix = &alt_bytes[common_prefix..];
debug_assert!(
!orig_suffix.is_empty() && !alt_suffix.is_empty(),
"monocase suffixes are prefix-free after the shared prefix"
);
let orig_state = build_arena_fragment(orig_suffix, next_state, arena);
let alt_state = build_arena_fragment(alt_suffix, next_state, arena);
let mut current = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[orig_suffix[0], alt_suffix[0]],
&[orig_state, alt_state],
));
for &byte in orig[..common_prefix].iter().rev() {
current = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current],
));
}
current
}
} else {
let mut current = next_state;
for &byte in orig.iter().rev() {
current = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current],
));
}
current
}
}
fn build_arena_fragment(val: &[u8], end_at: StateId, arena: &mut StateArena) -> StateId {
let Some((_first, rest)) = val.split_first() else {
unreachable!("monocase fragment is a non-empty char encoding")
};
let mut current = end_at;
for &byte in rest.iter().rev() {
current = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current],
));
}
current
}
#[must_use]
pub fn make_cidr_arena_fa(
cidr: &crate::json::CidrPattern,
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
use crate::json::CidrPattern;
let (mut arena, start) = match cidr {
CidrPattern::V4 {
network,
prefix_len,
} => make_ipv4_cidr_arena_fa(network, *prefix_len, next_field),
CidrPattern::V6 {
network,
prefix_len,
} => make_ipv6_cidr_arena_fa(network, *prefix_len, next_field),
};
arena.precompute_epsilon_closures();
(arena, start)
}
fn make_ipv4_cidr_arena_fa(
network: &[u8; 4],
prefix_len: u8,
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let close_quote_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"\"",
&[term_state],
));
let mut current_state = close_quote_state;
for octet_idx in (0..4).rev() {
let octet_start_bit = octet_idx * 8;
let octet_end_bit = octet_start_bit + 8;
let (min_val, max_val) = if prefix_len as usize >= octet_end_bit {
(network[octet_idx], network[octet_idx])
} else if (prefix_len as usize) <= octet_start_bit {
(0u8, 255u8)
} else {
let constrained_bits = prefix_len as usize - octet_start_bit;
let mask = !0u8 << (8 - constrained_bits);
let base = network[octet_idx] & mask;
let range_size = 1u16 << (8 - constrained_bits);
(base, (u16::from(base) + range_size - 1).min(255) as u8)
};
let octet_start = build_octet_range_arena_fa(min_val, max_val, current_state, &mut arena);
if octet_idx > 0 {
current_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b".",
&[octet_start],
));
} else {
current_state = octet_start;
}
}
let open_quote_start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"\"",
&[current_state],
));
(arena, open_quote_start)
}
fn make_ipv6_cidr_arena_fa(
network: &[u8; 16],
prefix_len: u8,
next_field: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(next_field);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let close_quote_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"\"",
&[term_state],
));
let mut current_state = close_quote_state;
for group_idx in (0..8).rev() {
let byte_idx = group_idx * 2;
let group_start_bit = group_idx * 16;
let group_end_bit = group_start_bit + 16;
let group_value = u16::from_be_bytes([network[byte_idx], network[byte_idx + 1]]);
let (min_val, max_val) = if prefix_len as usize >= group_end_bit {
(group_value, group_value)
} else if (prefix_len as usize) <= group_start_bit {
(0u16, 0xffffu16)
} else {
let constrained_bits = prefix_len as usize - group_start_bit;
let mask = !0u16 << (16 - constrained_bits);
let base = group_value & mask;
let range_size = 1u32 << (16 - constrained_bits);
(base, (u32::from(base) + range_size - 1).min(0xffff) as u16)
};
let group_start =
build_ipv6_group_range_arena_fa(min_val, max_val, current_state, &mut arena);
if group_idx > 0 {
current_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b":",
&[group_start],
));
} else {
current_state = group_start;
}
}
let open_quote_start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"\"",
&[current_state],
));
(arena, open_quote_start)
}
fn build_octet_range_arena_fa(
min_val: u8,
max_val: u8,
continuation: StateId,
arena: &mut StateArena,
) -> StateId {
if min_val == max_val {
let val_str = min_val.to_string();
return build_literal_chain_arena(val_str.as_bytes(), continuation, arena);
}
let start = arena.alloc();
let mut value_starts = Vec::new();
for val in min_val..=max_val {
let val_str = val.to_string();
let val_start = build_literal_chain_arena(val_str.as_bytes(), continuation, arena);
value_starts.push(val_start);
}
arena[start].table.epsilons = SmallVec::from_vec(value_starts);
start
}
fn build_ipv6_group_range_arena_fa(
min_val: u16,
max_val: u16,
continuation: StateId,
arena: &mut StateArena,
) -> StateId {
if min_val == 0 && max_val == 0xffff {
return build_any_hex_group_arena(continuation, arena);
}
if min_val == max_val {
let val_str = format!("{min_val:x}");
return build_literal_chain_arena(val_str.as_bytes(), continuation, arena);
}
let start = arena.alloc();
let mut value_starts = Vec::new();
for val in min_val..=max_val {
let val_str = format!("{val:x}");
let val_start = build_literal_chain_arena(val_str.as_bytes(), continuation, arena);
value_starts.push(val_start);
}
arena[start].table.epsilons = SmallVec::from_vec(value_starts);
start
}
fn build_literal_chain_arena(val: &[u8], continuation: StateId, arena: &mut StateArena) -> StateId {
let mut current = continuation;
for &byte in val.iter().rev() {
current = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current],
));
}
current
}
fn build_any_hex_group_arena(continuation: StateId, arena: &mut StateArena) -> StateId {
let hex_chars: Vec<u8> = (b'0'..=b'9')
.chain(b'a'..=b'f')
.chain(b'A'..=b'F')
.collect();
let mut current = continuation;
for digit_pos in (0..4).rev() {
let next_state = arena.alloc();
let mut bytes = Vec::new();
let mut targets = Vec::new();
for &b in &hex_chars {
bytes.push(b);
targets.push(current);
}
arena[next_state].table = SmallTable::with_mappings(StateId::NONE, &bytes, &targets);
if digit_pos > 0 {
arena[next_state].table.epsilons.push(continuation);
}
current = next_state;
}
current
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_state_id_none() {
assert!(StateId::NONE.is_none());
assert!(!StateId(0).is_none());
assert!(!StateId(100).is_none());
}
#[test]
fn test_arena_alloc() {
let mut arena = StateArena::new();
let id1 = arena.alloc();
let id2 = arena.alloc();
assert_eq!(id1.index(), 0);
assert_eq!(id2.index(), 1);
assert_eq!(arena.len(), 2);
}
#[test]
fn test_arena_cyclic_reference() {
let mut arena = StateArena::new();
let state_a = arena.alloc();
let state_b = arena.alloc();
arena[state_a].table.epsilons.push(state_b);
arena[state_b].table.epsilons.push(state_a);
assert_eq!(arena[state_a].table.epsilons[0], state_b);
assert_eq!(arena[state_b].table.epsilons[0], state_a);
}
#[test]
fn test_arena_small_table_pack() {
let mut table = SmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[b'a' as usize] = StateId(0);
unpacked[b'b' as usize] = StateId(1);
table.pack(&unpacked);
assert_eq!(table.dstep(b'a'), StateId(0));
assert_eq!(table.dstep(b'b'), StateId(1));
assert!(table.dstep(b'c').is_none());
}
#[test]
fn test_dstep_rejects_forbidden_utf8() {
let s = StateId(0);
let table = SmallTable::with_mappings(StateId::NONE, b"a", &[s]);
assert_eq!(table.dstep(b'a'), s, "valid byte should still transition");
for byte in [0xC0_u8, 0xC1] {
assert!(
table.dstep(byte).is_none(),
"forbidden byte {byte:#04x} must not transition"
);
}
for byte in 0xF5_u8..=0xFF {
assert!(
table.dstep(byte).is_none(),
"forbidden byte {byte:#04x} must not transition"
);
}
}
#[test]
fn test_make_byte_dot_table_forbidden_bytes() {
let dest = StateId(7);
let table = make_byte_dot_table(dest);
for byte in 0..=255_u32 {
#[allow(clippy::cast_possible_truncation)]
let b = byte as u8;
let forbidden = matches!(b, 0xC0 | 0xC1) || b >= 0xF5;
let got = table.dstep(b);
if forbidden {
assert!(
got.is_none(),
"byte {b:#04x} is forbidden but transitioned to {got:?}"
);
} else {
assert_eq!(got, dest, "byte {b:#04x} should self-loop to dest");
}
}
}
#[test]
fn test_traverse_arena_nfa_simple() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let final_state = arena.alloc();
arena[final_state]
.field_transitions
.push(field_matcher.clone());
let match_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"a",
&[match_state],
));
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
let value = b"a";
traverse_arena_nfa(&arena, start, value, &mut bufs);
assert_eq!(bufs.transitions.len(), 1);
assert_eq!(bufs.transitions[0], Arc::as_ptr(&field_matcher) as usize);
bufs.clear();
traverse_arena_nfa(&arena, start, b"b", &mut bufs);
assert!(bufs.transitions.is_empty());
}
#[test]
fn test_traverse_arena_nfa_star_cyclic() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let final_state = arena.alloc();
arena[final_state].field_transitions.push(field_matcher);
let exit_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let loopback = arena.alloc();
let start = arena.alloc_with_table({
let mut table = SmallTable::with_mappings(StateId::NONE, b"ab", &[loopback, loopback]);
table.epsilons.push(exit_state);
table
});
arena[loopback].table.epsilons = smallvec![exit_state, start];
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&arena, start, b"", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]* should match empty string");
bufs.clear();
traverse_arena_nfa(&arena, start, b"a", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]* should match 'a'");
bufs.clear();
traverse_arena_nfa(&arena, start, b"ab", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]* should match 'ab'");
bufs.clear();
traverse_arena_nfa(&arena, start, b"aaa", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]* should match 'aaa'");
bufs.clear();
traverse_arena_nfa(&arena, start, b"abba", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]* should match 'abba'");
bufs.clear();
let long_value = "ab".repeat(100);
traverse_arena_nfa(&arena, start, long_value.as_bytes(), &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]* should match long string");
bufs.clear();
traverse_arena_nfa(&arena, start, b"c", &mut bufs);
assert!(bufs.transitions.is_empty(), "[ab]* should NOT match 'c'");
}
#[test]
fn test_traverse_arena_nfa_plus_cyclic() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let final_state = arena.alloc();
arena[final_state].field_transitions.push(field_matcher);
let exit_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let loopback = arena.alloc();
let start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"ab",
&[loopback, loopback],
));
arena[loopback].table.epsilons = smallvec![exit_state, start];
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&arena, start, b"", &mut bufs);
assert!(
bufs.transitions.is_empty(),
"[ab]+ should NOT match empty string"
);
bufs.clear();
traverse_arena_nfa(&arena, start, b"a", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]+ should match 'a'");
bufs.clear();
traverse_arena_nfa(&arena, start, b"ab", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]+ should match 'ab'");
bufs.clear();
let long_value = "ab".repeat(100);
traverse_arena_nfa(&arena, start, long_value.as_bytes(), &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "[ab]+ should match long string");
}
#[test]
fn test_arena_state_count_vs_chain() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let final_state = arena.alloc();
arena[final_state].field_transitions.push(field_matcher);
let exit_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let loopback = arena.alloc();
let start = arena.alloc_with_table({
let mut table = SmallTable::with_mappings(StateId::NONE, b"a", &[loopback]);
table.epsilons.push(exit_state);
table
});
arena[loopback].table.epsilons = smallvec![exit_state, start];
assert_eq!(arena.len(), 4, "Arena [a]* should only need 4 states");
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&arena, start, b"aaaaaaaaaa", &mut bufs);
assert_eq!(bufs.transitions.len(), 1);
}
#[test]
fn test_nested_quantifier_dedup() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let final_state = arena.alloc();
arena[final_state].field_transitions.push(field_matcher);
let exit_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let loopback = arena.alloc();
let start = arena.alloc_with_table({
let mut table = SmallTable::with_mappings(StateId::NONE, b"abc", &[loopback; 3]);
table.epsilons.push(exit_state);
table
});
arena[loopback].table.epsilons = smallvec![exit_state, start];
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
let long_input: Vec<u8> = std::iter::repeat_n(b'a', 200).collect();
traverse_arena_nfa(&arena, start, &long_input, &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Should match the long input");
bufs.clear();
traverse_arena_nfa(&arena, start, b"abc", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Should match 'abc'");
bufs.clear();
traverse_arena_nfa(&arena, start, b"", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Should match empty (via ?)");
bufs.clear();
traverse_arena_nfa(&arena, start, b"d", &mut bufs);
assert!(
bufs.transitions.is_empty(),
"Should not match 'd' (only a/b/c)"
);
}
#[test]
fn test_ascii_fast_path_acceleration() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let final_state = arena.alloc();
arena[final_state].field_transitions.push(field_matcher);
let exit_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let loopback = arena.alloc();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
for (byte, slot) in unpacked.iter_mut().enumerate() {
if byte != b'x' as usize {
*slot = loopback;
}
}
let mut start_table = SmallTable::new();
start_table.pack(&unpacked);
start_table.accel = Some(super::super::AccelInfo {
exit_bytes: [b'x', 0, 0],
len: 1,
});
let start = arena.alloc_with_table(start_table);
arena[loopback].table.epsilons = smallvec![exit_state, start];
arena[loopback].table.accel = Some(super::super::AccelInfo {
exit_bytes: [b'x', 0, 0],
len: 1,
});
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
let test_value = b"aaaaaaaaaaaaaaaaaaaaaaaaax";
traverse_arena_nfa(&arena, start, test_value, &mut bufs);
assert!(
bufs.transitions.is_empty(),
"[^x]+ should NOT match string ending with 'x'"
);
bufs.clear();
let test_value2 = b"aaaaaaaaaaaaaaaaaaaaaaaa";
traverse_arena_nfa(&arena, start, test_value2, &mut bufs);
assert_eq!(
bufs.transitions.len(),
1,
"[^x]+ should match string without 'x'"
);
bufs.clear();
let test_value3 = b"aaaaaaxaaaaaa";
traverse_arena_nfa(&arena, start, test_value3, &mut bufs);
assert!(
bufs.transitions.is_empty(),
"[^x]+ should NOT match string with 'x' in middle"
);
}
#[test]
fn test_arena_nfa_skips_acceleration_with_multiple_active_states() {
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let match_state = arena.alloc();
arena[match_state].field_transitions.push(field_matcher);
let lit_final = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let lit_ready =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"z", &[lit_final]));
let lit_mid =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"b", &[lit_ready]));
let accel_state = arena.alloc();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
for (byte, slot) in unpacked.iter_mut().enumerate() {
if byte != b'z' as usize && byte != ARENA_VALUE_TERMINATOR as usize {
*slot = accel_state;
}
}
arena[accel_state].table.pack(&unpacked);
arena[accel_state].table.accel = Some(AccelInfo {
exit_bytes: [b'z', 0, 0],
len: 1,
});
let lit_entry =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[lit_mid]));
let start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"a",
&[accel_state],
));
arena[start].table.epsilons = smallvec![lit_entry];
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&arena, start, b"abz", &mut bufs);
assert_eq!(
bufs.transitions.len(),
1,
"literal 'abz' must match: acceleration must not fire while a second state is active"
);
}
#[test]
fn test_arena_nfa_dedup_generation_advances_between_steps() {
const HUBS: usize = 70;
let mut arena = StateArena::new();
let field_matcher = Arc::new(FieldMatcher::new());
let match_state = arena.alloc();
arena[match_state].field_transitions.push(field_matcher);
let hubs: Vec<StateId> = (0..HUBS).map(|_| arena.alloc()).collect();
for (k, &hub) in hubs.iter().enumerate() {
if k == 0 {
arena[hub].table = SmallTable::with_mappings(
StateId::NONE,
&[b'a', ARENA_VALUE_TERMINATOR],
&[hub, match_state],
);
} else {
arena[hub].table = SmallTable::with_mappings(StateId::NONE, b"a", &[hub]);
}
}
let start = arena.alloc();
arena[start].table.epsilons = SmallVec::from_vec(hubs);
arena.precompute_epsilon_closures();
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&arena, start, b"aa", &mut bufs);
assert_eq!(
bufs.transitions.len(),
1,
"match must survive two consecutive dedup steps over a repeating active set"
);
}
#[test]
fn test_try_accelerate_arena() {
let mut table = SmallTable::new();
let nz = |n: usize| NonZero::new(n).unwrap();
assert!(try_accelerate_arena(&table, b"hello").is_none());
table.accel = Some(super::super::AccelInfo {
exit_bytes: [b'x', 0, 0],
len: 1,
});
assert_eq!(try_accelerate_arena(&table, b"helloxworld"), Some(nz(5))); assert!(try_accelerate_arena(&table, b"hello").is_none()); assert!(try_accelerate_arena(&table, b"xhello").is_none());
assert!(try_accelerate_arena(&table, b"").is_none());
table.accel = Some(super::super::AccelInfo {
exit_bytes: [b'x', b'y', 0],
len: 2,
});
assert!(try_accelerate_arena(&table, b"helloworld").is_none()); assert_eq!(try_accelerate_arena(&table, b"hellxyworld"), Some(nz(4))); assert_eq!(try_accelerate_arena(&table, b"hellyxworld"), Some(nz(4))); assert_eq!(try_accelerate_arena(&table, b"helloxyw"), Some(nz(5)));
table.accel = Some(super::super::AccelInfo {
exit_bytes: [b'x', b'y', b'z'],
len: 3,
});
assert_eq!(try_accelerate_arena(&table, b"abcdefghijz"), Some(nz(10))); assert_eq!(try_accelerate_arena(&table, b"abcxyz"), Some(nz(3))); assert!(try_accelerate_arena(&table, b"abcdefghij").is_none()); }
#[test]
fn test_traverse_dfa_backward_none_start_collects_nothing() {
let arena = StateArena::new();
let mut transitions = Vec::new();
traverse_arena_dfa_backward(&arena, StateId::NONE, b"suffix", &mut transitions);
assert!(transitions.is_empty());
}
#[test]
fn test_dfa_lookup_matches_smalltable_dstep() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let s2 = arena.alloc();
arena[s0].table.ceilings = smallvec![b'c', b'z'];
arena[s0].table.steps = smallvec![s1, s2];
arena[s1].table.ceilings = smallvec![0x80];
arena[s1].table.steps = smallvec![s0];
let mut expected: Vec<Vec<StateId>> = Vec::new();
for sid in [s0, s1, s2] {
let mut row = Vec::new();
for byte in 0..=255u8 {
row.push(arena[sid].table.dstep(byte));
}
expected.push(row);
}
arena.flatten_tables();
for (i, sid) in [s0, s1, s2].iter().enumerate() {
for byte in 0..=255u8 {
assert_eq!(
arena.dstep(*sid, byte),
expected[i][byte as usize],
"mismatch at state {i}, byte {byte:#04x}"
);
}
}
}
#[test]
fn test_arena_stats_empty() {
let arena = StateArena::new();
let stats = arena.stats();
assert_eq!(stats.state_count, 0);
assert_eq!(stats.tables_with_transitions, 0);
assert_eq!(stats.total_epsilons, 0);
let _s = format!("{stats}");
}
#[test]
fn test_arena_stats_basic() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let s2 = arena.alloc();
arena[s0].table.set_transition(b'a', s1);
arena[s1].table.epsilons.push(s2);
let fm = Arc::new(FieldMatcher::new());
arena[s2].field_transitions.push(fm);
let stats = arena.stats();
assert_eq!(stats.state_count, 3);
assert!(stats.tables_with_transitions >= 1); assert_eq!(stats.total_epsilons, 1);
assert_eq!(stats.max_epsilons, 1);
assert_eq!(stats.states_with_field_transitions, 1);
assert_eq!(stats.states_with_closures, 3);
assert_eq!(stats.max_closure_len, 1);
assert_eq!(stats.dfa_lookup_states, 0);
arena.precompute_epsilon_closures();
let stats = arena.stats();
assert_eq!(stats.states_with_closures, 3);
assert!(stats.max_closure_len >= 2); assert!(stats.closure_data_len > 0);
arena.flatten_tables();
let stats = arena.stats();
#[cfg(not(miri))]
assert_eq!(stats.dfa_lookup_states, 3);
assert!(stats.ft_ptrs_len > 0);
let display = format!("{stats}");
assert!(display.contains("states=3"));
}
}
#[cfg(test)]
mod arena_stats_utility_tests {
use super::*;
#[test]
fn test_arena_stats_add_sums_and_maxes() {
let mut a = Stats {
state_count: 10,
tables_with_transitions: 5,
total_ceiling_entries: 20,
max_ceilings: 4,
total_epsilons: 3,
max_epsilons: 2,
states_with_field_transitions: 2,
closure_data_len: 15,
states_with_closures: 8,
total_closure_entries: 12,
max_closure_len: 3,
ft_ptrs_len: 6,
dfa_lookup_states: 10,
estimated_bytes: 1000,
};
let b = Stats {
state_count: 7,
tables_with_transitions: 3,
total_ceiling_entries: 10,
max_ceilings: 6,
total_epsilons: 5,
max_epsilons: 1,
states_with_field_transitions: 1,
closure_data_len: 9,
states_with_closures: 4,
total_closure_entries: 7,
max_closure_len: 5,
ft_ptrs_len: 2,
dfa_lookup_states: 7,
estimated_bytes: 500,
};
a.add(&b);
assert_eq!(a.state_count, 17);
assert_eq!(a.tables_with_transitions, 8);
assert_eq!(a.total_ceiling_entries, 30);
assert_eq!(a.total_epsilons, 8);
assert_eq!(a.states_with_field_transitions, 3);
assert_eq!(a.closure_data_len, 24);
assert_eq!(a.states_with_closures, 12);
assert_eq!(a.total_closure_entries, 19);
assert_eq!(a.ft_ptrs_len, 8);
assert_eq!(a.dfa_lookup_states, 17);
assert_eq!(a.estimated_bytes, 1500);
assert_eq!(a.max_ceilings, 6); assert_eq!(a.max_epsilons, 2); assert_eq!(a.max_closure_len, 5); }
#[test]
fn test_arena_stats_add_equal_max_values() {
let mut a = Stats {
max_ceilings: 4,
max_epsilons: 3,
max_closure_len: 5,
..Default::default()
};
let b = Stats {
max_ceilings: 4,
max_epsilons: 3,
max_closure_len: 5,
..Default::default()
};
a.add(&b);
assert_eq!(a.max_ceilings, 4);
assert_eq!(a.max_epsilons, 3);
assert_eq!(a.max_closure_len, 5);
}
#[test]
fn test_estimated_byte_size() {
let mut arena = StateArena::new();
let empty_size = arena.estimated_byte_size();
arena.alloc();
arena.alloc();
arena.alloc();
let size_with_states = arena.estimated_byte_size();
let expected = arena.states.capacity() * std::mem::size_of::<FaState>()
+ arena.closure_data.capacity() * std::mem::size_of::<StateId>()
+ arena.ft_ptrs.capacity() * std::mem::size_of::<usize>()
+ arena.dfa_lookup.capacity() * std::mem::size_of::<StateId>();
assert_eq!(size_with_states, expected);
assert!(size_with_states >= empty_size);
}
#[test]
fn test_debug_fmt_arena() {
let mut arena = StateArena::new();
arena.alloc();
arena.alloc();
let dbg = format!("{arena:?}");
assert!(dbg.contains("states_count"));
assert!(dbg.contains('2')); }
#[test]
fn test_debug_fmt_state() {
let state = FaState::new();
let dbg = format!("{state:?}");
assert!(dbg.contains("FaState"));
assert!(dbg.contains("field_transitions_count"));
}
#[test]
fn test_with_capacity() {
let mut arena = StateArena::with_capacity(10);
assert!(arena.is_empty());
assert_eq!(arena.len(), 0);
assert!(arena.states.capacity() >= 10);
assert!(arena.closure_data.capacity() >= 10);
let id = arena.alloc();
assert_eq!(id.index(), 0);
assert!(!arena.is_empty());
assert_eq!(arena.len(), 1);
}
#[test]
fn test_is_empty_transitions() {
let mut arena = StateArena::new();
assert!(arena.is_empty());
let id = arena.alloc();
assert!(!arena.is_empty());
assert!(arena.get(id).is_some());
}
#[test]
fn test_get_mut_valid_and_invalid() {
let mut arena = StateArena::new();
let id = arena.alloc();
assert!(arena.get_mut(id).is_some());
assert!(arena.get_mut(StateId::NONE).is_none());
assert!(arena.get_mut(StateId::from_index(999)).is_none());
}
#[test]
fn test_stats_max_tracking_across_states() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let s2 = arena.alloc();
let s3 = arena.alloc();
arena[s0].table.ceilings = smallvec![b'a', BYTE_CEILING_U8];
arena[s0].table.steps = smallvec![s1, StateId::NONE];
arena[s1].table.ceilings = smallvec![b'a', b'b', BYTE_CEILING_U8];
arena[s1].table.steps = smallvec![s0, s2, StateId::NONE];
arena[s1].table.epsilons.push(s3);
arena[s2].table.ceilings = smallvec![b'a', b'b', b'c', BYTE_CEILING_U8];
arena[s2].table.steps = smallvec![s0, s1, s3, StateId::NONE];
arena[s2].table.epsilons.push(s0);
arena[s2].table.epsilons.push(s1);
let stats = arena.stats();
assert_eq!(stats.state_count, 4);
assert_eq!(stats.tables_with_transitions, 3); assert_eq!(stats.max_ceilings, 4); assert_eq!(stats.total_epsilons, 3); assert_eq!(stats.max_epsilons, 2); }
#[test]
fn test_is_nondeterministic() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
assert!(!arena.is_nondeterministic());
arena[s0].table.epsilons.push(s1);
assert!(arena.is_nondeterministic());
}
#[test]
fn test_stats_add_max_takes_larger_both_directions() {
let mut self_larger = Stats {
max_ceilings: 9,
max_epsilons: 8,
max_closure_len: 7,
..Default::default()
};
let smaller = Stats {
max_ceilings: 2,
max_epsilons: 1,
max_closure_len: 3,
..Default::default()
};
self_larger.add(&smaller);
assert_eq!(self_larger.max_ceilings, 9);
assert_eq!(self_larger.max_epsilons, 8);
assert_eq!(self_larger.max_closure_len, 7);
let mut self_smaller = Stats {
max_ceilings: 2,
max_epsilons: 1,
max_closure_len: 3,
..Default::default()
};
let larger = Stats {
max_ceilings: 9,
max_epsilons: 8,
max_closure_len: 7,
..Default::default()
};
self_smaller.add(&larger);
assert_eq!(self_smaller.max_ceilings, 9);
assert_eq!(self_smaller.max_epsilons, 8);
assert_eq!(self_smaller.max_closure_len, 7);
}
#[test]
fn test_stats_total_ceiling_entries_accumulates() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let s2 = arena.alloc();
arena[s0].table.ceilings = smallvec![b'a', BYTE_CEILING_U8];
arena[s0].table.steps = smallvec![s1, StateId::NONE];
arena[s1].table.ceilings = smallvec![b'a', b'b', BYTE_CEILING_U8];
arena[s1].table.steps = smallvec![s0, s2, StateId::NONE];
let stats = arena.stats();
assert_eq!(stats.tables_with_transitions, 2);
assert_eq!(stats.total_ceiling_entries, 5); }
#[test]
fn test_estimated_byte_size_with_flattened_buffers() {
let mut arena = StateArena::new();
arena.alloc();
arena.ft_ptrs = vec![1usize, 2, 3, 4, 5, 6, 7];
arena.dfa_lookup = vec![StateId::NONE; 11];
assert!(arena.ft_ptrs.capacity() > 0);
assert!(arena.dfa_lookup.capacity() > 0);
let expected = arena.states.capacity() * std::mem::size_of::<FaState>()
+ arena.closure_data.capacity() * std::mem::size_of::<StateId>()
+ arena.ft_ptrs.capacity() * std::mem::size_of::<usize>()
+ arena.dfa_lookup.capacity() * std::mem::size_of::<StateId>();
assert_eq!(arena.estimated_byte_size(), expected);
}
#[test]
fn test_nfa_buffers_with_capacity_preallocates() {
let bufs = NfaBuffers::with_capacity();
assert!(
bufs.current_states.capacity() >= 16,
"current_states must be pre-allocated, got cap={}",
bufs.current_states.capacity()
);
assert!(
bufs.next_states.capacity() >= 16,
"next_states must be pre-allocated, got cap={}",
bufs.next_states.capacity()
);
let empty = NfaBuffers::new();
assert_eq!(empty.current_states.capacity(), 0);
assert_eq!(empty.next_states.capacity(), 0);
}
#[test]
fn test_alloc_sets_per_state_closure() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let s2 = arena.alloc();
assert_eq!(arena.closure_of(s0), &[s0]);
assert_eq!(arena.closure_of(s1), &[s1]);
assert_eq!(arena.closure_of(s2), &[s2]);
}
#[test]
fn test_precompute_epsilon_closures_skips_out_of_range_target() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let dangling = StateId::from_index(arena.len());
arena[s0].table.epsilons.push(s1);
arena[s0].table.epsilons.push(dangling);
arena.precompute_epsilon_closures();
let c = arena.closure_of(s0);
assert!(c.contains(&s0));
assert!(c.contains(&s1));
assert_eq!(c.len(), 2);
}
#[test]
fn test_unpack_arena_table_writes_full_byte_ceiling_range() {
let mut arena = StateArena::new();
let target = arena.alloc();
let table = SmallTable::with_mappings(target, &[], &[]);
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(&table, &mut unpacked);
assert_eq!(unpacked[0], target);
assert_eq!(unpacked[BYTE_CEILING - 1], target);
}
#[test]
fn test_traverse_arena_nfa_multi_state_closure_final() {
let mut arena = StateArena::new();
let fm_a = Arc::new(FieldMatcher::with_match_id(11));
let fm_b = Arc::new(FieldMatcher::with_match_id(13));
let match_state_a = arena.alloc();
arena[match_state_a].field_transitions.push(fm_a);
let match_state_b = arena.alloc();
arena[match_state_b].field_transitions.push(fm_b);
let final_state = arena.alloc();
arena[final_state].table.epsilons.push(match_state_a);
arena[final_state].table.epsilons.push(match_state_b);
let vt_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[final_state],
));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[vt_state]));
arena.precompute_epsilon_closures();
assert!(
arena.closure_of(final_state).len() >= 2,
"test setup expects final_state to have a multi-state closure"
);
assert!(arena.ft_ptrs.is_empty());
let mut bufs = NfaBuffers::new();
traverse_arena_nfa(&arena, start, b"a", &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"unfrozen path must collect transitions from final_state's closure"
);
arena.flatten_tables();
assert!(!arena.ft_ptrs.is_empty());
let mut bufs = NfaBuffers::new();
traverse_arena_nfa(&arena, start, b"a", &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"frozen path must collect transitions from final_state's closure"
);
}
#[test]
fn test_traverse_arena_nfa_multi_state_closure_mid_value() {
let mut arena = StateArena::new();
let fm = Arc::new(FieldMatcher::with_match_id(7));
let accept = arena.alloc();
arena[accept].field_transitions.push(fm);
let m1 = arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"b", &[accept]));
let m2 = arena.alloc();
let fork = arena.alloc();
arena[fork].table.epsilons.push(m1);
arena[fork].table.epsilons.push(m2);
let start = arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[fork]));
arena.precompute_epsilon_closures();
assert!(arena.closure_of(fork).len() >= 2);
assert!(arena.ft_ptrs.is_empty());
let mut bufs = NfaBuffers::new();
traverse_arena_nfa(&arena, start, b"ab", &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"unfrozen path must follow the onward transition through a closure mate"
);
arena.flatten_tables();
let mut bufs = NfaBuffers::new();
traverse_arena_nfa(&arena, start, b"ab", &mut bufs);
assert!(
!bufs.transitions.is_empty(),
"frozen path must follow the onward transition through a closure mate"
);
}
#[test]
fn test_lazy_dfa_conflicting_nfa_accel_not_accelerated() {
let mut arena = StateArena::new();
let s0 = arena.alloc();
let s1 = arena.alloc();
let s2 = arena.alloc();
arena[s0].table.epsilons.push(s1);
arena[s0].table.epsilons.push(s2);
arena[s1].table.set_transition(b'a', s1);
arena[s1].table.accel = Some(AccelInfo {
exit_bytes: [b'x', 0, 0],
len: 1,
});
arena[s2].table.set_transition(b'a', s2);
arena[s2].table.accel = Some(AccelInfo {
exit_bytes: [b'y', 0, 0],
len: 1,
});
arena.precompute_epsilon_closures();
let mut lazy = LazyDfa::new(arena, s0, 100);
let mut transitions = Vec::new();
traverse_lazy_dfa(&mut lazy, b"aaaa", &mut transitions);
let accel_count = lazy.states.iter().filter(|s| s.accel.is_some()).count();
assert_eq!(
accel_count, 0,
"state with conflicting NFA accel must not be accelerated"
);
}
}
#[cfg(test)]
#[allow(unsafe_code)]
mod merge_tests {
use super::*;
fn make_single_byte_arena(byte: u8, fm: Arc<FieldMatcher>) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, &[byte], &[term]));
(arena, start)
}
#[test]
fn test_merge_dual_spinout_states_filters_none_epsilons() {
let mut arena1 = StateArena::new();
let mut arena2 = StateArena::new();
let s1_id = arena1.alloc();
let s2_id = arena2.alloc();
arena1[s1_id].table.epsilons.push(StateId::NONE);
arena2[s2_id].table.epsilons.push(StateId::NONE);
let s1 = arena1[s1_id].clone();
let s2 = arena2[s2_id].clone();
let mut new_arena = StateArena::new();
let new_id = new_arena.alloc();
let mut memo: FxHashMap<(StateId, StateId), StateId> = FxHashMap::default();
merge_dual_spinout_states(
&arena1,
&s1,
&arena2,
&s2,
&mut new_arena,
&mut memo,
new_id,
);
assert!(
new_arena[new_id]
.table
.epsilons
.iter()
.all(|e| !e.is_none()),
"merged spinout table retained a NONE epsilon"
);
}
#[test]
fn test_asymmetric_spinner_merge_filters_none_epsilons() {
let mut arena1 = StateArena::new();
let mut arena2 = StateArena::new();
let s1_id = arena1.alloc();
let s2_id = arena2.alloc();
arena1[s1_id].table.epsilons.push(StateId::NONE);
let mut new_arena = StateArena::new();
let new_id = new_arena.alloc();
let mut memo: FxHashMap<(StateId, StateId), StateId> = FxHashMap::default();
asymmetric_spinner_merge(
&arena1,
s1_id,
&arena2,
s2_id,
true,
&mut new_arena,
&mut memo,
new_id,
);
assert!(
new_arena[new_id]
.table
.epsilons
.iter()
.all(|e| !e.is_none()),
"merged spinner table retained a NONE epsilon"
);
}
#[test]
fn test_merge_nfa_tables_bytewise_filters_none_epsilons() {
let arena1 = StateArena::new();
let arena2 = StateArena::new();
let mut table1 = SmallTable::new();
let mut table2 = SmallTable::new();
table1.epsilons.push(StateId::NONE);
table2.epsilons.push(StateId::NONE);
let mut new_arena = StateArena::new();
let mut memo: FxHashMap<(StateId, StateId), StateId> = FxHashMap::default();
let result = merge_nfa_tables_bytewise(
&arena1,
&table1,
&arena2,
&table2,
&mut new_arena,
&mut memo,
);
assert!(
result.epsilons.iter().all(|e| !e.is_none()),
"merged NFA table retained a NONE epsilon"
);
}
#[test]
fn test_merge_arena_tables_filters_none_epsilons() {
let arena1 = StateArena::new();
let arena2 = StateArena::new();
let mut table1 = SmallTable::new();
let mut table2 = SmallTable::new();
table1.epsilons.push(StateId::NONE);
table2.epsilons.push(StateId::NONE);
let mut new_arena = StateArena::new();
let mut memo: FxHashMap<(StateId, StateId), StateId> = FxHashMap::default();
let result = merge_arena_tables(
&arena1,
&table1,
&arena2,
&table2,
&mut new_arena,
&mut memo,
);
assert!(
result.epsilons.iter().all(|e| !e.is_none()),
"merged DFA table retained a NONE epsilon"
);
}
#[test]
fn test_merge_empty_arenas() {
let arena1 = StateArena::new();
let arena2 = StateArena::new();
let (merged, start) = merge_arena_dfas(&arena1, StateId::NONE, &arena2, StateId::NONE);
assert!(
start.is_none(),
"Merging empty arenas should return NONE start"
);
assert!(merged.is_empty(), "Merged arena should be empty");
}
#[test]
fn test_merge_one_empty_arena() {
let fm = Arc::new(FieldMatcher::new());
let (arena1, start1) = make_single_byte_arena(b'a', fm);
let (merged, start) = merge_arena_dfas(&arena1, start1, &StateArena::new(), StateId::NONE);
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&merged, start, b"a", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Should match 'a'");
}
#[test]
fn test_merge_single_transition() {
let fm1 = Arc::new(FieldMatcher::new());
let fm2 = Arc::new(FieldMatcher::new());
let (arena1, start1) = make_single_byte_arena(b'a', fm1.clone());
let (arena2, start2) = make_single_byte_arena(b'b', fm2.clone());
let (merged, start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&merged, start, b"a", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Merged should match 'a'");
assert_eq!(bufs.transitions[0], Arc::as_ptr(&fm1) as usize);
bufs.clear();
traverse_arena_nfa(&merged, start, b"b", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Merged should match 'b'");
assert_eq!(bufs.transitions[0], Arc::as_ptr(&fm2) as usize);
bufs.clear();
traverse_arena_nfa(&merged, start, b"c", &mut bufs);
assert!(bufs.transitions.is_empty(), "Merged should NOT match 'c'");
}
#[test]
fn test_merge_overlapping_transitions() {
let fm1 = Arc::new(FieldMatcher::new());
let fm2 = Arc::new(FieldMatcher::new());
let (arena1, start1) = make_single_byte_arena(b'a', fm1);
let (arena2, start2) = make_single_byte_arena(b'a', fm2);
let (merged, start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&merged, start, b"a", &mut bufs);
assert_eq!(
bufs.transitions.len(),
2,
"Overlapping merge should have 2 transitions"
);
}
#[test]
fn test_merge_preserves_field_transitions() {
let fm1 = Arc::new(FieldMatcher::with_match_id(100));
let fm2 = Arc::new(FieldMatcher::with_match_id(200));
let (arena1, start1) = make_single_byte_arena(b'x', fm1);
let (arena2, start2) = make_single_byte_arena(b'y', fm2);
let (merged, start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&merged, start, b"x", &mut bufs);
assert_eq!(bufs.transitions.len(), 1);
assert_eq!(
unsafe { &*(bufs.transitions[0] as *const FieldMatcher) }.match_id,
Some(100)
);
bufs.clear();
traverse_arena_nfa(&merged, start, b"y", &mut bufs);
assert_eq!(bufs.transitions.len(), 1);
assert_eq!(
unsafe { &*(bufs.transitions[0] as *const FieldMatcher) }.match_id,
Some(200)
);
}
#[test]
fn test_merge_multiple_arenas_associative() {
let fm_a = Arc::new(FieldMatcher::with_match_id(1));
let fm_b = Arc::new(FieldMatcher::with_match_id(2));
let fm_c = Arc::new(FieldMatcher::with_match_id(3));
let (arena_a, start_a) = make_single_byte_arena(b'a', fm_a);
let (arena_b, start_b) = make_single_byte_arena(b'b', fm_b);
let (arena_c, start_c) = make_single_byte_arena(b'c', fm_c);
let (ab, ab_start) = merge_arena_dfas(&arena_a, start_a, &arena_b, start_b);
let (abc_left, abc_left_start) = merge_arena_dfas(&ab, ab_start, &arena_c, start_c);
let (bc, bc_start) = merge_arena_dfas(&arena_b, start_b, &arena_c, start_c);
let (abc_right, abc_right_start) = merge_arena_dfas(&arena_a, start_a, &bc, bc_start);
let mut bufs1 = NfaBuffers::with_capacity();
let mut bufs2 = NfaBuffers::with_capacity();
for byte in [b'a', b'b', b'c'] {
bufs1.clear();
bufs2.clear();
traverse_arena_nfa(&abc_left, abc_left_start, &[byte], &mut bufs1);
traverse_arena_nfa(&abc_right, abc_right_start, &[byte], &mut bufs2);
assert_eq!(
bufs1.transitions.len(),
bufs2.transitions.len(),
"Associativity: both should have same number of transitions for '{}'",
byte as char
);
assert_eq!(
bufs1.transitions.len(),
1,
"Should match '{}'",
byte as char
);
}
}
#[test]
fn test_merge_multi_byte_sequences() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm1);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let state_b =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"b", &[term]));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[state_b]));
(arena, start)
};
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let state_c =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"c", &[term]));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[state_c]));
(arena, start)
};
let (merged, start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&merged, start, b"ab", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Should match 'ab'");
assert_eq!(
unsafe { &*(bufs.transitions[0] as *const FieldMatcher) }.match_id,
Some(1)
);
bufs.clear();
traverse_arena_nfa(&merged, start, b"ac", &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "Should match 'ac'");
assert_eq!(
unsafe { &*(bufs.transitions[0] as *const FieldMatcher) }.match_id,
Some(2)
);
for val in [b"a".as_slice(), b"ad", b"bc"] {
bufs.clear();
traverse_arena_nfa(&merged, start, val, &mut bufs);
assert!(
bufs.transitions.is_empty(),
"Should NOT match {:?}",
std::str::from_utf8(val)
);
}
}
}
#[cfg(test)]
#[allow(unsafe_code)]
mod numeric_arena_tests {
use super::*;
use crate::numbits::q_num_from_f64;
fn matches_arena(arena: &StateArena, start: StateId, q_num: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, q_num, &mut bufs);
!bufs.transitions.is_empty()
}
#[allow(clippy::similar_names)]
#[test]
fn test_numeric_less_arena_fa_basic() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_less_arena_fa(100.0, true, next_field);
let q50 = q_num_from_f64(50.0);
let q100 = q_num_from_f64(100.0);
let q150 = q_num_from_f64(150.0);
let q0 = q_num_from_f64(0.0);
let q_neg = q_num_from_f64(-50.0);
assert!(matches_arena(&arena, start, &q50), "50 should match <= 100");
assert!(
matches_arena(&arena, start, &q100),
"100 should match <= 100 (inclusive)"
);
assert!(
!matches_arena(&arena, start, &q150),
"150 should NOT match <= 100"
);
assert!(matches_arena(&arena, start, &q0), "0 should match <= 100");
assert!(
matches_arena(&arena, start, &q_neg),
"-50 should match <= 100"
);
}
#[test]
fn test_numeric_less_arena_fa_exclusive() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_less_arena_fa(100.0, false, next_field);
let q99 = q_num_from_f64(99.0);
let q100 = q_num_from_f64(100.0);
assert!(matches_arena(&arena, start, &q99), "99 should match < 100");
assert!(
!matches_arena(&arena, start, &q100),
"100 should NOT match < 100 (exclusive)"
);
}
#[allow(clippy::similar_names)]
#[test]
fn test_numeric_greater_arena_fa_basic() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_greater_arena_fa(100.0, true, next_field);
let q50 = q_num_from_f64(50.0);
let q100 = q_num_from_f64(100.0);
let q150 = q_num_from_f64(150.0);
assert!(
!matches_arena(&arena, start, &q50),
"50 should NOT match >= 100"
);
assert!(
matches_arena(&arena, start, &q100),
"100 should match >= 100 (inclusive)"
);
assert!(
matches_arena(&arena, start, &q150),
"150 should match >= 100"
);
}
#[test]
fn test_numeric_greater_arena_fa_exclusive() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_greater_arena_fa(100.0, false, next_field);
let q100 = q_num_from_f64(100.0);
let q101 = q_num_from_f64(101.0);
assert!(
!matches_arena(&arena, start, &q100),
"100 should NOT match > 100 (exclusive)"
);
assert!(
matches_arena(&arena, start, &q101),
"101 should match > 100"
);
}
#[allow(clippy::similar_names)]
#[test]
fn test_numeric_range_arena_fa_two_sided() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_range_arena_fa(50.0, true, 150.0, true, next_field);
let q25 = q_num_from_f64(25.0);
let q50 = q_num_from_f64(50.0);
let q100 = q_num_from_f64(100.0);
let q150 = q_num_from_f64(150.0);
let q200 = q_num_from_f64(200.0);
assert!(
!matches_arena(&arena, start, &q25),
"25 should NOT match [50, 150]"
);
assert!(
matches_arena(&arena, start, &q50),
"50 should match [50, 150]"
);
assert!(
matches_arena(&arena, start, &q100),
"100 should match [50, 150]"
);
assert!(
matches_arena(&arena, start, &q150),
"150 should match [50, 150]"
);
assert!(
!matches_arena(&arena, start, &q200),
"200 should NOT match [50, 150]"
);
}
#[allow(clippy::similar_names)]
#[test]
fn test_numeric_range_arena_fa_exclusive_bounds() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_range_arena_fa(50.0, false, 150.0, false, next_field);
let q50 = q_num_from_f64(50.0);
let q51 = q_num_from_f64(51.0);
let q149 = q_num_from_f64(149.0);
let q150 = q_num_from_f64(150.0);
assert!(
!matches_arena(&arena, start, &q50),
"50 should NOT match (50, 150)"
);
assert!(
matches_arena(&arena, start, &q51),
"51 should match (50, 150)"
);
assert!(
matches_arena(&arena, start, &q149),
"149 should match (50, 150)"
);
assert!(
!matches_arena(&arena, start, &q150),
"150 should NOT match (50, 150)"
);
}
#[test]
fn test_numeric_range_arena_fa_single_point() {
let next_field = Arc::new(FieldMatcher::new());
let q5 = q_num_from_f64(5.0);
let (incl, incl_start) =
make_numeric_range_arena_fa(5.0, true, 5.0, true, next_field.clone());
assert!(
matches_arena(&incl, incl_start, &q5),
"5 should match [5, 5]"
);
let (half_open, half_open_start) =
make_numeric_range_arena_fa(5.0, true, 5.0, false, next_field);
assert!(
!matches_arena(&half_open, half_open_start, &q5),
"5 should NOT match [5, 5)"
);
}
#[allow(clippy::similar_names)]
#[test]
fn test_numeric_arena_fa_edge_cases() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_less_arena_fa(0.0, true, next_field.clone());
let q_neg = q_num_from_f64(-1.0);
let q0 = q_num_from_f64(0.0);
let q1 = q_num_from_f64(1.0);
assert!(matches_arena(&arena, start, &q_neg), "-1 should match <= 0");
assert!(matches_arena(&arena, start, &q0), "0 should match <= 0");
assert!(
!matches_arena(&arena, start, &q1),
"1 should NOT match <= 0"
);
let (arena2, start2) = make_numeric_greater_arena_fa(-100.0, true, next_field);
let q_neg50 = q_num_from_f64(-50.0);
let q_neg100 = q_num_from_f64(-100.0);
let q_neg150 = q_num_from_f64(-150.0);
assert!(
matches_arena(&arena2, start2, &q_neg50),
"-50 should match >= -100"
);
assert!(
matches_arena(&arena2, start2, &q_neg100),
"-100 should match >= -100"
);
assert!(
!matches_arena(&arena2, start2, &q_neg150),
"-150 should NOT match >= -100"
);
}
#[test]
fn test_numeric_arena_fa_float_values() {
let next_field = Arc::new(FieldMatcher::new());
let (arena, start) = make_numeric_range_arena_fa(1.5, true, 2.5, true, next_field);
let q1 = q_num_from_f64(1.0);
let q1_5 = q_num_from_f64(1.5);
let q2 = q_num_from_f64(2.0);
let q2_5 = q_num_from_f64(2.5);
let q3 = q_num_from_f64(3.0);
assert!(
!matches_arena(&arena, start, &q1),
"1.0 should NOT match [1.5, 2.5]"
);
assert!(
matches_arena(&arena, start, &q1_5),
"1.5 should match [1.5, 2.5]"
);
assert!(
matches_arena(&arena, start, &q2),
"2.0 should match [1.5, 2.5]"
);
assert!(
matches_arena(&arena, start, &q2_5),
"2.5 should match [1.5, 2.5]"
);
assert!(
!matches_arena(&arena, start, &q3),
"3.0 should NOT match [1.5, 2.5]"
);
}
#[test]
fn test_numeric_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_numeric_less_arena_fa(50.0, false, fm1);
let (arena2, start2) = make_numeric_greater_arena_fa(100.0, false, fm2);
let (merged, merged_start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
let q25 = q_num_from_f64(25.0);
let q75 = q_num_from_f64(75.0);
let q150 = q_num_from_f64(150.0);
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(&merged, merged_start, &q25, &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "25 should match merged FA");
assert_eq!(
unsafe { &*(bufs.transitions[0] as *const FieldMatcher) }.match_id,
Some(1)
);
bufs.clear();
traverse_arena_nfa(&merged, merged_start, &q75, &mut bufs);
assert!(bufs.transitions.is_empty(), "75 should NOT match merged FA");
bufs.clear();
traverse_arena_nfa(&merged, merged_start, &q150, &mut bufs);
assert_eq!(bufs.transitions.len(), 1, "150 should match merged FA");
assert_eq!(
unsafe { &*(bufs.transitions[0] as *const FieldMatcher) }.match_id,
Some(2)
);
}
#[test]
fn test_numeric_arena_fa_ordering_preserved() {
let next_field = Arc::new(FieldMatcher::new());
let test_values = vec![
-1000.0, -100.0, -10.0, -1.0, -0.5, 0.0, 0.5, 1.0, 10.0, 100.0, 1000.0,
];
for &bound in &test_values {
let (arena_less, start_less) =
make_numeric_less_arena_fa(bound, false, next_field.clone());
let (arena_greater, start_greater) =
make_numeric_greater_arena_fa(bound, false, next_field.clone());
for &val in &test_values {
let q_val = q_num_from_f64(val);
let matches_less = matches_arena(&arena_less, start_less, &q_val);
let matches_greater = matches_arena(&arena_greater, start_greater, &q_val);
if val < bound {
assert!(
matches_less,
"{val} should match < {bound} (Q-number ordering)"
);
assert!(
!matches_greater,
"{val} should NOT match > {bound} (Q-number ordering)"
);
} else if val > bound {
assert!(
!matches_less,
"{val} should NOT match < {bound} (Q-number ordering)"
);
assert!(
matches_greater,
"{val} should match > {bound} (Q-number ordering)"
);
} else {
assert!(
!matches_less,
"{val} should NOT match < {bound} (exclusive)"
);
assert!(
!matches_greater,
"{val} should NOT match > {bound} (exclusive)"
);
}
}
}
}
}
#[cfg(test)]
#[allow(unsafe_code)]
mod nfa_merge_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
fn get_match_ids(arena: &StateArena, start: StateId, value: &[u8]) -> Vec<u64> {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
bufs.transitions
.iter()
.filter_map(|&ptr| unsafe { &*(ptr as *const FieldMatcher) }.match_id)
.collect()
}
fn make_epsilon_alternation_arena(
patterns: &[&[u8]],
fm: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(fm);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let mut branches = Vec::new();
for pattern in patterns {
if pattern.is_empty() {
branches.push(term_state);
} else {
let mut current = term_state;
for &byte in pattern.iter().rev() {
let state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current],
));
current = state;
}
branches.push(current);
}
}
let start = arena.alloc();
arena[start].table.epsilons = SmallVec::from_vec(branches);
arena.precompute_epsilon_closures();
(arena, start)
}
fn make_spinout_arena(
prefix: &[u8],
suffix: &[u8],
fm: Arc<FieldMatcher>,
) -> (StateArena, StateId) {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(fm);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let mut after_spinout = term_state;
for &byte in suffix.iter().rev() {
let state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[after_spinout],
));
after_spinout = state;
}
let spinout_state = arena.alloc();
arena[spinout_state].table = make_byte_dot_table(spinout_state);
arena[spinout_state].table.epsilons.push(after_spinout);
if !suffix.is_empty() {
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(&arena[spinout_state].table, &mut unpacked);
unpacked[suffix[0] as usize] = after_spinout;
arena[spinout_state].table.pack(&unpacked);
}
let mut current = spinout_state;
let start = if prefix.is_empty() {
let start = arena.alloc();
arena[start].table.epsilons.push(spinout_state);
start
} else {
for &byte in prefix.iter().rev() {
let state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[byte],
&[current],
));
current = state;
}
current
};
arena.precompute_epsilon_closures();
(arena, start)
}
fn step_byte(arena: &StateArena, state: StateId, byte: u8) -> StateId {
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(&arena[state].table, &mut unpacked);
unpacked[byte as usize]
}
#[test]
fn test_merge_alternation_with_literal_builds_splice_not_expansion() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = {
let mut arena = StateArena::new();
let m = arena.alloc();
arena[m].field_transitions.push(fm1);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[m],
));
let b_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"b", &[term]));
let c_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"c", &[term]));
let alt = arena.alloc();
arena[alt].table.epsilons = smallvec![b_state, c_state];
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[alt]));
arena.precompute_epsilon_closures();
(arena, start)
};
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let d_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"d", &[term]));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[d_state]));
(arena, start)
};
let (merged, start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
let after_a = step_byte(&merged, start, b'a');
assert!(!after_a.is_none(), "'a' must reach a merged state");
assert!(
is_epsilon_only_state(&merged, after_a),
"expected an epsilon-only splice, got a state that carries its own \
byte transitions"
);
assert_eq!(get_match_ids(&merged, start, b"ab"), vec![1]);
assert_eq!(get_match_ids(&merged, start, b"ac"), vec![1]);
assert_eq!(get_match_ids(&merged, start, b"ad"), vec![2]);
}
#[test]
fn test_merge_spinout_with_literal_stays_a_spinner() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = make_spinout_arena(b"a", b"", fm1);
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let d_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"d", &[term]));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[d_state]));
(arena, start)
};
let (merged, start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
let spinner = step_byte(&merged, start, b'a');
assert!(!spinner.is_none(), "'a' must reach the combined spinner");
assert!(
is_spinout_state(&merged, spinner),
"the combined post-'a' state must remain a self-looping spinner"
);
assert_eq!(
step_byte(&merged, spinner, b'z'),
spinner,
"a byte the literal does not cover must loop back to the same spinner"
);
assert!(
get_match_ids(&merged, start, b"a").contains(&1),
"'a' is a*"
);
assert!(
get_match_ids(&merged, start, b"azz").contains(&1),
"'azz' is a*"
);
let ids_ad = get_match_ids(&merged, start, b"ad");
assert!(
ids_ad.contains(&1) && ids_ad.contains(&2),
"'ad' matches both a* and the literal ad, got {ids_ad:?}"
);
}
#[test]
fn test_merge_arena_with_epsilons() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = make_epsilon_alternation_arena(&[b"a", b"b"], fm1);
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"c", &[term]));
(arena, start)
};
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
let ids_a = get_match_ids(&merged, merged_start, b"a");
assert!(ids_a.contains(&1), "Merged should match 'a' (id=1)");
let ids_b = get_match_ids(&merged, merged_start, b"b");
assert!(ids_b.contains(&1), "Merged should match 'b' (id=1)");
let ids_c = get_match_ids(&merged, merged_start, b"c");
assert!(ids_c.contains(&2), "Merged should match 'c' (id=2)");
assert!(
!matches_value(&merged, merged_start, b"d"),
"Merged should NOT match 'd'"
);
}
#[test]
fn test_merge_shared_prefix_spinout_and_literal() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = make_spinout_arena(b"a", b"", fm1);
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let b_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"b", &[term]));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[b_state]));
(arena, start)
};
let (merged, start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert_eq!(
get_match_ids(&merged, start, b"a"),
vec![1],
"'a' is a* only"
);
let ids_ab = get_match_ids(&merged, start, b"ab");
assert!(
ids_ab.contains(&1) && ids_ab.contains(&2),
"'ab' matches both a* and the literal ab, got {ids_ab:?}"
);
assert_eq!(
get_match_ids(&merged, start, b"aXYZ"),
vec![1],
"'aXYZ' is a* only"
);
assert!(
!matches_value(&merged, start, b"b"),
"'b' needs a leading 'a'"
);
}
#[test]
fn test_merge_shared_prefix_epsilon_alternation_and_literal() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = {
let mut arena = StateArena::new();
let m = arena.alloc();
arena[m].field_transitions.push(fm1);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[m],
));
let b_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"b", &[term]));
let c_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"c", &[term]));
let alt = arena.alloc();
arena[alt].table.epsilons = smallvec![b_state, c_state];
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[alt]));
arena.precompute_epsilon_closures();
(arena, start)
};
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let d_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"d", &[term]));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"a", &[d_state]));
(arena, start)
};
let (merged, start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert_eq!(
get_match_ids(&merged, start, b"ab"),
vec![1],
"'ab' is alternation only"
);
assert_eq!(
get_match_ids(&merged, start, b"ac"),
vec![1],
"'ac' is alternation only"
);
assert_eq!(
get_match_ids(&merged, start, b"ad"),
vec![2],
"'ad' is literal only"
);
assert!(
!matches_value(&merged, start, b"ae"),
"'ae' matches neither"
);
assert!(
!matches_value(&merged, start, b"b"),
"'b' has no leading 'a' — alternation must not fire"
);
assert!(
!matches_value(&merged, start, b"d"),
"'d' has no leading 'a' — literal must not fire"
);
assert!(
!matches_value(&merged, start, b"f"),
"'f' matches neither pattern"
);
}
#[test]
fn test_merge_suffixed_spinout_asymmetric_branches() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let make_chain = |bytes: &[u8], fm: Arc<FieldMatcher>| {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm);
let mut next = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
for &b in bytes.iter().rev() {
next =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, &[b], &[next]));
}
(arena, next)
};
let (wild_arena, wild_start) = make_spinout_arena(b"a", b"c", fm1);
let (disjoint_arena, disjoint_start) =
make_chain(b"ad", Arc::new(FieldMatcher::with_match_id(2)));
for (a1, s1, a2, s2) in [
(&wild_arena, wild_start, &disjoint_arena, disjoint_start),
(&disjoint_arena, disjoint_start, &wild_arena, wild_start),
] {
let (m, st) = merge_arena_nfas(a1, s1, a2, s2);
assert_eq!(
get_match_ids(&m, st, b"aXc"),
vec![1],
"a*c keeps its 'c' exit"
);
assert_eq!(
get_match_ids(&m, st, b"ad"),
vec![2],
"literal 'ad' still matches"
);
assert!(
!matches_value(&m, st, b"Xc"),
"a*c needs a leading 'a' — 'Xc' must not match"
);
assert!(
!matches_value(&m, st, b"d"),
"literal 'ad' needs a leading 'a' — 'd' must not match"
);
}
let (shared_arena, shared_start) =
make_chain(b"ac", Arc::new(FieldMatcher::with_match_id(2)));
for (a1, s1, a2, s2) in [
(&wild_arena, wild_start, &shared_arena, shared_start),
(&shared_arena, shared_start, &wild_arena, wild_start),
] {
let (m, st) = merge_arena_nfas(a1, s1, a2, s2);
let ids = get_match_ids(&m, st, b"ac");
assert!(
ids.contains(&1) && ids.contains(&2),
"'ac' matches a*c and literal ac, got {ids:?}"
);
assert!(
get_match_ids(&m, st, b"acYc").contains(&1),
"a*c still loops after its 'c' branch: 'acYc' matches"
);
assert!(
!matches_value(&m, st, b"Xc"),
"a*c needs a leading 'a' — 'Xc' must not match"
);
assert!(
!matches_value(&m, st, b"c"),
"literal 'ac' needs a leading 'a' — 'c' must not match"
);
}
}
#[test]
fn test_merge_arena_with_spinout() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = make_spinout_arena(b"a", b"b", fm1);
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = make_spinout_arena(b"x", b"y", fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
let ids_ab = get_match_ids(&merged, merged_start, b"ab");
assert!(ids_ab.contains(&1), "Merged should match 'ab'");
let ids_axxxb = get_match_ids(&merged, merged_start, b"aXXXb");
assert!(ids_axxxb.contains(&1), "Merged should match 'aXXXb'");
let ids_xy = get_match_ids(&merged, merged_start, b"xy");
assert!(ids_xy.contains(&2), "Merged should match 'xy'");
let ids_xzzzy = get_match_ids(&merged, merged_start, b"xZZZy");
assert!(ids_xzzzy.contains(&2), "Merged should match 'xZZZy'");
assert!(
!matches_value(&merged, merged_start, b"ac"),
"Merged should NOT match 'ac'"
);
}
#[test]
fn test_merge_arena_shellstyle_patterns() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_spinout_arena(b"foo", b"", fm1);
let (arena2, start2) = make_spinout_arena(b"", b"bar", fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
let ids_foo = get_match_ids(&merged, merged_start, b"foo");
assert!(ids_foo.contains(&1), "Merged should match 'foo'");
let ids_fooxyz = get_match_ids(&merged, merged_start, b"fooXYZ");
assert!(ids_fooxyz.contains(&1), "Merged should match 'fooXYZ'");
let ids_bar = get_match_ids(&merged, merged_start, b"bar");
assert!(ids_bar.contains(&2), "Merged should match 'bar'");
let ids_xyzbar = get_match_ids(&merged, merged_start, b"XYZbar");
assert!(ids_xyzbar.contains(&2), "Merged should match 'XYZbar'");
let ids_foobar = get_match_ids(&merged, merged_start, b"foobar");
assert!(
ids_foobar.contains(&1) && ids_foobar.contains(&2),
"Merged should match 'foobar' with both patterns"
);
}
#[test]
fn test_merge_arena_preserves_cycles() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(fm1);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let loopback = arena.alloc();
let start = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
b"ab",
&[loopback, loopback],
));
arena[loopback].table.epsilons = smallvec![term_state, start];
(arena, start)
};
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let end = arena.alloc();
arena[end].field_transitions.push(fm2);
let term = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[end],
));
let start =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"c", &[term]));
(arena, start)
};
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"a"),
"Merged cycle should match 'a'"
);
assert!(
matches_value(&merged, merged_start, b"b"),
"Merged cycle should match 'b'"
);
assert!(
matches_value(&merged, merged_start, b"ab"),
"Merged cycle should match 'ab'"
);
assert!(
matches_value(&merged, merged_start, b"aba"),
"Merged cycle should match 'aba'"
);
assert!(
matches_value(&merged, merged_start, b"abba"),
"Merged cycle should match 'abba'"
);
assert!(
matches_value(&merged, merged_start, b"aaabbb"),
"Merged cycle should match 'aaabbb'"
);
let long_ab = "ab".repeat(50);
assert!(
matches_value(&merged, merged_start, long_ab.as_bytes()),
"Merged cycle should match long 'abab...' pattern"
);
assert!(
matches_value(&merged, merged_start, b"c"),
"Merged should match 'c'"
);
assert!(
!matches_value(&merged, merged_start, b"d"),
"Merged should NOT match 'd'"
);
assert!(
!matches_value(&merged, merged_start, b""),
"[ab]+ should NOT match empty string"
);
}
#[test]
fn test_merge_arena_both_have_spinouts() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let (arena1, start1) = {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(fm1);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let spinout2 = arena.alloc();
arena[spinout2].table = make_byte_dot_table(spinout2);
arena[spinout2].table.epsilons.push(term_state);
let x_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"X", &[spinout2]));
let spinout1 = arena.alloc();
arena[spinout1].table = make_byte_dot_table(spinout1);
arena[spinout1].table.epsilons.push(x_state);
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(&arena[spinout1].table, &mut unpacked);
unpacked[b'X' as usize] = spinout2;
arena[spinout1].table.pack(&unpacked);
let start = arena.alloc();
arena[start].table.epsilons.push(spinout1);
(arena, start)
};
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena2, start2) = {
let mut arena = StateArena::new();
let match_state = arena.alloc();
arena[match_state].field_transitions.push(fm2);
let term_state = arena.alloc_with_table(SmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let spinout2 = arena.alloc();
arena[spinout2].table = make_byte_dot_table(spinout2);
arena[spinout2].table.epsilons.push(term_state);
let y_state =
arena.alloc_with_table(SmallTable::with_mappings(StateId::NONE, b"Y", &[spinout2]));
let spinout1 = arena.alloc();
arena[spinout1].table = make_byte_dot_table(spinout1);
arena[spinout1].table.epsilons.push(y_state);
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpack_arena_table(&arena[spinout1].table, &mut unpacked);
unpacked[b'Y' as usize] = spinout2;
arena[spinout1].table.pack(&unpacked);
let start = arena.alloc();
arena[start].table.epsilons.push(spinout1);
(arena, start)
};
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
get_match_ids(&merged, merged_start, b"X").contains(&1),
"Should match 'X'"
);
assert!(
get_match_ids(&merged, merged_start, b"aXb").contains(&1),
"Should match 'aXb'"
);
assert!(
get_match_ids(&merged, merged_start, b"aaaXbbb").contains(&1),
"Should match 'aaaXbbb'"
);
assert!(
get_match_ids(&merged, merged_start, b"Y").contains(&2),
"Should match 'Y'"
);
assert!(
get_match_ids(&merged, merged_start, b"aYb").contains(&2),
"Should match 'aYb'"
);
let ids_xy = get_match_ids(&merged, merged_start, b"XY");
assert!(
ids_xy.contains(&1) && ids_xy.contains(&2),
"Should match 'XY' with both patterns"
);
let ids_yax = get_match_ids(&merged, merged_start, b"YaX");
assert!(
ids_yax.contains(&1) && ids_yax.contains(&2),
"Should match 'YaX' with both patterns"
);
assert!(
!matches_value(&merged, merged_start, b"abc"),
"Should NOT match 'abc'"
);
}
#[test]
fn test_merge_arena_nfas_empty_cases() {
let fm = Arc::new(FieldMatcher::new());
let (arena1, start1) = make_epsilon_alternation_arena(&[b"a"], fm);
let (merged, merged_start) =
merge_arena_nfas(&arena1, start1, &StateArena::new(), StateId::NONE);
assert!(
matches_value(&merged, merged_start, b"a"),
"Merging with empty should preserve original"
);
let (merged2, merged_start2) =
merge_arena_nfas(&StateArena::new(), StateId::NONE, &arena1, start1);
assert!(
matches_value(&merged2, merged_start2, b"a"),
"Merging empty with non-empty should preserve original"
);
let (_merged3, merged_start3) = merge_arena_nfas(
&StateArena::new(),
StateId::NONE,
&StateArena::new(),
StateId::NONE,
);
assert!(
merged_start3.is_none(),
"Merging two empty arenas should return NONE"
);
}
#[test]
fn test_flatten_epsilon_targets_on_repeated_merge() {
let fm = Arc::new(FieldMatcher::new());
let (a1, s1) = make_epsilon_alternation_arena(&[b"a"], fm.clone());
let (a2, s2) = make_epsilon_alternation_arena(&[b"b"], fm.clone());
let (a3, s3) = make_epsilon_alternation_arena(&[b"c"], fm.clone());
let (a4, s4) = make_epsilon_alternation_arena(&[b"d"], fm);
let (m12, s12) = merge_arena_nfas(&a1, s1, &a2, s2);
let (m123, s123) = merge_arena_nfas(&m12, s12, &a3, s3);
let (m1234, s1234) = merge_arena_nfas(&m123, s123, &a4, s4);
assert!(matches_value(&m1234, s1234, b"a"), "should match 'a'");
assert!(matches_value(&m1234, s1234, b"b"), "should match 'b'");
assert!(matches_value(&m1234, s1234, b"c"), "should match 'c'");
assert!(matches_value(&m1234, s1234, b"d"), "should match 'd'");
assert!(!matches_value(&m1234, s1234, b"e"), "should not match 'e'");
let start_state = &m1234[s1234];
let eps_count = start_state.table.epsilons.len();
assert!(
eps_count > 2,
"Flattening should produce > 2 direct epsilon targets, got {eps_count}"
);
}
}
#[cfg(test)]
mod string_arena_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_string_arena_fa_basic() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_string_arena_fa(b"hello", fm);
assert!(
matches_value(&arena, start, b"hello"),
"Should match 'hello'"
);
assert!(
!matches_value(&arena, start, b"hell"),
"Should NOT match 'hell' (prefix)"
);
assert!(
!matches_value(&arena, start, b"helloworld"),
"Should NOT match 'helloworld' (longer)"
);
assert!(
!matches_value(&arena, start, b"world"),
"Should NOT match 'world'"
);
assert!(
!matches_value(&arena, start, b""),
"Should NOT match empty string"
);
}
#[test]
fn test_string_arena_fa_empty_string() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_string_arena_fa(b"", fm);
assert!(
matches_value(&arena, start, b""),
"Should match empty string"
);
assert!(!matches_value(&arena, start, b"a"), "Should NOT match 'a'");
}
#[test]
fn test_string_arena_fa_single_char() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_string_arena_fa(b"x", fm);
assert!(matches_value(&arena, start, b"x"), "Should match 'x'");
assert!(!matches_value(&arena, start, b"y"), "Should NOT match 'y'");
assert!(
!matches_value(&arena, start, b"xy"),
"Should NOT match 'xy'"
);
}
#[test]
fn test_string_arena_fa_utf8() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_string_arena_fa("café".as_bytes(), fm);
assert!(
matches_value(&arena, start, "café".as_bytes()),
"Should match 'café'"
);
assert!(
!matches_value(&arena, start, b"cafe"),
"Should NOT match 'cafe'"
);
}
#[test]
fn test_string_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_string_arena_fa(b"foo", fm1);
let (arena2, start2) = make_string_arena_fa(b"bar", fm2);
let (merged, merged_start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"foo"),
"Merged should match 'foo'"
);
assert!(
matches_value(&merged, merged_start, b"bar"),
"Merged should match 'bar'"
);
assert!(
!matches_value(&merged, merged_start, b"baz"),
"Merged should NOT match 'baz'"
);
}
#[test]
fn test_string_arena_fa_merge_common_prefix() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_string_arena_fa(b"prefix_one", fm1);
let (arena2, start2) = make_string_arena_fa(b"prefix_two", fm2);
let (merged, merged_start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"prefix_one"),
"Merged should match 'prefix_one'"
);
assert!(
matches_value(&merged, merged_start, b"prefix_two"),
"Merged should match 'prefix_two'"
);
assert!(
!matches_value(&merged, merged_start, b"prefix"),
"Merged should NOT match 'prefix'"
);
assert!(
!matches_value(&merged, merged_start, b"prefix_"),
"Merged should NOT match 'prefix_'"
);
}
}
#[cfg(test)]
mod prefix_arena_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_prefix_arena_fa_basic() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_prefix_arena_fa(b"hello", fm);
assert!(
matches_value(&arena, start, b"hello"),
"Should match 'hello'"
);
assert!(
matches_value(&arena, start, b"helloworld"),
"Should match 'helloworld'"
);
assert!(
matches_value(&arena, start, b"hello_test"),
"Should match 'hello_test'"
);
assert!(
matches_value(&arena, start, b"hello123"),
"Should match 'hello123'"
);
assert!(
!matches_value(&arena, start, b"hell"),
"Should NOT match 'hell' (shorter than prefix)"
);
assert!(
!matches_value(&arena, start, b"world"),
"Should NOT match 'world'"
);
assert!(
!matches_value(&arena, start, b""),
"Should NOT match empty string"
);
}
#[test]
fn test_prefix_arena_fa_empty_prefix() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_prefix_arena_fa(b"", fm);
assert!(
matches_value(&arena, start, b""),
"Should match empty string"
);
assert!(
matches_value(&arena, start, b"anything"),
"Should match 'anything'"
);
assert!(
matches_value(&arena, start, b"hello world"),
"Should match 'hello world'"
);
}
#[test]
fn test_prefix_arena_fa_single_char() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_prefix_arena_fa(b"a", fm);
assert!(matches_value(&arena, start, b"a"), "Should match 'a'");
assert!(matches_value(&arena, start, b"abc"), "Should match 'abc'");
assert!(!matches_value(&arena, start, b"b"), "Should NOT match 'b'");
assert!(
!matches_value(&arena, start, b""),
"Should NOT match empty string"
);
}
#[test]
fn test_prefix_arena_fa_utf8() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_prefix_arena_fa(b"caf", fm);
assert!(
matches_value(&arena, start, "café".as_bytes()),
"Should match 'café'"
);
assert!(
matches_value(&arena, start, b"cafeteria"),
"Should match 'cafeteria'"
);
assert!(
!matches_value(&arena, start, b"ca"),
"Should NOT match 'ca'"
);
}
#[test]
fn test_prefix_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_prefix_arena_fa(b"foo", fm1);
let (arena2, start2) = make_prefix_arena_fa(b"bar", fm2);
let (merged, merged_start) = merge_arena_dfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"foo"),
"Merged should match 'foo'"
);
assert!(
matches_value(&merged, merged_start, b"foobar"),
"Merged should match 'foobar'"
);
assert!(
matches_value(&merged, merged_start, b"bar"),
"Merged should match 'bar'"
);
assert!(
matches_value(&merged, merged_start, b"barfoo"),
"Merged should match 'barfoo'"
);
assert!(
!matches_value(&merged, merged_start, b"baz"),
"Merged should NOT match 'baz'"
);
}
}
#[cfg(test)]
mod shellstyle_arena_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_shellstyle_arena_fa_prefix_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"foo*", fm);
assert!(matches_value(&arena, start, b"foo"), "Should match 'foo'");
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar'"
);
assert!(
matches_value(&arena, start, b"foo123"),
"Should match 'foo123'"
);
assert!(
!matches_value(&arena, start, b"fo"),
"Should NOT match 'fo'"
);
assert!(
!matches_value(&arena, start, b"bar"),
"Should NOT match 'bar'"
);
}
#[test]
fn test_shellstyle_arena_fa_suffix_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"*bar", fm);
assert!(matches_value(&arena, start, b"bar"), "Should match 'bar'");
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar'"
);
assert!(
matches_value(&arena, start, b"123bar"),
"Should match '123bar'"
);
assert!(
!matches_value(&arena, start, b"ba"),
"Should NOT match 'ba'"
);
assert!(
!matches_value(&arena, start, b"baz"),
"Should NOT match 'baz'"
);
}
#[test]
fn test_shellstyle_arena_fa_infix_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"foo*bar", fm);
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar'"
);
assert!(
matches_value(&arena, start, b"foo_bar"),
"Should match 'foo_bar'"
);
assert!(
matches_value(&arena, start, b"foo123bar"),
"Should match 'foo123bar'"
);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match 'foo'"
);
assert!(
!matches_value(&arena, start, b"bar"),
"Should NOT match 'bar'"
);
assert!(
!matches_value(&arena, start, b"foobaz"),
"Should NOT match 'foobaz'"
);
}
#[test]
fn test_shellstyle_arena_fa_no_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"hello", fm);
assert!(
matches_value(&arena, start, b"hello"),
"Should match 'hello'"
);
assert!(
!matches_value(&arena, start, b"helloworld"),
"Should NOT match 'helloworld'"
);
assert!(
!matches_value(&arena, start, b"hell"),
"Should NOT match 'hell'"
);
}
#[test]
fn test_shellstyle_arena_fa_only_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"*", fm);
assert!(matches_value(&arena, start, b""), "Should match empty");
assert!(
matches_value(&arena, start, b"anything"),
"Should match 'anything'"
);
assert!(
matches_value(&arena, start, b"foo bar baz"),
"Should match 'foo bar baz'"
);
}
#[test]
fn test_shellstyle_arena_fa_double_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"*foo*", fm);
assert!(matches_value(&arena, start, b"foo"), "Should match 'foo'");
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar'"
);
assert!(
matches_value(&arena, start, b"barfoo"),
"Should match 'barfoo'"
);
assert!(
matches_value(&arena, start, b"barfoobaz"),
"Should match 'barfoobaz'"
);
assert!(
matches_value(&arena, start, b"foofoofoo"),
"Should match 'foofoofoo'"
);
assert!(
!matches_value(&arena, start, b"bar"),
"Should NOT match 'bar'"
);
assert!(
!matches_value(&arena, start, b"fo"),
"Should NOT match 'fo'"
);
assert!(
!matches_value(&arena, start, b"ffo"),
"Should NOT match 'ffo'"
);
}
#[test]
fn test_shellstyle_arena_fa_foo_bar_multi_star() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"*foo*bar*", fm);
assert!(matches_value(&arena, start, b"foobar"));
assert!(matches_value(&arena, start, b"xfooybar"));
assert!(matches_value(&arena, start, b"foobarbaz"));
assert!(matches_value(&arena, start, b"xxfooxxbarxx"));
assert!(!matches_value(&arena, start, b"barfoo"));
assert!(!matches_value(&arena, start, b"foo"));
assert!(!matches_value(&arena, start, b"bar"));
assert!(!matches_value(&arena, start, b"fobar"));
}
#[test]
fn test_shellstyle_arena_fa_five_star() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"*a*b*c*d*e*", fm);
assert!(matches_value(&arena, start, b"abcde"));
assert!(matches_value(&arena, start, b"xaxbxcxdxex"));
assert!(matches_value(&arena, start, b"aabbccddee"));
assert!(!matches_value(&arena, start, b"abcd"));
assert!(!matches_value(&arena, start, b"edcba"));
assert!(!matches_value(&arena, start, b"abce"));
}
#[test]
fn test_shellstyle_arena_fa_eight_star() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_shellstyle_arena_fa(b"*a*b*c*d*e*f*g*h*", fm);
assert!(matches_value(&arena, start, b"abcdefgh"));
assert!(matches_value(&arena, start, b"xaxbxcxdxexfxgxhx"));
assert!(!matches_value(&arena, start, b"abcdefg"));
assert!(!matches_value(&arena, start, b"hgfedcba"));
}
#[test]
fn test_shellstyle_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_shellstyle_arena_fa(b"foo*", fm1);
let (arena2, start2) = make_shellstyle_arena_fa(b"*bar", fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"foo"),
"Merged should match 'foo'"
);
assert!(
matches_value(&merged, merged_start, b"bar"),
"Merged should match 'bar'"
);
assert!(
matches_value(&merged, merged_start, b"foobar"),
"Merged should match 'foobar'"
);
assert!(
!matches_value(&merged, merged_start, b"baz"),
"Merged should NOT match 'baz'"
);
}
}
#[cfg(test)]
mod wildcard_arena_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_wildcard_arena_fa_basic() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"foo*bar", fm);
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar'"
);
assert!(
matches_value(&arena, start, b"foo123bar"),
"Should match 'foo123bar'"
);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match 'foo'"
);
}
#[test]
fn test_wildcard_arena_fa_escape_star() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"foo\\*bar", fm);
assert!(
matches_value(&arena, start, b"foo*bar"),
"Should match 'foo*bar'"
);
assert!(
!matches_value(&arena, start, b"foobar"),
"Should NOT match 'foobar'"
);
assert!(
!matches_value(&arena, start, b"foo123bar"),
"Should NOT match 'foo123bar'"
);
}
#[test]
fn test_wildcard_arena_fa_escape_backslash() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"foo\\\\bar", fm);
assert!(
matches_value(&arena, start, b"foo\\bar"),
"Should match 'foo\\bar'"
);
assert!(
!matches_value(&arena, start, b"foobar"),
"Should NOT match 'foobar'"
);
}
#[test]
fn test_wildcard_arena_fa_escape_with_wildcard() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"foo\\\\*bar", fm);
assert!(
matches_value(&arena, start, b"foo\\bar"),
"Should match 'foo\\bar'"
);
assert!(
matches_value(&arena, start, b"foo\\123bar"),
"Should match 'foo\\123bar'"
);
assert!(
!matches_value(&arena, start, b"foobar"),
"Should NOT match 'foobar'"
);
}
#[test]
fn test_wildcard_arena_fa_star_at_end_with_escape() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"foo\\**", fm);
assert!(matches_value(&arena, start, b"foo*"), "Should match 'foo*'");
assert!(
matches_value(&arena, start, b"foo*bar"),
"Should match 'foo*bar'"
);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match 'foo'"
);
assert!(
!matches_value(&arena, start, b"foobar"),
"Should NOT match 'foobar'"
);
}
#[test]
fn test_wildcard_arena_fa_no_escape() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"hello", fm);
assert!(
matches_value(&arena, start, b"hello"),
"Should match 'hello'"
);
assert!(
!matches_value(&arena, start, b"helloworld"),
"Should NOT match 'helloworld'"
);
}
#[test]
fn test_wildcard_arena_fa_trailing_backslash() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_wildcard_arena_fa(b"foo\\", fm);
assert!(
matches_value(&arena, start, b"foo\\"),
"Should match 'foo\\'"
);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match 'foo'"
);
}
#[test]
fn test_wildcard_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_wildcard_arena_fa(b"foo\\*", fm1);
let (arena2, start2) = make_wildcard_arena_fa(b"bar*", fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"foo*"),
"Merged should match 'foo*'"
);
assert!(
matches_value(&merged, merged_start, b"bar"),
"Merged should match 'bar'"
);
assert!(
!matches_value(&merged, merged_start, b"baz"),
"Merged should NOT match 'baz'"
);
}
}
#[cfg(test)]
mod anything_but_arena_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_anything_but_arena_fa_single_value() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let excluded = vec![b"foo".to_vec()];
let (arena, start) = make_anything_but_arena_fa(&excluded, fm);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match excluded 'foo'"
);
assert!(matches_value(&arena, start, b"bar"), "Should match 'bar'");
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar' (longer than excluded)"
);
assert!(matches_value(&arena, start, b"fo"), "Should match 'fo'");
assert!(matches_value(&arena, start, b""), "Should match empty");
}
#[test]
fn test_anything_but_arena_fa_multiple_values() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let excluded = vec![b"foo".to_vec(), b"bar".to_vec()];
let (arena, start) = make_anything_but_arena_fa(&excluded, fm);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match excluded 'foo'"
);
assert!(
!matches_value(&arena, start, b"bar"),
"Should NOT match excluded 'bar'"
);
assert!(matches_value(&arena, start, b"baz"), "Should match 'baz'");
assert!(
matches_value(&arena, start, b"foobar"),
"Should match 'foobar'"
);
}
#[test]
fn test_anything_but_arena_fa_common_prefix() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let excluded = vec![b"foo".to_vec(), b"foobar".to_vec()];
let (arena, start) = make_anything_but_arena_fa(&excluded, fm);
assert!(
!matches_value(&arena, start, b"foo"),
"Should NOT match excluded 'foo'"
);
assert!(
!matches_value(&arena, start, b"foobar"),
"Should NOT match excluded 'foobar'"
);
assert!(matches_value(&arena, start, b"fo"), "Should match 'fo'");
assert!(matches_value(&arena, start, b"foob"), "Should match 'foob'");
assert!(
matches_value(&arena, start, b"foobarbaz"),
"Should match 'foobarbaz'"
);
}
#[test]
fn test_anything_but_arena_fa_empty_excluded() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let excluded: Vec<Vec<u8>> = vec![];
let (arena, start) = make_anything_but_arena_fa(&excluded, fm);
assert!(
matches_value(&arena, start, b"anything"),
"Should match 'anything'"
);
assert!(matches_value(&arena, start, b""), "Should match empty");
}
#[test]
fn test_anything_but_arena_fa_empty_value() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let excluded = vec![b"".to_vec()];
let (arena, start) = make_anything_but_arena_fa(&excluded, fm);
assert!(matches_value(&arena, start, b"foo"), "Should match 'foo'");
assert!(matches_value(&arena, start, b""), "Should match empty");
}
#[test]
fn test_anything_but_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_anything_but_arena_fa(&[b"foo".to_vec()], fm1);
let (arena2, start2) = make_string_arena_fa(b"bar", fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
!matches_value(&merged, merged_start, b"foo"),
"Merged should NOT match 'foo'"
);
assert!(
matches_value(&merged, merged_start, b"bar"),
"Merged should match 'bar'"
);
assert!(
matches_value(&merged, merged_start, b"baz"),
"Merged should match 'baz'"
);
}
}
#[cfg(test)]
mod monocase_arena_tests {
use super::*;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_monocase_arena_fa_single_char() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"A", fm);
assert!(matches_value(&arena, start, b"A"), "Should match 'A'");
assert!(matches_value(&arena, start, b"a"), "Should match 'a'");
assert!(!matches_value(&arena, start, b"B"), "Should NOT match 'B'");
}
#[test]
fn test_monocase_arena_fa_two_chars() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"Ab", fm);
assert!(matches_value(&arena, start, b"Ab"), "Should match 'Ab'");
assert!(matches_value(&arena, start, b"ab"), "Should match 'ab'");
assert!(matches_value(&arena, start, b"AB"), "Should match 'AB'");
assert!(matches_value(&arena, start, b"aB"), "Should match 'aB'");
assert!(
!matches_value(&arena, start, b"Ac"),
"Should NOT match 'Ac'"
);
}
#[test]
fn test_monocase_arena_fa_three_chars() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"cat", fm);
assert!(matches_value(&arena, start, b"cat"), "Should match 'cat'");
assert!(matches_value(&arena, start, b"CAT"), "Should match 'CAT'");
assert!(matches_value(&arena, start, b"Cat"), "Should match 'Cat'");
}
#[test]
fn test_monocase_arena_fa_basic_ascii() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"Hello", fm);
assert!(
matches_value(&arena, start, b"Hello"),
"Should match 'Hello'"
);
assert!(
matches_value(&arena, start, b"hello"),
"Should match 'hello'"
);
assert!(
matches_value(&arena, start, b"HELLO"),
"Should match 'HELLO'"
);
assert!(
matches_value(&arena, start, b"hElLo"),
"Should match 'hElLo'"
);
assert!(
!matches_value(&arena, start, b"world"),
"Should NOT match 'world'"
);
assert!(
!matches_value(&arena, start, b"hell"),
"Should NOT match 'hell'"
);
}
#[test]
fn test_monocase_arena_fa_empty() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"", fm);
assert!(matches_value(&arena, start, b""), "Should match empty");
assert!(!matches_value(&arena, start, b"a"), "Should NOT match 'a'");
}
#[test]
fn test_monocase_arena_fa_no_case_chars() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"123", fm);
assert!(matches_value(&arena, start, b"123"), "Should match '123'");
assert!(
!matches_value(&arena, start, b"456"),
"Should NOT match '456'"
);
}
#[test]
fn test_monocase_arena_fa_mixed_ascii() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa(b"Abc123", fm);
assert!(
matches_value(&arena, start, b"Abc123"),
"Should match 'Abc123'"
);
assert!(
matches_value(&arena, start, b"abc123"),
"Should match 'abc123'"
);
assert!(
matches_value(&arena, start, b"ABC123"),
"Should match 'ABC123'"
);
assert!(
!matches_value(&arena, start, b"Abc124"),
"Should NOT match 'Abc124'"
);
}
#[test]
fn test_monocase_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let (arena1, start1) = make_monocase_arena_fa(b"Foo", fm1);
let (arena2, start2) = make_monocase_arena_fa(b"Bar", fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"foo"),
"Merged should match 'foo'"
);
assert!(
matches_value(&merged, merged_start, b"FOO"),
"Merged should match 'FOO'"
);
assert!(
matches_value(&merged, merged_start, b"bar"),
"Merged should match 'bar'"
);
assert!(
matches_value(&merged, merged_start, b"BAR"),
"Merged should match 'BAR'"
);
assert!(
!matches_value(&merged, merged_start, b"baz"),
"Merged should NOT match 'baz'"
);
}
#[test]
fn test_monocase_arena_fa_greek_sigma() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let pattern = "Σοφα".as_bytes();
let (arena, start) = make_monocase_arena_fa(pattern, fm);
assert!(
matches_value(&arena, start, "Σοφα".as_bytes()),
"Original Greek should match"
);
assert!(
matches_value(&arena, start, "σοφα".as_bytes()),
"Lowercase sigma at start should match"
);
assert!(
matches_value(&arena, start, "ΣΟΦΑ".as_bytes()),
"All uppercase should match"
);
assert!(
matches_value(&arena, start, "σΟΦΑ".as_bytes()),
"Mixed case should match"
);
}
#[test]
fn test_monocase_arena_fa_shared_lead_byte() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let (arena, start) = make_monocase_arena_fa("Àb".as_bytes(), fm);
assert!(
matches_value(&arena, start, "Àb".as_bytes()),
"Should match original 'Àb'"
);
assert!(
matches_value(&arena, start, "àB".as_bytes()),
"Should match folded 'àB'"
);
assert!(
matches_value(&arena, start, "àb".as_bytes()),
"Should match folded 'àb'"
);
assert!(
!matches_value(&arena, start, "Áb".as_bytes()),
"Should NOT match 'Áb' (U+00C1 shares the C3 lead byte only)"
);
assert!(
!matches_value(&arena, start, "Àc".as_bytes()),
"Should NOT match 'Àc'"
);
}
}
#[cfg(test)]
mod cidr_arena_tests {
use super::*;
use crate::json::CidrPattern;
fn matches_value(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
#[test]
fn test_cidr_arena_fa_ipv4_exact() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let cidr = CidrPattern::V4 {
network: [192, 168, 1, 1],
prefix_len: 32,
};
let (arena, start) = make_cidr_arena_fa(&cidr, fm);
assert!(
matches_value(&arena, start, b"\"192.168.1.1\""),
"Should match exact IP"
);
assert!(
!matches_value(&arena, start, b"\"192.168.1.2\""),
"Should NOT match different IP"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_cidr_arena_fa_ipv4_24() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let cidr = CidrPattern::V4 {
network: [10, 0, 0, 0],
prefix_len: 24,
};
let (arena, start) = make_cidr_arena_fa(&cidr, fm);
assert!(
matches_value(&arena, start, b"\"10.0.0.0\""),
"Should match 10.0.0.0"
);
assert!(
matches_value(&arena, start, b"\"10.0.0.1\""),
"Should match 10.0.0.1"
);
assert!(
matches_value(&arena, start, b"\"10.0.0.255\""),
"Should match 10.0.0.255"
);
assert!(
!matches_value(&arena, start, b"\"10.0.1.0\""),
"Should NOT match 10.0.1.0"
);
assert!(
!matches_value(&arena, start, b"\"192.168.1.1\""),
"Should NOT match 192.168.1.1"
);
}
#[test]
fn test_cidr_arena_fa_ipv4_range() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let cidr = CidrPattern::V4 {
network: [172, 16, 0, 0],
prefix_len: 30,
};
let (arena, start) = make_cidr_arena_fa(&cidr, fm);
assert!(
matches_value(&arena, start, b"\"172.16.0.0\""),
"Should match 172.16.0.0"
);
assert!(
matches_value(&arena, start, b"\"172.16.0.1\""),
"Should match 172.16.0.1"
);
assert!(
matches_value(&arena, start, b"\"172.16.0.2\""),
"Should match 172.16.0.2"
);
assert!(
matches_value(&arena, start, b"\"172.16.0.3\""),
"Should match 172.16.0.3"
);
assert!(
!matches_value(&arena, start, b"\"172.16.0.4\""),
"Should NOT match 172.16.0.4"
);
}
#[test]
fn test_cidr_arena_fa_merge() {
let fm1 = Arc::new(FieldMatcher::with_match_id(1));
let fm2 = Arc::new(FieldMatcher::with_match_id(2));
let cidr1 = CidrPattern::V4 {
network: [10, 0, 0, 0],
prefix_len: 32,
};
let cidr2 = CidrPattern::V4 {
network: [192, 168, 0, 0],
prefix_len: 32,
};
let (arena1, start1) = make_cidr_arena_fa(&cidr1, fm1);
let (arena2, start2) = make_cidr_arena_fa(&cidr2, fm2);
let (merged, merged_start) = merge_arena_nfas(&arena1, start1, &arena2, start2);
assert!(
matches_value(&merged, merged_start, b"\"10.0.0.0\""),
"Merged should match 10.0.0.0"
);
assert!(
matches_value(&merged, merged_start, b"\"192.168.0.0\""),
"Merged should match 192.168.0.0"
);
assert!(
!matches_value(&merged, merged_start, b"\"172.16.0.0\""),
"Merged should NOT match 172.16.0.0"
);
}
#[test]
fn test_cidr_arena_fa_ipv6_basic() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let cidr = CidrPattern::V6 {
network: [0x20, 0x01, 0x0d, 0xb8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
prefix_len: 32,
};
let (arena, start) = make_cidr_arena_fa(&cidr, fm);
assert!(
matches_value(&arena, start, b"\"2001:db8:0:0:0:0:0:1\""),
"Should match 2001:db8:0:0:0:0:0:1"
);
assert!(
matches_value(&arena, start, b"\"2001:db8:ffff:ffff:ffff:ffff:ffff:ffff\""),
"Should match end of range"
);
assert!(
!matches_value(&arena, start, b"\"2001:db9:0:0:0:0:0:1\""),
"Should NOT match 2001:db9:..."
);
}
#[test]
fn test_cidr_arena_fa_ipv6_partial_group_masks_host_bits() {
let fm = Arc::new(FieldMatcher::with_match_id(1));
let cidr = CidrPattern::V6 {
network: [
0x20, 0x01, 0x0d, 0xb8, 0xab, 0xcd, 0xef, 0x08, 0, 0, 0, 0, 0, 0, 0, 0,
],
prefix_len: 60,
};
let (arena, start) = make_cidr_arena_fa(&cidr, fm);
assert!(
matches_value(&arena, start, b"\"2001:db8:abcd:ef00:0:0:0:0\""),
"0xef00 is the masked network base and must match"
);
assert!(
!matches_value(&arena, start, b"\"2001:db8:abcd:ef10:0:0:0:0\""),
"0xef10 is the next /60 block and must not match"
);
}
#[test]
fn test_build_any_hex_group_requires_at_least_one_digit() {
let mut arena = StateArena::new();
let fm = Arc::new(FieldMatcher::with_match_id(1));
let match_state = arena.alloc();
arena[match_state].field_transitions.push(fm);
let start = build_any_hex_group_arena(match_state, &mut arena);
arena.precompute_epsilon_closures();
assert!(
matches_value(&arena, start, b"1"),
"one hex digit must match"
);
assert!(
!matches_value(&arena, start, b""),
"empty group must not match"
);
}
}
#[cfg(kani)]
mod kani_arena_proofs {
use super::*;
#[kani::proof]
#[kani::unwind(4)]
fn arena_dstep_symbolic_lookup() {
let c0: u8 = kani::any();
let c1: u8 = kani::any();
let c2: u8 = kani::any();
kani::assume(c0 > 0);
kani::assume(c1 > c0);
kani::assume(c2 > c1);
kani::assume((c2 as usize) <= BYTE_CEILING);
let s0 = StateId::from_index(kani::any::<u8>() as usize);
let s1 = StateId::from_index(kani::any::<u8>() as usize);
let s2 = StateId::from_index(kani::any::<u8>() as usize);
let mut table = SmallTable::new();
table.ceilings = smallvec![c0, c1, c2];
table.steps = smallvec![s0, s1, s2];
let byte: u8 = kani::any();
kani::assume((byte as usize) < BYTE_CEILING);
let result = table.dstep(byte);
if byte < c0 {
kani::assert(result == s0, "byte in first range must return s0");
} else if byte < c1 {
kani::assert(result == s1, "byte in second range must return s1");
} else if byte < c2 {
kani::assert(result == s2, "byte in third range must return s2");
} else {
kani::assert(
result == StateId::NONE,
"byte past last ceiling must return NONE",
);
}
}
}
#[cfg(test)]
mod dfa_test_helpers {
use super::*;
use crate::regexp::{make_regexp_nfa_arena, parse_regexp};
pub(super) fn build_regexp_nfa(pattern: &str) -> (StateArena, StateId) {
let root = parse_regexp(pattern).expect("valid regexp");
let (mut arena, start, _fm) = make_regexp_nfa_arena(root);
arena.precompute_epsilon_closures();
(arena, start)
}
pub(super) fn nfa_matches(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut bufs = NfaBuffers::with_capacity();
traverse_arena_nfa(arena, start, value, &mut bufs);
!bufs.transitions.is_empty()
}
pub(super) fn dfa_matches(arena: &StateArena, start: StateId, value: &[u8]) -> bool {
let mut transitions = Vec::new();
traverse_arena_dfa(arena, start, value, &mut transitions);
!transitions.is_empty()
}
pub(super) fn lazy_dfa_matches(lazy_dfa: &mut LazyDfa, value: &[u8]) -> bool {
let mut transitions = Vec::new();
traverse_lazy_dfa(lazy_dfa, value, &mut transitions);
!transitions.is_empty()
}
}
#[cfg(test)]
mod nfa_to_dfa_tests {
use super::dfa_test_helpers::*;
use super::*;
fn assert_nfa_dfa_equivalence(
nfa: &StateArena,
nfa_start: StateId,
dfa: &StateArena,
dfa_start: StateId,
test_values: &[(&[u8], bool)],
) {
for &(val, expected) in test_values {
let label = String::from_utf8_lossy(val);
assert_eq!(
nfa_matches(nfa, nfa_start, val),
expected,
"NFA mismatch on {label:?}",
);
assert_eq!(
dfa_matches(dfa, dfa_start, val),
expected,
"DFA mismatch on {label:?}",
);
}
}
#[test]
fn test_nfa_to_dfa_simple_plus() {
let (nfa, nfa_start) = build_regexp_nfa("[abc]+");
assert!(nfa.is_nondeterministic(), "should be NFA");
let (dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
assert!(!dfa.is_nondeterministic(), "should be DFA");
assert_nfa_dfa_equivalence(
&nfa,
nfa_start,
&dfa,
dfa_start,
&[
(b"\"a\"", true),
(b"\"abc\"", true),
(b"\"aaa\"", true),
(b"\"d\"", false),
(b"\"\"", false),
],
);
}
#[test]
fn test_nfa_to_dfa_star() {
let (nfa, nfa_start) = build_regexp_nfa("[xyz]*end");
assert!(nfa.is_nondeterministic());
let (dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
assert!(!dfa.is_nondeterministic());
assert_nfa_dfa_equivalence(
&nfa,
nfa_start,
&dfa,
dfa_start,
&[
(b"\"end\"", true),
(b"\"xend\"", true),
(b"\"xyzend\"", true),
(b"\"xyxyend\"", true),
(b"\"en\"", false),
(b"\"aend\"", false),
],
);
}
#[test]
fn test_nfa_to_dfa_nested_quantifiers() {
let (nfa, nfa_start) = build_regexp_nfa("(([abc]?)*)+");
assert!(nfa.is_nondeterministic());
let (dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
assert!(!dfa.is_nondeterministic());
assert_nfa_dfa_equivalence(
&nfa,
nfa_start,
&dfa,
dfa_start,
&[
(b"\"\"", true),
(b"\"a\"", true),
(b"\"abc\"", true),
(b"\"aabbcc\"", true),
(b"\"d\"", false),
],
);
}
#[test]
fn test_nfa_to_dfa_budget_exceeded() {
let (nfa, nfa_start) = build_regexp_nfa("(([abc]?)*)+");
assert!(nfa.is_nondeterministic());
let result = nfa.nfa_to_dfa(nfa_start, 2);
assert!(result.is_none(), "should exceed budget");
}
#[test]
fn test_nfa_to_dfa_empty_arena() {
let arena = StateArena::new();
let result = arena.nfa_to_dfa(StateId::NONE, 1000);
assert!(result.is_some());
let (dfa, start) = result.unwrap();
assert!(start.is_none());
assert!(dfa.is_empty());
}
#[test]
fn test_nfa_to_dfa_none_start_nonempty_arena() {
let (nfa, _nfa_start) = build_regexp_nfa("[abc]+");
assert!(!nfa.is_empty());
let result = nfa.nfa_to_dfa(StateId::NONE, 1000);
assert!(result.is_some());
let (dfa, start) = result.unwrap();
assert!(start.is_none());
assert!(dfa.is_empty());
}
#[test]
fn test_nfa_to_dfa_alternation() {
let (nfa, nfa_start) = build_regexp_nfa("[a]+d|[b]+d");
if !nfa.is_nondeterministic() {
return; }
let (dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
assert!(!dfa.is_nondeterministic());
assert_nfa_dfa_equivalence(
&nfa,
nfa_start,
&dfa,
dfa_start,
&[
(b"\"ad\"", true),
(b"\"aad\"", true),
(b"\"bd\"", true),
(b"\"bbd\"", true),
(b"\"cd\"", false),
(b"\"ab\"", false),
],
);
}
#[test]
fn test_nfa_to_dfa_preserves_field_transitions() {
let (nfa, nfa_start) = build_regexp_nfa("[abc]+");
let (dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
let has_ft =
(0..dfa.len()).any(|i| !dfa[StateId::from_index(i)].field_transitions.is_empty());
assert!(has_ft, "DFA should have field transitions");
let mut transitions = Vec::new();
traverse_arena_dfa(&dfa, dfa_start, b"\"a\"", &mut transitions);
assert!(!transitions.is_empty(), "should find field transitions");
}
}
#[cfg(all(test, miri))]
mod miri_dfa_traversal_tests {
use super::dfa_test_helpers::*;
use super::*;
#[test]
fn test_traverse_dfa_flat_match_and_no_match() {
let (nfa, nfa_start) = build_regexp_nfa("[ab]+");
let (mut dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
dfa.flatten_tables();
assert!(dfa_matches(&dfa, dfa_start, b"\"a\""));
assert!(dfa_matches(&dfa, dfa_start, b"\"ab\""));
assert!(!dfa_matches(&dfa, dfa_start, b"\"c\""));
assert!(!dfa_matches(&dfa, dfa_start, b"\"\""));
}
#[test]
fn test_traverse_dfa_flat_alternation() {
let (nfa, nfa_start) = build_regexp_nfa("cat|dog");
let (mut dfa, dfa_start) = nfa.nfa_to_dfa(nfa_start, 1000).expect("should convert");
dfa.flatten_tables();
assert!(dfa_matches(&dfa, dfa_start, b"\"cat\""));
assert!(dfa_matches(&dfa, dfa_start, b"\"dog\""));
assert!(!dfa_matches(&dfa, dfa_start, b"\"car\""));
}
}
#[cfg(test)]
mod lazy_dfa_tests {
use super::dfa_test_helpers::*;
use super::*;
#[test]
fn test_lazy_dfa_basic() {
let (nfa, nfa_start) = build_regexp_nfa("[abc]+");
let mut lazy = LazyDfa::new(nfa, nfa_start, 100);
assert!(lazy_dfa_matches(&mut lazy, b"\"a\""));
assert!(lazy_dfa_matches(&mut lazy, b"\"abc\""));
assert!(!lazy_dfa_matches(&mut lazy, b"\"d\""));
assert!(!lazy_dfa_matches(&mut lazy, b"\"\""));
}
#[test]
fn test_lazy_dfa_matches_nfa() {
let (nfa, nfa_start) = build_regexp_nfa("[xyz]*end");
let test_values: &[&[u8]] = &[
b"\"end\"",
b"\"xend\"",
b"\"xyzend\"",
b"\"xyxyend\"",
b"\"en\"",
b"\"aend\"",
b"\"\"",
];
let mut lazy = LazyDfa::new(nfa.clone(), nfa_start, 100);
for &val in test_values {
assert_eq!(
nfa_matches(&nfa, nfa_start, val),
lazy_dfa_matches(&mut lazy, val),
"NFA/lazy-DFA disagree on {:?}",
String::from_utf8_lossy(val),
);
}
}
#[test]
fn test_lazy_dfa_cache_reuse() {
let (nfa, nfa_start) = build_regexp_nfa("[abc]+");
let mut lazy = LazyDfa::new(nfa, nfa_start, 100);
assert!(lazy_dfa_matches(&mut lazy, b"\"abc\""));
let states_after_first = lazy.states.len();
assert!(lazy_dfa_matches(&mut lazy, b"\"abc\""));
assert_eq!(
lazy.states.len(),
states_after_first,
"should reuse cached states"
);
}
#[test]
fn test_lazy_dfa_budget_limits_cached_states() {
let (nfa, nfa_start) = build_regexp_nfa("[abc]+");
let mut lazy = LazyDfa::new(nfa, nfa_start, 3);
assert!(lazy_dfa_matches(&mut lazy, b"\"abc\""));
assert!(lazy.cached_count <= 3, "should respect budget");
assert!(
lazy.cached_count > 0,
"should have cached at least one state"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_lazy_dfa_nested_quantifiers() {
let (nfa, nfa_start) = build_regexp_nfa("(([abc]?)*)+");
let test_values: &[&[u8]] = &[b"\"\"", b"\"a\"", b"\"abc\"", b"\"aabbcc\"", b"\"d\""];
let mut lazy = LazyDfa::new(nfa.clone(), nfa_start, 1000);
for &val in test_values {
assert_eq!(
nfa_matches(&nfa, nfa_start, val),
lazy_dfa_matches(&mut lazy, val),
"NFA/lazy-DFA disagree on {:?}",
String::from_utf8_lossy(val),
);
}
}
}
#[cfg(test)]
mod dfa_accel_tests {
use super::dfa_test_helpers::build_regexp_nfa;
use super::*;
#[test]
#[cfg_attr(miri, ignore)]
fn test_lazy_dfa_accel_negated_char_class() {
let (nfa, nfa_start) = build_regexp_nfa("a[^x]+x");
assert!(
nfa.nfa_to_dfa(nfa_start, 1000).is_none(),
"expected budget exceeded for [^x]+ Unicode NFA"
);
let mut lazy = LazyDfa::new(nfa, nfa_start, 10_000);
let long_val: Vec<u8> = std::iter::once(b'"')
.chain(std::iter::once(b'a'))
.chain(std::iter::repeat_n(b'a', 1000))
.chain(std::iter::once(b'x'))
.chain(std::iter::once(b'"'))
.collect();
let mut transitions = Vec::new();
traverse_lazy_dfa(&mut lazy, &long_val, &mut transitions);
assert!(
!transitions.is_empty(),
"should match: a followed by 1000 a's then x"
);
let accel_count = lazy.states.iter().filter(|s| s.accel.is_some()).count();
assert!(
accel_count > 0,
"lazy DFA should detect AccelInfo on self-loop state"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_lazy_dfa_accel_skip_zero() {
let (nfa, nfa_start) = build_regexp_nfa("a[^x]+x");
let mut lazy = LazyDfa::new(nfa, nfa_start, 10_000);
let mut transitions = Vec::new();
traverse_lazy_dfa(&mut lazy, b"\"aax\"", &mut transitions);
assert!(!transitions.is_empty(), "should match 'aax'");
transitions.clear();
traverse_lazy_dfa(&mut lazy, b"\"ax\"", &mut transitions);
assert!(
transitions.is_empty(),
"'ax' should NOT match a[^x]+x — no non-x chars between a and x"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_lazy_dfa_accel_multi_exit_bytes() {
let (nfa, nfa_start) = build_regexp_nfa("a[^xy]+y");
let mut lazy = LazyDfa::new(nfa, nfa_start, 10_000);
let long_val: Vec<u8> = std::iter::once(b'"')
.chain(std::iter::once(b'a'))
.chain(std::iter::repeat_n(b'a', 500))
.chain(std::iter::once(b'y'))
.chain(std::iter::once(b'"'))
.collect();
let mut transitions = Vec::new();
traverse_lazy_dfa(&mut lazy, &long_val, &mut transitions);
assert!(!transitions.is_empty(), "should match 500 a's then y");
let accel_states: Vec<_> = lazy
.states
.iter()
.filter_map(|s| s.accel.as_ref())
.collect();
assert!(!accel_states.is_empty(), "should have accel on self-loop");
for accel in &accel_states {
assert_eq!(accel.len, 2, "expected 2 exit bytes for [^xy]+");
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_eager_dfa_accel_simple_pattern() {
let (nfa, nfa_start) = build_regexp_nfa("[^x]+");
let (dfa, _dfa_start) = nfa.nfa_to_dfa(nfa_start, 100_000).expect("should convert");
let accel_count = dfa
.states
.iter()
.filter(|s| s.table.accel.is_some())
.count();
assert!(
accel_count > 0,
"loop state of [^x]+ DFA should be accelerated"
);
for state in &dfa.states {
if let Some(ref accel) = state.table.accel {
assert_eq!(accel.len, 2, "expected exactly 2 exit bytes");
let mut actual = [accel.exit_bytes[0], accel.exit_bytes[1]];
actual.sort_unstable();
assert_eq!(actual, [b'"', b'x']);
}
}
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_eager_dfa_traversal_with_accel() {
let (nfa, nfa_start) = build_regexp_nfa("[^x]+");
if let Some((mut dfa, dfa_start)) = nfa.nfa_to_dfa(nfa_start, 100_000) {
dfa.flatten_tables();
let long_val: Vec<u8> = std::iter::once(b'"')
.chain(std::iter::repeat_n(b'a', 1000))
.chain(std::iter::once(b'"'))
.collect();
let mut transitions = Vec::new();
traverse_arena_dfa(&dfa, dfa_start, &long_val, &mut transitions);
assert!(
!transitions.is_empty(),
"DFA should match 1000 a's (no x = full match)"
);
}
}
#[test]
fn test_accel_traversal_miri_friendly() {
use super::super::small_table::{AccelInfo, FieldMatcher, VALUE_TERMINATOR};
use std::sync::Arc;
let mut arena = StateArena::new();
let loop_state = arena.alloc();
let exit_state = arena.alloc();
let mut unpacked = [loop_state; BYTE_CEILING];
unpacked[b'z' as usize] = exit_state;
unpacked[VALUE_TERMINATOR as usize] = StateId::NONE;
arena[loop_state].table.pack(&unpacked);
let fm = Arc::new(FieldMatcher::with_match_id(42));
arena[exit_state].field_transitions.push(fm);
arena[loop_state].table.accel = Some(AccelInfo {
exit_bytes: [b'z', 0, 0],
len: 1,
});
arena.flatten_tables();
let val = b"\"aaaz\"";
let mut transitions = Vec::new();
traverse_arena_dfa(&arena, loop_state, val, &mut transitions);
assert!(
!transitions.is_empty(),
"accel traversal should find field transition on exit byte"
);
let val_no_exit = b"\"aaaa\"";
transitions.clear();
traverse_arena_dfa(&arena, loop_state, val_no_exit, &mut transitions);
assert!(
transitions.is_empty(),
"no exit byte means no field transition"
);
}
#[test]
#[cfg_attr(miri, ignore)]
fn test_lazy_dfa_accel_from_nfa_spinout_states() {
let (nfa, nfa_start) = build_regexp_nfa("a[^x]+x");
let nfa_accel_count = nfa
.states
.iter()
.filter(|s| s.table.accel.is_some())
.count();
assert!(
nfa_accel_count > 0,
"NFA builder should set AccelInfo on [^x]+ spinout states; got 0"
);
let mut lazy = LazyDfa::new(nfa, nfa_start, 10_000);
let long_val: Vec<u8> = std::iter::once(b'"')
.chain(std::iter::once(b'a'))
.chain(std::iter::repeat_n(b'a', 100))
.chain(std::iter::once(b'x'))
.chain(std::iter::once(b'"'))
.collect();
let mut transitions = Vec::new();
traverse_lazy_dfa(&mut lazy, &long_val, &mut transitions);
assert!(!transitions.is_empty(), "should match correctly");
let accel_count = lazy.states.iter().filter(|s| s.accel.is_some()).count();
assert!(
accel_count > 0,
"lazy DFA should acquire AccelInfo from NFA spinout states"
);
let (nfa2, nfa_start2) = build_regexp_nfa("[abc]+");
let nfa2_accel = nfa2
.states
.iter()
.filter(|s| s.table.accel.is_some())
.count();
assert_eq!(
nfa2_accel, 0,
"non-negated pattern NFA states should have no AccelInfo"
);
let mut lazy2 = LazyDfa::new(nfa2, nfa_start2, 100);
let mut t2 = Vec::new();
traverse_lazy_dfa(&mut lazy2, b"\"abc\"", &mut t2);
assert!(!t2.is_empty(), "should match");
let accel_count2 = lazy2.states.iter().filter(|s| s.accel.is_some()).count();
assert_eq!(
accel_count2, 0,
"non-negated [abc]+: no NFA AccelInfo → no lazy DFA accel"
);
}
}