use std::collections::HashMap;
use std::error::Error;
use std::fmt;
use crate::hash::fnv1a::{fnv1a64_initial_state, fnv1a64_update_byte};
use crate::matching::dfa_compile::CompiledDfa;
pub const OP_ID: &str = "vyre-primitives::matching::nfa_to_dfa";
const LANES: usize = 32;
const STATE_BITSET_WORDS: usize = LANES;
type StateSet = [u32; STATE_BITSET_WORDS];
const EMPTY_SET: StateSet = [0u32; STATE_BITSET_WORDS];
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum NfaToDfaError {
StateExplosion {
produced: usize,
cap: usize,
},
ShapeMismatch {
reason: &'static str,
},
}
impl fmt::Display for NfaToDfaError {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::StateExplosion { produced, cap } => write!(
formatter,
"NFA→DFA subset construction exceeded the {cap}-state cap after producing {produced} DFA states. Fix: raise the cap, shard the pattern set, or dispatch via the NFA scan kernel."
),
Self::ShapeMismatch { reason } => {
write!(formatter, "NFA bit-table shape mismatch: {reason}.")
}
}
}
}
impl Error for NfaToDfaError {}
#[derive(Debug, Clone)]
pub struct NfaTables<'tables> {
pub num_states: u32,
pub transition_table: &'tables [u32],
pub epsilon_table: &'tables [u32],
pub accept_state_ids: &'tables [u32],
pub accept_pattern_ids: &'tables [u32],
pub max_pattern_len: u32,
}
pub fn nfa_to_dfa(
tables: &NfaTables<'_>,
max_dfa_states: usize,
) -> Result<CompiledDfa, NfaToDfaError> {
let n = tables.num_states as usize;
if n > LANES * 32 {
return Err(NfaToDfaError::ShapeMismatch {
reason: "num_states exceeds LANES * 32 bit-set capacity",
});
}
if tables.transition_table.len() != n * 256 * LANES {
return Err(NfaToDfaError::ShapeMismatch {
reason: "transition_table length != num_states * 256 * LANES",
});
}
if tables.epsilon_table.len() != n * LANES {
return Err(NfaToDfaError::ShapeMismatch {
reason: "epsilon_table length != num_states * LANES",
});
}
if tables.accept_state_ids.len() != tables.accept_pattern_ids.len() {
return Err(NfaToDfaError::ShapeMismatch {
reason: "accept_state_ids and accept_pattern_ids length disagree",
});
}
let epsilon_closures = build_epsilon_closures(n, tables.epsilon_table);
let mut entry_set = EMPTY_SET;
set_bit(&mut entry_set, 0);
let start_set = closure_of_set(&entry_set, &epsilon_closures);
let mut dfa_state_index: HashMap<StateSet, u32> = HashMap::new();
let mut dfa_state_sets: Vec<StateSet> = Vec::new();
let mut transitions: Vec<u32> = Vec::new();
dfa_state_index.insert(start_set, 0);
dfa_state_sets.push(start_set);
transitions.extend(std::iter::repeat_n(0u32, 256));
let mut next_to_process: usize = 0;
while next_to_process < dfa_state_sets.len() {
let dfa_state_id = next_to_process;
let current_set = dfa_state_sets[dfa_state_id];
next_to_process += 1;
for byte in 0u32..256 {
let mut target_set = EMPTY_SET;
for_each_set_bit(¤t_set, |src_state| {
let row_start = (src_state as usize) * 256 * LANES + (byte as usize) * LANES;
for lane in 0..LANES {
target_set[lane] |= tables.transition_table[row_start + lane];
}
});
let closed = closure_of_set(&target_set, &epsilon_closures);
let next_dfa_state = if closed == EMPTY_SET {
ensure_dead_state(
&mut dfa_state_index,
&mut dfa_state_sets,
&mut transitions,
max_dfa_states,
)?
} else if let Some(&existing) = dfa_state_index.get(&closed) {
existing
} else {
if dfa_state_sets.len() >= max_dfa_states {
return Err(NfaToDfaError::StateExplosion {
produced: dfa_state_sets.len(),
cap: max_dfa_states,
});
}
let new_id = dfa_state_sets.len() as u32;
dfa_state_index.insert(closed, new_id);
dfa_state_sets.push(closed);
transitions.extend(std::iter::repeat_n(0u32, 256));
new_id
};
transitions[(dfa_state_id) * 256 + byte as usize] = next_dfa_state;
}
}
let state_count = dfa_state_sets.len() as u32;
let mut accept: Vec<u32> = vec![0; state_count as usize];
let mut output_offsets: Vec<u32> = Vec::with_capacity(state_count as usize + 1);
let mut output_records: Vec<u32> = Vec::new();
output_offsets.push(0);
for dfa_state_id in 0..state_count {
let set = &dfa_state_sets[dfa_state_id as usize];
let mut first_accept_pid: Option<u32> = None;
for (i, &nfa_state) in tables.accept_state_ids.iter().enumerate() {
if test_bit(set, nfa_state) {
let pid = tables.accept_pattern_ids[i];
if first_accept_pid.is_none() {
first_accept_pid = Some(pid);
}
output_records.push(pid);
}
}
accept[dfa_state_id as usize] = first_accept_pid.map(|pid| pid + 1).unwrap_or(0);
output_offsets.push(output_records.len() as u32);
}
Ok(CompiledDfa {
transitions,
accept,
state_count,
max_pattern_len: tables.max_pattern_len,
output_offsets,
output_records,
})
}
#[must_use]
pub fn dfa_fingerprint(dfa: &CompiledDfa) -> u64 {
let mut hash = fnv1a64_initial_state();
hash_u32(&mut hash, dfa.state_count);
hash_u32(&mut hash, dfa.max_pattern_len);
hash_u32_slice(&mut hash, &dfa.transitions);
hash_u32_slice(&mut hash, &dfa.accept);
hash_u32_slice(&mut hash, &dfa.output_offsets);
hash_u32_slice(&mut hash, &dfa.output_records);
hash
}
#[must_use]
pub fn dfa_wire_bytes(dfa: &CompiledDfa) -> usize {
std::mem::size_of::<u32>()
* (2 + dfa.transitions.len()
+ dfa.accept.len()
+ dfa.output_offsets.len()
+ dfa.output_records.len())
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
pub struct DfaDedupResult {
pub fingerprint: u64,
pub canonical_index: usize,
pub inserted: bool,
}
#[derive(Debug, Clone, Copy, Default, Eq, PartialEq)]
pub struct DfaDedupStats {
pub input_count: usize,
pub inserted_count: usize,
pub duplicate_count: usize,
pub table_len_after: usize,
pub input_wire_bytes: usize,
pub inserted_wire_bytes: usize,
pub saved_wire_bytes: usize,
}
#[derive(Debug, Clone, Default, Eq, PartialEq)]
pub struct DfaDedupBatch {
pub results: Vec<DfaDedupResult>,
pub stats: DfaDedupStats,
}
impl DfaDedupBatch {
#[must_use]
pub fn saved_wire_ppm(&self) -> u32 {
saved_wire_ppm(self.stats.saved_wire_bytes, self.stats.input_wire_bytes)
}
}
#[derive(Debug, Default, Clone)]
pub struct DfaDedupTable {
buckets: HashMap<u64, Vec<usize>>,
entries: Vec<CompiledDfa>,
}
impl DfaDedupTable {
#[must_use]
pub fn len(&self) -> usize {
self.entries.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
#[must_use]
pub fn get(&self, index: usize) -> Option<&CompiledDfa> {
self.entries.get(index)
}
#[must_use]
pub fn canonical_wire_bytes(&self) -> usize {
self.entries
.iter()
.map(dfa_wire_bytes)
.fold(0usize, usize::saturating_add)
}
pub fn insert(&mut self, dfa: CompiledDfa) -> DfaDedupResult {
let fingerprint = dfa_fingerprint(&dfa);
if let Some(bucket) = self.buckets.get(&fingerprint) {
for &candidate in bucket {
if self
.entries
.get(candidate)
.map(|existing| dfa_content_eq(existing, &dfa))
.unwrap_or(false)
{
return DfaDedupResult {
fingerprint,
canonical_index: candidate,
inserted: false,
};
}
}
}
let canonical_index = self.entries.len();
self.entries.push(dfa);
self.buckets
.entry(fingerprint)
.or_default()
.push(canonical_index);
DfaDedupResult {
fingerprint,
canonical_index,
inserted: true,
}
}
pub fn insert_many<I>(&mut self, dfas: I) -> DfaDedupBatch
where
I: IntoIterator<Item = CompiledDfa>,
{
let mut results = Vec::new();
let mut inserted_count = 0usize;
let mut duplicate_count = 0usize;
let mut input_wire_bytes = 0usize;
let mut inserted_wire_bytes = 0usize;
let mut saved_wire_bytes = 0usize;
for dfa in dfas {
let wire_bytes = dfa_wire_bytes(&dfa);
input_wire_bytes = input_wire_bytes.saturating_add(wire_bytes);
let result = self.insert(dfa);
if result.inserted {
inserted_count += 1;
inserted_wire_bytes = inserted_wire_bytes.saturating_add(wire_bytes);
} else {
duplicate_count += 1;
saved_wire_bytes = saved_wire_bytes.saturating_add(wire_bytes);
}
results.push(result);
}
DfaDedupBatch {
stats: DfaDedupStats {
input_count: results.len(),
inserted_count,
duplicate_count,
table_len_after: self.len(),
input_wire_bytes,
inserted_wire_bytes,
saved_wire_bytes,
},
results,
}
}
pub fn merge_from(&mut self, other: &DfaDedupTable) -> DfaDedupBatch {
self.insert_many(other.entries.iter().cloned())
}
}
fn saved_wire_ppm(saved_wire_bytes: usize, input_wire_bytes: usize) -> u32 {
if input_wire_bytes == 0 {
return 0;
}
let ppm = (saved_wire_bytes as u128).saturating_mul(1_000_000) / (input_wire_bytes as u128);
u32::try_from(ppm).unwrap_or(u32::MAX)
}
fn dfa_content_eq(left: &CompiledDfa, right: &CompiledDfa) -> bool {
left.state_count == right.state_count
&& left.max_pattern_len == right.max_pattern_len
&& left.transitions == right.transitions
&& left.accept == right.accept
&& left.output_offsets == right.output_offsets
&& left.output_records == right.output_records
}
fn hash_u32_slice(hash: &mut u64, values: &[u32]) {
hash_u64(hash, values.len() as u64);
for &value in values {
hash_u32(hash, value);
}
}
fn hash_u32(hash: &mut u64, value: u32) {
for byte in value.to_le_bytes() {
*hash = fnv1a64_update_byte(*hash, byte);
}
}
fn hash_u64(hash: &mut u64, value: u64) {
for byte in value.to_le_bytes() {
*hash = fnv1a64_update_byte(*hash, byte);
}
}
#[inline]
fn set_bit(set: &mut StateSet, state: u32) {
let lane = (state / 32) as usize;
let bit = state % 32;
set[lane] |= 1u32 << bit;
}
#[inline]
fn test_bit(set: &StateSet, state: u32) -> bool {
let lane = (state / 32) as usize;
let bit = state % 32;
(set[lane] & (1u32 << bit)) != 0
}
fn for_each_set_bit(set: &StateSet, mut f: impl FnMut(u32)) {
for (lane, &word) in set.iter().enumerate() {
let mut w = word;
while w != 0 {
let bit = w.trailing_zeros();
f((lane as u32) * 32 + bit);
w &= w - 1;
}
}
}
fn build_epsilon_closures(num_states: usize, epsilon_table: &[u32]) -> Vec<StateSet> {
let mut closures = vec![EMPTY_SET; num_states];
for state in 0..num_states {
let mut closure = EMPTY_SET;
set_bit(&mut closure, state as u32);
let mut frontier_word = EMPTY_SET;
for lane in 0..LANES {
frontier_word[lane] = epsilon_table[state * LANES + lane];
}
for lane in 0..LANES {
closure[lane] |= frontier_word[lane];
}
loop {
let mut next_frontier = EMPTY_SET;
for_each_set_bit(&frontier_word, |s| {
for lane in 0..LANES {
let bits = epsilon_table[(s as usize) * LANES + lane];
let new_bits = bits & !closure[lane];
next_frontier[lane] |= new_bits;
}
});
if next_frontier == EMPTY_SET {
break;
}
for lane in 0..LANES {
closure[lane] |= next_frontier[lane];
}
frontier_word = next_frontier;
}
closures[state] = closure;
}
closures
}
fn closure_of_set(set: &StateSet, per_state_closures: &[StateSet]) -> StateSet {
let mut out = EMPTY_SET;
for_each_set_bit(set, |state| {
let closure = &per_state_closures[state as usize];
for lane in 0..LANES {
out[lane] |= closure[lane];
}
});
out
}
fn ensure_dead_state(
index: &mut HashMap<StateSet, u32>,
sets: &mut Vec<StateSet>,
transitions: &mut Vec<u32>,
max_dfa_states: usize,
) -> Result<u32, NfaToDfaError> {
if let Some(&existing) = index.get(&EMPTY_SET) {
return Ok(existing);
}
if sets.len() >= max_dfa_states {
return Err(NfaToDfaError::StateExplosion {
produced: sets.len(),
cap: max_dfa_states,
});
}
let dead_id = sets.len() as u32;
index.insert(EMPTY_SET, dead_id);
sets.push(EMPTY_SET);
transitions.extend(std::iter::repeat_n(dead_id, 256));
Ok(dead_id)
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "nfa")]
#[test]
fn layout_matches_nfa_module() {
assert_eq!(
LANES,
crate::nfa::subgroup_nfa::LANES_PER_SUBGROUP,
"nfa_to_dfa's local LANES must mirror subgroup_nfa::LANES_PER_SUBGROUP - a drift means the bit-table layout in this primitive no longer matches what `compile_regex_set` / `nfa_scan_with_plan` emit. Fix: update LANES here and re-run the matching test suite."
);
}
fn literal_abc_tables() -> (Vec<u32>, Vec<u32>, Vec<u32>, Vec<u32>) {
let num_states = 4_usize;
let mut transition = vec![0u32; num_states * 256 * LANES];
for (src, b, dst) in [(0usize, b'a', 1u32), (1, b'b', 2), (2, b'c', 3)] {
let dst_lane = (dst / 32) as usize;
let dst_bit = 1u32 << (dst % 32);
let idx = src * 256 * LANES + (b as usize) * LANES + dst_lane;
transition[idx] |= dst_bit;
}
let epsilon = vec![0u32; num_states * LANES];
let accept_state_ids = vec![3u32];
let accept_pattern_ids = vec![0u32];
(transition, epsilon, accept_state_ids, accept_pattern_ids)
}
struct GeneratedNfa {
num_states: u32,
transition: Vec<u32>,
epsilon: Vec<u32>,
accept_states: Vec<u32>,
accept_pids: Vec<u32>,
max_pattern_len: u32,
primary_word: Vec<u8>,
alternate_word: Vec<u8>,
}
impl GeneratedNfa {
fn tables(&self) -> NfaTables<'_> {
NfaTables {
num_states: self.num_states,
transition_table: &self.transition,
epsilon_table: &self.epsilon,
accept_state_ids: &self.accept_states,
accept_pattern_ids: &self.accept_pids,
max_pattern_len: self.max_pattern_len,
}
}
}
fn generated_nfa(seed: u32) -> GeneratedNfa {
let num_states = 3 + (seed as usize % 4);
let mut transition = vec![0u32; num_states * 256 * LANES];
let mut epsilon = vec![0u32; num_states * LANES];
let mut primary_word = Vec::with_capacity(num_states.saturating_sub(1));
for src in 0..num_states.saturating_sub(1) {
let byte = generated_byte(seed, src as u32);
primary_word.push(byte);
add_transition(&mut transition, src, byte, (src + 1) as u32);
if src > 0 {
add_transition(
&mut transition,
src,
generated_byte(seed ^ 0x5a5a_1337, src as u32),
src as u32,
);
}
}
let alternate_target = num_states.saturating_sub(2).max(1);
let alternate_byte = generated_byte(seed ^ 0x9e37_79b9, 0);
add_transition(&mut transition, 0, alternate_byte, alternate_target as u32);
let alternate_word = vec![alternate_byte];
if seed % 3 == 0 && num_states > 3 {
add_epsilon(&mut epsilon, 1, 2);
}
if seed % 5 == 0 {
add_epsilon(&mut epsilon, 0, 1);
}
if seed % 11 == 0 && num_states > 4 {
add_epsilon(&mut epsilon, 2, (num_states - 1) as u32);
}
let mut accept_states = vec![(num_states - 1) as u32];
let mut accept_pids = vec![seed % 31];
if seed % 7 == 0 {
accept_states.push(alternate_target as u32);
accept_pids.push(100 + (seed % 97));
}
GeneratedNfa {
num_states: num_states as u32,
transition,
epsilon,
accept_states,
accept_pids,
max_pattern_len: primary_word.len() as u32,
primary_word,
alternate_word,
}
}
fn generated_byte(seed: u32, lane: u32) -> u8 {
let mixed = seed
.wrapping_mul(1_664_525)
.wrapping_add(lane.wrapping_mul(1_013_904_223))
.wrapping_add(0x45d9_f3b);
b'a' + (mixed % 23) as u8
}
fn add_transition(table: &mut [u32], src: usize, byte: u8, dst: u32) {
let lane = (dst / 32) as usize;
let bit = 1u32 << (dst % 32);
let idx = src * 256 * LANES + (byte as usize) * LANES + lane;
table[idx] |= bit;
}
fn add_epsilon(table: &mut [u32], src: usize, dst: u32) {
let lane = (dst / 32) as usize;
let bit = 1u32 << (dst % 32);
table[src * LANES + lane] |= bit;
}
fn nfa_outputs_for(nfa: &GeneratedNfa, input: &[u8]) -> Vec<u32> {
let closures = build_epsilon_closures(nfa.num_states as usize, &nfa.epsilon);
let mut current = EMPTY_SET;
set_bit(&mut current, 0);
current = closure_of_set(¤t, &closures);
for &byte in input {
let mut target = EMPTY_SET;
for_each_set_bit(¤t, |src_state| {
let row_start = (src_state as usize) * 256 * LANES + (byte as usize) * LANES;
for lane in 0..LANES {
target[lane] |= nfa.transition[row_start + lane];
}
});
current = closure_of_set(&target, &closures);
}
let mut out = Vec::new();
for (idx, &state) in nfa.accept_states.iter().enumerate() {
if test_bit(¤t, state) {
out.push(nfa.accept_pids[idx]);
}
}
out
}
fn dfa_outputs_for(dfa: &CompiledDfa, input: &[u8]) -> Vec<u32> {
let mut state = 0usize;
for &byte in input {
state = dfa.transitions[state * 256 + byte as usize] as usize;
}
let start = dfa.output_offsets[state] as usize;
let end = dfa.output_offsets[state + 1] as usize;
dfa.output_records[start..end].to_vec()
}
#[test]
fn generated_nfa_to_dfa_matches_reference_nfa_for_thousands_of_inputs() {
let mut checked = 0usize;
for seed in 0..1024u32 {
let nfa = generated_nfa(seed);
let dfa = nfa_to_dfa(&nfa.tables(), 4096)
.expect("Fix: generated sparse NFA must stay inside the DFA cap");
let mut mutated_primary = nfa.primary_word.clone();
if let Some(last) = mutated_primary.last_mut() {
*last = last.wrapping_add(1);
}
let primary_prefix =
nfa.primary_word[..nfa.primary_word.len().saturating_sub(1)].to_vec();
let mut reversed_primary = nfa.primary_word.clone();
reversed_primary.reverse();
let generated_noise = vec![
generated_byte(seed, 9),
generated_byte(seed, 10),
generated_byte(seed, 11),
generated_byte(seed, 12),
];
let alternate_twice =
[nfa.alternate_word.as_slice(), nfa.alternate_word.as_slice()].concat();
let corpus = [
Vec::new(),
nfa.primary_word.clone(),
primary_prefix,
reversed_primary,
mutated_primary,
nfa.alternate_word.clone(),
alternate_twice,
generated_noise.clone(),
vec![generated_byte(seed ^ 0xa5a5_a5a5, 1)],
[generated_byte(seed, 13)].into(),
[generated_byte(seed, 14), generated_byte(seed, 15)].into(),
[nfa.alternate_word.as_slice(), nfa.primary_word.as_slice()].concat(),
[nfa.primary_word.as_slice(), nfa.alternate_word.as_slice()].concat(),
[generated_byte(seed, 16)]
.into_iter()
.chain(nfa.primary_word.iter().copied())
.collect(),
nfa.primary_word
.iter()
.copied()
.chain([generated_byte(seed, 17)])
.collect(),
[
nfa.alternate_word.as_slice(),
&generated_noise,
nfa.primary_word.as_slice(),
]
.concat(),
];
for input in corpus {
assert_eq!(
dfa_outputs_for(&dfa, &input),
nfa_outputs_for(&nfa, &input),
"seed {seed} input {input:?} must produce identical accept records"
);
checked += 1;
}
}
assert_eq!(checked, 16_384);
}
#[test]
fn generated_malformed_nfa_shapes_report_structured_errors() {
let mut checked = 0usize;
for seed in 0..1024u32 {
let nfa = generated_nfa(seed);
let mut short_transition = nfa.transition.clone();
short_transition.pop();
assert!(matches!(
nfa_to_dfa(
&NfaTables {
transition_table: &short_transition,
..nfa.tables()
},
4096,
),
Err(NfaToDfaError::ShapeMismatch { .. })
));
checked += 1;
let mut short_epsilon = nfa.epsilon.clone();
short_epsilon.pop();
assert!(matches!(
nfa_to_dfa(
&NfaTables {
epsilon_table: &short_epsilon,
..nfa.tables()
},
4096,
),
Err(NfaToDfaError::ShapeMismatch { .. })
));
checked += 1;
let mut extra_pid = nfa.accept_pids.clone();
extra_pid.push(seed);
assert!(matches!(
nfa_to_dfa(
&NfaTables {
accept_pattern_ids: &extra_pid,
..nfa.tables()
},
4096,
),
Err(NfaToDfaError::ShapeMismatch { .. })
));
checked += 1;
assert!(matches!(
nfa_to_dfa(
&NfaTables {
num_states: 1025,
..nfa.tables()
},
4096,
),
Err(NfaToDfaError::ShapeMismatch { .. })
));
checked += 1;
}
assert_eq!(checked, 4096);
}
#[test]
fn generated_dfa_fingerprints_are_stable_and_content_addressed() {
let mut checked = 0usize;
for seed in 0..1024u32 {
let nfa = generated_nfa(seed);
let first = match nfa_to_dfa(&nfa.tables(), 4096) {
Ok(dfa) => dfa,
Err(err) => panic!("Fix: generated NFA must lower for seed {seed}: {err}"),
};
let second = match nfa_to_dfa(&nfa.tables(), 4096) {
Ok(dfa) => dfa,
Err(err) => {
panic!("Fix: generated NFA must lower on replay for seed {seed}: {err}")
}
};
assert_eq!(
dfa_fingerprint(&first),
dfa_fingerprint(&second),
"seed {seed} must produce a stable content fingerprint"
);
let mut changed = first.clone();
changed.max_pattern_len = changed.max_pattern_len.wrapping_add(1);
assert_ne!(
dfa_fingerprint(&first),
dfa_fingerprint(&changed),
"seed {seed} max-pattern metadata must perturb the fingerprint"
);
if let Some(first_transition) = changed.transitions.first_mut() {
*first_transition = first_transition.wrapping_add(1);
assert_ne!(
dfa_fingerprint(&first),
dfa_fingerprint(&changed),
"seed {seed} transition metadata must perturb the fingerprint"
);
}
checked += 3;
}
assert_eq!(checked, 3072);
}
#[test]
fn generated_dfa_dedup_table_canonicalizes_repeated_automata() {
let mut table = DfaDedupTable::default();
let mut checked = 0usize;
for seed in 0..1024u32 {
let nfa = generated_nfa(seed);
let first = nfa_to_dfa(&nfa.tables(), 4096).unwrap_or_else(|err| {
panic!("Fix: generated NFA must lower for seed {seed}: {err}")
});
let replay = nfa_to_dfa(&nfa.tables(), 4096).unwrap_or_else(|err| {
panic!("Fix: generated NFA must lower on replay for seed {seed}: {err}")
});
let first_result = table.insert(first.clone());
let replay_result = table.insert(replay);
assert!(
first_result.inserted,
"seed {seed} first insert must create a canonical DFA"
);
assert!(
!replay_result.inserted,
"seed {seed} replay insert must deduplicate"
);
assert_eq!(
first_result.canonical_index, replay_result.canonical_index,
"seed {seed} replay must resolve to the first canonical slot"
);
assert_eq!(
first_result.fingerprint, replay_result.fingerprint,
"seed {seed} replay must keep the same content fingerprint"
);
let mut changed = first;
changed.max_pattern_len = changed.max_pattern_len.wrapping_add(1);
let changed_result = table.insert(changed);
assert!(
changed_result.inserted,
"seed {seed} changed DFA metadata must not deduplicate"
);
assert_ne!(
changed_result.canonical_index, first_result.canonical_index,
"seed {seed} changed DFA must get a distinct canonical slot"
);
checked += 3;
}
assert_eq!(checked, 3072);
assert_eq!(table.len(), 2048);
}
#[test]
fn generated_dfa_batch_dedup_preserves_input_order_and_stats() {
let mut table = DfaDedupTable::default();
let mut input = Vec::new();
for seed in 0..512u32 {
let nfa = generated_nfa(seed);
let dfa = nfa_to_dfa(&nfa.tables(), 4096).unwrap_or_else(|err| {
panic!("Fix: generated NFA must lower for seed {seed}: {err}")
});
input.push(dfa.clone());
input.push(dfa.clone());
let mut changed = dfa;
changed.max_pattern_len = changed.max_pattern_len.wrapping_add(1);
input.push(changed);
}
let batch = table.insert_many(input);
assert_eq!(batch.stats.input_count, 1536);
assert_eq!(batch.stats.inserted_count, 1024);
assert_eq!(batch.stats.duplicate_count, 512);
assert_eq!(batch.stats.table_len_after, 1024);
assert_eq!(
batch.stats.input_wire_bytes,
batch.stats.inserted_wire_bytes + batch.stats.saved_wire_bytes
);
assert_eq!(
batch.stats.inserted_wire_bytes,
table.canonical_wire_bytes()
);
assert!(
batch.stats.saved_wire_bytes > 0,
"batch dedup must report saved wire bytes for replayed automata"
);
assert!(
batch.saved_wire_ppm() > 0,
"batch dedup must report a nonzero deterministic saved-byte ratio"
);
assert_eq!(batch.results.len(), 1536);
for chunk in batch.results.chunks_exact(3) {
assert!(chunk[0].inserted);
assert!(!chunk[1].inserted);
assert!(chunk[2].inserted);
assert_eq!(chunk[0].canonical_index, chunk[1].canonical_index);
assert_ne!(chunk[0].canonical_index, chunk[2].canonical_index);
assert_eq!(chunk[0].fingerprint, chunk[1].fingerprint);
assert_ne!(chunk[0].fingerprint, chunk[2].fingerprint);
}
}
#[test]
fn generated_dfa_table_merge_deduplicates_cross_shard_plans() {
let mut left = DfaDedupTable::default();
let mut right = DfaDedupTable::default();
for seed in 0..256u32 {
let nfa = generated_nfa(seed);
let dfa = nfa_to_dfa(&nfa.tables(), 4096).unwrap_or_else(|err| {
panic!("Fix: generated NFA must lower for seed {seed}: {err}")
});
left.insert(dfa.clone());
right.insert(dfa);
}
for seed in 256..512u32 {
let nfa = generated_nfa(seed);
let dfa = nfa_to_dfa(&nfa.tables(), 4096).unwrap_or_else(|err| {
panic!("Fix: generated NFA must lower for seed {seed}: {err}")
});
right.insert(dfa);
}
let before_len = left.len();
let before_bytes = left.canonical_wire_bytes();
let batch = left.merge_from(&right);
assert_eq!(before_len, 256);
assert_eq!(batch.stats.input_count, 512);
assert_eq!(batch.stats.inserted_count, 256);
assert_eq!(batch.stats.duplicate_count, 256);
assert_eq!(batch.stats.table_len_after, 512);
assert!(batch.stats.saved_wire_bytes > 0);
assert!(batch.saved_wire_ppm() > 0);
assert!(left.canonical_wire_bytes() > before_bytes);
assert_eq!(left.len(), 512);
for result in batch.results.iter().take(256) {
assert!(!result.inserted);
assert!(result.canonical_index < before_len);
}
for result in batch.results.iter().skip(256) {
assert!(result.inserted);
assert!(result.canonical_index >= before_len);
}
}
#[test]
fn literal_pattern_lowers_to_acceptor_dfa() {
let (transition, epsilon, accepts, pids) = literal_abc_tables();
let tables = NfaTables {
num_states: 4,
transition_table: &transition,
epsilon_table: &epsilon,
accept_state_ids: &accepts,
accept_pattern_ids: &pids,
max_pattern_len: 3,
};
let dfa = nfa_to_dfa(&tables, 1024).expect("Fix: literal NFA must lower cleanly");
assert!(
dfa.state_count >= 4,
"literal 'abc' needs at least entry + 3 progress states; got {}",
dfa.state_count
);
let s_a = dfa.transitions[0 * 256 + b'a' as usize];
let s_ab = dfa.transitions[(s_a as usize) * 256 + b'b' as usize];
let s_abc = dfa.transitions[(s_ab as usize) * 256 + b'c' as usize];
assert!(
dfa.accept[s_abc as usize] != 0,
"DFA state after 'abc' must accept pattern 0"
);
let s_x = dfa.transitions[(s_ab as usize) * 256 + b'x' as usize];
assert_eq!(
dfa.accept[s_x as usize], 0,
"'abx' is not 'abc' - must not accept"
);
}
#[test]
fn empty_input_returns_one_state_dfa_with_dead_self_loop() {
let transition = vec![0u32; 1 * 256 * LANES];
let epsilon = vec![0u32; 1 * LANES];
let tables = NfaTables {
num_states: 1,
transition_table: &transition,
epsilon_table: &epsilon,
accept_state_ids: &[],
accept_pattern_ids: &[],
max_pattern_len: 0,
};
let dfa = nfa_to_dfa(&tables, 16).expect("Fix: trivial NFA must lower");
let dead = dfa.transitions[0 * 256 + b'a' as usize];
assert_eq!(
dfa.transitions[dead as usize * 256 + b'a' as usize],
dead,
"dead state must self-loop on every byte"
);
assert_eq!(dfa.accept[dead as usize], 0, "dead state must not accept");
}
#[test]
fn state_explosion_reports_structured_error() {
let (transition, epsilon, accepts, pids) = literal_abc_tables();
let tables = NfaTables {
num_states: 4,
transition_table: &transition,
epsilon_table: &epsilon,
accept_state_ids: &accepts,
accept_pattern_ids: &pids,
max_pattern_len: 3,
};
let err = nfa_to_dfa(&tables, 1).expect_err("cap=1 must trip state explosion");
match err {
NfaToDfaError::StateExplosion { cap, produced } => {
assert_eq!(cap, 1);
assert!(produced >= 1);
}
other => panic!("expected StateExplosion, got {other:?}"),
}
}
#[test]
fn shape_mismatch_caught_before_construction() {
let transition = vec![0u32; 1 * 256 * LANES];
let epsilon = vec![0u32; 1 * LANES];
let tables = NfaTables {
num_states: 4,
transition_table: &transition,
epsilon_table: &epsilon,
accept_state_ids: &[],
accept_pattern_ids: &[],
max_pattern_len: 0,
};
let err = nfa_to_dfa(&tables, 16)
.expect_err("declared num_states != table length must error, not panic");
match err {
NfaToDfaError::ShapeMismatch { reason } => {
assert!(reason.contains("transition_table"));
}
other => panic!("expected ShapeMismatch, got {other:?}"),
}
}
}