use std::collections::HashMap;
use super::types::{Dfa, Nfa, NfaStateId, PatternError, INVALID_STATE};
pub const MAX_DFA_STATES: u32 = 65_535;
#[inline]
pub fn nfa_to_dfa(nfa: &Nfa) -> Result<Dfa, PatternError> {
let n_states = nfa.state_count as usize;
let mut eps_slice: Vec<(u32, u32)> = vec![(0, 0); n_states];
let mut byte_slice: Vec<[(u32, u32); 256]> = Vec::with_capacity(n_states);
byte_slice.resize(n_states, [(0, 0); 256]);
let mut eps_targets: Vec<NfaStateId> = Vec::with_capacity(nfa.edges.len());
let mut byte_targets: Vec<NfaStateId> = Vec::with_capacity(nfa.edges.len());
{
let mut eps_count = vec![0u32; n_states];
let mut byte_count = vec![[0u32; 256]; n_states];
for edge in &nfa.edges {
let from = edge.from as usize;
match edge.byte {
None => eps_count[from] += 1,
Some(b) => byte_count[from][b as usize] += 1,
}
}
let mut eps_cursor = 0u32;
let mut byte_cursor = 0u32;
for s in 0..n_states {
eps_slice[s] = (eps_cursor, eps_count[s]);
eps_cursor += eps_count[s];
for b in 0..256usize {
byte_slice[s][b] = (byte_cursor, byte_count[s][b]);
byte_cursor += byte_count[s][b];
}
}
eps_targets.resize(eps_cursor as usize, 0);
byte_targets.resize(byte_cursor as usize, 0);
let mut eps_fill = vec![0u32; n_states];
let mut byte_fill = vec![[0u32; 256]; n_states];
for edge in &nfa.edges {
let from = edge.from as usize;
match edge.byte {
None => {
let off = eps_slice[from].0 + eps_fill[from];
eps_targets[off as usize] = edge.to;
eps_fill[from] += 1;
}
Some(b) => {
let off = byte_slice[from][b as usize].0 + byte_fill[from][b as usize];
byte_targets[off as usize] = edge.to;
byte_fill[from][b as usize] += 1;
}
}
}
}
let mut closure_scratch = ClosureScratch::with_capacity(n_states);
let start_closure = epsilon_closure(
&eps_slice,
&eps_targets,
&[nfa.start],
n_states,
&mut closure_scratch,
);
let mut subsets: Vec<Vec<NfaStateId>> = vec![start_closure.clone()];
let mut subset_index: HashMap<Vec<NfaStateId>, u32> = HashMap::new();
subset_index.insert(start_closure.clone(), 0);
let mut transitions: Vec<u32> = Vec::with_capacity(256);
let mut accept: Vec<bool> = Vec::with_capacity(1);
extend_row(&mut transitions, &mut accept, &start_closure, &nfa.accept);
let mut worklist: Vec<usize> = vec![0];
let mut reachable_scratch: Vec<NfaStateId> = Vec::with_capacity(n_states);
while let Some(idx) = worklist.pop() {
for byte in 0u16..256 {
reachable_scratch.clear();
for &state in subsets[idx].iter() {
let (off, len) = byte_slice[state as usize][byte as usize];
for t in &byte_targets[off as usize..(off + len) as usize] {
reachable_scratch.push(*t);
}
}
if reachable_scratch.is_empty() {
transitions[idx * 256 + byte as usize] = INVALID_STATE;
continue;
}
let closure = epsilon_closure(
&eps_slice,
&eps_targets,
&reachable_scratch,
n_states,
&mut closure_scratch,
);
let next_idx = match subset_index.get(&closure) {
Some(existing) => *existing,
None => {
if subsets.len() as u64 >= u64::from(MAX_DFA_STATES) {
return Err(PatternError::StateOverflow {
limit: MAX_DFA_STATES,
});
}
let new_id = subsets.len() as u32;
subsets.push(closure.clone());
subset_index.insert(closure.clone(), new_id);
extend_row(&mut transitions, &mut accept, &closure, &nfa.accept);
worklist.push(new_id as usize);
new_id
}
};
transitions[idx * 256 + byte as usize] = next_idx;
}
}
Ok(Dfa {
state_count: subsets.len() as u32,
transitions,
start: 0,
accept,
})
}
fn extend_row(
transitions: &mut Vec<u32>,
accept: &mut Vec<bool>,
closure: &[NfaStateId],
nfa_accept: &[bool],
) {
transitions.extend(std::iter::repeat_n(INVALID_STATE, 256));
accept.push(closure.iter().any(|s| nfa_accept[*s as usize]));
}
pub(crate) struct ClosureScratch {
in_closure: Vec<bool>,
stack: Vec<NfaStateId>,
collected: Vec<NfaStateId>,
}
impl ClosureScratch {
pub(crate) fn with_capacity(n_states: usize) -> Self {
Self {
in_closure: vec![false; n_states],
stack: Vec::with_capacity(n_states.min(64)),
collected: Vec::with_capacity(n_states.min(64)),
}
}
}
fn epsilon_closure(
eps_slice: &[(u32, u32)],
eps_targets: &[NfaStateId],
seeds: &[NfaStateId],
n_states: usize,
scratch: &mut ClosureScratch,
) -> Vec<NfaStateId> {
if scratch.in_closure.len() < n_states {
scratch.in_closure.resize(n_states, false);
}
for state in scratch.collected.drain(..) {
scratch.in_closure[state as usize] = false;
}
scratch.stack.clear();
scratch.stack.extend_from_slice(seeds);
while let Some(state) = scratch.stack.pop() {
let idx = state as usize;
if scratch.in_closure[idx] {
continue;
}
scratch.in_closure[idx] = true;
scratch.collected.push(state);
let (off, len) = eps_slice[idx];
for t in &eps_targets[off as usize..(off + len) as usize] {
scratch.stack.push(*t);
}
}
let mut out = scratch.collected.clone();
out.sort_unstable();
out
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pattern::regex_to_nfa::regex_to_nfa;
fn run(dfa: &Dfa, input: &[u8]) -> bool {
let mut state = dfa.start;
for &b in input {
let next = dfa.go(state, b);
if next == INVALID_STATE {
return false;
}
state = next;
}
dfa.accept[state as usize]
}
#[test]
fn literal_pattern_accepts_literal_input() {
let nfa = regex_to_nfa("abc").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b"abc"));
assert!(!run(&dfa, b"abd"));
assert!(!run(&dfa, b"ab"));
}
#[test]
fn alternation_accepts_either() {
let nfa = regex_to_nfa("foo|bar").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b"foo"));
assert!(run(&dfa, b"bar"));
assert!(!run(&dfa, b"baz"));
}
#[test]
fn kleene_star_accepts_repeats() {
let nfa = regex_to_nfa("a*").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b""));
assert!(run(&dfa, b"a"));
assert!(run(&dfa, b"aaaaa"));
assert!(!run(&dfa, b"b"));
}
#[test]
fn plus_requires_one() {
let nfa = regex_to_nfa("a+").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(!run(&dfa, b""));
assert!(run(&dfa, b"a"));
assert!(run(&dfa, b"aaa"));
}
#[test]
fn optional_allows_absence() {
let nfa = regex_to_nfa("ab?c").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b"ac"));
assert!(run(&dfa, b"abc"));
assert!(!run(&dfa, b"abbc"));
}
#[test]
fn character_class_accepts_members() {
let nfa = regex_to_nfa("[a-c]").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b"a"));
assert!(run(&dfa, b"b"));
assert!(run(&dfa, b"c"));
assert!(!run(&dfa, b"d"));
}
#[test]
fn group_with_star() {
let nfa = regex_to_nfa("(ab)*c").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b"c"));
assert!(run(&dfa, b"abc"));
assert!(run(&dfa, b"ababababc"));
assert!(!run(&dfa, b"ac"));
}
#[test]
fn empty_regex_only_accepts_empty() {
let nfa = regex_to_nfa("").unwrap();
let dfa = nfa_to_dfa(&nfa).unwrap();
assert!(run(&dfa, b""));
assert!(!run(&dfa, b"a"));
}
}