use std::collections::{BTreeSet, HashMap};
use crate::hir::unicode::is_word_byte;
use crate::nfa::{Nfa, NfaInstruction, StateId as NfaStateId};
pub type DfaStateId = u32;
pub const STRIDE: u32 = 256;
pub const TAG_MATCH: u32 = 1 << 30;
pub const TAG_DEAD: u32 = 1 << 31;
pub const TAG_MASK: u32 = TAG_MATCH | TAG_DEAD;
pub const STATE_MASK: u32 = !TAG_MASK;
pub const DEAD_STATE: u32 = TAG_DEAD | STATE_MASK;
pub const UNKNOWN_STATE: u32 = TAG_DEAD;
pub const DEFAULT_CACHE_LIMIT: usize = 10_000;
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
pub struct PositionContext {
pub at_start_of_input: bool,
pub at_start_of_line: bool,
pub at_end_of_input: bool,
pub at_end_of_line: bool,
}
impl PositionContext {
pub fn start_of_input() -> Self {
Self {
at_start_of_input: true,
at_start_of_line: true,
at_end_of_input: false,
at_end_of_line: false,
}
}
pub fn middle() -> Self {
Self {
at_start_of_input: false,
at_start_of_line: false,
at_end_of_input: false,
at_end_of_line: false,
}
}
pub fn after_newline() -> Self {
Self {
at_start_of_input: false,
at_start_of_line: true,
at_end_of_input: false,
at_end_of_line: false,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug, Default)]
pub enum CharClass {
#[default]
NonWord = 0,
Word = 1,
}
impl CharClass {
#[inline]
pub fn from_byte(b: u8) -> Self {
if is_word_byte(b) {
CharClass::Word
} else {
CharClass::NonWord
}
}
}
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
pub enum StateKey {
Simple(BTreeSet<NfaStateId>),
WithClass(BTreeSet<NfaStateId>, CharClass),
}
#[derive(Debug, Clone)]
pub struct DfaState {
pub is_match: bool,
pub nfa_states: BTreeSet<NfaStateId>,
pub prev_class: CharClass,
}
impl DfaState {
pub fn new(nfa_states: BTreeSet<NfaStateId>, is_match: bool, prev_class: CharClass) -> Self {
Self {
is_match,
nfa_states,
prev_class,
}
}
}
#[derive(Debug, Clone)]
pub struct LazyDfaContext {
pub(crate) nfa: Nfa,
pub(crate) states: Vec<DfaState>,
pub(crate) transitions: Vec<u32>,
pub(crate) state_map: HashMap<StateKey, DfaStateId>,
pub(crate) start: DfaStateId,
pub(crate) cache_limit: usize,
pub(crate) flush_count: usize,
pub(crate) has_word_boundary: bool,
pub(crate) has_anchors: bool,
pub(crate) has_start_anchor: bool,
pub(crate) has_end_anchor: bool,
pub(crate) has_multiline_anchors: bool,
}
impl LazyDfaContext {
pub fn new(mut nfa: Nfa) -> Self {
nfa.precompute_epsilon_closures();
let has_word_boundary = nfa_has_word_boundary(&nfa);
let (has_anchors, has_start_anchor, has_end_anchor, has_multiline_anchors) =
nfa_anchor_info(&nfa);
let mut ctx = Self {
nfa,
states: Vec::new(),
transitions: Vec::new(),
state_map: HashMap::new(),
start: 0,
cache_limit: DEFAULT_CACHE_LIMIT,
flush_count: 0,
has_word_boundary,
has_anchors,
has_start_anchor,
has_end_anchor,
has_multiline_anchors,
};
let mut start_set = BTreeSet::new();
start_set.insert(ctx.nfa.start);
let is_at_boundary = if has_word_boundary { Some(true) } else { None };
let start_closure = if has_word_boundary || has_anchors {
epsilon_closure_with_context(
&ctx.nfa,
&start_set,
is_at_boundary,
Some(PositionContext::start_of_input()),
)
} else {
ctx.nfa.epsilon_closure(&start_set)
};
ctx.start = get_or_create_state_with_class(&mut ctx, start_closure, CharClass::NonWord);
ctx
}
pub fn has_word_boundary(&self) -> bool {
self.has_word_boundary
}
pub fn has_anchors(&self) -> bool {
self.has_anchors
}
pub fn has_start_anchor(&self) -> bool {
self.has_start_anchor
}
pub fn has_end_anchor(&self) -> bool {
self.has_end_anchor
}
pub fn has_multiline_anchors(&self) -> bool {
self.has_multiline_anchors
}
pub fn start(&self) -> DfaStateId {
self.start
}
pub fn state_count(&self) -> usize {
self.states.len()
}
pub fn flush_count(&self) -> usize {
self.flush_count
}
pub fn set_cache_limit(&mut self, limit: usize) {
self.cache_limit = limit;
}
}
pub fn nfa_has_word_boundary(nfa: &Nfa) -> bool {
nfa.states.iter().any(|state| {
matches!(
&state.instruction,
Some(NfaInstruction::WordBoundary) | Some(NfaInstruction::NotWordBoundary)
)
})
}
pub fn nfa_anchor_info(nfa: &Nfa) -> (bool, bool, bool, bool) {
let mut has_start_anchor = false;
let mut has_end_anchor = false;
let mut has_multiline_anchors = false;
for state in &nfa.states {
match &state.instruction {
Some(NfaInstruction::StartOfText) => has_start_anchor = true,
Some(NfaInstruction::EndOfText) => has_end_anchor = true,
Some(NfaInstruction::StartOfLine) => {
has_start_anchor = true;
has_multiline_anchors = true;
}
Some(NfaInstruction::EndOfLine) => {
has_end_anchor = true;
has_multiline_anchors = true;
}
_ => {}
}
}
let has_anchors = has_start_anchor || has_end_anchor;
(
has_anchors,
has_start_anchor,
has_end_anchor,
has_multiline_anchors,
)
}
pub fn epsilon_closure_with_context(
nfa: &Nfa,
seeds: &BTreeSet<NfaStateId>,
is_at_boundary: Option<bool>,
pos_ctx: Option<PositionContext>,
) -> BTreeSet<NfaStateId> {
let mut closure = BTreeSet::new();
let mut stack: Vec<NfaStateId> = seeds.iter().copied().collect();
while let Some(state_id) = stack.pop() {
if !closure.insert(state_id) {
continue;
}
let state = match nfa.get(state_id) {
Some(s) => s,
None => continue,
};
let should_follow_epsilons = match &state.instruction {
Some(NfaInstruction::WordBoundary) => match is_at_boundary {
Some(true) => true, Some(false) => false, None => false, },
Some(NfaInstruction::NotWordBoundary) => match is_at_boundary {
Some(false) => true, Some(true) => false, None => false, },
Some(NfaInstruction::StartOfText) => match pos_ctx {
Some(ctx) if ctx.at_start_of_input => true,
Some(_) => false,
None => false,
},
Some(NfaInstruction::StartOfLine) => match pos_ctx {
Some(ctx) if ctx.at_start_of_line => true,
Some(_) => false,
None => false,
},
Some(NfaInstruction::EndOfText) => true,
Some(NfaInstruction::EndOfLine) => true,
_ => true,
};
if should_follow_epsilons {
for &eps_target in &state.epsilon {
if !closure.contains(&eps_target) {
stack.push(eps_target);
}
}
}
}
closure
}
pub fn get_or_create_state_with_class(
ctx: &mut LazyDfaContext,
nfa_states: BTreeSet<NfaStateId>,
prev_class: CharClass,
) -> DfaStateId {
let key = if ctx.has_word_boundary {
StateKey::WithClass(nfa_states.clone(), prev_class)
} else {
StateKey::Simple(nfa_states.clone())
};
if let Some(&id) = ctx.state_map.get(&key) {
return id;
}
if ctx.states.len() >= ctx.cache_limit {
flush_cache(ctx);
if let Some(&id) = ctx.state_map.get(&key) {
return id;
}
}
let is_match = nfa_states
.iter()
.any(|&s| ctx.nfa.get(s).map(|state| state.is_match).unwrap_or(false));
let state_index = ctx.states.len();
let premul_id = (state_index as u32) * STRIDE;
ctx.states
.push(DfaState::new(nfa_states, is_match, prev_class));
ctx.transitions
.resize(ctx.transitions.len() + STRIDE as usize, UNKNOWN_STATE);
ctx.state_map.insert(key, premul_id);
premul_id
}
pub fn flush_cache(ctx: &mut LazyDfaContext) {
ctx.flush_count += 1;
let start_index = state_index(ctx.start);
let start_nfa_states = ctx.states[start_index].nfa_states.clone();
let start_is_match = ctx.states[start_index].is_match;
let start_prev_class = ctx.states[start_index].prev_class;
ctx.states.clear();
ctx.transitions.clear();
ctx.state_map.clear();
let key = if ctx.has_word_boundary {
StateKey::WithClass(start_nfa_states.clone(), start_prev_class)
} else {
StateKey::Simple(start_nfa_states.clone())
};
ctx.states.push(DfaState::new(
start_nfa_states,
start_is_match,
start_prev_class,
));
ctx.transitions.resize(STRIDE as usize, UNKNOWN_STATE);
ctx.state_map.insert(key, 0);
ctx.start = 0;
}
#[inline(always)]
pub fn state_index(premul_id: DfaStateId) -> usize {
((premul_id & STATE_MASK) / STRIDE) as usize
}
#[inline(always)]
pub fn tag_state(premul_id: DfaStateId, is_match: bool) -> u32 {
if is_match {
premul_id | TAG_MATCH
} else {
premul_id
}
}
#[inline(always)]
pub fn is_dead_state(tagged: u32) -> bool {
tagged == DEAD_STATE
}
#[inline(always)]
pub fn is_unknown_state(tagged: u32) -> bool {
tagged == UNKNOWN_STATE
}
#[inline(always)]
pub fn is_tagged_match(tagged: u32) -> bool {
(tagged & TAG_MATCH) != 0
}
#[inline(always)]
pub fn untag_state(tagged: u32) -> DfaStateId {
tagged & STATE_MASK
}