use std::collections::{HashMap, VecDeque};
use super::super::super::lazy::{CharClass, LazyDfa};
use super::super::shared::{
is_word_byte, StateMetadata, DEAD_STATE, STATE_MASK, TAG_DEAD, TAG_MATCH,
};
pub struct EagerDfa {
transitions: Vec<u32>,
state_count: usize,
start: u32,
start_word: Option<u32>,
has_word_boundary: bool,
has_anchors: bool,
has_start_anchor: bool,
has_end_anchor: bool,
has_multiline_anchors: bool,
state_metadata: Vec<StateMetadata>,
}
impl EagerDfa {
pub fn from_lazy(lazy: &mut LazyDfa) -> Self {
lazy.set_cache_limit(usize::MAX);
let has_word_boundary = lazy.has_word_boundary();
let has_anchors = lazy.has_anchors();
let has_start_anchor = lazy.has_start_anchor();
let has_end_anchor = lazy.has_end_anchor();
let has_multiline_anchors = lazy.has_multiline_anchors();
let needs_metadata = has_word_boundary || has_anchors;
let lazy_start = lazy.get_start_state_for_class(CharClass::NonWord);
let lazy_start_word = if has_word_boundary {
Some(lazy.get_start_state_for_class(CharClass::Word))
} else {
None
};
let mut state_map: HashMap<u32, u32> = HashMap::new();
let mut queue: VecDeque<u32> = VecDeque::new();
let mut all_transitions: Vec<[Option<u32>; 256]> = Vec::new();
let mut match_flags: Vec<bool> = Vec::new();
let mut lazy_state_order: Vec<u32> = Vec::new();
let start_idx = 0u32;
state_map.insert(lazy_start, start_idx);
queue.push_back(lazy_start);
let start_word_idx = if let Some(sw) = lazy_start_word {
if sw != lazy_start {
let idx = 1u32;
state_map.insert(sw, idx);
queue.push_back(sw);
Some(idx)
} else {
Some(start_idx)
}
} else {
None
};
while let Some(lazy_state) = queue.pop_front() {
let eager_idx = *state_map.get(&lazy_state).unwrap();
while all_transitions.len() <= eager_idx as usize {
all_transitions.push([None; 256]);
match_flags.push(false);
lazy_state_order.push(0); }
lazy_state_order[eager_idx as usize] = lazy_state;
match_flags[eager_idx as usize] = lazy.is_match(lazy_state);
let lazy_transitions = lazy.compute_all_transitions(lazy_state);
for byte in 0..=255u8 {
if let Some(next_lazy) = lazy_transitions[byte as usize] {
let next_idx = if let Some(&idx) = state_map.get(&next_lazy) {
idx
} else {
let idx = state_map.len() as u32;
state_map.insert(next_lazy, idx);
queue.push_back(next_lazy);
idx
};
all_transitions[eager_idx as usize][byte as usize] = Some(next_idx);
}
}
}
let state_count = all_transitions.len();
let mut transitions = vec![DEAD_STATE; state_count * 256];
for (state_idx, trans) in all_transitions.iter().enumerate() {
let base = state_idx * 256;
for byte in 0..256 {
if let Some(next_idx) = trans[byte] {
let next_is_match =
match_flags.get(next_idx as usize).copied().unwrap_or(false);
let mut tagged = next_idx;
if next_is_match {
tagged |= TAG_MATCH;
}
transitions[base + byte] = tagged;
}
}
}
let state_metadata = if needs_metadata {
lazy_state_order
.iter()
.map(|&lazy_state| {
let prev_class = lazy.get_state_prev_class(lazy_state);
let (needs_word_boundary, needs_not_word_boundary) =
lazy.get_state_boundary_requirements(lazy_state);
let (needs_end_of_text, needs_end_of_line) =
lazy.get_state_anchor_requirements(lazy_state);
StateMetadata {
prev_class,
needs_word_boundary,
needs_not_word_boundary,
needs_end_of_text,
needs_end_of_line,
}
})
.collect()
} else {
Vec::new()
};
let start = if match_flags
.get(start_idx as usize)
.copied()
.unwrap_or(false)
{
start_idx | TAG_MATCH
} else {
start_idx
};
let start_word = start_word_idx.map(|idx| {
if match_flags.get(idx as usize).copied().unwrap_or(false) {
idx | TAG_MATCH
} else {
idx
}
});
Self {
transitions,
state_count,
start,
start_word,
has_word_boundary,
has_anchors,
has_start_anchor,
has_end_anchor,
has_multiline_anchors,
state_metadata,
}
}
#[inline]
pub fn state_count(&self) -> usize {
self.state_count
}
pub fn has_word_boundary(&self) -> bool {
self.has_word_boundary
}
pub fn has_anchors(&self) -> bool {
self.has_anchors
}
pub fn has_start_anchor(&self) -> bool {
self.has_start_anchor
}
pub fn has_end_anchor(&self) -> bool {
self.has_end_anchor
}
pub fn has_multiline_anchors(&self) -> bool {
self.has_multiline_anchors
}
pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
if self.has_start_anchor {
if self.has_multiline_anchors {
if let Some(end) = self.find_at(input, 0) {
return Some((0, end));
}
for (i, &byte) in input.iter().enumerate() {
if byte == b'\n' && i < input.len() {
if let Some(end) = self.find_at(input, i + 1) {
return Some((i + 1, end));
}
}
}
None
} else {
self.find_at(input, 0).map(|end| (0, end))
}
} else {
for start_pos in 0..=input.len() {
if let Some(end) = self.find_at(input, start_pos) {
return Some((start_pos, end));
}
}
None
}
}
#[inline]
pub fn find_at(&self, input: &[u8], start: usize) -> Option<usize> {
if start > input.len() {
return None;
}
if self.has_start_anchor && !self.has_multiline_anchors && start != 0 {
return None;
}
let state = if self.has_word_boundary && start > 0 {
let prev_byte = input[start - 1];
if is_word_byte(prev_byte) {
self.start_word.unwrap_or(self.start)
} else {
self.start
}
} else {
self.start
};
if !self.has_word_boundary && !self.has_multiline_anchors {
return self.find_at_fast(input, start, state);
}
self.find_at_slow(input, start, state)
}
#[inline(never)]
fn find_at_fast(&self, input: &[u8], start: usize, mut state: u32) -> Option<usize> {
let mut last_match = if state & TAG_MATCH != 0 {
Some(start)
} else {
None
};
let bytes = &input[start..];
for (i, &byte) in bytes.iter().enumerate() {
let state_idx = (state & STATE_MASK) as usize;
let next = self.transitions[state_idx * 256 + byte as usize];
if next & TAG_DEAD != 0 {
break;
}
state = next;
if state & TAG_MATCH != 0 {
last_match = Some(start + i + 1);
}
}
if let Some(end_pos) = last_match {
if self.has_end_anchor && end_pos != input.len() {
return None;
}
}
last_match
}
fn find_at_slow(&self, input: &[u8], start: usize, mut state: u32) -> Option<usize> {
let mut last_match = if state & TAG_MATCH != 0 {
if self.check_end_assertions(input, start, (state & STATE_MASK) as usize) {
Some(start)
} else {
None
}
} else {
None
};
for (i, &byte) in input[start..].iter().enumerate() {
let state_idx = (state & STATE_MASK) as usize;
let next = self.transitions[state_idx * 256 + byte as usize];
if next & TAG_DEAD != 0 {
break;
}
state = next;
if state & TAG_MATCH != 0 {
let match_end = start + i + 1;
let next_state_idx = (state & STATE_MASK) as usize;
if self.check_end_assertions(input, match_end, next_state_idx) {
last_match = Some(match_end);
}
}
}
last_match
}
#[inline]
fn check_end_assertions(&self, input: &[u8], pos: usize, state_idx: usize) -> bool {
if self.state_metadata.is_empty() {
return true;
}
let metadata = match self.state_metadata.get(state_idx) {
Some(m) => m,
None => return true,
};
if self.has_word_boundary {
let prev_class = metadata.prev_class;
let next_class = if pos < input.len() {
CharClass::from_byte(input[pos])
} else {
CharClass::NonWord
};
let is_at_boundary = prev_class != next_class;
if metadata.needs_word_boundary && !is_at_boundary {
return false;
}
if metadata.needs_not_word_boundary && is_at_boundary {
return false;
}
}
if self.has_anchors {
if metadata.needs_end_of_text && pos != input.len() {
return false;
}
if metadata.needs_end_of_line {
let at_end_of_line = pos == input.len() || input.get(pos) == Some(&b'\n');
if !at_end_of_line {
return false;
}
}
}
true
}
}
impl std::fmt::Debug for EagerDfa {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("EagerDfa")
.field("state_count", &self.state_count)
.field("has_word_boundary", &self.has_word_boundary)
.field("has_anchors", &self.has_anchors)
.finish()
}
}