use std::cell::RefCell;
use std::collections::HashMap;
use std::sync::Arc;
use smallvec::{smallvec, SmallVec};
use crate::automaton::{
arena::{ArenaSmallTable, StateArena, StateId, ARENA_VALUE_TERMINATOR},
FieldMatcher, BYTE_CEILING,
};
use super::parser::{QuantifiedAtom, RegexpBranch, RegexpRoot, RuneRange};
const UTF8_1BYTE_MAX: u32 = 0x7F;
const UTF8_2BYTE_MAX: u32 = 0x7FF;
const UTF8_3BYTE_MAX: u32 = 0xFFFF;
const SURROGATE_START: u32 = 0xD800;
const SURROGATE_END: u32 = 0xDFFF;
fn rune_to_utf8(r: char) -> Vec<u8> {
let mut buf = [0u8; 4];
let s = r.encode_utf8(&mut buf);
s.as_bytes().to_vec()
}
pub fn regexp_has_plus_star(root: &RegexpRoot) -> bool {
for branch in root {
for qa in branch {
if qa.is_plus() || qa.is_star() {
return true;
}
if let Some(ref subtree) = qa.subtree {
if regexp_has_plus_star(subtree) {
return true;
}
}
}
}
false
}
pub fn make_regexp_nfa_arena(root: RegexpRoot) -> (StateArena, StateId, Arc<FieldMatcher>) {
let next_field = Arc::new(FieldMatcher::new());
let (mut arena, start) = if root.is_empty() {
let mut arena = StateArena::with_capacity(4);
let match_state = arena.alloc();
arena[match_state]
.field_transitions
.push(next_field.clone());
let vt_state = arena.alloc_with_table(ArenaSmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let closing_quote = arena.alloc_with_table(ArenaSmallTable::with_mappings(
StateId::NONE,
b"\"",
&[vt_state],
));
let start = arena.alloc_with_table(ArenaSmallTable::with_mappings(
StateId::NONE,
b"\"",
&[closing_quote],
));
(arena, start)
} else {
let mut arena = StateArena::with_capacity(16);
let match_state = arena.alloc();
arena[match_state]
.field_transitions
.push(next_field.clone());
let vt_state = arena.alloc_with_table(ArenaSmallTable::with_mappings(
StateId::NONE,
&[ARENA_VALUE_TERMINATOR],
&[match_state],
));
let next_step = arena.alloc_with_table(ArenaSmallTable::with_mappings(
StateId::NONE,
b"\"",
&[vt_state],
));
let branch_start = make_arena_nfa_from_branches(&root, &mut arena, next_step);
let start = arena.alloc_with_table(ArenaSmallTable::with_mappings(
StateId::NONE,
b"\"",
&[branch_start],
));
(arena, start)
};
arena.precompute_epsilon_closures();
(arena, start, next_field)
}
fn make_arena_nfa_from_branches(
root: &RegexpRoot,
arena: &mut StateArena,
next_step: StateId,
) -> StateId {
if root.is_empty() {
return next_step;
}
if root.len() == 1 {
return make_one_arena_branch_fa(&root[0], arena, next_step);
}
let mut branch_starts = Vec::with_capacity(root.len());
for branch in root {
if branch.is_empty() {
branch_starts.push(next_step);
} else {
let branch_start = make_one_arena_branch_fa(branch, arena, next_step);
branch_starts.push(branch_start);
}
}
let start = arena.alloc();
arena[start].table.epsilons = SmallVec::from_vec(branch_starts);
start
}
fn make_one_arena_branch_fa(
branch: &RegexpBranch,
arena: &mut StateArena,
next_step: StateId,
) -> StateId {
let mut current_next = next_step;
for qa in branch.iter().rev() {
let original_next = current_next;
if qa.is_plus() || qa.is_star() {
current_next = create_arena_plus_star_loop(qa, arena, original_next, qa.is_star());
} else if qa.is_qm() {
let atom_state = make_arena_atom_fa(qa, arena, current_next);
arena[atom_state].table.epsilons.push(original_next);
current_next = atom_state;
} else if qa.is_singleton() {
current_next = make_arena_atom_fa(qa, arena, current_next);
} else {
let n = qa.quant_min as usize;
let m = qa.quant_max as usize;
if n == 0 && m == 0 {
let epsilon_state = arena.alloc();
arena[epsilon_state].table.epsilons.push(current_next);
current_next = epsilon_state;
continue;
}
for _ in n..m {
let atom_state = make_arena_atom_fa(qa, arena, current_next);
arena[atom_state].table.epsilons.push(current_next);
current_next = atom_state;
}
for _ in 0..n {
current_next = make_arena_atom_fa(qa, arena, current_next);
}
}
}
current_next
}
fn create_arena_plus_star_loop(
qa: &QuantifiedAtom,
arena: &mut StateArena,
exit_state: StateId,
is_star: bool,
) -> StateId {
let loopback = arena.alloc();
let start = make_arena_atom_fa(qa, arena, loopback);
arena[loopback].table.epsilons = smallvec![exit_state, start];
if is_star {
arena[start].table.epsilons.push(exit_state);
}
let accel = qa.ascii_negated_bytes.as_ref().map(|bytes| {
let mut accel = crate::automaton::AccelInfo {
exit_bytes: [0; 3],
len: bytes.len() as u8,
};
for (i, &b) in bytes.iter().enumerate() {
accel.exit_bytes[i] = b;
}
accel
});
if let Some(accel) = accel {
arena[start].table.accel = Some(accel.clone());
arena[loopback].table.accel = Some(accel);
}
start
}
fn make_arena_atom_fa(qa: &QuantifiedAtom, arena: &mut StateArena, next: StateId) -> StateId {
if qa.is_dot {
make_arena_dot_fa(arena, next)
} else if let Some(ref subtree) = qa.subtree {
make_arena_nfa_from_branches(subtree, arena, next)
} else if let Some(ref cache_key) = qa.cache_key {
if cache_key == "wb_W" {
return make_nonword_char_fa(arena, next);
}
make_cached_rune_range_fa(cache_key, &qa.runes, arena, next)
} else {
make_arena_rune_range_fa(&qa.runes, arena, next)
}
}
#[allow(clippy::type_complexity)]
fn make_utf8_char_fa(
arena: &mut StateArena,
dest: StateId,
ascii_filter: Option<&dyn Fn(&mut [StateId; BYTE_CEILING])>,
) -> StateId {
let s_last = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0x80..0xC0].fill(dest);
table.pack(&unpacked);
table
});
let s_last_inter = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0x80..0xC0].fill(s_last);
table.pack(&unpacked);
table
});
let s_first_inter = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0x80..0xC0].fill(s_last_inter);
table.pack(&unpacked);
table
});
let target_e0 = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0xA0..0xC0].fill(s_last);
table.pack(&unpacked);
table
});
let target_ed = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0x80..0xA0].fill(s_last);
table.pack(&unpacked);
table
});
let target_f0 = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0x90..0xC0].fill(s_last_inter);
table.pack(&unpacked);
table
});
let target_f4 = arena.alloc_with_table({
let mut table = ArenaSmallTable::new();
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[0x80..0x90].fill(s_last_inter);
table.pack(&unpacked);
table
});
arena.alloc_with_table({
let mut unpacked = [StateId::NONE; BYTE_CEILING];
unpacked[..0x80].fill(dest);
if let Some(filter) = ascii_filter {
filter(&mut unpacked);
}
unpacked[0xC2..0xE0].fill(s_last);
unpacked[0xE0] = target_e0;
unpacked[0xE1..0xED].fill(s_last_inter);
unpacked[0xED] = target_ed;
unpacked[0xEE..0xF0].fill(s_last_inter);
unpacked[0xF0] = target_f0;
unpacked[0xF1..0xF4].fill(s_first_inter);
unpacked[0xF4] = target_f4;
let mut table = ArenaSmallTable::new();
table.pack(&unpacked);
table
})
}
fn make_arena_dot_fa(arena: &mut StateArena, dest: StateId) -> StateId {
make_utf8_char_fa(arena, dest, None)
}
pub(crate) fn make_nonword_char_fa(arena: &mut StateArena, dest: StateId) -> StateId {
make_utf8_char_fa(
arena,
dest,
Some(&|unpacked| {
for b in b'a'..=b'z' {
unpacked[b as usize] = StateId::NONE;
}
for b in b'A'..=b'Z' {
unpacked[b as usize] = StateId::NONE;
}
for b in b'0'..=b'9' {
unpacked[b as usize] = StateId::NONE;
}
unpacked[b'_' as usize] = StateId::NONE;
}),
)
}
struct CachedShell {
tables: Vec<ArenaSmallTable>,
root: u32,
}
thread_local! {
static FA_SHELL_CACHE: RefCell<HashMap<String, CachedShell>> = RefCell::new(HashMap::new());
}
fn build_shell(rr: &RuneRange) -> CachedShell {
let mut temp_arena = StateArena::with_capacity(16);
let placeholder = temp_arena.alloc();
let root_id = make_arena_rune_range_fa(rr, &mut temp_arena, placeholder);
let mut tables = Vec::with_capacity(temp_arena.len());
for i in 0..temp_arena.len() {
let id = StateId::from_index(i);
tables.push(temp_arena[id].table.clone());
}
CachedShell {
tables,
root: root_id.index() as u32,
}
}
fn instantiate_shell(shell: &CachedShell, arena: &mut StateArena, next: StateId) -> StateId {
let mut id_map: Vec<StateId> = Vec::with_capacity(shell.tables.len());
id_map.push(next);
for _ in 1..shell.tables.len() {
id_map.push(arena.alloc());
}
for (local_idx, src_table) in shell.tables.iter().enumerate() {
if local_idx == 0 {
continue;
}
let real_id = id_map[local_idx];
let mut table = src_table.clone();
for step in table.steps.iter_mut() {
if !step.is_none() {
*step = id_map[step.index()];
}
}
for eps in table.epsilons.iter_mut() {
if !eps.is_none() {
*eps = id_map[eps.index()];
}
}
if !table.default.is_none() {
table.default = id_map[table.default.index()];
}
arena[real_id].table = table;
}
id_map[shell.root as usize]
}
fn make_cached_rune_range_fa(
cache_key: &str,
rr: &RuneRange,
arena: &mut StateArena,
next: StateId,
) -> StateId {
FA_SHELL_CACHE.with(|cache| {
let mut cache = cache.borrow_mut();
if let Some(shell) = cache.get(cache_key) {
return instantiate_shell(shell, arena, next);
}
let shell = build_shell(rr);
let result = instantiate_shell(&shell, arena, next);
cache.insert(cache_key.to_string(), shell);
result
})
}
pub fn clear_fa_shell_cache() {
FA_SHELL_CACHE.with(|cache| {
cache.borrow_mut().clear();
});
}
struct ArenaRuneTreeEntry {
next: Option<StateId>,
child: Option<ArenaRuneTreeNode>,
}
type ArenaRuneTreeNode = Vec<Option<ArenaRuneTreeEntry>>;
fn new_arena_rune_tree_node() -> ArenaRuneTreeNode {
(0..BYTE_CEILING).map(|_| None).collect()
}
fn arena_nfa_from_rune_tree(arena: &mut StateArena, root: &ArenaRuneTreeNode) -> StateId {
arena_table_from_rune_tree_node(arena, root)
}
fn arena_table_from_rune_tree_node(arena: &mut StateArena, node: &ArenaRuneTreeNode) -> StateId {
let mut unpacked: [StateId; BYTE_CEILING] = [StateId::NONE; BYTE_CEILING];
for (b, entry_opt) in node.iter().enumerate() {
if let Some(entry) = entry_opt {
if let Some(next) = entry.next {
unpacked[b] = next;
} else if let Some(ref child) = entry.child {
let child_state = arena_table_from_rune_tree_node(arena, child);
unpacked[b] = child_state;
}
}
}
let mut table = ArenaSmallTable::new();
table.pack(&unpacked);
arena.alloc_with_table(table)
}
fn make_arena_rune_range_fa(rr: &RuneRange, arena: &mut StateArena, next: StateId) -> StateId {
let mut root = new_arena_rune_tree_node();
for pair in rr {
add_arena_rune_pair_tree_entry(&mut root, pair.lo, pair.hi, next);
}
arena_nfa_from_rune_tree(arena, &root)
}
fn add_arena_rune_pair_tree_entry(root: &mut ArenaRuneTreeNode, lo: char, hi: char, dest: StateId) {
let lo_u32 = lo as u32;
let hi_u32 = hi as u32;
let boundaries = [UTF8_1BYTE_MAX, UTF8_2BYTE_MAX, UTF8_3BYTE_MAX, u32::MAX];
let mut current = lo_u32;
for &boundary in &boundaries {
if current > hi_u32 {
break;
}
if boundary < current {
continue;
}
let segment_end = hi_u32.min(boundary);
if current <= SURROGATE_END && segment_end >= SURROGATE_START {
if current < SURROGATE_START {
let pre_end = (SURROGATE_START - 1).min(segment_end);
if let (Some(start), Some(end)) = (char::from_u32(current), char::from_u32(pre_end))
{
add_arena_utf8_range_to_tree(root, start, end, dest);
}
}
if segment_end > SURROGATE_END {
let post_start = (SURROGATE_END + 1).max(current);
if let (Some(start), Some(end)) =
(char::from_u32(post_start), char::from_u32(segment_end))
{
add_arena_utf8_range_to_tree(root, start, end, dest);
}
}
} else if let (Some(start), Some(end)) =
(char::from_u32(current), char::from_u32(segment_end))
{
add_arena_utf8_range_to_tree(root, start, end, dest);
}
current = segment_end + 1;
}
}
fn add_arena_utf8_range_to_tree(root: &mut ArenaRuneTreeNode, lo: char, hi: char, dest: StateId) {
let lo_bytes = rune_to_utf8(lo);
let hi_bytes = rune_to_utf8(hi);
debug_assert_eq!(lo_bytes.len(), hi_bytes.len());
add_arena_byte_range_recursive(root, &lo_bytes, &hi_bytes, 0, dest);
}
fn add_arena_byte_range_recursive(
node: &mut ArenaRuneTreeNode,
lo_bytes: &[u8],
hi_bytes: &[u8],
idx: usize,
dest: StateId,
) {
if idx >= lo_bytes.len() {
return;
}
let lo_byte = lo_bytes[idx];
let hi_byte = hi_bytes[idx];
let is_last = idx == lo_bytes.len() - 1;
if lo_byte == hi_byte {
ensure_arena_tree_entry(node, lo_byte);
let entry = node[lo_byte as usize].as_mut().unwrap();
if is_last {
entry.next = Some(dest);
} else {
if entry.child.is_none() {
entry.child = Some(new_arena_rune_tree_node());
}
add_arena_byte_range_recursive(
entry.child.as_mut().unwrap(),
lo_bytes,
hi_bytes,
idx + 1,
dest,
);
}
} else {
add_arena_lo_range_to_tree(node, lo_bytes, idx, dest);
if hi_byte > lo_byte + 1 {
add_arena_middle_range_to_tree(
node,
lo_byte + 1,
hi_byte - 1,
lo_bytes.len() - idx - 1,
dest,
);
}
add_arena_hi_range_to_tree(node, hi_bytes, idx, dest);
}
}
fn add_arena_lo_range_to_tree(
node: &mut ArenaRuneTreeNode,
lo_bytes: &[u8],
idx: usize,
dest: StateId,
) {
let lo_byte = lo_bytes[idx];
let is_last = idx == lo_bytes.len() - 1;
ensure_arena_tree_entry(node, lo_byte);
let entry = node[lo_byte as usize].as_mut().unwrap();
if is_last {
entry.next = Some(dest);
} else {
if entry.child.is_none() {
entry.child = Some(new_arena_rune_tree_node());
}
let child = entry.child.as_mut().unwrap();
let next_byte = lo_bytes[idx + 1];
add_arena_lo_range_to_tree(child, lo_bytes, idx + 1, dest);
if next_byte < 0xBF {
add_arena_middle_range_to_tree(
child,
next_byte + 1,
0xBF,
lo_bytes.len() - idx - 2,
dest,
);
}
}
}
fn add_arena_hi_range_to_tree(
node: &mut ArenaRuneTreeNode,
hi_bytes: &[u8],
idx: usize,
dest: StateId,
) {
let hi_byte = hi_bytes[idx];
let is_last = idx == hi_bytes.len() - 1;
ensure_arena_tree_entry(node, hi_byte);
let entry = node[hi_byte as usize].as_mut().unwrap();
if is_last {
entry.next = Some(dest);
} else {
if entry.child.is_none() {
entry.child = Some(new_arena_rune_tree_node());
}
let child = entry.child.as_mut().unwrap();
let next_byte = hi_bytes[idx + 1];
if next_byte > 0x80 {
add_arena_middle_range_to_tree(
child,
0x80,
next_byte - 1,
hi_bytes.len() - idx - 2,
dest,
);
}
add_arena_hi_range_to_tree(child, hi_bytes, idx + 1, dest);
}
}
fn add_arena_middle_range_to_tree(
node: &mut ArenaRuneTreeNode,
lo: u8,
hi: u8,
depth: usize,
dest: StateId,
) {
if depth == 0 {
for byte in lo..=hi {
ensure_arena_tree_entry(node, byte);
node[byte as usize].as_mut().unwrap().next = Some(dest);
}
} else {
for byte in lo..=hi {
ensure_arena_tree_entry(node, byte);
let entry = node[byte as usize].as_mut().unwrap();
if entry.child.is_none() {
entry.child = Some(new_arena_rune_tree_node());
}
add_arena_middle_range_to_tree(
entry.child.as_mut().unwrap(),
0x80,
0xBF,
depth - 1,
dest,
);
}
}
}
fn ensure_arena_tree_entry(node: &mut ArenaRuneTreeNode, byte: u8) {
let idx = byte as usize;
if node[idx].is_none() {
node[idx] = Some(ArenaRuneTreeEntry {
next: None,
child: None,
});
}
}