use std::{borrow::Cow, vec};
use super::{
SimpleMatcher, SimpleResult,
build::{BOUNDARY_LEFT, BOUNDARY_RIGHT},
pattern::{DIRECT_RULE_BIT, PatternDispatch, decode_direct},
scan::{CHARWISE_DENSITY_THRESHOLD, text_non_ascii_density},
state::{SIMPLE_MATCH_STATE, ScanContext, ScanState},
tree::ProcessTypeBitNode,
};
static WORD_BYTE_LUT: [u8; 256] = {
let mut lut = [0u8; 256];
let mut i = 0u16;
while i < 256 {
let b = i as u8;
lut[i as usize] = if b.is_ascii_alphanumeric() || b == b'_' || b >= 0x80 {
1
} else {
0
};
i += 1;
}
lut
};
#[inline(always)]
fn check_word_boundary(text: &[u8], start: usize, end: usize, flags: u8) -> bool {
if flags & BOUNDARY_LEFT != 0 && start > 0 {
unsafe { core::hint::assert_unchecked(start < text.len()) };
let prev = text[start - 1];
let curr = text[start];
if WORD_BYTE_LUT[prev as usize] != 0 && WORD_BYTE_LUT[curr as usize] != 0 {
return false;
}
}
if flags & BOUNDARY_RIGHT != 0 && end < text.len() {
unsafe { core::hint::assert_unchecked(end >= 1) };
let prev = text[end - 1];
let next = text[end];
if WORD_BYTE_LUT[prev as usize] != 0 && WORD_BYTE_LUT[next as usize] != 0 {
return false;
}
}
true
}
fn fold_noop_children_masks(
tree: &[ProcessTypeBitNode],
node_idx: usize,
parent_ascii: bool,
) -> u64 {
let mut mask = tree[node_idx].pt_index_mask;
if !parent_ascii {
return mask;
}
for &ci in &tree[node_idx].children {
let child = &tree[ci];
if child.pt_index_mask != 0 && child.step.is_some_and(|s| s.is_noop_on_ascii_input()) {
mask |= fold_noop_children_masks(tree, ci, true);
}
}
mask
}
impl SimpleMatcher {
#[inline(always)]
fn scan_variant(&self, processed_text: &str, ctx: ScanContext, ss: &mut ScanState<'_>) -> bool {
let text_bytes = processed_text.as_bytes();
self.scan.for_each_match_value(
processed_text,
ctx.non_ascii_density,
|raw_value, start, end| self.process_match(raw_value, text_bytes, start, end, ctx, ss),
)
}
#[inline(always)]
fn process_match(
&self,
raw_value: u32,
text: &[u8],
start: usize,
end: usize,
ctx: ScanContext,
ss: &mut ScanState<'_>,
) -> bool {
if raw_value & DIRECT_RULE_BIT != 0 {
let (pt_index, boundary, kind, offset, rule_idx) = decode_direct(raw_value);
if ctx.process_type_mask & (1u64 << pt_index) == 0 {
return false;
}
if boundary != 0 && !check_word_boundary(text, start, end, boundary) {
return false;
}
return self.rules.eval_hit(rule_idx, kind, offset, ctx, ss);
}
match self.scan.patterns().dispatch_indirect(raw_value) {
PatternDispatch::SingleEntry(entry) => {
if entry.boundary != 0 && !check_word_boundary(text, start, end, entry.boundary) {
return false;
}
if ctx.process_type_mask & (1u64 << entry.pt_index) == 0 {
return false;
}
self.rules.eval_hit(
entry.rule_idx as usize,
entry.kind,
entry.offset as usize,
ctx,
ss,
)
}
PatternDispatch::Entries(entries) => {
for entry in entries {
if entry.boundary != 0 && !check_word_boundary(text, start, end, entry.boundary)
{
continue;
}
if ctx.process_type_mask & (1u64 << entry.pt_index) == 0 {
continue;
}
if self.rules.eval_hit(
entry.rule_idx as usize,
entry.kind,
entry.offset as usize,
ctx,
ss,
) {
return true;
}
}
false
}
}
}
#[inline]
pub(super) fn walk_and_scan<'a>(
&'a self,
text: &'a str,
exit_early: bool,
results: Option<&mut Vec<SimpleResult<'a>>>,
) -> bool {
self.walk_and_scan_with(text, exit_early, |rules, ss| {
if let Some(results) = results {
rules.collect_matches(ss, results);
}
})
.0
}
#[inline]
pub(super) fn walk_and_scan_with<'a, F, R>(
&'a self,
text: &'a str,
exit_early: bool,
collect: F,
) -> (bool, Option<R>)
where
F: FnOnce(&'a super::rule::RuleSet, &ScanState<'_>) -> R,
{
let tree = &self.tree;
let num_variants = tree.len();
let state = unsafe { &mut *SIMPLE_MATCH_STATE.get() };
state.prepare(self.rules.len());
let mut ss = state.as_scan_state();
let mut collect = Some(collect);
let root_density = text_non_ascii_density(text);
let root_scan_mask = fold_noop_children_masks(tree, 0, root_density == 0.0);
if root_scan_mask != 0 {
let ctx = ScanContext {
text_index: 0,
process_type_mask: root_scan_mask,
num_variants,
exit_early,
non_ascii_density: root_density,
};
if self.scan_variant(text, ctx, &mut ss) {
return (true, None);
}
}
if tree[0].children.is_empty() {
let r = collect.take().map(|f| f(&self.rules, &ss));
return (ss.has_match(), r);
}
let mut texts: Vec<Cow<'_, str>> = Vec::with_capacity(num_variants);
texts.push(Cow::Borrowed(text));
let mut density_flags: Vec<f32> = Vec::new();
density_flags.push(root_density);
let mut node_arena: Vec<usize> = vec![0; num_variants];
let mut node_variant: Vec<usize> = vec![0; num_variants];
let mut variant_counter = 1usize;
let mut stopped = false;
'walk: for node_idx in 0..tree.len() {
let num_children = tree[node_idx].children.len();
if num_children == 0 {
continue;
}
let parent_aidx = node_arena[node_idx];
let parent_vi = node_variant[node_idx];
for ci in 0..num_children {
let child_idx = tree[node_idx].children[ci];
let child = &tree[child_idx];
let Some(step) = child.step else {
unreachable!()
};
let is_leaf = child.children.is_empty();
let parent_density = density_flags[parent_aidx];
let parent_ascii = parent_density == 0.0;
let is_noop = parent_ascii && step.is_noop_on_ascii_input();
if is_leaf {
if child.pt_index_mask != 0 {
if is_noop {
continue;
}
let use_fused = !(cfg!(feature = "dfa")
&& parent_density <= CHARWISE_DENSITY_THRESHOLD);
let fused_result = if use_fused {
let parent_text = texts[parent_aidx].as_ref();
step.filter_bytes(parent_text).map(|iter| {
let vi = variant_counter;
let ctx = ScanContext {
text_index: vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: parent_density,
};
let fused_text_bytes = parent_text.as_bytes();
variant_counter += 1;
self.scan.for_each_match_value_from_iter(
iter,
ctx.non_ascii_density,
|v, start, end| {
self.process_match(
v,
fused_text_bytes,
start,
end,
ctx,
&mut ss,
)
},
)
})
} else {
None
};
stopped = if let Some(result) = fused_result {
result
} else {
let changed = step.apply(texts[parent_aidx].as_ref(), parent_density);
if let Some((s, child_density)) = changed {
let vi = variant_counter;
variant_counter += 1;
let ctx = ScanContext {
text_index: vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: child_density,
};
self.scan_variant(&s, ctx, &mut ss)
} else {
let ctx = ScanContext {
text_index: parent_vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: parent_density,
};
self.scan_variant(texts[parent_aidx].as_ref(), ctx, &mut ss)
}
};
if stopped {
break 'walk;
}
}
} else {
let changed = step.apply(texts[parent_aidx].as_ref(), parent_density);
let (child_aidx, child_vi) = match changed {
Some((s, child_density)) => {
let idx = texts.len();
density_flags.push(child_density);
texts.push(Cow::Owned(s));
let vi = variant_counter;
variant_counter += 1;
(idx, vi)
}
None => (parent_aidx, parent_vi),
};
node_arena[child_idx] = child_aidx;
node_variant[child_idx] = child_vi;
if child.pt_index_mask != 0 && !is_noop {
let child_ascii = density_flags[child_aidx] == 0.0;
let scan_mask = fold_noop_children_masks(tree, child_idx, child_ascii);
let ctx = ScanContext {
text_index: child_vi,
process_type_mask: scan_mask,
num_variants,
exit_early,
non_ascii_density: density_flags[child_aidx],
};
stopped = self.scan_variant(texts[child_aidx].as_ref(), ctx, &mut ss);
if stopped {
break 'walk;
}
}
}
}
}
if stopped {
return (true, None);
}
let r = collect.take().map(|f| f(&self.rules, &ss));
(ss.has_match(), r)
}
}