use std::borrow::Cow;
use tinyvec::TinyVec;
use crate::process::step::TransformStep;
use crate::process::string_pool::return_string_to_pool;
use super::build::{BOUNDARY_LEFT, BOUNDARY_RIGHT};
use super::encoding::{
DIRECT_BOUNDARY_MASK, DIRECT_BOUNDARY_SHIFT, DIRECT_PT_MASK, DIRECT_PT_SHIFT, DIRECT_RULE_BIT,
DIRECT_RULE_MASK,
};
use super::engine::{CHARWISE_DENSITY_THRESHOLD, text_non_ascii_density};
use super::pattern::PatternDispatch;
use super::state::{SIMPLE_MATCH_STATE, ScanContext, ScanState};
use super::{SimpleMatcher, SimpleResult};
static WORD_BYTE_LUT: [u8; 256] = {
let mut lut = [0u8; 256];
let mut i = 0u16;
while i < 256 {
let b = i as u8;
lut[i as usize] = if b.is_ascii_alphanumeric() || b == b'_' || b >= 0x80 {
1
} else {
0
};
i += 1;
}
lut
};
#[inline(always)]
fn check_word_boundary(text: &[u8], start: usize, end: usize, flags: u8) -> bool {
if flags & BOUNDARY_LEFT != 0 && start > 0 {
let prev = unsafe { *text.get_unchecked(start - 1) };
let curr = unsafe { *text.get_unchecked(start) };
if WORD_BYTE_LUT[prev as usize] != 0 && WORD_BYTE_LUT[curr as usize] != 0 {
return false;
}
}
if flags & BOUNDARY_RIGHT != 0 && end < text.len() {
let prev = unsafe { *text.get_unchecked(end - 1) };
let next = unsafe { *text.get_unchecked(end) };
if WORD_BYTE_LUT[prev as usize] != 0 && WORD_BYTE_LUT[next as usize] != 0 {
return false;
}
}
true
}
impl SimpleMatcher {
pub(super) fn is_match_simple(&self, text: &str) -> bool {
if !self.scan.patterns().has_boundary() {
return self.scan.is_match(text);
}
let text_bytes = text.as_bytes();
let density = text_non_ascii_density(text);
let mut matched = false;
self.scan
.for_each_rule_idx_simple(text, density, |_rule_idx, boundary, start, end| {
if !matched
&& (boundary == 0 || check_word_boundary(text_bytes, start, end, boundary))
{
matched = true;
}
});
matched
}
pub(super) fn process_simple<'a>(&'a self, text: &'a str, results: &mut Vec<SimpleResult<'a>>) {
let state = unsafe { &mut *SIMPLE_MATCH_STATE.get() };
state.prepare(self.rules.len());
let mut ss = state.as_scan_state();
let text_bytes = text.as_bytes();
let density = text_non_ascii_density(text);
self.scan
.for_each_rule_idx_simple(text, density, |rule_idx, boundary, start, end| {
if boundary != 0 && !check_word_boundary(text_bytes, start, end, boundary) {
return;
}
self.rules.push_result_if_new(rule_idx, &mut ss, results);
});
}
#[inline(always)]
fn scan_variant(&self, processed_text: &str, ctx: ScanContext, ss: &mut ScanState<'_>) -> bool {
let text_bytes = processed_text.as_bytes();
self.scan.for_each_match_value(
processed_text,
ctx.non_ascii_density,
|raw_value, start, end| self.process_match(raw_value, text_bytes, start, end, ctx, ss),
)
}
#[inline(always)]
fn process_match(
&self,
raw_value: u32,
text: &[u8],
start: usize,
end: usize,
ctx: ScanContext,
ss: &mut ScanState<'_>,
) -> bool {
if raw_value & DIRECT_RULE_BIT != 0 {
let pt_index = ((raw_value & DIRECT_PT_MASK) >> DIRECT_PT_SHIFT) as u8;
if ctx.process_type_mask & (1u64 << pt_index) == 0 {
return false;
}
let boundary = ((raw_value & DIRECT_BOUNDARY_MASK) >> DIRECT_BOUNDARY_SHIFT) as u8;
if boundary != 0 && !check_word_boundary(text, start, end, boundary) {
return false;
}
let rule_idx = (raw_value & DIRECT_RULE_MASK) as usize;
if ss.mark_positive(rule_idx) {
ss.resolved_count += 1;
}
return ctx.exit_early;
}
match self.scan.patterns().dispatch_indirect(raw_value) {
PatternDispatch::SingleEntry(entry) => {
if entry.boundary != 0 && !check_word_boundary(text, start, end, entry.boundary) {
return false;
}
self.rules.process_entry(entry, ctx, ss)
}
PatternDispatch::Entries(entries) => {
for entry in entries {
if entry.boundary != 0 && !check_word_boundary(text, start, end, entry.boundary)
{
continue;
}
if self.rules.process_entry(entry, ctx, ss) {
return true;
}
}
false
}
}
}
#[inline]
pub(super) fn walk_and_scan<'a>(
&'a self,
text: &'a str,
exit_early: bool,
results: Option<&mut Vec<SimpleResult<'a>>>,
) -> bool {
let tree = &self.tree;
let num_variants = tree.len();
let state = unsafe { &mut *SIMPLE_MATCH_STATE.get() };
state.prepare(self.rules.len());
let mut ss = state.as_scan_state();
if self.scan.patterns().is_empty() {
return false;
}
let root_density = text_non_ascii_density(text);
if tree[0].pt_index_mask != 0 {
let ctx = ScanContext {
text_index: 0,
process_type_mask: tree[0].pt_index_mask,
num_variants,
exit_early,
non_ascii_density: root_density,
};
if self.scan_variant(text, ctx, &mut ss) {
return true;
}
if !exit_early && !self.rules.has_not_rules() && ss.resolved_count >= self.rules.len() {
if let Some(results) = results {
self.rules.collect_matches(&ss, results);
}
return self.rules.has_match(&ss);
}
}
if tree[0].children.is_empty() {
if let Some(results) = results {
self.rules.collect_matches(&ss, results);
}
return self.rules.has_match(&ss);
}
let mut texts: Vec<Cow<'_, str>> = Vec::with_capacity(num_variants);
texts.push(Cow::Borrowed(text));
let mut density_flags: TinyVec<[f32; 16]> = TinyVec::new();
density_flags.push(root_density);
let mut node_arena: TinyVec<[usize; 16]> = TinyVec::new();
node_arena.resize(num_variants, 0);
let mut node_variant: TinyVec<[usize; 16]> = TinyVec::new();
node_variant.resize(num_variants, 0);
let mut variant_counter = 1usize;
let mut stopped = false;
'walk: for node_idx in 0..tree.len() {
let num_children = tree[node_idx].children.len();
if num_children == 0 {
continue;
}
let parent_aidx = node_arena[node_idx];
let parent_vi = node_variant[node_idx];
for ci in 0..num_children {
let child_idx = tree[node_idx].children[ci];
let child = &tree[child_idx];
let step = child
.step
.expect("non-root process tree nodes always cache a transform step");
let is_leaf = child.children.is_empty();
let parent_density = density_flags[parent_aidx];
let parent_ascii = parent_density == 0.0;
if is_leaf {
if child.pt_index_mask != 0 {
let is_noop = parent_ascii && step.is_noop_on_ascii_input();
let use_fused = !(is_noop
|| self.scan.has_dfa() && parent_density <= CHARWISE_DENSITY_THRESHOLD);
let fused_result = if use_fused {
let parent_text = texts[parent_aidx].as_ref();
let vi = variant_counter;
let ctx = ScanContext {
text_index: vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: parent_density,
};
let fused_text_bytes = parent_text.as_bytes();
macro_rules! fused {
($m:expr) => {
Some(self.scan.for_each_match_value_from_iter(
$m.filter_bytes(parent_text),
ctx.non_ascii_density,
|v, start, end| {
self.process_match(
v,
fused_text_bytes,
start,
end,
ctx,
&mut ss,
)
},
))
};
}
match step {
TransformStep::Delete(m) => fused!(m),
TransformStep::Normalize(m) => fused!(m),
TransformStep::VariantNorm(m) => fused!(m),
TransformStep::Romanize(m) | TransformStep::RomanizeChar(m) => {
fused!(m)
}
_ => None,
}
.inspect(|_| {
variant_counter += 1;
})
} else {
None
};
stopped = if let Some(result) = fused_result {
result
} else {
let changed = if !is_noop {
step.apply(texts[parent_aidx].as_ref(), parent_density)
} else {
None
};
if let Some((s, child_density)) = changed {
let vi = variant_counter;
variant_counter += 1;
let ctx = ScanContext {
text_index: vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: child_density,
};
let result = self.scan_variant(&s, ctx, &mut ss);
return_string_to_pool(s);
result
} else {
let ctx = ScanContext {
text_index: parent_vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: parent_density,
};
self.scan_variant(texts[parent_aidx].as_ref(), ctx, &mut ss)
}
};
if stopped {
break 'walk;
}
if !exit_early
&& !self.rules.has_not_rules()
&& ss.resolved_count >= self.rules.len()
{
break 'walk;
}
}
} else {
let changed = step.apply(texts[parent_aidx].as_ref(), parent_density);
let (child_aidx, child_vi) = match changed {
Some((s, child_density)) => {
let idx = texts.len();
density_flags.push(child_density);
texts.push(Cow::Owned(s));
let vi = variant_counter;
variant_counter += 1;
(idx, vi)
}
None => (parent_aidx, parent_vi),
};
node_arena[child_idx] = child_aidx;
node_variant[child_idx] = child_vi;
if child.pt_index_mask != 0 {
let ctx = ScanContext {
text_index: child_vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
non_ascii_density: density_flags[child_aidx],
};
stopped = self.scan_variant(texts[child_aidx].as_ref(), ctx, &mut ss);
if stopped {
break 'walk;
}
if !exit_early
&& !self.rules.has_not_rules()
&& ss.resolved_count >= self.rules.len()
{
break 'walk;
}
}
}
}
}
for cow in texts {
if let Cow::Owned(s) = cow {
return_string_to_pool(s);
}
}
if stopped {
return true;
}
if let Some(results) = results {
self.rules.collect_matches(&ss, results);
}
self.rules.has_match(&ss)
}
}