use std::borrow::Cow;
use crate::process::step::TransformStep;
use crate::process::string_pool::return_string_to_pool;
use crate::process::transform::simd::multibyte_density;
use super::rule::PatternDispatch;
use super::state::{SIMPLE_MATCH_STATE, ScanContext, SimpleMatchState};
use super::{SimpleMatcher, SimpleResult};
impl SimpleMatcher {
pub(super) fn is_match_simple(&self, text: &str) -> bool {
self.scan.is_match(text)
}
pub(super) fn process_simple<'a>(&'a self, text: &'a str, results: &mut Vec<SimpleResult<'a>>) {
let state = unsafe { &mut *SIMPLE_MATCH_STATE.get() };
state.prepare(self.rules.len());
let use_bytewise =
multibyte_density(text.as_bytes()) < self.scan.charwise_density_threshold();
let _ = self
.scan
.for_each_match_value(text, use_bytewise, |raw_value| {
match self.scan.patterns().dispatch(raw_value) {
PatternDispatch::DirectRule { rule_idx, .. } => {
self.rules.push_result_if_new(rule_idx, state, results);
}
PatternDispatch::SingleEntry(entry) => {
self.rules
.push_result_if_new(entry.rule_idx as usize, state, results);
}
PatternDispatch::Entries(entries) => {
for entry in entries {
self.rules
.push_result_if_new(entry.rule_idx as usize, state, results);
}
}
}
false
});
}
#[inline(always)]
fn scan_variant(
&self,
processed_text: &str,
ctx: ScanContext,
state: &mut SimpleMatchState,
) -> bool {
self.scan
.for_each_match_value(processed_text, ctx.use_bytewise, |raw_value| {
self.process_match(raw_value, ctx, state)
})
}
#[inline(always)]
fn process_match(
&self,
raw_value: u32,
ctx: ScanContext,
state: &mut SimpleMatchState,
) -> bool {
match self.scan.patterns().dispatch(raw_value) {
PatternDispatch::DirectRule { rule_idx, pt_index } => {
if ctx.process_type_mask & (1u64 << pt_index) == 0 {
return false;
}
state.mark_positive(rule_idx);
ctx.exit_early
}
PatternDispatch::SingleEntry(entry) => self.rules.process_entry(entry, ctx, state),
PatternDispatch::Entries(entries) => {
for entry in entries {
if self.rules.process_entry(entry, ctx, state) {
return true;
}
}
false
}
}
}
#[inline(always)]
pub(super) fn walk_and_scan<'a>(
&'a self,
text: &'a str,
exit_early: bool,
results: Option<&mut Vec<SimpleResult<'a>>>,
) -> bool {
let tree = self.process.tree();
let num_variants = tree.len();
let state = unsafe { &mut *SIMPLE_MATCH_STATE.get() };
state.prepare(self.rules.len());
if self.scan.patterns().is_empty() {
return false;
}
let root_density = multibyte_density(text.as_bytes());
if tree[0].pt_index_mask != 0 {
let ctx = ScanContext {
text_index: 0,
process_type_mask: tree[0].pt_index_mask,
num_variants,
exit_early,
use_bytewise: root_density < self.scan.charwise_density_threshold(),
};
if self.scan_variant(text, ctx, state) {
return true;
}
}
if tree[0].children.is_empty() {
if let Some(results) = results {
self.rules.collect_matches(state, results);
}
return self.rules.has_match(state);
}
let mut texts: Vec<Cow<'_, str>> = Vec::with_capacity(num_variants);
texts.push(Cow::Borrowed(text));
let mut density_flags: Vec<f32> = Vec::with_capacity(num_variants);
density_flags.push(root_density);
let mut node_arena: Vec<usize> = vec![0; num_variants];
let mut node_variant: Vec<usize> = vec![0; num_variants];
let mut variant_counter = 1usize;
let mut stopped = false;
'walk: for node_idx in 0..tree.len() {
let num_children = tree[node_idx].children.len();
if num_children == 0 {
continue;
}
let parent_aidx = node_arena[node_idx];
let parent_vi = node_variant[node_idx];
for ci in 0..num_children {
let child_idx = tree[node_idx].children[ci];
let child = &tree[child_idx];
let step = child
.step
.expect("non-root process tree nodes always cache a transform step");
let is_leaf = child.children.is_empty();
let parent_density = density_flags[parent_aidx];
let parent_ascii = parent_density == 0.0;
let parent_use_bytewise = parent_density < self.scan.charwise_density_threshold();
if is_leaf {
if child.pt_index_mask != 0 {
let is_noop = parent_ascii && step.is_noop_on_ascii_input();
stopped = if is_noop {
let ctx = ScanContext {
text_index: parent_vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
use_bytewise: parent_use_bytewise,
};
self.scan_variant(texts[parent_aidx].as_ref(), ctx, state)
} else {
let vi = variant_counter;
variant_counter += 1;
let child_use_bytewise = if parent_ascii {
true
} else {
step.output_use_bytewise(parent_use_bytewise)
};
let ctx = ScanContext {
text_index: vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
use_bytewise: child_use_bytewise,
};
self.scan_variant_streaming(
step,
texts[parent_aidx].as_ref(),
ctx,
state,
)
};
if stopped {
break 'walk;
}
}
} else {
let output = step.apply(texts[parent_aidx].as_ref(), parent_density);
let (child_aidx, child_vi) = match output.changed {
Some(s) => {
let idx = texts.len();
density_flags.push(output.output_density);
texts.push(Cow::Owned(s));
let vi = variant_counter;
variant_counter += 1;
(idx, vi)
}
None => (parent_aidx, parent_vi),
};
node_arena[child_idx] = child_aidx;
node_variant[child_idx] = child_vi;
if child.pt_index_mask != 0 {
let child_density = density_flags[child_aidx];
let ctx = ScanContext {
text_index: child_vi,
process_type_mask: child.pt_index_mask,
num_variants,
exit_early,
use_bytewise: child_density < self.scan.charwise_density_threshold(),
};
stopped = self.scan_variant(texts[child_aidx].as_ref(), ctx, state);
if stopped {
break 'walk;
}
}
}
}
}
for cow in texts {
if let Cow::Owned(s) = cow {
return_string_to_pool(s);
}
}
if stopped {
return true;
}
if let Some(results) = results {
self.rules.collect_matches(state, results);
}
self.rules.has_match(state)
}
#[inline(always)]
fn scan_variant_streaming(
&self,
step: &TransformStep,
parent_text: &str,
ctx: ScanContext,
state: &mut SimpleMatchState,
) -> bool {
match step {
TransformStep::None => self.scan_variant(parent_text, ctx, state),
TransformStep::Fanjian(matcher) => self.scan.for_each_match_value_from_iter(
matcher.byte_iter(parent_text),
ctx.use_bytewise,
|raw_value| self.process_match(raw_value, ctx, state),
),
TransformStep::Delete(matcher) => self.scan.for_each_match_value_from_iter(
matcher.byte_iter(parent_text),
ctx.use_bytewise,
|raw_value| self.process_match(raw_value, ctx, state),
),
TransformStep::Normalize(matcher) => self.scan.for_each_match_value_from_iter(
matcher.byte_iter(parent_text),
ctx.use_bytewise,
|raw_value| self.process_match(raw_value, ctx, state),
),
TransformStep::PinYin(matcher) | TransformStep::PinYinChar(matcher) => {
self.scan.for_each_match_value_from_iter(
matcher.byte_iter(parent_text),
ctx.use_bytewise,
|raw_value| self.process_match(raw_value, ctx, state),
)
}
}
}
}