use std::borrow::Cow;
use serde::Serialize;
use crate::process::{ProcessTypeBitNode, return_processed_string_to_pool, walk_process_tree};
mod construction;
mod scan;
mod types;
use types::{
AsciiMatcher, NonAsciiMatcher, PatternEntry, RuleCold, RuleHot, SIMPLE_MATCH_STATE, ScanContext,
};
pub use types::{SimpleTable, SimpleTableSerde};
#[derive(Serialize, Debug)]
pub struct SimpleResult<'a> {
pub word_id: u32,
pub word: Cow<'a, str>,
}
#[derive(Clone)]
pub struct SimpleMatcher {
process_type_tree: Vec<ProcessTypeBitNode>,
ascii_matcher: Option<AsciiMatcher>,
non_ascii_matcher: Option<NonAsciiMatcher>,
single_pt_index: Option<u8>,
ac_dedup_entries: Vec<PatternEntry>,
ac_dedup_ranges: Vec<(usize, usize)>,
rule_hot: Vec<RuleHot>,
rule_cold: Vec<RuleCold>,
all_simple: bool,
}
impl SimpleMatcher {
pub fn is_match(&self, text: &str) -> bool {
if text.is_empty() {
return false;
}
if self.all_simple {
return self.is_match_simple(text);
}
if self.single_pt_index.is_some() {
self.is_match_inner::<true>(text)
} else {
self.is_match_inner::<false>(text)
}
}
fn is_match_simple(&self, text: &str) -> bool {
if text.is_ascii() {
if let Some(ref m) = self.ascii_matcher {
return match m {
#[cfg(feature = "dfa")]
AsciiMatcher::AcDfa { matcher, .. } => matcher.is_match(text),
AsciiMatcher::DaacBytewise(d) => d.find_iter(text).next().is_some(),
};
}
} else if let Some(ref m) = self.non_ascii_matcher {
return match m {
NonAsciiMatcher::DaacCharwise(d) => d.find_iter(text).next().is_some(),
};
} else if let Some(ref m) = self.ascii_matcher {
return match m {
#[cfg(feature = "dfa")]
AsciiMatcher::AcDfa { matcher, .. } => matcher.is_match(text),
AsciiMatcher::DaacBytewise(d) => d.find_iter(text).next().is_some(),
};
}
false
}
#[inline(always)]
fn is_match_inner<const SINGLE_PT: bool>(&self, text: &str) -> bool {
let tree = &self.process_type_tree;
let max_pt = tree.len();
let mut state = SIMPLE_MATCH_STATE.borrow_mut();
state.prepare(self.rule_hot.len());
let (text_masks, stopped) =
walk_process_tree::<true, _>(tree, text, &mut |txt, idx, mask, is_ascii| {
let ctx = ScanContext {
text_index: idx,
process_type_mask: mask,
num_variants: max_pt,
exit_early: true,
is_ascii,
};
self.scan_variant::<SINGLE_PT>(txt, ctx, &mut state)
});
if stopped {
return_processed_string_to_pool(text_masks);
return true;
}
let generation = state.generation;
let result = state.touched_indices.iter().any(|&rule_idx| {
let word_state = &state.word_states[rule_idx];
word_state.positive_generation == generation && word_state.not_generation != generation
});
return_processed_string_to_pool(text_masks);
result
}
pub fn process<'a>(&'a self, text: &'a str) -> Vec<SimpleResult<'a>> {
let mut results = Vec::new();
self.process_into(text, &mut results);
results
}
pub fn process_into<'a>(&'a self, text: &'a str, results: &mut Vec<SimpleResult<'a>>) {
if text.is_empty() {
return;
}
let (processed, _) =
walk_process_tree::<false, _>(&self.process_type_tree, text, &mut |_, _, _, _| false);
self.process_preprocessed_into(&processed, results);
return_processed_string_to_pool(processed);
}
}