use std::{
borrow::Cow,
collections::{HashMap, HashSet},
};
use foldhash::HashMapExt;
type FoldHashMap<K, V> = HashMap<K, V, foldhash::fast::FixedState>;
use super::{
SimpleMatcher,
pattern::{BITMASK_CAPACITY, PROCESS_TYPE_TABLE_SIZE, PatternEntry, PatternKind},
rule::{Rule, RuleInfo, RuleSet, SatisfactionMethod},
scan::ScanPlan,
tree::build_process_type_tree,
};
use crate::{
MatcherError,
process::{ProcessType, reduce_text_process_emit},
};
pub(super) struct ParsedRules<'a> {
pub(super) dedup_patterns: Vec<Cow<'a, str>>,
pub(super) dedup_entries: Vec<Vec<PatternEntry>>,
pub(super) rules: RuleSet,
}
impl SimpleMatcher {
pub fn new<'a, I, S1, S2>(
process_type_word_map: &'a HashMap<ProcessType, HashMap<u32, I, S1>, S2>,
) -> Result<SimpleMatcher, MatcherError>
where
I: AsRef<str> + 'a,
{
for &pt in process_type_word_map.keys() {
if (pt.bits() as usize) >= PROCESS_TYPE_TABLE_SIZE {
return Err(MatcherError::invalid_process_type(pt.bits()));
}
}
let pt_index_table = Self::build_pt_index_table(process_type_word_map.keys().copied());
let process_type_set: HashSet<ProcessType> =
process_type_word_map.keys().copied().collect();
let parsed = Self::parse_rules(process_type_word_map, &pt_index_table);
if parsed.dedup_patterns.is_empty() {
return Err(MatcherError::EmptyPatterns);
}
let process_type_tree = build_process_type_tree(&process_type_set, &pt_index_table);
let scan = ScanPlan::compile(
&parsed.dedup_patterns,
parsed.dedup_entries,
parsed.rules.rule_info(),
)?;
let is_match_fast = process_type_tree[0].children.is_empty()
&& scan.patterns().all_single_and(parsed.rules.rule_info())
&& !scan.patterns().has_boundary();
Ok(SimpleMatcher {
tree: process_type_tree,
scan,
rules: parsed.rules,
is_match_fast,
})
}
fn build_pt_index_table(
process_type_keys: impl Iterator<Item = ProcessType>,
) -> [u8; PROCESS_TYPE_TABLE_SIZE] {
let mut pt_index_table = [u8::MAX; PROCESS_TYPE_TABLE_SIZE];
let mut next_pt_idx: u8 = 0;
pt_index_table[ProcessType::None.bits() as usize] = next_pt_idx;
next_pt_idx += 1;
for pt in process_type_keys {
let bits = pt.bits() as usize;
if bits < PROCESS_TYPE_TABLE_SIZE && pt_index_table[bits] == u8::MAX {
pt_index_table[bits] = next_pt_idx;
next_pt_idx += 1;
}
}
pt_index_table
}
#[optimize(speed)]
fn parse_rules<'a, I, S1, S2>(
process_type_word_map: &'a HashMap<ProcessType, HashMap<u32, I, S1>, S2>,
pt_index_table: &[u8; PROCESS_TYPE_TABLE_SIZE],
) -> ParsedRules<'a>
where
I: AsRef<str> + 'a,
{
let word_size: usize = process_type_word_map.values().map(|map| map.len()).sum();
let mut dedup_entries: Vec<Vec<PatternEntry>> = Vec::with_capacity(word_size);
let mut rules: Vec<Rule> = Vec::with_capacity(word_size);
let mut rule_infos: Vec<RuleInfo> = Vec::with_capacity(word_size);
let mut word_id_to_idx: FoldHashMap<(ProcessType, u32), usize> =
FoldHashMap::with_capacity(word_size);
let mut next_pattern_id: usize = 0;
let mut dedup_patterns = Vec::with_capacity(word_size);
let mut pattern_id_map: FoldHashMap<Cow<'_, str>, usize> =
FoldHashMap::with_capacity(word_size);
let mut and_splits: FoldHashMap<&str, i32> = FoldHashMap::new();
let mut not_splits: FoldHashMap<&str, i32> = FoldHashMap::new();
for (&process_type, simple_word_map) in process_type_word_map {
let word_process_type = process_type - ProcessType::Delete;
for (&simple_word_id, simple_word) in simple_word_map {
if simple_word.as_ref().is_empty() {
continue;
}
and_splits.clear();
not_splits.clear();
let mut start = 0;
let mut current_is_not = false;
let mut add_sub_word = |word: &'a str, is_not: bool| {
if word.is_empty() {
return;
}
if is_not {
let entry = not_splits.entry(word).or_insert(1);
*entry -= 1;
} else {
let entry = and_splits.entry(word).or_insert(0);
*entry += 1;
}
};
for (index, marker) in simple_word.as_ref().match_indices(['&', '~']) {
add_sub_word(&simple_word.as_ref()[start..index], current_is_not);
current_is_not = marker == "~";
start = index + 1;
}
add_sub_word(&simple_word.as_ref()[start..], current_is_not);
if and_splits.is_empty() && not_splits.is_empty() {
continue;
}
if and_splits.is_empty() && !not_splits.is_empty() {
eprintln!(
"matcher_rs: skipping pure-NOT rule (word_id={simple_word_id}, \
word={:?}) — rules with only NOT (~) segments and no AND segments \
can never match",
simple_word.as_ref(),
);
continue;
}
let and_count = and_splits.len();
let segment_counts = and_splits
.values()
.copied()
.chain(not_splits.values().copied())
.collect::<Vec<i32>>();
let use_matrix = and_count > BITMASK_CAPACITY
|| segment_counts.len() > BITMASK_CAPACITY
|| segment_counts[..and_count].iter().any(|&value| value != 1)
|| segment_counts[and_count..].iter().any(|&value| value != 0);
let has_not = and_count != segment_counts.len();
let method = match (use_matrix, and_count == 1) {
(true, _) => SatisfactionMethod::Matrix,
(false, true) => SatisfactionMethod::Immediate,
(false, false) => SatisfactionMethod::Bitmask,
};
let info = RuleInfo {
and_count: and_count as u8,
method,
has_not,
};
let rule_idx = if let Some(&existing_idx) =
word_id_to_idx.get(&(process_type, simple_word_id))
{
rules[existing_idx] = Rule {
segment_counts,
word_id: simple_word_id,
word: simple_word.as_ref().to_owned(),
};
rule_infos[existing_idx] = info;
existing_idx
} else {
let idx = rules.len();
word_id_to_idx.insert((process_type, simple_word_id), idx);
rules.push(Rule {
segment_counts,
word_id: simple_word_id,
word: simple_word.as_ref().to_owned(),
});
rule_infos.push(info);
idx
};
for (offset, &split_word) in and_splits.keys().chain(not_splits.keys()).enumerate()
{
assert!(
offset < 256,
"rule has {offset} segments; PatternEntry::offset is u8 (max 255)"
);
assert!(
and_count < 256,
"rule has {and_count} AND segments; RuleInfo::and_count is u8 (max 255)"
);
let kind = if offset < and_count {
PatternKind::And
} else {
PatternKind::Not
};
for alternative in split_word.split('|') {
if alternative.is_empty() {
continue;
}
let (boundary_flags, inner) = parse_boundary_markers(alternative);
if inner.is_empty() {
continue;
}
for ac_word in reduce_text_process_emit(word_process_type, inner) {
let pt_index = pt_index_table[process_type.bits() as usize];
let entry = PatternEntry {
rule_idx: rule_idx as u32,
offset: offset as u8,
pt_index,
kind,
boundary: boundary_flags,
};
let Some(&dedup_id) = pattern_id_map.get(ac_word.as_ref()) else {
pattern_id_map.insert(ac_word.clone(), next_pattern_id);
dedup_entries.push(vec![entry]);
dedup_patterns.push(ac_word);
next_pattern_id += 1;
continue;
};
dedup_entries[dedup_id].push(entry);
}
}
}
}
}
ParsedRules {
dedup_patterns,
dedup_entries,
rules: RuleSet::new(rules, rule_infos),
}
}
}
pub(super) const BOUNDARY_LEFT: u8 = 1;
pub(super) const BOUNDARY_RIGHT: u8 = 2;
fn parse_boundary_markers(s: &str) -> (u8, &str) {
let mut flags = 0u8;
let mut inner = s;
if inner.starts_with("\\b") {
flags |= BOUNDARY_LEFT;
inner = &inner[2..];
}
if inner.ends_with("\\b") {
flags |= BOUNDARY_RIGHT;
inner = &inner[..inner.len() - 2];
}
(flags, inner)
}
#[cfg(test)]
mod tests {
use super::{super::pattern::PatternKind, *};
use crate::process::ProcessType;
fn single_rule_table(
pt: ProcessType,
word_id: u32,
pattern: &str,
) -> HashMap<ProcessType, HashMap<u32, String>> {
let mut table = HashMap::new();
table
.entry(pt)
.or_insert_with(HashMap::new)
.insert(word_id, pattern.to_owned());
table
}
#[test]
fn test_pt_index_table() {
let keys = vec![ProcessType::VariantNorm, ProcessType::Delete];
let table = SimpleMatcher::build_pt_index_table(keys.into_iter());
assert_eq!(table[ProcessType::None.bits() as usize], 0);
let keys = vec![
ProcessType::None,
ProcessType::VariantNorm,
ProcessType::Delete,
];
let table = SimpleMatcher::build_pt_index_table(keys.into_iter());
assert_eq!(table[ProcessType::None.bits() as usize], 0);
let fj = table[ProcessType::VariantNorm.bits() as usize];
let del = table[ProcessType::Delete.bits() as usize];
assert!(fj == 1 || fj == 2);
assert!(del == 1 || del == 2);
assert_ne!(fj, del);
for (i, &val) in table.iter().enumerate() {
let pt_bits = i as u8;
if pt_bits != ProcessType::None.bits()
&& pt_bits != ProcessType::VariantNorm.bits()
&& pt_bits != ProcessType::Delete.bits()
{
assert_eq!(val, u8::MAX);
}
}
}
#[test]
fn test_parse_rules_simple() {
let table = single_rule_table(ProcessType::None, 1, "hello");
let pt_index_table = SimpleMatcher::build_pt_index_table(table.keys().copied());
let parsed = SimpleMatcher::parse_rules(&table, &pt_index_table);
assert_eq!(parsed.dedup_patterns.len(), 1);
assert_eq!(parsed.dedup_patterns[0].as_ref(), "hello");
assert_eq!(parsed.dedup_entries.len(), 1);
assert_eq!(parsed.dedup_entries[0].len(), 1);
assert_eq!(parsed.dedup_entries[0][0].kind, PatternKind::And);
}
#[test]
fn test_parse_rules_operators() {
let table = single_rule_table(ProcessType::None, 1, "a&b");
let pt_index_table = SimpleMatcher::build_pt_index_table(table.keys().copied());
let parsed = SimpleMatcher::parse_rules(&table, &pt_index_table);
assert_eq!(parsed.dedup_patterns.len(), 2);
let kinds: Vec<_> = parsed
.dedup_entries
.iter()
.flat_map(|bucket| bucket.iter().map(|e| e.kind))
.collect();
assert!(kinds.iter().all(|k| *k == PatternKind::And));
assert_eq!(parsed.rules.len(), 1);
let table = single_rule_table(ProcessType::None, 1, "a~b");
let pt_index_table = SimpleMatcher::build_pt_index_table(table.keys().copied());
let parsed = SimpleMatcher::parse_rules(&table, &pt_index_table);
assert_eq!(parsed.dedup_patterns.len(), 2);
let all_entries: Vec<_> = parsed
.dedup_entries
.iter()
.flat_map(|bucket| bucket.iter())
.collect();
let and_count = all_entries
.iter()
.filter(|e| e.kind == PatternKind::And)
.count();
let not_count = all_entries
.iter()
.filter(|e| e.kind == PatternKind::Not)
.count();
assert_eq!(and_count, 1);
assert_eq!(not_count, 1);
}
}