use std::borrow::Cow;
use std::collections::HashMap;
use crate::process::ProcessType;
use super::SimpleResult;
use super::pattern::{PatternEntry, PatternKind};
use super::state::{ScanContext, ScanState, init_matrix};
pub type SimpleTable<'a> = HashMap<ProcessType, HashMap<u32, &'a str>>;
pub type SimpleTableSerde<'a> = HashMap<ProcessType, HashMap<u32, Cow<'a, str>>>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub(super) enum RuleShape {
Bitmask = 0,
BitmaskNot = 1,
SingleAnd = 2,
SingleAndNot = 3,
Matrix = 4,
MatrixNot = 5,
}
impl RuleShape {
pub(super) fn has_not(self) -> bool {
self as u8 & 1 != 0
}
pub(super) fn use_matrix(self) -> bool {
matches!(self, Self::Matrix | Self::MatrixNot)
}
}
#[derive(Debug, Clone)]
pub(super) struct RuleHot {
pub(super) segment_counts: Vec<i32>,
}
#[derive(Debug, Clone)]
pub(super) struct RuleCold {
pub(super) word_id: u32,
pub(super) word: String,
}
#[derive(Clone)]
pub(super) struct RuleSet {
hot: Vec<RuleHot>,
cold: Vec<RuleCold>,
has_not_rules: bool,
}
impl RuleSet {
pub(super) fn new(hot: Vec<RuleHot>, cold: Vec<RuleCold>, has_not_rules: bool) -> Self {
Self {
hot,
cold,
has_not_rules,
}
}
#[inline(always)]
pub(super) fn has_not_rules(&self) -> bool {
self.has_not_rules
}
pub(super) fn heap_bytes(&self) -> usize {
let hot_inner: usize = self
.hot
.iter()
.map(|r| r.segment_counts.capacity() * size_of::<i32>())
.sum();
let cold_inner: usize = self.cold.iter().map(|r| r.word.capacity()).sum();
self.hot.capacity() * size_of::<RuleHot>()
+ hot_inner
+ self.cold.capacity() * size_of::<RuleCold>()
+ cold_inner
}
#[inline(always)]
pub(super) fn len(&self) -> usize {
self.hot.len()
}
#[inline(always)]
pub(super) fn has_match(&self, ss: &ScanState<'_>) -> bool {
ss.touched_indices()
.iter()
.any(|&rule_idx| ss.rule_is_satisfied(rule_idx))
}
#[inline(always)]
pub(super) fn push_result_if_new<'a>(
&'a self,
rule_idx: usize,
ss: &mut ScanState<'_>,
results: &mut Vec<SimpleResult<'a>>,
) {
if ss.mark_positive_simple(rule_idx) {
self.push_result(rule_idx, results);
}
}
pub(super) fn collect_matches<'a>(
&'a self,
ss: &ScanState<'_>,
results: &mut Vec<SimpleResult<'a>>,
) {
for &rule_idx in ss.touched_indices() {
if ss.rule_is_satisfied(rule_idx) {
self.push_result(rule_idx, results);
}
}
}
#[inline(always)]
pub(super) fn process_entry(
&self,
entry: &PatternEntry,
ctx: ScanContext,
ss: &mut ScanState<'_>,
) -> bool {
let generation = ss.generation;
let &PatternEntry {
rule_idx,
offset,
pt_index,
kind,
shape,
boundary: _,
and_count: _,
} = entry;
let rule_idx = rule_idx as usize;
if ctx.process_type_mask & (1u64 << pt_index) == 0 {
return false;
}
debug_assert!(rule_idx < ss.word_states.len());
debug_assert!(rule_idx < self.hot.len());
match kind {
PatternKind::Simple => {
let word_state = unsafe { ss.word_states.get_unchecked_mut(rule_idx) };
if word_state.positive_generation == generation {
return ctx.exit_early;
}
if word_state.matrix_generation != generation {
word_state.matrix_generation = generation;
word_state.positive_generation = generation;
ss.touched_indices.push(rule_idx);
ss.resolved_count += 1;
return ctx.exit_early;
}
}
PatternKind::And => {
let offset = offset as usize;
let word_state = unsafe { ss.word_states.get_unchecked_mut(rule_idx) };
if shape.has_not() && word_state.not_generation == generation {
return false;
}
if word_state.positive_generation == generation {
if !shape.has_not() && ctx.exit_early {
return true;
}
return false;
}
if word_state.matrix_generation != generation {
let and_count = entry.and_count;
word_state.matrix_generation = generation;
word_state.positive_generation = if and_count == 0 { generation } else { 0 };
word_state.remaining_and = and_count as u16;
word_state.satisfied_mask = 0;
ss.touched_indices.push(rule_idx);
if shape.use_matrix() {
let rule = unsafe { self.hot.get_unchecked(rule_idx) };
init_matrix(
unsafe { ss.matrix.get_unchecked_mut(rule_idx) },
unsafe { ss.matrix_status.get_unchecked_mut(rule_idx) },
&rule.segment_counts,
ctx.num_variants,
);
}
}
let is_satisfied = if shape.use_matrix() {
let flat_matrix = unsafe { ss.matrix.get_unchecked_mut(rule_idx) };
let flat_status = unsafe { ss.matrix_status.get_unchecked_mut(rule_idx) };
let counter = &mut flat_matrix[offset * ctx.num_variants + ctx.text_index];
*counter -= 1;
if flat_status[offset] == 0 && *counter <= 0 {
flat_status[offset] = 1;
word_state.remaining_and -= 1;
if word_state.remaining_and == 0 {
word_state.positive_generation = generation;
ss.resolved_count += 1;
}
}
word_state.positive_generation == generation
} else if matches!(shape, RuleShape::SingleAnd | RuleShape::SingleAndNot) {
word_state.positive_generation = generation;
ss.resolved_count += 1;
true
} else {
let bit = 1u64 << offset;
if word_state.satisfied_mask & bit == 0 {
word_state.satisfied_mask |= bit;
word_state.remaining_and -= 1;
if word_state.remaining_and == 0 {
word_state.positive_generation = generation;
ss.resolved_count += 1;
}
}
word_state.positive_generation == generation
};
if ctx.exit_early
&& is_satisfied
&& !shape.has_not()
&& word_state.not_generation != generation
{
return true;
}
}
PatternKind::Not => {
let offset = offset as usize;
let word_state = unsafe { ss.word_states.get_unchecked_mut(rule_idx) };
if word_state.not_generation == generation {
return false;
}
if word_state.matrix_generation != generation {
let and_count = entry.and_count;
word_state.matrix_generation = generation;
word_state.positive_generation = if and_count == 0 { generation } else { 0 };
word_state.remaining_and = and_count as u16;
word_state.satisfied_mask = 0;
ss.touched_indices.push(rule_idx);
if shape.use_matrix() {
let rule = unsafe { self.hot.get_unchecked(rule_idx) };
init_matrix(
unsafe { ss.matrix.get_unchecked_mut(rule_idx) },
unsafe { ss.matrix_status.get_unchecked_mut(rule_idx) },
&rule.segment_counts,
ctx.num_variants,
);
}
}
if shape.use_matrix() {
let flat_matrix = unsafe { ss.matrix.get_unchecked_mut(rule_idx) };
let flat_status = unsafe { ss.matrix_status.get_unchecked_mut(rule_idx) };
let counter = &mut flat_matrix[offset * ctx.num_variants + ctx.text_index];
*counter += 1;
if flat_status[offset] == 0 && *counter > 0 {
flat_status[offset] = 1;
word_state.not_generation = generation;
}
} else {
word_state.not_generation = generation;
}
}
}
false
}
#[inline(always)]
fn push_result<'a>(&'a self, rule_idx: usize, results: &mut Vec<SimpleResult<'a>>) {
debug_assert!(rule_idx < self.cold.len());
let cold = unsafe { self.cold.get_unchecked(rule_idx) };
results.push(SimpleResult {
word_id: cold.word_id,
word: Cow::Borrowed(&cold.word),
});
}
}
#[cfg(test)]
mod tests {
use super::super::state::SimpleMatchState;
use super::*;
fn make_ctx(exit_early: bool) -> ScanContext {
ScanContext {
text_index: 0,
process_type_mask: u64::MAX,
num_variants: 1,
exit_early,
non_ascii_density: 0.0,
}
}
fn make_simple_ruleset(word_id: u32, word: &str) -> RuleSet {
RuleSet::new(
vec![RuleHot {
segment_counts: vec![1],
}],
vec![RuleCold {
word_id,
word: word.to_owned(),
}],
false,
)
}
#[test]
fn test_rule_shape_predicates() {
assert!(!RuleShape::Bitmask.has_not());
assert!(RuleShape::BitmaskNot.has_not());
assert!(!RuleShape::SingleAnd.has_not());
assert!(RuleShape::SingleAndNot.has_not());
assert!(!RuleShape::Matrix.has_not());
assert!(RuleShape::MatrixNot.has_not());
assert!(!RuleShape::Bitmask.use_matrix());
assert!(!RuleShape::BitmaskNot.use_matrix());
assert!(!RuleShape::SingleAnd.use_matrix());
assert!(!RuleShape::SingleAndNot.use_matrix());
assert!(RuleShape::Matrix.use_matrix());
assert!(RuleShape::MatrixNot.use_matrix());
}
#[test]
fn test_process_entry_simple_kind() {
let rules = make_simple_ruleset(1, "hello");
let mut state = SimpleMatchState::new();
state.prepare(1);
let mut ss = state.as_scan_state();
let entry = PatternEntry {
rule_idx: 0,
offset: 0,
pt_index: 0,
kind: PatternKind::Simple,
shape: RuleShape::SingleAnd,
boundary: 0,
and_count: 1,
};
let result = rules.process_entry(&entry, make_ctx(true), &mut ss);
assert!(result, "Simple entry with exit_early should return true");
assert!(ss.rule_is_satisfied(0));
let result2 = rules.process_entry(&entry, make_ctx(true), &mut ss);
assert!(
result2,
"already-satisfied Simple should still return exit_early"
);
}
#[test]
fn test_process_entry_and_bitmask() {
let rules = RuleSet::new(
vec![RuleHot {
segment_counts: vec![1, 1, 1],
}],
vec![RuleCold {
word_id: 1,
word: "a&b&c".to_owned(),
}],
false,
);
let mut state = SimpleMatchState::new();
state.prepare(1);
let mut ss = state.as_scan_state();
let ctx = make_ctx(true);
let e0 = PatternEntry {
rule_idx: 0,
offset: 0,
pt_index: 0,
kind: PatternKind::And,
shape: RuleShape::Bitmask,
boundary: 0,
and_count: 3,
};
assert!(!rules.process_entry(&e0, ctx, &mut ss));
assert!(!ss.rule_is_satisfied(0));
let e1 = PatternEntry { offset: 1, ..e0 };
assert!(!rules.process_entry(&e1, ctx, &mut ss));
assert!(!ss.rule_is_satisfied(0));
let e2 = PatternEntry { offset: 2, ..e0 };
assert!(rules.process_entry(&e2, ctx, &mut ss));
assert!(ss.rule_is_satisfied(0));
}
#[test]
fn test_process_entry_not_veto() {
let rules = RuleSet::new(
vec![RuleHot {
segment_counts: vec![1, 0],
}],
vec![RuleCold {
word_id: 1,
word: "a~b".to_owned(),
}],
true,
);
let mut state = SimpleMatchState::new();
state.prepare(1);
let mut ss = state.as_scan_state();
let ctx = make_ctx(false);
let and_entry = PatternEntry {
rule_idx: 0,
offset: 0,
pt_index: 0,
kind: PatternKind::And,
shape: RuleShape::SingleAndNot,
boundary: 0,
and_count: 1,
};
rules.process_entry(&and_entry, ctx, &mut ss);
assert!(ss.rule_is_satisfied(0));
let not_entry = PatternEntry {
rule_idx: 0,
offset: 1,
pt_index: 0,
kind: PatternKind::Not,
shape: RuleShape::SingleAndNot,
boundary: 0,
and_count: 1,
};
rules.process_entry(¬_entry, ctx, &mut ss);
assert!(!ss.rule_is_satisfied(0), "NOT should veto the rule");
}
#[test]
fn test_process_entry_matrix_counters() {
let rules = RuleSet::new(
vec![RuleHot {
segment_counts: vec![2, 1],
}],
vec![RuleCold {
word_id: 1,
word: "a&a&b".to_owned(),
}],
false,
);
let mut state = SimpleMatchState::new();
state.prepare(1);
let mut ss = state.as_scan_state();
let ctx = make_ctx(true);
let seg0 = PatternEntry {
rule_idx: 0,
offset: 0,
pt_index: 0,
kind: PatternKind::And,
shape: RuleShape::Matrix,
boundary: 0,
and_count: 2,
};
let seg1 = PatternEntry {
rule_idx: 0,
offset: 1,
pt_index: 0,
kind: PatternKind::And,
shape: RuleShape::Matrix,
boundary: 0,
and_count: 2,
};
assert!(!rules.process_entry(&seg0, ctx, &mut ss));
assert!(!ss.rule_is_satisfied(0));
assert!(!rules.process_entry(&seg1, ctx, &mut ss));
assert!(!ss.rule_is_satisfied(0));
assert!(rules.process_entry(&seg0, ctx, &mut ss));
assert!(ss.rule_is_satisfied(0));
}
#[test]
fn test_process_entry_pt_mask_filters() {
let rules = make_simple_ruleset(1, "hello");
let mut state = SimpleMatchState::new();
state.prepare(1);
let mut ss = state.as_scan_state();
let entry = PatternEntry {
rule_idx: 0,
offset: 0,
pt_index: 3,
kind: PatternKind::Simple,
shape: RuleShape::SingleAnd,
boundary: 0,
and_count: 1,
};
let ctx = ScanContext {
text_index: 0,
process_type_mask: 0b0101,
num_variants: 1,
exit_early: true,
non_ascii_density: 0.0,
};
assert!(!rules.process_entry(&entry, ctx, &mut ss));
assert!(!ss.rule_is_satisfied(0), "entry should be filtered by mask");
}
}