matcher_rs 0.12.0

A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust.
Documentation
//! Construction of [`super::SimpleMatcher`] — rule parsing, emitted-pattern deduplication,
//! and matcher compilation.
//!
//! [`SimpleMatcher::new`](super::SimpleMatcher::new) is the entry point; the helpers in this
//! module parse rules, emit transformed sub-patterns, compile the scan engines, and flatten
//! the rule metadata used during search.

use std::borrow::Cow;
use std::collections::{HashMap, HashSet};

#[cfg(feature = "dfa")]
use aho_corasick::{AhoCorasickBuilder, AhoCorasickKind, MatchKind as AhoCorasickMatchKind};
use daachorse::{
    DoubleArrayAhoCorasickBuilder, MatchKind as DoubleArrayAhoCorasickMatchKind,
    charwise::CharwiseDoubleArrayAhoCorasickBuilder,
};

use crate::process::process_matcher::reduce_text_process_emit;
use crate::process::{ProcessType, build_process_type_tree};

use super::SimpleMatcher;
#[cfg(feature = "dfa")]
use super::types::AC_DFA_PATTERN_THRESHOLD;
use super::types::{
    AsciiMatcher, BITMASK_CAPACITY, NonAsciiMatcher, PROCESS_TYPE_TABLE_SIZE, PatternEntry,
    PatternKind, RuleCold, RuleHot,
};

/// Intermediate outputs of [`SimpleMatcher::parse_rules`], bundling all data
/// that [`SimpleMatcher::new`] needs to proceed to automaton compilation.
pub(super) struct ParsedRules<'a> {
    pub(super) dedup_patterns: Vec<Cow<'a, str>>,
    pub(super) dedup_entries: Vec<Vec<PatternEntry>>,
    pub(super) rule_hot: Vec<RuleHot>,
    pub(super) rule_cold: Vec<RuleCold>,
}

impl SimpleMatcher {
    /// Compiles a new [`SimpleMatcher`] from a `{ProcessType → {word_id → pattern}}` map.
    ///
    /// Prefer [`crate::SimpleMatcherBuilder`] for a more ergonomic API.
    ///
    /// Construction cost scales with the number of rules and the number of emitted
    /// sub-pattern variants, so it is intended to happen once and then be reused.
    /// The steps are:
    /// 1. Parse `&`/`~` operators in each pattern into AND and NOT sub-patterns.
    /// 2. For each sub-pattern, generate all normalized text variants via
    ///    [`reduce_text_process_emit`].
    /// 3. Deduplicate all variants across all rules and process types into a single
    ///    pattern set.
    /// 4. Partition that pattern set into ASCII and charwise buckets, then compile the
    ///    corresponding matcher engines.
    /// 5. Build the transformation trie (`ProcessTypeBitNode` tree) for fast text
    ///    pre-processing at match time.
    ///
    /// One subtle detail: sub-patterns are indexed under `process_type - ProcessType::Delete`,
    /// not the full `process_type`. Each sub-pattern is transformed with all steps except
    /// Delete, so it lives in the same coordinate space as the delete-transformed input text.
    /// Applying Delete a second time to the sub-pattern would corrupt the match.
    ///
    /// # Arguments
    /// * `process_type_word_map` — input rule table; the value type `I` must implement
    ///   `AsRef<str>` so both `&str` and `Cow<str>` are accepted.
    ///
    /// # Panics
    /// Panics if one of the internal matcher builders rejects the deduplicated pattern set.
    /// With normal UTF-8 input this should not happen.
    pub fn new<'a, I, S1, S2>(
        process_type_word_map: &'a HashMap<ProcessType, HashMap<u32, I, S1>, S2>,
    ) -> SimpleMatcher
    where
        I: AsRef<str> + 'a,
    {
        let pt_index_table = Self::build_pt_index_table(process_type_word_map.keys().copied());

        let process_type_set: HashSet<ProcessType> =
            process_type_word_map.keys().copied().collect();
        let single_pt_index = if process_type_set.len() == 1 {
            process_type_set
                .iter()
                .next()
                .map(|pt| pt_index_table[pt.bits() as usize])
        } else {
            None
        };

        let parsed = Self::parse_rules(process_type_word_map, &pt_index_table);

        let (ascii_matcher, non_ascii_matcher) = Self::compile_automata(&parsed.dedup_patterns);

        let (ac_dedup_entries, ac_dedup_ranges) = Self::flatten_dedup_entries(parsed.dedup_entries);

        let mut process_type_tree = build_process_type_tree(&process_type_set);
        for node in &mut process_type_tree {
            node.recompute_mask_with_index(&pt_index_table);
        }

        let all_simple = process_type_tree[0].children.is_empty()
            && ac_dedup_entries
                .iter()
                .all(|e| e.kind == PatternKind::Simple);

        SimpleMatcher {
            process_type_tree,
            ascii_matcher,
            non_ascii_matcher,
            single_pt_index,
            ac_dedup_entries,
            ac_dedup_ranges,
            rule_hot: parsed.rule_hot,
            rule_cold: parsed.rule_cold,
            all_simple,
        }
    }

    /// Builds the sequential [`ProcessType`] index table.
    ///
    /// Maps `pt.bits()` → a compact sequential index (0, 1, 2, …) for every composite
    /// `ProcessType` used in `process_type_word_map`. [`ProcessType::None`] always gets
    /// index 0. Unused slots contain `u8::MAX`.
    fn build_pt_index_table(
        process_type_keys: impl Iterator<Item = ProcessType>,
    ) -> [u8; PROCESS_TYPE_TABLE_SIZE] {
        let mut pt_index_table = [u8::MAX; PROCESS_TYPE_TABLE_SIZE];
        let mut next_pt_idx: u8 = 0;
        // None first — it always occupies a slot (root node always emits it).
        pt_index_table[ProcessType::None.bits() as usize] = next_pt_idx;
        next_pt_idx += 1;
        for pt in process_type_keys {
            let bits = pt.bits() as usize;
            if bits < PROCESS_TYPE_TABLE_SIZE && pt_index_table[bits] == u8::MAX {
                pt_index_table[bits] = next_pt_idx;
                next_pt_idx += 1;
            }
        }
        pt_index_table
    }

    /// Parses all rules, deduplicates sub-patterns, and builds `PatternEntry` records.
    ///
    /// For each word in `process_type_word_map`:
    /// - splits the pattern string on `&` and `~` operators into AND and NOT sub-patterns
    /// - generates all normalized text variants of each sub-pattern via [`reduce_text_process_emit`]
    /// - deduplicates variants across all rules into a flat pattern list
    /// - records a [`PatternEntry`] linking each variant back to its rule and sub-pattern offset
    fn parse_rules<'a, I, S1, S2>(
        process_type_word_map: &'a HashMap<ProcessType, HashMap<u32, I, S1>, S2>,
        pt_index_table: &[u8; PROCESS_TYPE_TABLE_SIZE],
    ) -> ParsedRules<'a>
    where
        I: AsRef<str> + 'a,
    {
        let word_size: usize = process_type_word_map.values().map(|m| m.len()).sum();

        let mut dedup_entries: Vec<Vec<PatternEntry>> = Vec::with_capacity(word_size);
        let mut rule_hot: Vec<RuleHot> = Vec::with_capacity(word_size);
        let mut rule_cold: Vec<RuleCold> = Vec::with_capacity(word_size);
        let mut word_id_to_idx: HashMap<(ProcessType, u32), usize> =
            HashMap::with_capacity(word_size);

        let mut next_pattern_id: usize = 0;
        let mut dedup_patterns = Vec::with_capacity(word_size);
        let mut pattern_id_map: HashMap<Cow<str>, usize> = HashMap::with_capacity(word_size);

        for (&process_type, simple_word_map) in process_type_word_map {
            let word_process_type = process_type - ProcessType::Delete;

            for (&simple_word_id, simple_word) in simple_word_map {
                if simple_word.as_ref().is_empty() {
                    continue;
                }
                let mut and_splits: HashMap<&str, i32> = HashMap::new();
                let mut not_splits: HashMap<&str, i32> = HashMap::new();

                let mut start = 0;
                let mut current_is_not = false;

                let mut add_sub_word = |word: &'a str, is_not: bool| {
                    if word.is_empty() {
                        return;
                    }
                    if is_not {
                        let entry = not_splits.entry(word).or_insert(1);
                        *entry -= 1;
                    } else {
                        let entry = and_splits.entry(word).or_insert(0);
                        *entry += 1;
                    }
                };

                for (index, char) in simple_word.as_ref().match_indices(['&', '~']) {
                    add_sub_word(&simple_word.as_ref()[start..index], current_is_not);
                    current_is_not = char == "~";
                    start = index + 1;
                }
                add_sub_word(&simple_word.as_ref()[start..], current_is_not);

                if and_splits.is_empty() && not_splits.is_empty() {
                    continue;
                }

                let and_count = and_splits.len();
                let segment_counts = and_splits
                    .values()
                    .copied()
                    .chain(not_splits.values().copied())
                    .collect::<Vec<i32>>();

                let use_matrix = and_count > BITMASK_CAPACITY
                    || segment_counts.len() > BITMASK_CAPACITY
                    || segment_counts[..and_count].iter().any(|&v| v != 1)
                    || segment_counts[and_count..].iter().any(|&v| v != 0);
                let has_not = and_count != segment_counts.len();

                let rule_idx = if let Some(&existing_idx) =
                    word_id_to_idx.get(&(process_type, simple_word_id))
                {
                    rule_hot[existing_idx] = RuleHot {
                        segment_counts,
                        and_count,
                        use_matrix,
                        has_not,
                    };
                    rule_cold[existing_idx] = RuleCold {
                        word_id: simple_word_id,
                        word: simple_word.as_ref().to_owned(),
                    };
                    existing_idx
                } else {
                    let idx = rule_hot.len();
                    word_id_to_idx.insert((process_type, simple_word_id), idx);
                    rule_hot.push(RuleHot {
                        segment_counts,
                        and_count,
                        use_matrix,
                        has_not,
                    });
                    rule_cold.push(RuleCold {
                        word_id: simple_word_id,
                        word: simple_word.as_ref().to_owned(),
                    });
                    idx
                };

                let is_simple = and_count == 1 && !has_not && !use_matrix;

                for (offset, &split_word) in and_splits.keys().chain(not_splits.keys()).enumerate()
                {
                    let kind = if is_simple {
                        PatternKind::Simple
                    } else if offset < and_count {
                        PatternKind::And
                    } else {
                        PatternKind::Not
                    };

                    for ac_word in reduce_text_process_emit(word_process_type, split_word) {
                        let pt_index = pt_index_table[process_type.bits() as usize];
                        let Some(&existing_dedup_id) = pattern_id_map.get(ac_word.as_ref()) else {
                            pattern_id_map.insert(ac_word.clone(), next_pattern_id);
                            dedup_entries.push(vec![PatternEntry {
                                rule_idx: rule_idx as u32,
                                offset: offset as u16,
                                pt_index,
                                kind,
                            }]);
                            dedup_patterns.push(ac_word);
                            next_pattern_id += 1;
                            continue;
                        };
                        dedup_entries[existing_dedup_id].push(PatternEntry {
                            rule_idx: rule_idx as u32,
                            offset: offset as u16,
                            pt_index,
                            kind,
                        });
                    }
                }
            }
        }

        ParsedRules {
            dedup_patterns,
            dedup_entries,
            rule_hot,
            rule_cold,
        }
    }

    /// Partitions deduplicated patterns by character content and compiles the scan engines.
    ///
    /// ASCII-only patterns go to the ASCII matcher. The non-ASCII matcher is compiled
    /// over:
    /// - the non-ASCII subset when there are no ASCII patterns, or
    /// - the full pattern set when both ASCII and non-ASCII patterns exist, so non-ASCII
    ///   text needs only one charwise scan.
    ///
    /// With the `dfa` feature, small ASCII sets use AC DFA and larger ones use the DAAC
    /// ASCII matcher.
    fn compile_automata(
        dedup_patterns: &[Cow<'_, str>],
    ) -> (Option<AsciiMatcher>, Option<NonAsciiMatcher>) {
        let mut ascii_patvals: Vec<(&str, u32)> = Vec::new();
        let mut non_ascii_patvals: Vec<(&str, u32)> = Vec::new();
        #[cfg(feature = "dfa")]
        let mut ascii_ac_to_dedup: Vec<u32> = Vec::new();

        for (dedup_id, pattern) in dedup_patterns.iter().enumerate() {
            if pattern.as_ref().is_ascii() {
                #[cfg(feature = "dfa")]
                ascii_ac_to_dedup.push(dedup_id as u32);
                ascii_patvals.push((pattern.as_ref(), dedup_id as u32));
            } else {
                non_ascii_patvals.push((pattern.as_ref(), dedup_id as u32));
            }
        }

        // When both ASCII and non-ASCII patterns exist, the charwise matcher
        // must contain all patterns so one scan covers everything for non-ASCII text.
        let full_charwise_patvals = if ascii_patvals.is_empty() || non_ascii_patvals.is_empty() {
            None
        } else {
            Some(
                dedup_patterns
                    .iter()
                    .enumerate()
                    .map(|(dedup_id, pattern)| (pattern.as_ref(), dedup_id as u32))
                    .collect::<Vec<_>>(),
            )
        };

        let ascii_matcher = if !ascii_patvals.is_empty() {
            #[cfg(feature = "dfa")]
            let engine = if ascii_patvals.len() <= AC_DFA_PATTERN_THRESHOLD {
                AsciiMatcher::AcDfa {
                    matcher: AhoCorasickBuilder::new()
                        .kind(Some(AhoCorasickKind::DFA))
                        .match_kind(AhoCorasickMatchKind::Standard)
                        .build(ascii_patvals.iter().map(|(p, _)| p))
                        .unwrap(),
                    to_dedup: ascii_ac_to_dedup,
                }
            } else {
                AsciiMatcher::DaacBytewise(
                    DoubleArrayAhoCorasickBuilder::new()
                        .match_kind(DoubleArrayAhoCorasickMatchKind::Standard)
                        .build_with_values(ascii_patvals)
                        .unwrap(),
                )
            };
            #[cfg(not(feature = "dfa"))]
            let engine = AsciiMatcher::DaacBytewise(
                DoubleArrayAhoCorasickBuilder::new()
                    .match_kind(DoubleArrayAhoCorasickMatchKind::Standard)
                    .build_with_values(ascii_patvals)
                    .unwrap(),
            );
            Some(engine)
        } else {
            None
        };

        let non_ascii_patvals = full_charwise_patvals
            .as_deref()
            .unwrap_or(non_ascii_patvals.as_slice());
        let non_ascii_matcher = if !non_ascii_patvals.is_empty() {
            Some(NonAsciiMatcher::DaacCharwise(
                CharwiseDoubleArrayAhoCorasickBuilder::new()
                    .match_kind(DoubleArrayAhoCorasickMatchKind::Standard)
                    .build_with_values(non_ascii_patvals.iter().copied())
                    .unwrap(),
            ))
        } else {
            None
        };

        (ascii_matcher, non_ascii_matcher)
    }

    /// Flattens `Vec<Vec<PatternEntry>>` into one contiguous entry array plus per-pattern ranges.
    ///
    /// Returns `(ac_dedup_entries, ac_dedup_ranges)` where `ac_dedup_ranges[i] = (start, len)`
    /// maps automaton pattern index `i` to its slice of [`PatternEntry`] records.
    fn flatten_dedup_entries(
        dedup_entries: Vec<Vec<PatternEntry>>,
    ) -> (Vec<PatternEntry>, Vec<(usize, usize)>) {
        let mut ac_dedup_entries = Vec::with_capacity(dedup_entries.iter().map(|v| v.len()).sum());
        let mut ac_dedup_ranges = Vec::with_capacity(dedup_entries.len());
        for entries in dedup_entries {
            let start = ac_dedup_entries.len();
            let len = entries.len();
            ac_dedup_entries.extend(entries);
            ac_dedup_ranges.push((start, len));
        }
        (ac_dedup_entries, ac_dedup_ranges)
    }
}