matcher_rs 0.12.3

A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust.
Documentation
use matcher_rs::{ProcessType, SimpleMatcherBuilder};

// ---------------------------------------------------------------------------
// SearchMode paths
// ---------------------------------------------------------------------------

#[test]
fn test_search_mode_all_simple() {
    // Only single-literal patterns under ProcessType::None -> AllSimple
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "alpha")
        .add_word(ProcessType::None, 2, "beta")
        .add_word(ProcessType::None, 3, "gamma")
        .build()
        .unwrap();

    assert!(!matcher.is_match(""), "empty text always false");
    assert!(matcher.is_match("alpha beta gamma"));
    assert!(!matcher.is_match("delta"));

    let results = matcher.process("alpha beta gamma");
    assert_eq!(results.len(), 3);
}

#[test]
fn test_search_mode_general() {
    // Rules across multiple PTs -> General
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello")
        .add_word(ProcessType::Fanjian, 2, "测试")
        .build()
        .unwrap();

    assert!(matcher.is_match("hello"));
    assert!(matcher.is_match("測試"));

    let results = matcher.process("hello 測試");
    assert_eq!(results.len(), 2);
}

// ---------------------------------------------------------------------------
// DIRECT_RULE_BIT and PatternDispatch
// ---------------------------------------------------------------------------

#[test]
fn test_direct_rule_bit_fast_path() {
    // AllSimple: all rules are simple literals under ProcessType::None
    let simple = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello")
        .add_word(ProcessType::None, 2, "world")
        .build()
        .unwrap();

    assert!(simple.is_match("hello world"));
    let results = simple.process("hello world");
    assert_eq!(results.len(), 2);

    // Mixed: same sub-pattern "hello" used in both a simple rule and a compound rule.
    // This forces Entries dispatch instead of DirectRule for the shared pattern.
    let mixed = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello")
        .add_word(ProcessType::None, 2, "hello&world")
        .build()
        .unwrap();

    let r1 = mixed.process("hello");
    assert_eq!(r1.len(), 1);
    assert_eq!(r1[0].word_id, 1);

    let r2 = mixed.process("hello world");
    assert_eq!(r2.len(), 2);
    let mut ids: Vec<u32> = r2.iter().map(|r| r.word_id).collect();
    ids.sort();
    assert_eq!(ids, vec![1, 2]);
}

#[test]
fn test_shared_subpattern_across_rules() {
    // "hello" is a sub-pattern shared by both rules -> PatternDispatch::Entries
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello&world")
        .add_word(ProcessType::None, 2, "hello&earth")
        .build()
        .unwrap();

    let r1 = matcher.process("hello world");
    assert_eq!(r1.len(), 1);
    assert_eq!(r1[0].word_id, 1);

    let r2 = matcher.process("hello earth");
    assert_eq!(r2.len(), 1);
    assert_eq!(r2[0].word_id, 2);

    let r3 = matcher.process("hello world earth");
    assert_eq!(r3.len(), 2);
    let mut ids: Vec<u32> = r3.iter().map(|r| r.word_id).collect();
    ids.sort();
    assert_eq!(ids, vec![1, 2]);
}

// ---------------------------------------------------------------------------
// Bitmask vs matrix dispatch
// ---------------------------------------------------------------------------

#[test]
fn test_matrix_repeated_and_segments() {
    // "a&a&b&b&b" -> and_splits: {a:2, b:3} -> use_matrix because counts != 1
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "a&a&b&b&b")
        .build()
        .unwrap();

    assert!(matcher.is_match("a a b b b"), "2a + 3b should match");
    assert!(!matcher.is_match("a b b b"), "1a + 3b should not match");
    assert!(!matcher.is_match("a a b b"), "2a + 2b should not match");
    assert!(!matcher.is_match("a a"), "2a + 0b should not match");

    let results = matcher.process("a a b b b");
    assert_eq!(results.len(), 1);
    assert_eq!(results[0].word_id, 1);
}

#[test]
fn test_matrix_triggered_by_not_count() {
    // "a~b~b" -> not_splits: {b: -1} -> use_matrix because not_count != 0
    // Veto fires only when "b" appears twice (counter goes from -1 -> 0 -> 1 > 0)
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "a~b~b")
        .build()
        .unwrap();

    assert!(matcher.is_match("a"), "a without b should match");
    assert!(
        matcher.is_match("a b"),
        "a with 1 b should match (below threshold)"
    );
    assert!(
        !matcher.is_match("a b b"),
        "a with 2 b should not match (veto)"
    );
    assert!(
        !matcher.is_match("a b b b"),
        "a with 3 b should not match (veto)"
    );
}

#[test]
fn test_matrix_combined_with_not_veto() {
    // 63 unique AND segments + 2 NOT segments = 65 total > BITMASK_CAPACITY(64)
    let and_parts: Vec<String> = (0..63).map(|i| format!("a{i}")).collect();
    let pattern = format!("{}~notX~notY", and_parts.join("&"));

    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, &pattern)
        .build()
        .unwrap();

    let all_ands = and_parts.join(" ");
    assert!(
        matcher.is_match(&all_ands),
        "all ANDs without NOTs should match"
    );

    let with_not = format!("{all_ands} notX");
    assert!(!matcher.is_match(&with_not), "NOT should veto the match");
}

#[test]
fn test_bitmask_boundary_64_vs_65() {
    // 64 unique AND segments -> bitmask path (exactly at capacity)
    let parts_64: Vec<String> = (0..64).map(|i| format!("w{i}")).collect();
    let pattern_64 = parts_64.join("&");
    let matcher_64 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, &pattern_64)
        .build()
        .unwrap();

    // 65 unique AND segments -> matrix fallback
    let parts_65: Vec<String> = (0..65).map(|i| format!("w{i}")).collect();
    let pattern_65 = parts_65.join("&");
    let matcher_65 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, &pattern_65)
        .build()
        .unwrap();

    let text_64 = parts_64.join(" ");
    let text_65 = parts_65.join(" ");
    let text_64_missing_last = parts_64[..63].join(" ");
    let text_65_missing_last = parts_65[..64].join(" ");

    assert!(
        matcher_64.is_match(&text_64),
        "64 segments: all present -> match"
    );
    assert!(
        !matcher_64.is_match(&text_64_missing_last),
        "64 segments: one missing -> no match"
    );
    assert!(
        matcher_65.is_match(&text_65),
        "65 segments: all present -> match"
    );
    assert!(
        !matcher_65.is_match(&text_65_missing_last),
        "65 segments: one missing -> no match"
    );

    assert_eq!(matcher_64.process(&text_64).len(), 1);
    assert_eq!(matcher_65.process(&text_65).len(), 1);
}

#[test]
fn test_and_count_one_not_matrix() {
    // 10 unique AND segments (each count=1) -> bitmask path, NOT matrix.
    let parts: Vec<&str> = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"];
    let pattern = parts.join("&");

    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, &pattern)
        .build()
        .unwrap();

    let text_all = parts.join(" ");
    assert!(matcher.is_match(&text_all), "all present -> match");

    // Remove each segment one at a time and verify no match
    for skip in &parts {
        let text_missing: String = parts
            .iter()
            .filter(|p| *p != skip)
            .copied()
            .collect::<Vec<_>>()
            .join(" ");
        assert!(
            !matcher.is_match(&text_missing),
            "missing '{skip}' -> no match"
        );
    }

    let results = matcher.process(&text_all);
    assert_eq!(results.len(), 1);
    assert_eq!(results[0].word, pattern);
}

// ---------------------------------------------------------------------------
// Pattern indexing under Delete
// ---------------------------------------------------------------------------

#[test]
fn test_delete_adjusted_pattern_indexing() {
    // Pattern "ab" under Delete: stored verbatim in AC, text is delete-stripped before scan.
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::Delete, 1, "ab")
        .build()
        .unwrap();

    assert!(matcher.is_match("ab"), "direct match");
    assert!(matcher.is_match("a*b"), "noise char stripped");
    assert!(matcher.is_match("a b"), "space stripped");
    assert!(matcher.is_match("a!!b"), "multiple noise chars stripped");
    assert!(!matcher.is_match("ac"), "no match");
}

#[test]
fn test_fanjian_delete_pattern_indexing() {
    // Fanjian|Delete: pattern is Fanjian-emitted (测试), text gets both Fanjian + Delete.
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::Fanjian | ProcessType::Delete, 1, "测试")
        .build()
        .unwrap();

    assert!(matcher.is_match("测试"), "simplified direct");
    assert!(matcher.is_match("測試"), "traditional -> Fanjian path");
    assert!(
        matcher.is_match("测!试"),
        "simplified + noise -> Delete path"
    );
    assert!(
        matcher.is_match("測!試"),
        "traditional + noise -> Fanjian + Delete"
    );
}

// ---------------------------------------------------------------------------
// AC automaton behavior: overlapping patterns, mixed engines
// ---------------------------------------------------------------------------

#[test]
fn test_overlapping_words() {
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello")
        .add_word(ProcessType::None, 2, "hello world")
        .add_word(ProcessType::None, 3, "world")
        .build()
        .unwrap();

    let results = matcher.process("hello world");
    let mut ids: Vec<u32> = results.into_iter().map(|r| r.word_id).collect();
    ids.sort();

    assert_eq!(ids, vec![1, 2, 3]);
}

#[test]
fn test_mixed_ascii_and_cjk_rules_on_non_ascii_text() {
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "abc")
        .add_word(ProcessType::None, 2, "你好")
        .build()
        .unwrap();

    let mut ids: Vec<u32> = matcher
        .process("你好 abc")
        .into_iter()
        .map(|result| result.word_id)
        .collect();
    ids.sort_unstable();

    assert_eq!(ids, vec![1, 2]);
}

// ---------------------------------------------------------------------------
// ASCII engine routing
// ---------------------------------------------------------------------------

#[test]
fn test_ascii_only_text_routing() {
    // Matcher with both ASCII and CJK patterns
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello")
        .add_word(ProcessType::None, 2, "你好")
        .build()
        .unwrap();

    // Pure ASCII text: ASCII engine handles "hello", charwise engine handles "你好"
    let r1 = matcher.process("hello world");
    assert_eq!(r1.len(), 1);
    assert_eq!(r1[0].word_id, 1);

    // Mixed text: both engines fire
    let r2 = matcher.process("hello 你好");
    let mut ids: Vec<u32> = r2.into_iter().map(|r| r.word_id).collect();
    ids.sort();
    assert_eq!(ids, vec![1, 2]);

    // CJK-only text: only charwise engine matches
    let r3 = matcher.process("你好世界");
    assert_eq!(r3.len(), 1);
    assert_eq!(r3[0].word_id, 2);
}

#[test]
fn test_ascii_engine_only_when_no_non_ascii_patterns() {
    // All patterns are ASCII -> non_ascii_matcher may be None.
    // Non-ASCII text should still find ASCII substrings.
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "hello")
        .add_word(ProcessType::None, 2, "world")
        .build()
        .unwrap();

    assert!(
        matcher.is_match("hello 世界"),
        "ASCII substring in non-ASCII text"
    );
    assert!(
        !matcher.is_match("你好世界"),
        "no ASCII pattern in CJK text"
    );

    let results = matcher.process("hello 世界 world");
    let mut ids: Vec<u32> = results.into_iter().map(|r| r.word_id).collect();
    ids.sort();
    assert_eq!(ids, vec![1, 2]);
}