matcher_rs 0.15.2

A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust.
Documentation
use matcher_rs::{
    ProcessType, SimpleMatcherBuilder, reduce_text_process, reduce_text_process_emit, text_process,
};

// ===========================================================================
// Individual ProcessType spot checks (table-driven)
// ===========================================================================

#[test]
fn test_each_process_type() {
    let cases: &[(ProcessType, &str, &str)] = &[
        // VariantNorm: Traditional → Simplified Chinese
        (ProcessType::VariantNorm, "測試", "测试"),
        (ProcessType::VariantNorm, "國語", "国语"),
        // VariantNorm: halfwidth katakana → fullwidth
        (ProcessType::VariantNorm, "アイウ", "アイウ"),
        (
            ProcessType::VariantNorm,
            "hello カタカナ world",
            "hello カタカナ world",
        ),
        // VariantNorm: fullwidth katakana unchanged
        (ProcessType::VariantNorm, "アイウ", "アイウ"),
        // Delete: strips configured codepoints
        (ProcessType::Delete, "a*b", "ab"),
        (ProcessType::Delete, "a b", "ab"),
        (ProcessType::Delete, "a!!b", "ab"),
        // Normalize: compatibility chars + casefold + diacritic strip
        (ProcessType::Normalize, "ABⅣ①℉", "ab41°f"),
        (ProcessType::Normalize, "Café", "cafe"),
        (ProcessType::Normalize, "CAFÉ", "cafe"),
        (ProcessType::Normalize, "I", "i"),
        // Romanize: CJK → pinyin with inter-char spacing
        (ProcessType::Romanize, "西安", " xi an"),
        (ProcessType::Romanize, "한글", " han geul"),
        (ProcessType::Romanize, "あいう", " a i u"),
        (ProcessType::Romanize, "カタカナ", " ka ta ka na"),
        (ProcessType::Romanize, "中あ한", " zhong a han"),
        // RomanizeChar: no inter-char spaces
        (ProcessType::RomanizeChar, "西安", "xian"),
        (ProcessType::RomanizeChar, "한글", "hangeul"),
        // EmojiNorm: emoji → CLDR short names
        (ProcessType::EmojiNorm, "👍", " thumbs_up"),
        (ProcessType::EmojiNorm, "🔥", " fire"),
        (ProcessType::EmojiNorm, "hello world", "hello world"),
    ];

    for &(pt, input, expected) in cases {
        let result = text_process(pt, input);
        assert_eq!(
            result.as_ref(),
            expected,
            "{pt:?} on {input:?}: expected {expected:?}, got {:?}",
            result.as_ref()
        );
    }

    // Romanize/RomanizeChar should skip ASCII digits
    for pt in [ProcessType::Romanize, ProcessType::RomanizeChar] {
        let m = SimpleMatcherBuilder::new()
            .add_word(pt, 1, "yi")
            .build()
            .unwrap();
        assert!(!m.is_match("1"), "{pt:?} should skip ASCII digits");
    }
}

// ===========================================================================
// Normalize: combining characters (subtle behavior)
// ===========================================================================

#[test]
fn test_normalize_combining_characters() {
    // Per-codepoint lookup table, NOT full NFC composition.
    // Precomposed chars have table entries; standalone combining marks pass
    // through.
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::Normalize, 1, "cafe")
        .build()
        .unwrap();

    // Precomposed é (U+00E9) → "e" via table
    assert!(matcher.is_match("caf\u{00E9}"));
    let precomposed = text_process(ProcessType::Normalize, "caf\u{00E9}");
    assert_eq!(precomposed.as_ref(), "cafe");

    // Decomposed: "cafe" + combining acute (U+0301) — "cafe" prefix still matches
    assert!(matcher.is_match("cafe\u{0301}"));
    // Standalone combining mark has no table entry, passes through
    let decomposed = text_process(ProcessType::Normalize, "\u{0301}");
    assert_eq!(decomposed.as_ref(), "\u{0301}");
}

// ===========================================================================
// EmojiNorm modifiers (table-driven)
// ===========================================================================

#[test]
fn test_emoji_norm_modifiers() {
    let cases: &[(&str, &str)] = &[
        // Skin tone stripped, base emoji normalized
        ("👍🏽", " thumbs_up"),
        // VS16 stripped
        ("\u{FE0F}", " red_heart"),
        // ZWJ sequence: each component normalized independently
        ("👨\u{200D}👩\u{200D}👧", " man woman girl"),
        // Multiple emoji
        ("🔥❤🎉", " fire red_heart party_popper"),
        // Mixed text
        ("Hello 🔥 World", "Hello  fire World"),
    ];

    for &(input, expected) in cases {
        let result = text_process(ProcessType::EmojiNorm, input);
        assert_eq!(result.as_ref(), expected, "EmojiNorm on {input:?}");
    }
}

// ===========================================================================
// Standalone transform API: reduce_text_process pipeline
// ===========================================================================

#[test]
fn test_reduce_text_process_pipeline() {
    let variants = reduce_text_process(ProcessType::VariantNormDeleteNormalize, "!A!測試!1!");
    // 0. Original: "!A!測試!1!"
    // 1. VariantNorm: "!A!测试!1!"
    // 2. Delete:      "A测试1"
    // 3. Normalize:   "a测试1"
    assert_eq!(variants.len(), 4);
    assert_eq!(variants[0], "!A!測試!1!");
    assert_eq!(variants[1], "!A!测试!1!");
    assert_eq!(variants[2], "A测试1");
    assert_eq!(variants[3], "a测试1");

    // Emit behavior: replace-type steps overwrite; Delete appends
    let emit =
        reduce_text_process_emit(ProcessType::VariantNormDeleteNormalize, "!A!測試!1!");
    assert_eq!(emit.len(), 2);
    assert_eq!(emit[0], "!A!测试!1!");
    assert_eq!(emit[1], "a测试1");

    // All transforms combined
    let all = reduce_text_process(
        ProcessType::VariantNorm
            | ProcessType::Delete
            | ProcessType::Normalize
            | ProcessType::Romanize
            | ProcessType::RomanizeChar,
        "A!漢語西安1",
    );
    assert_eq!(all.last().unwrap(), "a han yu xi an1");
}

// ===========================================================================
// Regression: SIMD fast-skip in DeleteFindIter
// ===========================================================================

#[test]
fn test_delete_simd_regression() {
    let variants = reduce_text_process(ProcessType::VariantNormDeleteNormalize, "A B 測試 A 1");
    assert_eq!(variants[0], "A B 測試 A 1");
    assert_eq!(variants[1], "A B 测试 A 1");
    assert_eq!(variants[2], "AB测试A1");
    assert_eq!(variants[3], "ab测试a1");
}

// ===========================================================================
// Composite ProcessType through SimpleMatcher
// ===========================================================================

#[test]
fn test_composition_variant_norm_delete_normalize() {
    // Kitchen-sink: VariantNorm + Delete + Normalize
    let m1 = SimpleMatcherBuilder::new()
        .add_word(
            ProcessType::VariantNorm | ProcessType::Delete | ProcessType::Normalize,
            1,
            "测试",
        )
        .build()
        .unwrap();
    assert!(m1.is_match("測!試"));

    // Cross-variant: None + Romanize
    let m2 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None | ProcessType::Romanize, 1, "apple&西安")
        .build()
        .unwrap();
    assert!(m2.is_match("apple 洗按"));

    // VariantNorm + Romanize
    let m3 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::VariantNorm | ProcessType::Romanize, 1, "测试")
        .build()
        .unwrap();
    assert!(m3.is_match("测试"));
    assert!(m3.is_match("測試"));
    assert!(
        m3.is_match("策士"),
        "different chars with same romanize syllables"
    );
}

#[test]
fn test_delete_emoji_norm_gotcha() {
    // Delete strips emoji codepoints BEFORE EmojiNorm can convert them.
    let pt = ProcessType::Delete | ProcessType::EmojiNorm;
    let matcher = SimpleMatcherBuilder::new()
        .add_word(pt, 1, "fire")
        .build()
        .unwrap();

    assert!(
        !matcher.is_match("🔥"),
        "Delete strips emoji before EmojiNorm"
    );
    assert!(matcher.is_match("fire"), "literal 'fire' still matches");
}

#[test]
fn test_none_in_composite_preserves_raw() {
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None | ProcessType::Delete, 1, "helloworld")
        .build()
        .unwrap();

    assert!(matcher.is_match("helloworld"), "raw match via None");
    assert!(matcher.is_match("hello world"), "Delete strips space");
    assert!(matcher.is_match("hello-world"), "Delete strips hyphen");
    assert!(!matcher.is_match("hallo"));
}

#[test]
fn test_romanize_vs_romanize_char_spacing() {
    // Romanize adds boundary spaces; RomanizeChar omits them
    let mr = SimpleMatcherBuilder::new()
        .add_word(ProcessType::Romanize, 1, "xian")
        .build()
        .unwrap();

    let mrc = SimpleMatcherBuilder::new()
        .add_word(ProcessType::RomanizeChar, 1, "xian")
        .build()
        .unwrap();

    // 先 → Romanize: " xian " (single syllable, contains "xian")
    assert!(mr.is_match(""));
    // 西安 → Romanize: " xi  an " (two syllables, NOT "xian")
    assert!(!mr.is_match("西安"));
    // RomanizeChar: both → "xian" (no spaces)
    assert!(mrc.is_match(""));
    assert!(mrc.is_match("西安"));
}

#[test]
fn test_or_with_process_type() {
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::VariantNorm, 1, "测试|世界")
        .build()
        .unwrap();

    assert!(matcher.is_match("测试"));
    assert!(matcher.is_match("世界"));
    assert!(
        matcher.is_match("測試"),
        "traditional variant via VariantNorm"
    );
}

// ===========================================================================
// Streaming scan paths
// ===========================================================================

#[test]
fn test_streaming_scan_paths() {
    // Normalize + VariantNorm forces General mode with streaming iterators
    let m1 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::Normalize, 1, "ab")
        .add_word(ProcessType::VariantNorm, 2, "测试")
        .build()
        .unwrap();

    assert!(m1.is_match("AB"), "fullwidth → normalized to ab");
    assert!(m1.is_match("測試"), "traditional → simplified");
    assert_eq!(m1.process("AB 測試").len(), 2);

    // Romanize streaming
    let m2 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::Romanize, 1, "xi an")
        .add_word(ProcessType::VariantNorm, 2, "测试")
        .build()
        .unwrap();

    assert!(m2.is_match("西安"));
    assert!(m2.is_match("測試"));
}

#[test]
fn test_ascii_noop_optimization() {
    // VariantNorm is a no-op on pure ASCII — scan reuses parent text.
    let matcher = SimpleMatcherBuilder::new()
        .add_word(ProcessType::VariantNorm, 1, "hello")
        .add_word(ProcessType::None, 2, "world")
        .build()
        .unwrap();

    assert!(matcher.is_match("hello world"));
    assert_eq!(matcher.process("hello world").len(), 2);
}

// ===========================================================================
// Unicode robustness (table-driven)
// ===========================================================================

#[test]
fn test_unicode_robustness() {
    // Emoji + ZWJ passthrough
    let m1 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::VariantNormDeleteNormalize, 1, "test")
        .build()
        .unwrap();
    let text = "test 👨\u{200D}👩\u{200D}👧\u{200D}👦 🎉";
    assert!(m1.is_match(text));
    assert_eq!(m1.process(text).len(), 1);

    // Combining marks: "cafe" matches "cafe" + combining acute as byte prefix
    let m2 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, "cafe")
        .build()
        .unwrap();
    assert!(m2.is_match("cafe\u{0301}"));

    // Private Use Area chars pass through unchanged
    let pua = "\u{E000}\u{E001}\u{F8FF}";
    let m3 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, pua)
        .build()
        .unwrap();
    assert!(m3.is_match(pua));
    assert_eq!(text_process(ProcessType::VariantNorm, pua).as_ref(), pua);

    // 4-byte emoji in pattern
    let emoji_pat = "test\u{1F389}";
    let m4 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::None, 1, emoji_pat)
        .build()
        .unwrap();
    assert!(m4.is_match("test\u{1F389}"));
    assert!(!m4.is_match("test"));

    // EmojiNorm integration through matcher
    let m5 = SimpleMatcherBuilder::new()
        .add_word(ProcessType::EmojiNorm, 1, "fire")
        .add_word(ProcessType::EmojiNorm, 2, "thumbs_up")
        .build()
        .unwrap();
    assert!(m5.is_match("🔥"));
    assert!(m5.is_match("👍🏽"));
    assert_eq!(m5.process("I love 🔥 and 👍").len(), 2);
}