use matcher_rs::{
ProcessType, SimpleMatcherBuilder, reduce_text_process, reduce_text_process_emit, text_process,
};
#[test]
fn test_each_process_type() {
let cases: &[(ProcessType, &str, &str)] = &[
(ProcessType::VariantNorm, "測試", "测试"),
(ProcessType::VariantNorm, "國語", "国语"),
(ProcessType::VariantNorm, "アイウ", "アイウ"),
(
ProcessType::VariantNorm,
"hello カタカナ world",
"hello カタカナ world",
),
(ProcessType::VariantNorm, "アイウ", "アイウ"),
(ProcessType::Delete, "a*b", "ab"),
(ProcessType::Delete, "a b", "ab"),
(ProcessType::Delete, "a!!b", "ab"),
(ProcessType::Normalize, "ABⅣ①℉", "ab41°f"),
(ProcessType::Normalize, "Café", "cafe"),
(ProcessType::Normalize, "CAFÉ", "cafe"),
(ProcessType::Normalize, "I", "i"),
(ProcessType::Romanize, "西安", " xi an"),
(ProcessType::Romanize, "한글", " han geul"),
(ProcessType::Romanize, "あいう", " a i u"),
(ProcessType::Romanize, "カタカナ", " ka ta ka na"),
(ProcessType::Romanize, "中あ한", " zhong a han"),
(ProcessType::RomanizeChar, "西安", "xian"),
(ProcessType::RomanizeChar, "한글", "hangeul"),
(ProcessType::EmojiNorm, "👍", " thumbs_up"),
(ProcessType::EmojiNorm, "🔥", " fire"),
(ProcessType::EmojiNorm, "hello world", "hello world"),
];
for &(pt, input, expected) in cases {
let result = text_process(pt, input);
assert_eq!(
result.as_ref(),
expected,
"{pt:?} on {input:?}: expected {expected:?}, got {:?}",
result.as_ref()
);
}
for pt in [ProcessType::Romanize, ProcessType::RomanizeChar] {
let m = SimpleMatcherBuilder::new()
.add_word(pt, 1, "yi")
.build()
.unwrap();
assert!(!m.is_match("1"), "{pt:?} should skip ASCII digits");
}
}
#[test]
fn test_normalize_combining_characters() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::Normalize, 1, "cafe")
.build()
.unwrap();
assert!(matcher.is_match("caf\u{00E9}"));
let precomposed = text_process(ProcessType::Normalize, "caf\u{00E9}");
assert_eq!(precomposed.as_ref(), "cafe");
assert!(matcher.is_match("cafe\u{0301}"));
let decomposed = text_process(ProcessType::Normalize, "\u{0301}");
assert_eq!(decomposed.as_ref(), "\u{0301}");
}
#[test]
fn test_emoji_norm_modifiers() {
let cases: &[(&str, &str)] = &[
("👍🏽", " thumbs_up"),
("❤\u{FE0F}", " red_heart"),
("👨\u{200D}👩\u{200D}👧", " man woman girl"),
("🔥❤🎉", " fire red_heart party_popper"),
("Hello 🔥 World", "Hello fire World"),
];
for &(input, expected) in cases {
let result = text_process(ProcessType::EmojiNorm, input);
assert_eq!(result.as_ref(), expected, "EmojiNorm on {input:?}");
}
}
#[test]
fn test_reduce_text_process_pipeline() {
let variants = reduce_text_process(ProcessType::VariantNormDeleteNormalize, "!A!測試!1!");
assert_eq!(variants.len(), 4);
assert_eq!(variants[0], "!A!測試!1!");
assert_eq!(variants[1], "!A!测试!1!");
assert_eq!(variants[2], "A测试1");
assert_eq!(variants[3], "a测试1");
let emit =
reduce_text_process_emit(ProcessType::VariantNormDeleteNormalize, "!A!測試!1!");
assert_eq!(emit.len(), 2);
assert_eq!(emit[0], "!A!测试!1!");
assert_eq!(emit[1], "a测试1");
let all = reduce_text_process(
ProcessType::VariantNorm
| ProcessType::Delete
| ProcessType::Normalize
| ProcessType::Romanize
| ProcessType::RomanizeChar,
"A!漢語西安1",
);
assert_eq!(all.last().unwrap(), "a han yu xi an1");
}
#[test]
fn test_delete_simd_regression() {
let variants = reduce_text_process(ProcessType::VariantNormDeleteNormalize, "A B 測試 A 1");
assert_eq!(variants[0], "A B 測試 A 1");
assert_eq!(variants[1], "A B 测试 A 1");
assert_eq!(variants[2], "AB测试A1");
assert_eq!(variants[3], "ab测试a1");
}
#[test]
fn test_composition_variant_norm_delete_normalize() {
let m1 = SimpleMatcherBuilder::new()
.add_word(
ProcessType::VariantNorm | ProcessType::Delete | ProcessType::Normalize,
1,
"测试",
)
.build()
.unwrap();
assert!(m1.is_match("測!試"));
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None | ProcessType::Romanize, 1, "apple&西安")
.build()
.unwrap();
assert!(m2.is_match("apple 洗按"));
let m3 = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm | ProcessType::Romanize, 1, "测试")
.build()
.unwrap();
assert!(m3.is_match("测试"));
assert!(m3.is_match("測試"));
assert!(
m3.is_match("策士"),
"different chars with same romanize syllables"
);
}
#[test]
fn test_delete_emoji_norm_gotcha() {
let pt = ProcessType::Delete | ProcessType::EmojiNorm;
let matcher = SimpleMatcherBuilder::new()
.add_word(pt, 1, "fire")
.build()
.unwrap();
assert!(
!matcher.is_match("🔥"),
"Delete strips emoji before EmojiNorm"
);
assert!(matcher.is_match("fire"), "literal 'fire' still matches");
}
#[test]
fn test_none_in_composite_preserves_raw() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::None | ProcessType::Delete, 1, "helloworld")
.build()
.unwrap();
assert!(matcher.is_match("helloworld"), "raw match via None");
assert!(matcher.is_match("hello world"), "Delete strips space");
assert!(matcher.is_match("hello-world"), "Delete strips hyphen");
assert!(!matcher.is_match("hallo"));
}
#[test]
fn test_romanize_vs_romanize_char_spacing() {
let mr = SimpleMatcherBuilder::new()
.add_word(ProcessType::Romanize, 1, "xian")
.build()
.unwrap();
let mrc = SimpleMatcherBuilder::new()
.add_word(ProcessType::RomanizeChar, 1, "xian")
.build()
.unwrap();
assert!(mr.is_match("先"));
assert!(!mr.is_match("西安"));
assert!(mrc.is_match("先"));
assert!(mrc.is_match("西安"));
}
#[test]
fn test_or_with_process_type() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm, 1, "测试|世界")
.build()
.unwrap();
assert!(matcher.is_match("测试"));
assert!(matcher.is_match("世界"));
assert!(
matcher.is_match("測試"),
"traditional variant via VariantNorm"
);
}
#[test]
fn test_streaming_scan_paths() {
let m1 = SimpleMatcherBuilder::new()
.add_word(ProcessType::Normalize, 1, "ab")
.add_word(ProcessType::VariantNorm, 2, "测试")
.build()
.unwrap();
assert!(m1.is_match("AB"), "fullwidth → normalized to ab");
assert!(m1.is_match("測試"), "traditional → simplified");
assert_eq!(m1.process("AB 測試").len(), 2);
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::Romanize, 1, "xi an")
.add_word(ProcessType::VariantNorm, 2, "测试")
.build()
.unwrap();
assert!(m2.is_match("西安"));
assert!(m2.is_match("測試"));
}
#[test]
fn test_ascii_noop_optimization() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm, 1, "hello")
.add_word(ProcessType::None, 2, "world")
.build()
.unwrap();
assert!(matcher.is_match("hello world"));
assert_eq!(matcher.process("hello world").len(), 2);
}
#[test]
fn test_unicode_robustness() {
let m1 = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNormDeleteNormalize, 1, "test")
.build()
.unwrap();
let text = "test 👨\u{200D}👩\u{200D}👧\u{200D}👦 🎉";
assert!(m1.is_match(text));
assert_eq!(m1.process(text).len(), 1);
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "cafe")
.build()
.unwrap();
assert!(m2.is_match("cafe\u{0301}"));
let pua = "\u{E000}\u{E001}\u{F8FF}";
let m3 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, pua)
.build()
.unwrap();
assert!(m3.is_match(pua));
assert_eq!(text_process(ProcessType::VariantNorm, pua).as_ref(), pua);
let emoji_pat = "test\u{1F389}";
let m4 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, emoji_pat)
.build()
.unwrap();
assert!(m4.is_match("test\u{1F389}"));
assert!(!m4.is_match("test"));
let m5 = SimpleMatcherBuilder::new()
.add_word(ProcessType::EmojiNorm, 1, "fire")
.add_word(ProcessType::EmojiNorm, 2, "thumbs_up")
.build()
.unwrap();
assert!(m5.is_match("🔥"));
assert!(m5.is_match("👍🏽"));
assert_eq!(m5.process("I love 🔥 and 👍").len(), 2);
}