use std::path::Path;
use matcher_rs::{
ProcessType, SimpleMatcherBuilder, reduce_text_process, reduce_text_process_emit, text_process,
};
#[test]
fn test_each_process_type() {
let cases: &[(ProcessType, &str, &str)] = &[
(ProcessType::VariantNorm, "測試", "测试"),
(ProcessType::VariantNorm, "國語", "国语"),
(ProcessType::VariantNorm, "アイウ", "アイウ"),
(
ProcessType::VariantNorm,
"hello カタカナ world",
"hello カタカナ world",
),
(ProcessType::VariantNorm, "アイウ", "アイウ"),
(ProcessType::Delete, "a*b", "ab"),
(ProcessType::Delete, "a b", "ab"),
(ProcessType::Delete, "a!!b", "ab"),
(ProcessType::Normalize, "ABⅣ①℉", "ab41°f"),
(ProcessType::Normalize, "Café", "cafe"),
(ProcessType::Normalize, "CAFÉ", "cafe"),
(ProcessType::Normalize, "I", "i"),
(ProcessType::Romanize, "西安", " xi an"),
(ProcessType::Romanize, "한글", " han geul"),
(ProcessType::Romanize, "あいう", " a i u"),
(ProcessType::Romanize, "カタカナ", " ka ta ka na"),
(ProcessType::Romanize, "中あ한", " zhong a han"),
(ProcessType::RomanizeChar, "西安", "xian"),
(ProcessType::RomanizeChar, "한글", "hangeul"),
(ProcessType::EmojiNorm, "👍", " thumbs_up"),
(ProcessType::EmojiNorm, "🔥", " fire"),
(ProcessType::EmojiNorm, "hello world", "hello world"),
];
for &(pt, input, expected) in cases {
let result = text_process(pt, input);
assert_eq!(
result.as_ref(),
expected,
"{pt:?} on {input:?}: expected {expected:?}, got {:?}",
result.as_ref()
);
}
for pt in [ProcessType::Romanize, ProcessType::RomanizeChar] {
let m = SimpleMatcherBuilder::new()
.add_word(pt, 1, "yi")
.build()
.unwrap();
assert!(!m.is_match("1"), "{pt:?} should skip ASCII digits");
}
}
#[test]
fn test_normalize_combining_characters() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::Normalize, 1, "cafe")
.build()
.unwrap();
assert!(matcher.is_match("caf\u{00E9}"));
let precomposed = text_process(ProcessType::Normalize, "caf\u{00E9}");
assert_eq!(precomposed.as_ref(), "cafe");
assert!(matcher.is_match("cafe\u{0301}"));
let decomposed = text_process(ProcessType::Normalize, "\u{0301}");
assert_eq!(decomposed.as_ref(), "\u{0301}");
}
#[test]
fn test_emoji_norm_modifiers() {
let cases: &[(&str, &str)] = &[
("👍🏽", " thumbs_up"),
("❤\u{FE0F}", " red_heart"),
("👨\u{200D}👩\u{200D}👧", " man woman girl"),
("🔥❤🎉", " fire red_heart party_popper"),
("Hello 🔥 World", "Hello fire World"),
];
for &(input, expected) in cases {
let result = text_process(ProcessType::EmojiNorm, input);
assert_eq!(result.as_ref(), expected, "EmojiNorm on {input:?}");
}
}
#[test]
fn test_reduce_text_process_pipeline() {
let variants = reduce_text_process(ProcessType::VariantNormDeleteNormalize, "!A!測試!1!");
assert_eq!(variants.len(), 4);
assert_eq!(variants[0], "!A!測試!1!");
assert_eq!(variants[1], "!A!测试!1!");
assert_eq!(variants[2], "A测试1");
assert_eq!(variants[3], "a测试1");
let emit =
reduce_text_process_emit(ProcessType::VariantNormDeleteNormalize, "!A!測試!1!");
assert_eq!(emit.len(), 2);
assert_eq!(emit[0], "!A!测试!1!");
assert_eq!(emit[1], "a测试1");
let all = reduce_text_process(
ProcessType::VariantNorm
| ProcessType::Delete
| ProcessType::Normalize
| ProcessType::Romanize
| ProcessType::RomanizeChar,
"A!漢語西安1",
);
assert_eq!(all.last().unwrap(), "a han yu xi an1");
}
#[test]
fn test_delete_simd_regression() {
let variants = reduce_text_process(ProcessType::VariantNormDeleteNormalize, "A B 測試 A 1");
assert_eq!(variants[0], "A B 測試 A 1");
assert_eq!(variants[1], "A B 测试 A 1");
assert_eq!(variants[2], "AB测试A1");
assert_eq!(variants[3], "ab测试a1");
}
#[test]
fn test_composition_variant_norm_delete_normalize() {
let m1 = SimpleMatcherBuilder::new()
.add_word(
ProcessType::VariantNorm | ProcessType::Delete | ProcessType::Normalize,
1,
"测试",
)
.build()
.unwrap();
assert!(m1.is_match("測!試"));
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None | ProcessType::Romanize, 1, "apple&西安")
.build()
.unwrap();
assert!(m2.is_match("apple 洗按"));
let m3 = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm | ProcessType::Romanize, 1, "测试")
.build()
.unwrap();
assert!(m3.is_match("测试"));
assert!(m3.is_match("測試"));
assert!(
m3.is_match("策士"),
"different chars with same romanize syllables"
);
}
#[test]
fn test_delete_emoji_norm_gotcha() {
let pt = ProcessType::Delete | ProcessType::EmojiNorm;
let matcher = SimpleMatcherBuilder::new()
.add_word(pt, 1, "fire")
.build()
.unwrap();
assert!(
!matcher.is_match("🔥"),
"Delete strips emoji before EmojiNorm"
);
assert!(matcher.is_match("fire"), "literal 'fire' still matches");
}
#[test]
fn test_none_in_composite_preserves_raw() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::None | ProcessType::Delete, 1, "helloworld")
.build()
.unwrap();
assert!(matcher.is_match("helloworld"), "raw match via None");
assert!(matcher.is_match("hello world"), "Delete strips space");
assert!(matcher.is_match("hello-world"), "Delete strips hyphen");
assert!(!matcher.is_match("hallo"));
}
#[test]
fn test_romanize_vs_romanize_char_spacing() {
let mr = SimpleMatcherBuilder::new()
.add_word(ProcessType::Romanize, 1, "xian")
.build()
.unwrap();
let mrc = SimpleMatcherBuilder::new()
.add_word(ProcessType::RomanizeChar, 1, "xian")
.build()
.unwrap();
assert!(mr.is_match("先"));
assert!(!mr.is_match("西安"));
assert!(mrc.is_match("先"));
assert!(mrc.is_match("西安"));
}
#[test]
fn test_or_with_process_type() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm, 1, "测试|世界")
.build()
.unwrap();
assert!(matcher.is_match("测试"));
assert!(matcher.is_match("世界"));
assert!(
matcher.is_match("測試"),
"traditional variant via VariantNorm"
);
}
#[test]
fn test_streaming_scan_paths() {
let m1 = SimpleMatcherBuilder::new()
.add_word(ProcessType::Normalize, 1, "ab")
.add_word(ProcessType::VariantNorm, 2, "测试")
.build()
.unwrap();
assert!(m1.is_match("AB"), "fullwidth → normalized to ab");
assert!(m1.is_match("測試"), "traditional → simplified");
assert_eq!(m1.process("AB 測試").len(), 2);
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::Romanize, 1, "xi an")
.add_word(ProcessType::VariantNorm, 2, "测试")
.build()
.unwrap();
assert!(m2.is_match("西安"));
assert!(m2.is_match("測試"));
}
#[test]
fn test_ascii_noop_optimization() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm, 1, "hello")
.add_word(ProcessType::None, 2, "world")
.build()
.unwrap();
assert!(matcher.is_match("hello world"));
assert_eq!(matcher.process("hello world").len(), 2);
}
#[test]
fn test_unicode_robustness() {
let m1 = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNormDeleteNormalize, 1, "test")
.build()
.unwrap();
let text = "test 👨\u{200D}👩\u{200D}👧\u{200D}👦 🎉";
assert!(m1.is_match(text));
assert_eq!(m1.process(text).len(), 1);
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "cafe")
.build()
.unwrap();
assert!(m2.is_match("cafe\u{0301}"));
let pua = "\u{E000}\u{E001}\u{F8FF}";
let m3 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, pua)
.build()
.unwrap();
assert!(m3.is_match(pua));
assert_eq!(text_process(ProcessType::VariantNorm, pua).as_ref(), pua);
let emoji_pat = "test\u{1F389}";
let m4 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, emoji_pat)
.build()
.unwrap();
assert!(m4.is_match("test\u{1F389}"));
assert!(!m4.is_match("test"));
let m5 = SimpleMatcherBuilder::new()
.add_word(ProcessType::EmojiNorm, 1, "fire")
.add_word(ProcessType::EmojiNorm, 2, "thumbs_up")
.build()
.unwrap();
assert!(m5.is_match("🔥"));
assert!(m5.is_match("👍🏽"));
assert_eq!(m5.process("I love 🔥 and 👍").len(), 2);
}
fn process_map_dir() -> &'static Path {
Path::new(env!("CARGO_MANIFEST_DIR"))
.join("process_map")
.leak()
}
fn parse_tab_mappings(path: &Path) -> Vec<(String, String)> {
std::fs::read_to_string(path)
.unwrap()
.lines()
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.map(|line| {
let mut parts = line.splitn(2, '\t');
let src = parts.next().unwrap().to_owned();
let dst = parts.next().unwrap_or("").to_owned();
(src, dst)
})
.collect()
}
fn parse_hex_codepoints(path: &Path) -> Vec<char> {
std::fs::read_to_string(path)
.unwrap()
.lines()
.filter(|l| !l.is_empty() && !l.starts_with('#'))
.filter_map(|line| {
let hex = line.strip_prefix("U+")?;
u32::from_str_radix(hex, 16).ok().and_then(char::from_u32)
})
.collect()
}
#[test]
fn test_process_map_variant_norm_exhaustive() {
let mappings = parse_tab_mappings(&process_map_dir().join("VARIANT_NORM.txt"));
assert!(
mappings.len() > 1000,
"expected 1000+ mappings, got {}",
mappings.len()
);
let mut failures = Vec::new();
for (src, expected) in &mappings {
let result = text_process(ProcessType::VariantNorm, src);
if result.as_ref() != expected.as_str() {
failures.push(format!(
"{src:?} → expected {expected:?}, got {:?}",
result.as_ref()
));
}
}
assert!(
failures.is_empty(),
"{} VariantNorm failures:\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn test_process_map_romanize_exhaustive() {
let mappings = parse_tab_mappings(&process_map_dir().join("ROMANIZE.txt"));
assert!(
mappings.len() > 10000,
"expected 10000+ mappings, got {}",
mappings.len()
);
let mut failures = Vec::new();
for (src, raw_dst) in &mappings {
let expected_romanize = format!(" {raw_dst}");
let result = text_process(ProcessType::Romanize, src);
if result.as_ref() != expected_romanize.as_str() {
failures.push(format!(
"Romanize {src:?} → expected {expected_romanize:?}, got {:?}",
result.as_ref()
));
}
let result_char = text_process(ProcessType::RomanizeChar, src);
if result_char.as_ref() != raw_dst.as_str() {
failures.push(format!(
"RomanizeChar {src:?} → expected {raw_dst:?}, got {:?}",
result_char.as_ref()
));
}
}
assert!(
failures.is_empty(),
"{} Romanize failures:\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn test_process_map_normalize_exhaustive() {
let norm = parse_tab_mappings(&process_map_dir().join("NORM.txt"));
let num_norm = parse_tab_mappings(&process_map_dir().join("NUM-NORM.txt"));
let all_mappings: Vec<_> = norm.iter().chain(num_norm.iter()).collect();
assert!(
all_mappings.len() > 5000,
"expected 5000+ mappings, got {}",
all_mappings.len()
);
let mut failures = Vec::new();
for (src, expected) in &all_mappings {
let result = text_process(ProcessType::Normalize, src);
if result.as_ref() != expected.as_str() {
failures.push(format!(
"{src:?} → expected {expected:?}, got {:?}",
result.as_ref()
));
}
}
assert!(
failures.is_empty(),
"{} Normalize failures:\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn test_process_map_delete_exhaustive() {
let codepoints = parse_hex_codepoints(&process_map_dir().join("TEXT-DELETE.txt"));
assert!(
codepoints.len() > 1000,
"expected 1000+ codepoints, got {}",
codepoints.len()
);
let mut failures = Vec::new();
for ch in &codepoints {
let input = ch.to_string();
let result = text_process(ProcessType::Delete, &input);
if !result.is_empty() {
failures.push(format!(
"U+{:04X} ({ch:?}) should be deleted, got {:?}",
*ch as u32,
result.as_ref()
));
}
}
assert!(
failures.is_empty(),
"{} Delete failures:\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn test_process_map_emoji_norm_exhaustive() {
let mappings = parse_tab_mappings(&process_map_dir().join("EMOJI_NORM.txt"));
assert!(
mappings.len() > 500,
"expected 500+ mappings, got {}",
mappings.len()
);
let mut failures = Vec::new();
for (src, raw_dst) in &mappings {
let expected = if raw_dst.is_empty() {
String::new()
} else {
format!(" {raw_dst}")
};
let result = text_process(ProcessType::EmojiNorm, src);
if result.as_ref() != expected.as_str() {
failures.push(format!(
"{src:?} → expected {expected:?}, got {:?}",
result.as_ref()
));
}
}
assert!(
failures.is_empty(),
"{} EmojiNorm failures:\n{}",
failures.len(),
failures.join("\n")
);
}