use matcher_rs::{ProcessType, SimpleMatcherBuilder};
#[test]
fn test_direct_vs_indirect_dispatch() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "hello")
.add_word(ProcessType::None, 2, "hello&world")
.add_word(ProcessType::None, 3, "hello&earth")
.build()
.unwrap();
let r1 = matcher.process("hello");
assert_eq!(r1.len(), 1);
assert_eq!(r1[0].word_id, 1);
let r2 = matcher.process("hello world");
let mut ids: Vec<u32> = r2.iter().map(|r| r.word_id).collect();
ids.sort();
assert_eq!(ids, vec![1, 2]);
let r3 = matcher.process("hello world earth");
assert_eq!(r3.len(), 3);
}
#[test]
fn test_bitmask_to_matrix_threshold() {
let parts_64: Vec<String> = (0..64).map(|i| format!("w{i}")).collect();
let matcher_64 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, parts_64.join("&"))
.build()
.unwrap();
let text_64 = parts_64.join(" ");
assert!(matcher_64.is_match(&text_64));
assert!(!matcher_64.is_match(&parts_64[..63].join(" ")));
let parts_65: Vec<String> = (0..65).map(|i| format!("w{i}")).collect();
let matcher_65 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, parts_65.join("&"))
.build()
.unwrap();
let text_65 = parts_65.join(" ");
assert!(matcher_65.is_match(&text_65));
assert!(!matcher_65.is_match(&parts_65[..64].join(" ")));
let and_63: Vec<String> = (0..63).map(|i| format!("w{i}")).collect();
let m_63_not = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, format!("{}~veto", and_63.join("&")))
.build()
.unwrap();
assert!(m_63_not.is_match(&and_63.join(" ")));
assert!(!m_63_not.is_match(&format!("{} veto", and_63.join(" "))));
let m_63_2not = SimpleMatcherBuilder::new()
.add_word(
ProcessType::None,
1,
format!("{}~notX~notY", and_63.join("&")),
)
.build()
.unwrap();
assert!(m_63_2not.is_match(&and_63.join(" ")));
assert!(!m_63_2not.is_match(&format!("{} notX", and_63.join(" "))));
}
#[test]
fn test_matrix_repeated_segments() {
let m1 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "a&a&b&b&b")
.build()
.unwrap();
assert!(m1.is_match("a a b b b"));
assert!(!m1.is_match("a b b b"), "only 1a");
assert!(!m1.is_match("a a b b"), "only 2b");
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "a~b~b")
.build()
.unwrap();
assert!(m2.is_match("a"));
assert!(m2.is_match("a b"), "1b below threshold");
assert!(!m2.is_match("a b b"), "2b triggers veto");
assert!(!m2.is_match("a b b b"), "3b triggers veto");
}
#[test]
fn test_delete_adjusted_pattern_indexing() {
let m1 = SimpleMatcherBuilder::new()
.add_word(ProcessType::Delete, 1, "ab")
.build()
.unwrap();
assert!(m1.is_match("ab"));
assert!(m1.is_match("a*b"));
assert!(m1.is_match("a b"));
assert!(!m1.is_match("ac"));
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm | ProcessType::Delete, 1, "测试")
.build()
.unwrap();
assert!(m2.is_match("测试"), "simplified direct");
assert!(m2.is_match("測試"), "traditional → VariantNorm");
assert!(m2.is_match("测!试"), "simplified + noise → Delete");
assert!(m2.is_match("測!試"), "traditional + noise → both");
}
#[test]
fn test_density_dispatch() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "needle")
.add_word(ProcessType::None, 2, "针")
.build()
.unwrap();
let low = format!("needle 针 {}", "a".repeat(50));
let ids_low: Vec<u32> = matcher.process(&low).iter().map(|r| r.word_id).collect();
assert!(ids_low.contains(&1), "low density: needle found");
assert!(ids_low.contains(&2), "low density: 针 found");
let high = format!("{}needle{}", "你好世界测试国语中文", "你好世界测试国语中文");
assert!(matcher.is_match(&high));
let ids_high: Vec<u32> = matcher.process(&high).iter().map(|r| r.word_id).collect();
assert!(ids_high.contains(&1), "high density: needle found");
let ascii_only = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "hello")
.add_word(ProcessType::None, 2, "world")
.build()
.unwrap();
assert!(ascii_only.is_match("hello 世界"));
assert!(
!ascii_only.is_match("你好世界"),
"no ASCII pattern in CJK text"
);
let r = ascii_only.process("hello 世界 world");
let mut ids: Vec<u32> = r.into_iter().map(|r| r.word_id).collect();
ids.sort();
assert_eq!(ids, vec![1, 2]);
}
#[test]
fn test_overlapping_patterns() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "hello")
.add_word(ProcessType::None, 2, "hello world")
.add_word(ProcessType::None, 3, "world")
.build()
.unwrap();
let mut ids: Vec<u32> = matcher
.process("hello world")
.into_iter()
.map(|r| r.word_id)
.collect();
ids.sort();
assert_eq!(ids, vec![1, 2, 3]);
}
#[test]
fn test_large_pattern_set_compilation() {
let ascii: Vec<String> = (0..150u32).map(|i| format!("ascii{i:03}")).collect();
let cjk: Vec<String> = (0..150u32).map(|i| format!("测试{i:03}")).collect();
let mut builder = SimpleMatcherBuilder::new();
for (i, w) in ascii.iter().enumerate() {
builder = builder.add_word(ProcessType::None, i as u32, w);
}
for (i, w) in cjk.iter().enumerate() {
builder = builder.add_word(ProcessType::None, i as u32 + 1000, w);
}
let matcher = builder.build().unwrap();
assert!(matcher.is_match("ascii042"));
assert!(matcher.is_match("测试099"));
assert!(!matcher.is_match("missing"));
let results = matcher.process("ascii000 测试000 some text");
let mut ids: Vec<u32> = results.iter().map(|r| r.word_id).collect();
ids.sort();
assert!(ids.contains(&0));
assert!(ids.contains(&1000));
}
#[test]
fn test_dfa_and_charwise_streaming() {
let words: Vec<String> = (0..100u32).map(|i| format!("word{i:03}")).collect();
let mut builder = SimpleMatcherBuilder::new();
for (i, w) in words.iter().enumerate() {
builder = builder.add_word(ProcessType::VariantNorm, i as u32, w);
}
let m1 = builder.build().unwrap();
assert!(m1.is_match("word042"));
assert_eq!(m1.process("word000 word099").len(), 2);
let m2 = SimpleMatcherBuilder::new()
.add_word(ProcessType::VariantNorm | ProcessType::Delete, 1, "测试")
.add_word(ProcessType::VariantNorm | ProcessType::Delete, 2, "你好")
.build()
.unwrap();
assert!(m2.is_match("測!試"));
assert!(m2.is_match("你!好"));
assert_eq!(m2.process("測!試 你!好").len(), 2);
}
#[test]
fn test_sequential_matcher_reuse() {
let matcher_a = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "alpha")
.add_word(ProcessType::None, 2, "beta")
.build()
.unwrap();
let matcher_b = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 10, "gamma")
.add_word(ProcessType::None, 20, "delta")
.build()
.unwrap();
assert!(matcher_a.is_match("alpha beta"));
assert_eq!(matcher_a.process("alpha beta").len(), 2);
assert!(!matcher_b.is_match("alpha"));
assert!(matcher_b.is_match("gamma delta"));
assert_eq!(matcher_b.process("gamma delta").len(), 2);
assert!(matcher_a.is_match("alpha"));
assert!(!matcher_a.is_match("gamma"));
}
#[test]
fn test_mixed_ascii_cjk_rules() {
let matcher = SimpleMatcherBuilder::new()
.add_word(ProcessType::None, 1, "abc")
.add_word(ProcessType::None, 2, "你好")
.build()
.unwrap();
let mut ids: Vec<u32> = matcher
.process("你好 abc")
.into_iter()
.map(|r| r.word_id)
.collect();
ids.sort_unstable();
assert_eq!(ids, vec![1, 2]);
}