use simd_normalizer::CaseFoldMode;
use simd_normalizer::matching::{
MatchingOptions, matches_normalized, normalize_for_matching, normalize_for_matching_utf16,
};
fn main() {
println!("=== simd-normalizer: Text Matching Examples ===\n");
section_fused_comparison();
section_confusable_detection();
section_compatibility_matching();
section_normalize_for_indexing();
section_turkish_mode();
section_utf16_interop();
section_search_scenario();
section_username_comparison();
println!("=== All text matching examples completed. ===");
}
fn section_fused_comparison() {
println!("--- 1. Fused Comparison with matches_normalized() ---\n");
println!(" matches_normalized() combines NFKC + case folding + confusable");
println!(" skeleton mapping in a single comparison step.\n");
let opts = MatchingOptions::default();
let result = matches_normalized("File", "file", &opts);
println!(" matches_normalized(\"File\", \"file\") = {}", result);
assert!(result);
let result = matches_normalized("HELLO", "hello", &opts);
println!(" matches_normalized(\"HELLO\", \"hello\") = {}", result);
assert!(result);
let result = matches_normalized("CaFe", "cafe", &opts);
println!(" matches_normalized(\"CaFe\", \"cafe\") = {}", result);
assert!(result);
let result = matches_normalized("hello", "world", &opts);
println!(" matches_normalized(\"hello\", \"world\") = {}", result);
assert!(!result);
println!();
}
fn section_confusable_detection() {
println!("--- 2. Confusable Character Detection ---\n");
println!(" The matching pipeline detects visually confusable characters");
println!(" (UTS #39 skeleton) so that spoofing attempts are caught.\n");
let opts = MatchingOptions::default();
let result = matches_normalized("a", "\u{0430}", &opts);
println!(
" Latin 'a' (U+0061) vs Cyrillic '\u{0430}' (U+0430): match = {}",
result
);
assert!(result);
let latin = "apple";
let spoofed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
let result = matches_normalized(latin, spoofed, &opts);
println!(
" Latin \"apple\" vs Cyrillic-mixed \"{}\": match = {}",
spoofed, result
);
assert!(result);
let result = matches_normalized("test", "test", &opts);
println!(
" Identical strings \"test\" vs \"test\": match = {}",
result
);
assert!(result);
println!();
}
fn section_compatibility_matching() {
println!("--- 3. NFKC Compatibility Matching ---\n");
println!(" NFKC normalization unifies compatibility equivalents such as");
println!(" fullwidth characters and superscript digits.\n");
let opts = MatchingOptions::default();
let result = matches_normalized("\u{FF21}", "a", &opts);
println!(" Fullwidth 'A' (U+FF21) vs 'a': match = {}", result);
assert!(result);
let fullwidth = "\u{FF28}\u{FF45}\u{FF4C}\u{FF4C}\u{FF4F}"; let result = matches_normalized(fullwidth, "hello", &opts);
println!(
" Fullwidth \"{}\" vs \"hello\": match = {}",
fullwidth, result
);
assert!(result);
let result = matches_normalized("\u{00B2}", "2", &opts);
println!(" Superscript '2' (U+00B2) vs '2': match = {}", result);
assert!(result);
println!();
}
fn section_normalize_for_indexing() {
println!("--- 4. Pre-Processing with normalize_for_matching() ---\n");
println!(" normalize_for_matching() returns a canonical matching form suitable");
println!(" for storage in search indexes or database columns.\n");
let opts = MatchingOptions::default();
let inputs = [
"Cafe",
"cafe",
"CAFE",
"caf\u{00E9}", "cafe\u{0301}", ];
println!(" Case variants normalize to the same form (accents are preserved):");
for input in &inputs {
let normalized = normalize_for_matching(input, &opts);
println!(" {:?} -> {:?}", input, normalized);
}
println!();
let once = normalize_for_matching("File", &opts);
let twice = normalize_for_matching(&once, &opts);
println!(" Idempotence check:");
println!(" normalize_for_matching(\"File\") = {:?}", once);
println!(" normalize_for_matching(result) = {:?}", twice);
println!(" Idempotent: {}", once == twice);
assert_eq!(once, twice);
println!();
}
fn section_turkish_mode() {
println!("--- 5. Turkish Mode (MatchingOptions) ---\n");
println!(" Turkish/Azerbaijani locales have special case folding rules:");
println!(" Standard: I -> i Turkish: I -> \\u{{0131}} (dotless i)");
println!(" Standard: i -> i Turkish: \\u{{0130}} (dotted I) -> i\n");
let standard = MatchingOptions::default();
let turkish = MatchingOptions {
case_fold: CaseFoldMode::Turkish,
};
let std_norm = normalize_for_matching("Istanbul", &standard);
println!(" Standard mode:");
println!(" normalize_for_matching(\"Istanbul\") = {:?}", std_norm);
let tr_norm = normalize_for_matching("Istanbul", &turkish);
println!(" Turkish mode:");
println!(" normalize_for_matching(\"Istanbul\") = {:?}", tr_norm);
let dotted_i = "\u{0130}stanbul"; let tr_dotted = normalize_for_matching(dotted_i, &turkish);
println!(
" normalize_for_matching(\"\\u{{0130}}stanbul\") = {:?}",
tr_dotted
);
let result = matches_normalized(dotted_i, "istanbul", &turkish);
println!(
" matches_normalized(\"\\u{{0130}}stanbul\", \"istanbul\") = {}",
result
);
assert!(result);
let result = matches_normalized("Istanbul", "\u{0131}stanbul", &turkish);
println!(
" matches_normalized(\"Istanbul\", \"\\u{{0131}}stanbul\") = {} (Turkish)",
result
);
assert!(result);
println!();
}
fn section_utf16_interop() {
println!("--- 6. UTF-16 Output for Interop ---\n");
println!(" normalize_for_matching_utf16() produces UTF-16 code units,");
println!(" useful for interop with Windows APIs, Java, .NET, or databases");
println!(" that store text as UTF-16.\n");
let opts = MatchingOptions::default();
let utf16 = normalize_for_matching_utf16("Hello", &opts);
let utf8 = normalize_for_matching("Hello", &opts);
println!(" Input: \"Hello\"");
println!(" UTF-8 matching form: {:?}", utf8);
println!(" UTF-16 code units: {:?}", utf16);
let decoded = String::from_utf16(&utf16).expect("valid UTF-16");
println!(" Round-trip to String: {:?}", decoded);
assert_eq!(decoded, utf8);
println!();
let emoji = "\u{1F600}"; let utf16_emoji = normalize_for_matching_utf16(emoji, &opts);
println!(" Input: \"{}\" (U+1F600, grinning face)", emoji);
println!(" UTF-16 code units: {:?}", utf16_emoji);
println!(
" Code unit count: {} (surrogate pair for supplementary char)",
utf16_emoji.len()
);
println!();
let accented = "Caf\u{00E9}";
let utf16_acc = normalize_for_matching_utf16(accented, &opts);
let utf8_acc = normalize_for_matching(accented, &opts);
println!(" Input: {:?}", accented);
println!(" UTF-8 matching form: {:?}", utf8_acc);
println!(" UTF-16 code units: {:?}", utf16_acc);
println!();
}
fn section_search_scenario() {
println!("--- 7. Practical Scenario: Search Indexing ---\n");
println!(" Normalize documents at index time, then normalize queries at search");
println!(" time. Matching happens via simple string equality on the index.\n");
let opts = MatchingOptions::default();
let documents = [
"Resume",
"r\u{00E9}sum\u{00E9}", "RESUME",
"R\u{00C9}SUM\u{00C9}", ];
println!(" Indexing documents:");
let index: Vec<(String, &str)> = documents
.iter()
.map(|doc| (normalize_for_matching(doc, &opts), *doc))
.collect();
for (normalized, original) in &index {
println!(" {:?} -> index key {:?}", original, normalized);
}
let query = "resume";
let query_key = normalize_for_matching(query, &opts);
println!("\n Search query: {:?} -> key {:?}", query, query_key);
let hits: Vec<&&str> = index
.iter()
.filter(|(key, _)| key == &query_key)
.map(|(_, original)| original)
.collect();
println!(" Matching documents: {:?}", hits);
println!();
}
fn section_username_comparison() {
println!("--- 8. Practical Scenario: Username Anti-Spoofing ---\n");
println!(" Detect attempts to register confusable usernames by comparing");
println!(" the matching-normalized forms of existing and proposed names.\n");
let opts = MatchingOptions::default();
let existing_user = "admin";
let existing_key = normalize_for_matching(existing_user, &opts);
let attempts = [
"Admin", "ADMIN", "\u{0430}dmin", "\u{FF21}dmin", "adm\u{0131}n", "administrator", ];
println!(
" Existing user: {:?} (key: {:?})\n",
existing_user, existing_key
);
for attempt in &attempts {
let attempt_key = normalize_for_matching(attempt, &opts);
let blocked = attempt_key == existing_key;
println!(
" Attempt {:?} -> key {:?} => {}",
attempt,
attempt_key,
if blocked {
"BLOCKED (confusable)"
} else {
"allowed"
}
);
}
println!();
}