use simd_normalizer::CaseFoldMode;
use simd_normalizer::matching::{
MatchingOptions, matches_normalized, normalize_for_matching, normalize_for_matching_utf16,
};
fn default_opts() -> MatchingOptions {
MatchingOptions::default()
}
fn turkish_opts() -> MatchingOptions {
MatchingOptions {
case_fold: CaseFoldMode::Turkish,
}
}
fn assert_symmetric(a: &str, b: &str, opts: &MatchingOptions, expected: bool) {
let ab = matches_normalized(a, b, opts);
let ba = matches_normalized(b, a, opts);
assert_eq!(
ab, ba,
"matches_normalized is not symmetric for ({:?}, {:?}): (a,b)={}, (b,a)={}",
a, b, ab, ba,
);
assert_eq!(
ab, expected,
"matches_normalized({:?}, {:?}) should be {}, got {}",
a, b, expected, ab,
);
}
#[test]
fn symmetry_confusable_pairs() {
let opts = default_opts();
assert_symmetric("a", "\u{0430}", &opts, true);
assert_symmetric("o", "\u{043E}", &opts, true);
assert_symmetric("e", "\u{0435}", &opts, true);
assert_symmetric("p", "\u{0440}", &opts, true);
}
#[test]
fn symmetry_case_pairs() {
let opts = default_opts();
assert_symmetric("Hello", "hello", &opts, true);
assert_symmetric("WORLD", "world", &opts, true);
assert_symmetric("FiLe", "file", &opts, true);
}
#[test]
fn symmetry_mixed_case_and_confusable() {
let opts = default_opts();
let cyrillic_mixed = "\u{0410}\u{0440}\u{0440}l\u{0435}"; assert_symmetric("Apple", cyrillic_mixed, &opts, true);
}
#[test]
fn symmetry_non_matching_pairs() {
let opts = default_opts();
assert_symmetric("hello", "world", &opts, false);
assert_symmetric("cat", "dog", &opts, false);
assert_symmetric("abc", "xyz", &opts, false);
}
#[test]
fn symmetry_empty_strings() {
let opts = default_opts();
assert_symmetric("", "", &opts, true);
assert_symmetric("", "a", &opts, false);
}
#[test]
fn symmetry_identical_strings() {
let opts = default_opts();
assert_symmetric("test", "test", &opts, true);
assert_symmetric("\u{1F600}", "\u{1F600}", &opts, true);
}
#[test]
fn nfkc_roman_numeral_one() {
let opts = default_opts();
let norm_roman = normalize_for_matching("\u{2160}", &opts);
let norm_i = normalize_for_matching("I", &opts);
let norm_i_lower = normalize_for_matching("i", &opts);
assert_eq!(
norm_roman, norm_i,
"Roman numeral Ⅰ should match 'I' after matching pipeline"
);
assert_eq!(
norm_roman, norm_i_lower,
"Roman numeral Ⅰ should match 'i' after matching pipeline"
);
}
#[test]
fn nfkc_roman_numeral_range() {
let opts = default_opts();
assert_eq!(
normalize_for_matching("\u{2161}", &opts),
normalize_for_matching("II", &opts),
"Ⅱ should match 'II'"
);
assert_eq!(
normalize_for_matching("\u{2162}", &opts),
normalize_for_matching("III", &opts),
"Ⅲ should match 'III'"
);
}
#[test]
fn nfkc_circled_latin_small_a() {
let opts = default_opts();
let norm_circled = normalize_for_matching("\u{24D0}", &opts);
let norm_a = normalize_for_matching("a", &opts);
assert_eq!(
norm_circled, norm_a,
"Circled latin small letter a should match 'a'"
);
}
#[test]
fn nfkc_circled_latin_capital_a() {
let opts = default_opts();
let norm_circled_cap = normalize_for_matching("\u{24B6}", &opts);
let norm_a = normalize_for_matching("a", &opts);
assert_eq!(
norm_circled_cap, norm_a,
"Circled latin capital letter A should match 'a'"
);
}
#[test]
fn nfkc_parenthesized_digit_one() {
let opts = default_opts();
let norm_paren = normalize_for_matching("\u{2474}", &opts);
let norm_literal = normalize_for_matching("(1)", &opts);
assert_eq!(
norm_paren, norm_literal,
"Parenthesized digit one should match '(1)'"
);
}
#[test]
fn nfkc_fraction_one_half() {
let opts = default_opts();
let norm_frac = normalize_for_matching("\u{00BD}", &opts);
let norm_explicit = normalize_for_matching("1\u{2044}2", &opts);
assert_eq!(
norm_frac, norm_explicit,
"½ should match '1⁄2' (with fraction slash)"
);
}
#[test]
fn nfkc_fraction_one_quarter() {
let opts = default_opts();
let norm_frac = normalize_for_matching("\u{00BC}", &opts);
let norm_explicit = normalize_for_matching("1\u{2044}4", &opts);
assert_eq!(
norm_frac, norm_explicit,
"¼ should match '1⁄4' (with fraction slash)"
);
}
#[test]
fn nfkc_fullwidth_digits() {
let opts = default_opts();
assert_eq!(
normalize_for_matching("\u{FF11}", &opts),
normalize_for_matching("1", &opts),
"Fullwidth digit 1 should match '1'"
);
assert_eq!(
normalize_for_matching("\u{FF10}", &opts),
normalize_for_matching("0", &opts),
"Fullwidth digit 0 should match '0'"
);
}
#[test]
fn nfkc_subscript_digits() {
let opts = default_opts();
assert_eq!(
normalize_for_matching("\u{2080}", &opts),
normalize_for_matching("0", &opts),
"Subscript 0 should match '0'"
);
assert_eq!(
normalize_for_matching("\u{2082}", &opts),
normalize_for_matching("2", &opts),
"Subscript 2 should match '2'"
);
}
#[test]
fn ligature_fi_matches_fi() {
let opts = default_opts();
assert!(
matches_normalized("\u{FB01}", "fi", &opts),
"fi ligature should match 'fi'"
);
}
#[test]
fn ligature_fl_matches_fl() {
let opts = default_opts();
assert!(
matches_normalized("\u{FB02}", "fl", &opts),
"fl ligature should match 'fl'"
);
}
#[test]
fn ligature_ffi_matches_ffi() {
let opts = default_opts();
assert!(
matches_normalized("\u{FB03}", "ffi", &opts),
"ffi ligature should match 'ffi'"
);
}
#[test]
fn ligature_ffl_matches_ffl() {
let opts = default_opts();
assert!(
matches_normalized("\u{FB04}", "ffl", &opts),
"ffl ligature should match 'ffl'"
);
}
#[test]
fn ligature_in_word_context() {
let opts = default_opts();
assert!(
matches_normalized("o\u{FB03}ce", "office", &opts),
"'office' (with ffi ligature) should match 'office'"
);
}
#[test]
fn ligature_st() {
let opts = default_opts();
assert!(
matches_normalized("\u{FB06}", "st", &opts),
"st ligature should match 'st'"
);
}
#[test]
fn convergence_idempotent_after_normalization() {
let opts = default_opts();
let tricky_inputs = [
"\u{2160}", "\u{FB01}", "\u{00BD}", "\u{FF21}", "\u{0430}", "\u{00B2}", "\u{24D0}", "F\u{0131}LE", "\u{0410}\u{0440}\u{0440}l\u{0435}", ];
for input in &tricky_inputs {
let once = normalize_for_matching(input, &opts);
let twice = normalize_for_matching(&once, &opts);
assert_eq!(
once, twice,
"Pipeline did not converge (not idempotent) for input {:?}: once={:?}, twice={:?}",
input, once, twice,
);
}
}
#[test]
fn convergence_multi_step_chain() {
let opts = default_opts();
let fw_a = normalize_for_matching("\u{FF21}", &opts);
let cyr_cap_a = normalize_for_matching("\u{0410}", &opts);
assert_eq!(
fw_a, cyr_cap_a,
"Fullwidth A and Cyrillic A should converge to the same matching form"
);
}
#[test]
fn convergence_passes_are_bounded() {
let opts = default_opts();
let input = "\u{FF21}\u{FF22}\u{FF23}\u{2160}\u{2161}\u{2162}\u{00BD}\u{FB01}\u{FB02}";
let result = normalize_for_matching(input, &opts);
assert!(
!result.is_empty(),
"Convergence should produce non-empty output"
);
let again = normalize_for_matching(&result, &opts);
assert_eq!(result, again, "Result should be stable after convergence");
}
#[test]
fn turkish_i_distinctions() {
let turkish = turkish_opts();
let standard = default_opts();
assert!(
matches_normalized("I", "\u{0131}", &turkish),
"Turkish: 'I' should match 'ı'"
);
assert!(
matches_normalized("\u{0130}", "i", &turkish),
"Turkish: 'İ' should match 'i'"
);
assert!(
matches_normalized("I", "i", &standard),
"Standard: 'I' should match 'i'"
);
}
#[test]
fn turkish_vs_standard_different_results_for_uppercase_i() {
let turkish = turkish_opts();
let standard = default_opts();
let turkish_i = normalize_for_matching("I", &turkish);
let standard_i = normalize_for_matching("I", &standard);
let turkish_fold = normalize_for_matching("\u{0131}", &turkish); let standard_fold = normalize_for_matching("i", &standard);
assert_eq!(
turkish_i, turkish_fold,
"Turkish: 'I' should produce same result as 'ı'"
);
assert_eq!(
standard_i, standard_fold,
"Standard: 'I' should produce same result as 'i'"
);
}
#[test]
fn turkish_mode_with_confusable_cyrillic() {
let opts = turkish_opts();
assert!(
matches_normalized("a", "\u{0430}", &opts),
"Turkish mode: Latin 'a' and Cyrillic 'а' should still match"
);
}
#[test]
fn turkish_mode_full_word() {
let opts = turkish_opts();
assert!(matches_normalized("\u{0130}stanbul", "istanbul", &opts));
assert!(matches_normalized("ISTANBUL", "\u{0131}stanbul", &opts));
}
#[test]
fn turkish_mode_dotless_i_in_word() {
let turkish = turkish_opts();
assert!(
matches_normalized("f\u{0131}le", "f\u{0131}le", &turkish),
"Turkish: identical fıle should match"
);
}
#[test]
fn utf16_empty_string() {
let opts = default_opts();
let utf16 = normalize_for_matching_utf16("", &opts);
assert!(
utf16.is_empty(),
"Empty string should produce empty UTF-16 vec"
);
}
#[test]
fn utf16_ascii_only_no_surrogates() {
let opts = default_opts();
let utf16 = normalize_for_matching_utf16("hello", &opts);
for &cu in &utf16 {
assert!(
!(0xD800..=0xDFFF).contains(&cu),
"ASCII-only input should not produce surrogate pairs, found {:04X}",
cu,
);
}
}
#[test]
fn utf16_supplementary_chars_produce_surrogates() {
let opts = default_opts();
let utf16 = normalize_for_matching_utf16("\u{1F600}", &opts);
assert!(
utf16.len() >= 2,
"Supplementary character should produce at least 2 code units (surrogate pair)"
);
let has_surrogate = utf16.iter().any(|&cu| (0xD800..=0xDBFF).contains(&cu));
assert!(
has_surrogate,
"Supplementary character should have a high surrogate"
);
String::from_utf16(&utf16).expect("Should be valid UTF-16 for supplementary char");
}
#[test]
fn utf16_musical_symbol_supplementary() {
let opts = default_opts();
let utf16 = normalize_for_matching_utf16("\u{1D11E}", &opts);
assert!(
utf16.len() >= 2,
"Musical symbol should produce surrogate pair"
);
String::from_utf16(&utf16).expect("Should be valid UTF-16 for musical symbol");
}
#[test]
fn utf16_mixed_bmp_and_supplementary() {
let opts = default_opts();
let input = "Hello \u{1F600} World \u{1F4A9}";
let utf16 = normalize_for_matching_utf16(input, &opts);
assert!(!utf16.is_empty());
let decoded = String::from_utf16(&utf16).expect("Should be valid UTF-16");
assert_eq!(
decoded,
normalize_for_matching(input, &opts),
"UTF-16 round-trip should match normalize_for_matching"
);
}
#[test]
fn utf16_roundtrip_comprehensive() {
let opts = default_opts();
let inputs = [
"",
"a",
"hello",
"CAFÉ",
"\u{1F600}", "\u{1D11E}", "abc\u{1F600}def\u{1F4A9}ghi", "\u{2160}\u{2161}", "\u{FB01}\u{FB02}", "\u{00BD}", "\u{0430}\u{0440}\u{0440}l\u{0435}", "\u{FF21}\u{FF22}\u{FF23}", ];
for input in &inputs {
let utf16 = normalize_for_matching_utf16(input, &opts);
let expected = normalize_for_matching(input, &opts);
if expected.is_empty() {
assert!(
utf16.is_empty(),
"Empty matching result should give empty UTF-16"
);
} else {
let decoded = String::from_utf16(&utf16)
.unwrap_or_else(|_| panic!("Invalid UTF-16 for input {:?}", input));
assert_eq!(
decoded, expected,
"UTF-16 round-trip mismatch for input {:?}",
input,
);
}
}
}
#[test]
fn matching_options_default_is_standard() {
let opts = MatchingOptions::default();
assert_eq!(
opts.case_fold,
CaseFoldMode::Standard,
"Default MatchingOptions should use Standard case folding"
);
}
#[test]
#[allow(clippy::clone_on_copy)]
fn matching_options_clone_and_copy() {
let opts = MatchingOptions {
case_fold: CaseFoldMode::Turkish,
};
let cloned = opts.clone();
let copied = opts; assert_eq!(opts, cloned, "Clone should produce equal MatchingOptions");
assert_eq!(opts, copied, "Copy should produce equal MatchingOptions");
}
#[test]
fn matching_options_debug() {
let opts = MatchingOptions::default();
let debug_str = format!("{:?}", opts);
assert!(
debug_str.contains("MatchingOptions"),
"Debug output should contain 'MatchingOptions', got: {}",
debug_str,
);
assert!(
debug_str.contains("Standard"),
"Debug output should contain 'Standard', got: {}",
debug_str,
);
}
#[test]
fn matching_options_eq() {
let a = MatchingOptions::default();
let b = MatchingOptions::default();
let c = MatchingOptions {
case_fold: CaseFoldMode::Turkish,
};
assert_eq!(a, b, "Two default MatchingOptions should be equal");
assert_ne!(a, c, "Standard and Turkish options should not be equal");
}
#[test]
#[allow(clippy::clone_on_copy)]
fn casefold_mode_clone_copy_debug_eq() {
let standard = CaseFoldMode::Standard;
let turkish = CaseFoldMode::Turkish;
let cloned = standard.clone();
assert_eq!(standard, cloned);
let copied = standard;
assert_eq!(standard, copied);
let debug_s = format!("{:?}", standard);
assert!(debug_s.contains("Standard"));
let debug_t = format!("{:?}", turkish);
assert!(debug_t.contains("Turkish"));
assert_eq!(standard, CaseFoldMode::Standard);
assert_ne!(standard, turkish);
}
#[test]
fn very_long_string_no_panic() {
let opts = default_opts();
let chunk = "Hello World \u{0430}\u{0440}\u{0440}l\u{0435} \u{FF21}\u{00BD} ";
let input: String = chunk.repeat(500); assert!(input.len() > 10_000, "Input should be >10KB");
let result = normalize_for_matching(&input, &opts);
assert!(!result.is_empty());
let utf16 = normalize_for_matching_utf16(&input, &opts);
assert!(!utf16.is_empty());
}
#[test]
fn only_combining_marks() {
let opts = default_opts();
let marks = "\u{0300}\u{0301}\u{0302}\u{0303}\u{0308}";
let result = normalize_for_matching(marks, &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(
result, again,
"Combining-marks-only result should be idempotent"
);
}
#[test]
fn many_combining_marks_stacked() {
let opts = default_opts();
let mut input = String::from("a");
for _ in 0..100 {
input.push('\u{0300}'); }
let result = normalize_for_matching(&input, &opts);
assert!(!result.is_empty());
}
#[test]
fn mixed_scripts_latin_cyrillic_cjk() {
let opts = default_opts();
let input = "Hello\u{041F}\u{0440}\u{0438}\u{0432}\u{0435}\u{0442}\u{4F60}\u{597D}";
let result = normalize_for_matching(input, &opts);
assert!(
!result.is_empty(),
"Mixed script input should produce output"
);
let again = normalize_for_matching(&result, &opts);
assert_eq!(result, again, "Mixed script result should be idempotent");
}
#[test]
fn null_character() {
let opts = default_opts();
let input = "\u{0000}";
let result = normalize_for_matching(input, &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(result, again, "Null character result should be idempotent");
}
#[test]
fn null_character_in_middle() {
let opts = default_opts();
let input = "ab\u{0000}cd";
let result = normalize_for_matching(input, &opts);
assert!(!result.is_empty());
let again = normalize_for_matching(&result, &opts);
assert_eq!(result, again, "Null-in-middle result should be idempotent");
}
#[test]
fn single_character_various_scripts() {
let opts = default_opts();
let chars = [
"a", "A", "\u{0430}", "\u{0410}", "\u{03B1}", "\u{0391}", "\u{4E00}", "\u{0627}", "\u{05D0}", "\u{0E01}", "\u{3042}", "\u{30A2}", "\u{AC00}", "\u{1F600}", ];
for &ch in &chars {
let result = normalize_for_matching(ch, &opts);
assert!(
!result.is_empty(),
"Single character {:?} should produce non-empty result",
ch,
);
let again = normalize_for_matching(&result, &opts);
assert_eq!(
result, again,
"Single character {:?} should be idempotent after matching normalization",
ch,
);
}
}
#[test]
fn replacement_character() {
let opts = default_opts();
let result = normalize_for_matching("\u{FFFD}", &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(
result, again,
"Replacement character result should be idempotent"
);
}
#[test]
fn bom_character() {
let opts = default_opts();
let result = normalize_for_matching("\u{FEFF}", &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(result, again, "BOM result should be idempotent");
}
#[test]
fn soft_hyphen() {
let opts = default_opts();
let result = normalize_for_matching("\u{00AD}", &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(result, again, "Soft hyphen result should be idempotent");
}
#[test]
fn zero_width_chars() {
let opts = default_opts();
let input = "\u{200B}\u{200C}\u{200D}\u{FEFF}";
let result = normalize_for_matching(input, &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(
result, again,
"Zero-width chars result should be idempotent"
);
}
#[test]
fn private_use_area_characters() {
let opts = default_opts();
let result = normalize_for_matching("\u{E000}", &opts);
assert!(!result.is_empty());
let result2 = normalize_for_matching("\u{F8FF}", &opts);
assert!(!result2.is_empty());
}
#[test]
fn max_unicode_scalar() {
let opts = default_opts();
let result = normalize_for_matching("\u{10FFFF}", &opts);
let again = normalize_for_matching(&result, &opts);
assert_eq!(
result, again,
"Max Unicode scalar result should be idempotent"
);
}
#[test]
fn confusable_and_nfkc_combined() {
let opts = default_opts();
assert!(
matches_normalized("\u{FF21}", "\u{0430}", &opts),
"Fullwidth A should match Cyrillic а through NFKC + confusable pipeline"
);
}
#[test]
fn repeated_normalization_stability() {
let opts = default_opts();
let input = "\u{FF21}\u{0410}\u{FB01}\u{00BD}";
let mut prev_result = normalize_for_matching(input, &opts);
for i in 1..10 {
let next_result = normalize_for_matching(&prev_result, &opts);
assert_eq!(
prev_result, next_result,
"Normalization should be stable at iteration {}",
i
);
prev_result = next_result;
}
}
#[test]
fn matches_normalized_reflexive() {
let opts = default_opts();
let inputs = [
"",
"a",
"Hello",
"\u{0430}",
"\u{1F600}",
"\u{FF21}",
"\u{FB01}",
"\u{0300}\u{0301}",
"a\u{0300}b\u{0301}c",
];
for &input in &inputs {
assert!(
matches_normalized(input, input, &opts),
"matches_normalized should be reflexive for {:?}",
input,
);
}
}
#[test]
fn matches_normalized_transitive() {
let opts = default_opts();
let a = "\u{FF21}"; let b = "A"; let c = "\u{0410}";
let ab = matches_normalized(a, b, &opts);
let bc = matches_normalized(b, c, &opts);
let ac = matches_normalized(a, c, &opts);
assert!(ab, "Fullwidth A should match Latin A");
assert!(bc, "Latin A should match Cyrillic A");
assert!(ac, "Fullwidth A should match Cyrillic A (transitivity)");
}
#[test]
fn hangul_syllable_composition() {
let opts = default_opts();
assert!(
matches_normalized("\u{AC00}", "\u{1100}\u{1161}", &opts),
"Hangul syllable should match its Jamo decomposition through matching"
);
}
#[test]
fn nfkc_angstrom_sign() {
let opts = default_opts();
assert!(
matches_normalized("\u{212B}", "\u{00C5}", &opts),
"Angstrom sign should match Latin A with ring above"
);
assert!(
matches_normalized("\u{212B}", "\u{00E5}", &opts),
"Angstrom sign should match lowercase a with ring above (after casefold)"
);
}
#[test]
fn nfkc_ohm_sign() {
let opts = default_opts();
assert!(
matches_normalized("\u{2126}", "\u{03A9}", &opts),
"Ohm sign should match Greek capital omega"
);
assert!(
matches_normalized("\u{2126}", "\u{03C9}", &opts),
"Ohm sign should match Greek small omega (after casefold)"
);
}
#[test]
fn nfkc_kelvin_sign() {
let opts = default_opts();
assert!(
matches_normalized("\u{212A}", "K", &opts),
"Kelvin sign should match Latin K"
);
assert!(
matches_normalized("\u{212A}", "k", &opts),
"Kelvin sign should match Latin k (after casefold)"
);
}