use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
fn icu_nfc(s: &str) -> String {
ComposingNormalizerBorrowed::new_nfc()
.normalize(s)
.into_owned()
}
fn icu_nfd(s: &str) -> String {
DecomposingNormalizerBorrowed::new_nfd()
.normalize(s)
.into_owned()
}
fn icu_nfkc(s: &str) -> String {
ComposingNormalizerBorrowed::new_nfkc()
.normalize(s)
.into_owned()
}
fn icu_nfkd(s: &str) -> String {
DecomposingNormalizerBorrowed::new_nfkd()
.normalize(s)
.into_owned()
}
fn our_nfc(s: &str) -> String {
simd_normalizer::nfc().normalize(s).into_owned()
}
fn our_nfd(s: &str) -> String {
simd_normalizer::nfd().normalize(s).into_owned()
}
fn our_nfkc(s: &str) -> String {
simd_normalizer::nfkc().normalize(s).into_owned()
}
fn our_nfkd(s: &str) -> String {
simd_normalizer::nfkd().normalize(s).into_owned()
}
fn codepoints(s: &str) -> String {
s.chars()
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" ")
}
fn assert_nfd_matches_icu(label: &str, input: &str) {
let ours = our_nfd(input);
let reference = icu_nfd(input);
assert_eq!(
ours,
reference,
"NFD mismatch [{label}]\n input: {}\n ours: {}\n icu: {}",
codepoints(input),
codepoints(&ours),
codepoints(&reference),
);
}
fn assert_nfc_matches_icu(label: &str, input: &str) {
let ours = our_nfc(input);
let reference = icu_nfc(input);
assert_eq!(
ours,
reference,
"NFC mismatch [{label}]\n input: {}\n ours: {}\n icu: {}",
codepoints(input),
codepoints(&ours),
codepoints(&reference),
);
}
fn assert_nfkd_matches_icu(label: &str, input: &str) {
let ours = our_nfkd(input);
let reference = icu_nfkd(input);
assert_eq!(
ours,
reference,
"NFKD mismatch [{label}]\n input: {}\n ours: {}\n icu: {}",
codepoints(input),
codepoints(&ours),
codepoints(&reference),
);
}
fn assert_nfkc_matches_icu(label: &str, input: &str) {
let ours = our_nfkc(input);
let reference = icu_nfkc(input);
assert_eq!(
ours,
reference,
"NFKC mismatch [{label}]\n input: {}\n ours: {}\n icu: {}",
codepoints(input),
codepoints(&ours),
codepoints(&reference),
);
}
fn assert_all_forms_match_icu(label: &str, input: &str) {
assert_nfd_matches_icu(label, input);
assert_nfc_matches_icu(label, input);
assert_nfkd_matches_icu(label, input);
assert_nfkc_matches_icu(label, input);
}
#[test]
fn stable_sort_same_ccc_above_marks() {
let input = "a\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0308}";
assert_all_forms_match_icu("stable-sort-6-above-marks", input);
let input_rev = "a\u{0308}\u{0304}\u{0303}\u{0302}\u{0301}\u{0300}";
assert_all_forms_match_icu("stable-sort-6-above-marks-reversed", input_rev);
}
#[test]
fn stable_sort_same_ccc_below_marks() {
let input = "a\u{0316}\u{0317}\u{0323}\u{0324}\u{0325}";
assert_all_forms_match_icu("stable-sort-5-below-marks", input);
}
#[test]
fn stable_sort_mixed_same_ccc_groups() {
let input = "e\u{0300}\u{0301}\u{0323}\u{0325}\u{0302}\u{0303}";
assert_all_forms_match_icu("stable-sort-mixed-groups", input);
}
#[test]
fn stable_sort_three_marks_same_ccc_with_composition() {
let input = "e\u{0323}\u{0302}\u{0301}";
assert_all_forms_match_icu("stable-sort-compose-with-trailing", input);
}
#[test]
fn hebrew_vowel_marks_single_base() {
let input_sorted = "\u{05D1}\u{05B0}\u{05B1}\u{05B4}\u{05B7}\u{05B8}\u{05BB}\u{05BC}\u{05BD}\u{05BF}\u{05C1}\u{05C2}";
assert_all_forms_match_icu("hebrew-all-vowels-sorted", input_sorted);
let input_reversed = "\u{05D1}\u{05C2}\u{05C1}\u{05BF}\u{05BD}\u{05BC}\u{05BB}\u{05B8}\u{05B7}\u{05B4}\u{05B1}\u{05B0}";
assert_all_forms_match_icu("hebrew-all-vowels-reversed", input_reversed);
}
#[test]
fn hebrew_dagesh_and_vowel_ordering() {
let input = "\u{05D1}\u{05BC}\u{05B0}";
assert_all_forms_match_icu("hebrew-dagesh-before-sheva", input);
let input2 = "\u{05D1}\u{05B4}\u{05BC}\u{05BD}";
assert_all_forms_match_icu("hebrew-hiriq-dagesh-meteg", input2);
}
#[test]
fn hebrew_shin_sin_dot_distinction() {
let shin_with_shin_dot = "\u{05E9}\u{05C1}";
let shin_with_sin_dot = "\u{05E9}\u{05C2}";
assert_all_forms_match_icu("hebrew-shin-dot", shin_with_shin_dot);
assert_all_forms_match_icu("hebrew-sin-dot", shin_with_sin_dot);
let input = "\u{05E9}\u{05C2}\u{05C1}\u{05B4}";
assert_all_forms_match_icu("hebrew-two-dots-plus-vowel", input);
}
#[test]
fn hebrew_realistic_word() {
let word = "\u{05D1}\u{05BC}\u{05B0}\u{05E8}\u{05D0}\u{05E9}\u{05C1}\u{05B4}\u{05EA}";
assert_all_forms_match_icu("hebrew-realistic-word", word);
}
#[test]
fn arabic_vowel_marks_single_base() {
let input_sorted =
"\u{0628}\u{064B}\u{064C}\u{064D}\u{064E}\u{064F}\u{0650}\u{0651}\u{0652}\u{0670}";
assert_all_forms_match_icu("arabic-all-marks-sorted", input_sorted);
let input_reversed =
"\u{0628}\u{0670}\u{0652}\u{0651}\u{0650}\u{064F}\u{064E}\u{064D}\u{064C}\u{064B}";
assert_all_forms_match_icu("arabic-all-marks-reversed", input_reversed);
}
#[test]
fn arabic_shadda_with_vowel() {
let input = "\u{0628}\u{0651}\u{064E}";
assert_all_forms_match_icu("arabic-shadda-fatha", input);
let input2 = "\u{0628}\u{0650}\u{0651}";
assert_all_forms_match_icu("arabic-kasra-shadda", input2);
let input3 = "\u{0628}\u{0651}\u{0650}";
assert_all_forms_match_icu("arabic-shadda-kasra-reorder", input3);
}
#[test]
fn arabic_small_marks() {
let input = "\u{0628}\u{061A}\u{0619}\u{0618}";
assert_all_forms_match_icu("arabic-small-marks-reversed", input);
}
#[test]
fn arabic_realistic_word() {
let word = "\u{0628}\u{0650}\u{0633}\u{0652}\u{0645}\u{0650}\u{0627}\u{0644}\u{0644}\u{0651}\u{064E}\u{0647}\u{0650}";
assert_all_forms_match_icu("arabic-bismillah-fragment", word);
}
#[test]
fn tibetan_vowel_signs() {
let input = "\u{0F40}\u{0F74}\u{0F72}\u{0F71}";
assert_all_forms_match_icu("tibetan-vowels-unsorted", input);
let input_same_ccc = "\u{0F40}\u{0F72}\u{0F7A}\u{0F7C}\u{0F80}";
assert_all_forms_match_icu("tibetan-same-ccc-130", input_same_ccc);
}
#[test]
fn tibetan_subjoined_consonants() {
let input = "\u{0F40}\u{0FB6}\u{0F72}";
assert_all_forms_match_icu("tibetan-subjoined-with-vowel", input);
}
#[test]
fn tibetan_marks_with_latin_marks() {
let input = "a\u{0F39}\u{0302}\u{0323}\u{0F71}";
assert_all_forms_match_icu("tibetan-latin-mixed-ccc", input);
}
#[test]
fn vietnamese_tone_marks_basic() {
let input1 = "o\u{031B}\u{0301}";
assert_all_forms_match_icu("vietnamese-o-horn-acute", input1);
let input2 = "o\u{031B}\u{0323}\u{0300}";
assert_all_forms_match_icu("vietnamese-o-horn-dot-grave", input2);
let input3 = "o\u{0300}\u{0323}\u{031B}";
assert_all_forms_match_icu("vietnamese-reversed-marks", input3);
}
#[test]
fn vietnamese_precomposed_base_with_extra_marks() {
let input1 = "\u{01A1}\u{0301}";
assert_all_forms_match_icu("vietnamese-precomposed-o-horn-acute", input1);
let input2 = "\u{01A1}\u{0323}\u{0309}";
assert_all_forms_match_icu("vietnamese-precomposed-o-horn-dot-hook", input2);
}
#[test]
fn vietnamese_realistic_word() {
let word = "Vi\u{1EC7}t Nam";
assert_all_forms_match_icu("vietnamese-viet-nam-precomposed", word);
let word_decomposed = "Vie\u{0323}\u{0302}t Nam";
assert_all_forms_match_icu("vietnamese-viet-nam-decomposed", word_decomposed);
}
#[test]
fn long_sequence_50_above_marks() {
let above_marks: &[char] = &[
'\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
'\u{0307}', '\u{0308}', '\u{030B}',
];
let mut input = String::from("a");
for i in 0..50 {
input.push(above_marks[i % above_marks.len()]);
}
assert_all_forms_match_icu("long-50-above-marks", &input);
}
#[test]
fn long_sequence_60_mixed_ccc() {
let marks_and_ccc: &[(char, u8)] = &[
('\u{0327}', 202), ('\u{0328}', 202), ('\u{0316}', 220), ('\u{0323}', 220), ('\u{0317}', 220), ('\u{0300}', 230), ('\u{0301}', 230), ('\u{0302}', 230), ('\u{0308}', 230), ('\u{0303}', 230), ];
let mut input = String::from("e");
for i in (0..60).rev() {
let (ch, _) = marks_and_ccc[i % marks_and_ccc.len()];
input.push(ch);
}
assert_all_forms_match_icu("long-60-mixed-reverse-ccc", &input);
}
#[test]
fn long_sequence_100_marks_all_same_ccc() {
let marks: &[char] = &[
'\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
'\u{0307}', '\u{0308}', '\u{030B}',
];
let mut input = String::from("x");
for i in 0..100 {
input.push(marks[i % marks.len()]);
}
assert_all_forms_match_icu("long-100-same-ccc-230", &input);
}
#[test]
fn long_sequence_boundary_at_inline_cap() {
let mut input_18 = String::from("a");
for i in 0..18 {
if i % 2 == 0 {
input_18.push('\u{0323}'); } else {
input_18.push('\u{0301}'); }
}
assert_all_forms_match_icu("long-exactly-18-marks", &input_18);
let mut input_19 = input_18.clone();
input_19.push('\u{0300}'); assert_all_forms_match_icu("long-exactly-19-marks-overflow", &input_19);
}
#[test]
fn long_sequence_multiple_ccc_bands() {
let marks_by_ccc: &[(char, u8)] = &[
('\u{0334}', 1), ('\u{093C}', 7), ('\u{094D}', 9), ('\u{05B0}', 10), ('\u{05B4}', 14), ('\u{05BC}', 21), ('\u{064B}', 27), ('\u{064E}', 30), ('\u{0651}', 33), ('\u{0670}', 35), ('\u{0327}', 202), ('\u{031B}', 216), ('\u{0323}', 220), ('\u{0300}', 230), ('\u{0301}', 230), ('\u{0302}', 230), ('\u{0345}', 240), ('\u{0303}', 230), ];
let mut input = String::from("a");
for i in 0..54 {
let (ch, _) = marks_by_ccc[(54 - 1 - i) % marks_by_ccc.len()];
input.push(ch);
}
assert_all_forms_match_icu("long-54-multi-ccc-bands", &input);
}
#[test]
fn worst_case_reverse_ccc_order() {
let input = "a\u{0345}\u{0300}\u{0323}\u{0327}\u{031B}\u{0334}";
assert_all_forms_match_icu("worst-case-reverse-6-marks", input);
}
#[test]
fn worst_case_interleaved_ascending_descending() {
let input = "a\u{0300}\u{0334}\u{0323}\u{093C}\u{0327}\u{094D}\u{0345}";
assert_all_forms_match_icu("worst-case-interleaved", input);
}
#[test]
fn worst_case_all_distinct_ccc_values() {
let input = "a\u{0345}\u{035D}\u{0362}\u{0350}\u{0300}\u{0323}\u{0327}\u{094D}\u{0334}";
assert_all_forms_match_icu("worst-case-many-distinct-ccc", input);
}
#[test]
fn worst_case_duplicate_ccc_values_reversed() {
let input = "e\u{0301}\u{0323}\u{0300}\u{0327}\u{0302}\u{0328}";
assert_all_forms_match_icu("worst-case-duplicates-interleaved", input);
}
#[test]
fn cross_form_nfd_vs_nfc_cedilla_acute() {
let input = "e\u{0327}\u{0301}";
assert_nfd_matches_icu("cross-form-nfd-cedilla-acute", input);
assert_nfc_matches_icu("cross-form-nfc-cedilla-acute", input);
let input_rev = "e\u{0301}\u{0327}";
assert_nfd_matches_icu("cross-form-nfd-acute-cedilla", input_rev);
assert_nfc_matches_icu("cross-form-nfc-acute-cedilla", input_rev);
}
#[test]
fn cross_form_blocking_by_intervening_mark() {
let input = "e\u{0327}\u{0323}\u{0301}";
assert_all_forms_match_icu("cross-form-blocking-cedilla-dot-acute", input);
let input2 = "e\u{0323}\u{0327}\u{0301}";
assert_all_forms_match_icu("cross-form-blocking-reordered", input2);
}
#[test]
fn overlay_marks_ccc_1() {
let input = "a\u{0301}\u{0334}\u{0335}\u{0338}";
assert_all_forms_match_icu("overlay-ccc-1-after-230", input);
}
#[test]
fn nukta_ccc_7() {
let input = "\u{0915}\u{093C}\u{094D}";
assert_all_forms_match_icu("nukta-virama-sorted", input);
let input_rev = "\u{0915}\u{094D}\u{093C}";
assert_all_forms_match_icu("nukta-virama-reversed", input_rev);
}
#[test]
fn kana_voicing_ccc_8() {
let input = "\u{304B}\u{3099}";
assert_all_forms_match_icu("kana-voicing-dakuten", input);
let input2 = "\u{306F}\u{309A}";
assert_all_forms_match_icu("kana-voicing-handakuten", input2);
}
#[test]
fn iota_subscript_ccc_240() {
let input = "\u{0391}\u{0345}\u{0300}\u{0313}";
assert_all_forms_match_icu("iota-subscript-sorts-last", input);
let input2 = "\u{03B1}\u{0313}\u{0300}\u{0345}";
assert_all_forms_match_icu("greek-alpha-breathing-grave-iota", input2);
}
#[test]
fn hebrew_arabic_marks_on_same_base() {
let input = "a\u{0300}\u{0651}\u{064E}\u{05B0}";
assert_all_forms_match_icu("hebrew-arabic-mixed-on-latin", input);
}
#[test]
fn multiple_clusters_with_marks() {
let input = "a\u{0302}\u{0327}b\u{0323}\u{0301}c\u{0345}\u{0300}\u{0334}";
assert_all_forms_match_icu("multi-cluster-with-marks", input);
}
#[test]
fn alternating_starters_and_marks() {
let input = "a\u{0301}b\u{0302}c\u{0303}d\u{0327}e\u{0323}";
assert_all_forms_match_icu("alternating-starter-mark", input);
}
#[test]
fn empty_input() {
assert_all_forms_match_icu("empty", "");
}
#[test]
fn single_combining_mark_no_base() {
let input = "\u{0301}";
assert_all_forms_match_icu("lone-acute", input);
let input2 = "\u{0323}\u{0301}\u{0327}";
assert_all_forms_match_icu("multiple-marks-no-base", input2);
}
#[test]
fn repeated_cluster_stress() {
let cluster = "e\u{0301}\u{0323}\u{0327}"; let input: String = cluster.repeat(100);
assert_all_forms_match_icu("repeated-cluster-100x", &input);
}
#[test]
fn supplementary_plane_combining_marks() {
let input = "\u{1D157}\u{1D16D}\u{1D165}\u{1D167}";
assert_all_forms_match_icu("musical-supplementary-marks", input);
}
#[test]
fn attached_below_marks() {
let input = "a\u{0327}\u{0328}";
assert_all_forms_match_icu("attached-below-cedilla-ogonek", input);
let input_rev = "a\u{0328}\u{0327}";
assert_all_forms_match_icu("attached-below-ogonek-cedilla", input_rev);
}
#[test]
fn hebrew_cantillation_marks() {
let input = "\u{05D0}\u{0594}\u{0593}\u{0592}\u{059A}\u{0596}\u{0591}";
assert_all_forms_match_icu("hebrew-cantillation-mixed", input);
}
#[test]
fn exactly_inline_cap_same_ccc() {
let marks: &[char] = &[
'\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0305}', '\u{0306}',
'\u{0307}', '\u{0308}', '\u{030B}', '\u{030C}', '\u{030D}', '\u{030E}', '\u{030F}',
'\u{0310}', '\u{0311}', '\u{0312}', '\u{0313}',
];
assert_eq!(marks.len(), 18);
let mut input = String::from("a");
for &m in marks {
input.push(m);
}
assert_all_forms_match_icu("exactly-inline-cap-18-same-ccc", &input);
}