use std::borrow::Cow;
use std::fmt::Write;
fn our_nfc(s: &str) -> Cow<'_, str> {
simd_normalizer::nfc().normalize(s)
}
fn our_nfd(s: &str) -> Cow<'_, str> {
simd_normalizer::nfd().normalize(s)
}
fn our_is_nfc(s: &str) -> bool {
simd_normalizer::nfc().is_normalized(s)
}
fn our_is_nfd(s: &str) -> bool {
simd_normalizer::nfd().is_normalized(s)
}
fn icu_nfc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfc()
.normalize(s)
.into_owned()
}
fn icu_nfd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfd()
.normalize(s)
.into_owned()
}
fn codepoint_dump(s: &str) -> String {
let mut out = String::new();
for (i, c) in s.chars().enumerate() {
if i > 0 {
out.push(' ');
}
let _ = write!(out, "U+{:04X}", c as u32);
}
out
}
const SCRIPT_SAMPLES: &[(&str, &str, &str)] = &[
(
"Georgian",
"sakartvelo",
"\u{10E1}\u{10D0}\u{10E5}\u{10D0}\u{10E0}\u{10D7}\u{10D5}\u{10D4}\u{10DA}\u{10DD}",
), (
"Georgian",
"gamarjoba",
"\u{10D2}\u{10D0}\u{10DB}\u{10D0}\u{10E0}\u{10EF}\u{10DD}\u{10D1}\u{10D0}",
), (
"Georgian",
"tbilisi",
"\u{10D7}\u{10D1}\u{10D8}\u{10DA}\u{10D8}\u{10E1}\u{10D8}",
), (
"Georgian",
"mixed-sentence",
"\u{10DB}\u{10D4} \u{10DB}\u{10D8}\u{10E7}\u{10D5}\u{10D0}\u{10E0}\u{10E1} \u{10E1}\u{10D0}\u{10E5}\u{10D0}\u{10E0}\u{10D7}\u{10D5}\u{10D4}\u{10DA}\u{10DD}",
), (
"Armenian",
"hayastan",
"\u{0540}\u{0561}\u{0575}\u{0561}\u{057D}\u{057F}\u{0561}\u{0576}",
), ("Armenian", "barev", "\u{0532}\u{0561}\u{0580}\u{0587}"), (
"Armenian",
"yerevan",
"\u{0535}\u{0580}\u{0587}\u{0561}\u{0576}",
), ("Armenian", "ech-yiwn-ligature", "\u{0587}"), (
"Ethiopic",
"ethiopia",
"\u{12A2}\u{1275}\u{12EE}\u{1335}\u{12EB}",
), ("Ethiopic", "amharic-greeting", "\u{1230}\u{120B}\u{121D}"), (
"Ethiopic",
"addis-ababa",
"\u{12A0}\u{12F2}\u{1235} \u{12A0}\u{1260}\u{1263}",
), ("Ethiopic", "combining-gemination", "\u{1200}\u{135F}"), ("Ethiopic", "combining-vowel-length", "\u{1200}\u{135E}"), ("Ethiopic", "combining-both", "\u{1200}\u{135D}\u{135F}"), (
"Ethiopic",
"multi-combining",
"\u{1230}\u{135F}\u{120B}\u{135E}\u{121D}",
), ("Tibetan", "tibet", "\u{0F56}\u{0F7C}\u{0F51}"), (
"Tibetan",
"tashi-delek",
"\u{0F56}\u{0F40}\u{0FB2}\u{0F0B}\u{0F64}\u{0F72}\u{0F66}\u{0F0B}\u{0F56}\u{0F51}\u{0F7A}\u{0F0B}\u{0F63}\u{0F7A}\u{0F42}\u{0F66}",
), ("Tibetan", "ccc-reorder-132-129", "\u{0F40}\u{0F74}\u{0F71}"),
("Tibetan", "ccc-reorder-130-129", "\u{0F40}\u{0F72}\u{0F71}"),
(
"Tibetan",
"starter-interruption",
"\u{0F40}\u{0F72}\u{0F7E}",
),
("Tibetan", "ccc-already-ordered", "\u{0F40}\u{0F71}\u{0F74}"),
(
"Tibetan",
"ccc-triple-reorder",
"\u{0F40}\u{0F74}\u{0F72}\u{0F71}",
),
(
"Tibetan",
"subjoined-mix",
"\u{0F40}\u{0FB5}\u{0F71}\u{0F74}",
),
("Tamil", "tamil", "\u{0BA4}\u{0BAE}\u{0BBF}\u{0BB4}\u{0BCD}"), (
"Tamil",
"vanakkam",
"\u{0BB5}\u{0BA3}\u{0B95}\u{0BCD}\u{0B95}\u{0BAE}\u{0BCD}",
), (
"Tamil",
"chennai",
"\u{0B9A}\u{0BC6}\u{0BA9}\u{0BCD}\u{0BA9}\u{0BC8}",
), ("Tamil", "vowel-sign-ii", "\u{0B95}\u{0BC0}"), ("Tamil", "virama-sequence", "\u{0B95}\u{0BCD}\u{0BB7}"), ("Tamil", "au-length-mark", "\u{0B95}\u{0BCA}\u{0BD7}"), (
"Telugu",
"telugu",
"\u{0C24}\u{0C46}\u{0C32}\u{0C41}\u{0C17}\u{0C41}",
), (
"Telugu",
"hyderabad",
"\u{0C39}\u{0C48}\u{0C26}\u{0C30}\u{0C3E}\u{0C2C}\u{0C3E}\u{0C26}\u{0C4D}",
), (
"Telugu",
"namaskaram",
"\u{0C28}\u{0C2E}\u{0C38}\u{0C4D}\u{0C15}\u{0C3E}\u{0C30}\u{0C02}",
), ("Telugu", "vowel-signs", "\u{0C15}\u{0C46}\u{0C56}"), (
"Kannada",
"kannada",
"\u{0C95}\u{0CA8}\u{0CCD}\u{0CA8}\u{0CA1}",
), (
"Kannada",
"bengaluru",
"\u{0CAC}\u{0CC6}\u{0C82}\u{0C97}\u{0CB3}\u{0CC2}\u{0CB0}\u{0CC1}",
), (
"Kannada",
"namaskara",
"\u{0CA8}\u{0CAE}\u{0CB8}\u{0CCD}\u{0C95}\u{0CBE}\u{0CB0}",
), ("Kannada", "vowel-signs", "\u{0C95}\u{0CC8}"), (
"Malayalam",
"malayalam",
"\u{0D2E}\u{0D32}\u{0D2F}\u{0D3E}\u{0D33}\u{0D02}",
), (
"Malayalam",
"namaskaram",
"\u{0D28}\u{0D2E}\u{0D38}\u{0D4D}\u{0D15}\u{0D3E}\u{0D30}\u{0D02}",
), (
"Malayalam",
"thiruvananthapuram",
"\u{0D24}\u{0D3F}\u{0D30}\u{0D41}\u{0D35}\u{0D28}\u{0D28}\u{0D4D}\u{0D24}\u{0D2A}\u{0D41}\u{0D30}\u{0D02}",
), ("Malayalam", "chillu-n", "\u{0D7B}"), (
"Myanmar",
"myanmar",
"\u{1019}\u{103C}\u{1014}\u{103A}\u{1019}\u{102C}",
), (
"Myanmar",
"mingalaba",
"\u{1019}\u{1004}\u{103A}\u{1039}\u{1002}\u{101C}\u{102C}\u{1015}\u{102B}",
), (
"Myanmar",
"yangon",
"\u{101B}\u{1014}\u{103A}\u{1000}\u{102F}\u{1014}\u{103A}",
), (
"Myanmar",
"medial-cluster",
"\u{1000}\u{103C}\u{103D}\u{1031}\u{102C}",
), ];
#[test]
fn script_round_trip_nfd_nfc() {
let mut failures = Vec::new();
for &(script, label, text) in SCRIPT_SAMPLES {
let nfc_text = our_nfc(text);
let nfd_text = our_nfd(text);
let recovered = our_nfc(&nfd_text);
if *recovered != *nfc_text {
failures.push(format!(
" [{script}/{label}] NFC(NFD(text)) != NFC(text)\n\
\x20 input: {input_cps}\n\
\x20 nfc: {nfc_cps}\n\
\x20 nfd: {nfd_cps}\n\
\x20 recovered: {recovered_cps}",
input_cps = codepoint_dump(text),
nfc_cps = codepoint_dump(&nfc_text),
nfd_cps = codepoint_dump(&nfd_text),
recovered_cps = codepoint_dump(&recovered),
));
}
}
if !failures.is_empty() {
panic!(
"Round-trip NFC(NFD(text)) failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
fn script_idempotence() {
let mut failures = Vec::new();
for &(script, label, text) in SCRIPT_SAMPLES {
let nfc_once = our_nfc(text);
let nfc_twice = our_nfc(&nfc_once);
if *nfc_once != *nfc_twice {
failures.push(format!(
" [{script}/{label}] NFC idempotence failed\n\
\x20 once: {once_cps}\n\
\x20 twice: {twice_cps}",
once_cps = codepoint_dump(&nfc_once),
twice_cps = codepoint_dump(&nfc_twice),
));
}
let nfd_once = our_nfd(text);
let nfd_twice = our_nfd(&nfd_once);
if *nfd_once != *nfd_twice {
failures.push(format!(
" [{script}/{label}] NFD idempotence failed\n\
\x20 once: {once_cps}\n\
\x20 twice: {twice_cps}",
once_cps = codepoint_dump(&nfd_once),
twice_cps = codepoint_dump(&nfd_twice),
));
}
}
if !failures.is_empty() {
panic!(
"Idempotence failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
fn script_is_normalized_consistency() {
let mut failures = Vec::new();
for &(script, label, text) in SCRIPT_SAMPLES {
let nfc_text = our_nfc(text);
if !our_is_nfc(&nfc_text) {
failures.push(format!(
" [{script}/{label}] is_nfc(nfc(text)) returned false\n\
\x20 nfc: {nfc_cps}",
nfc_cps = codepoint_dump(&nfc_text),
));
}
let nfd_text = our_nfd(text);
if !our_is_nfd(&nfd_text) {
failures.push(format!(
" [{script}/{label}] is_nfd(nfd(text)) returned false\n\
\x20 nfd: {nfd_cps}",
nfd_cps = codepoint_dump(&nfd_text),
));
}
}
if !failures.is_empty() {
panic!(
"is_normalized consistency failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
fn script_differential_vs_icu_normalizer() {
let mut failures = Vec::new();
for &(script, label, text) in SCRIPT_SAMPLES {
let our_nfc_result = our_nfc(text);
let icu_nfc_result = icu_nfc(text);
if *our_nfc_result != *icu_nfc_result {
failures.push(format!(
" [{script}/{label}] NFC diverges from icu_normalizer\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(text),
ours_cps = codepoint_dump(&our_nfc_result),
icu_cps = codepoint_dump(&icu_nfc_result),
));
}
let our_nfd_result = our_nfd(text);
let icu_nfd_result = icu_nfd(text);
if *our_nfd_result != *icu_nfd_result {
failures.push(format!(
" [{script}/{label}] NFD diverges from icu_normalizer\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(text),
ours_cps = codepoint_dump(&our_nfd_result),
icu_cps = codepoint_dump(&icu_nfd_result),
));
}
}
if !failures.is_empty() {
panic!(
"Differential vs icu_normalizer failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
fn tibetan_ccc_reordering_detailed() {
let cases: &[(&str, &str)] = &[
("U-before-AA", "\u{0F40}\u{0F74}\u{0F71}"),
("I-before-AA", "\u{0F40}\u{0F72}\u{0F71}"),
("U-before-I", "\u{0F40}\u{0F74}\u{0F72}"),
("AA-I-U-ordered", "\u{0F40}\u{0F71}\u{0F72}\u{0F74}"),
("U-I-AA-reversed", "\u{0F40}\u{0F74}\u{0F72}\u{0F71}"),
(
"starter-between",
"\u{0F40}\u{0F72}\u{0F0B}\u{0F40}\u{0F71}",
),
("single-AA", "\u{0F40}\u{0F71}"),
("single-I", "\u{0F40}\u{0F72}"),
("single-U", "\u{0F40}\u{0F74}"),
("ccc0-termination", "\u{0F40}\u{0F74}\u{0F71}\u{0F7E}"),
];
for (label, input) in cases {
let our_nfd_out = our_nfd(input);
let icu_nfd_out = icu_nfd(input);
assert_eq!(
*our_nfd_out,
icu_nfd_out,
"Tibetan CCC reorder [{label}] NFD mismatch\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(input),
ours_cps = codepoint_dump(&our_nfd_out),
icu_cps = codepoint_dump(&icu_nfd_out),
);
let our_nfc_out = our_nfc(input);
let icu_nfc_out = icu_nfc(input);
assert_eq!(
*our_nfc_out,
icu_nfc_out,
"Tibetan CCC reorder [{label}] NFC mismatch\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(input),
ours_cps = codepoint_dump(&our_nfc_out),
icu_cps = codepoint_dump(&icu_nfc_out),
);
}
}
#[test]
fn ethiopic_combining_marks_detailed() {
let cases: &[(&str, &str)] = &[
("gemination-and-vowel", "\u{1200}\u{135D}"),
("vowel-length", "\u{1200}\u{135E}"),
("gemination", "\u{1200}\u{135F}"),
("double-combining", "\u{1200}\u{135D}\u{135F}"),
("multi-base-combining", "\u{1200}\u{135F}\u{1201}\u{135E}"),
("cross-script-ccc", "A\u{0300}\u{135F}"),
];
for (label, input) in cases {
let our_nfd_out = our_nfd(input);
let icu_nfd_out = icu_nfd(input);
assert_eq!(
*our_nfd_out,
icu_nfd_out,
"Ethiopic combining [{label}] NFD mismatch\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(input),
ours_cps = codepoint_dump(&our_nfd_out),
icu_cps = codepoint_dump(&icu_nfd_out),
);
let our_nfc_out = our_nfc(input);
let icu_nfc_out = icu_nfc(input);
assert_eq!(
*our_nfc_out,
icu_nfc_out,
"Ethiopic combining [{label}] NFC mismatch\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(input),
ours_cps = codepoint_dump(&our_nfc_out),
icu_cps = codepoint_dump(&icu_nfc_out),
);
}
}
#[test]
fn tamil_combining_marks_detailed() {
let cases: &[(&str, &str)] = &[
("ka-virama", "\u{0B95}\u{0BCD}"),
("ka-vowel-i", "\u{0B95}\u{0BBF}"),
("ka-vowel-ii", "\u{0B95}\u{0BC0}"),
("ka-vowel-u", "\u{0B95}\u{0BC1}"),
("conjunct-ksha", "\u{0B95}\u{0BCD}\u{0BB7}"),
("geminated-kka", "\u{0B95}\u{0BCD}\u{0B95}"),
("long-kka-aa", "\u{0B95}\u{0BCD}\u{0B95}\u{0BBE}"),
("om-sign", "\u{0BD0}"),
];
for (label, input) in cases {
let our_nfd_out = our_nfd(input);
let icu_nfd_out = icu_nfd(input);
assert_eq!(
*our_nfd_out,
icu_nfd_out,
"Tamil combining [{label}] NFD mismatch\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(input),
ours_cps = codepoint_dump(&our_nfd_out),
icu_cps = codepoint_dump(&icu_nfd_out),
);
let our_nfc_out = our_nfc(input);
let icu_nfc_out = icu_nfc(input);
assert_eq!(
*our_nfc_out,
icu_nfc_out,
"Tamil combining [{label}] NFC mismatch\n\
\x20 input: {input_cps}\n\
\x20 ours: {ours_cps}\n\
\x20 icu: {icu_cps}",
input_cps = codepoint_dump(input),
ours_cps = codepoint_dump(&our_nfc_out),
icu_cps = codepoint_dump(&icu_nfc_out),
);
}
}