use std::borrow::Cow;
use std::fmt::Write;
const TEXTS: &[(&str, &str)] = &[
(
"arabic-carroll",
include_str!("../test-data/Carroll-11-ar.txt"),
),
(
"german-carroll",
include_str!("../test-data/Carroll-11-de.txt"),
),
(
"greek-carroll",
include_str!("../test-data/Carroll-11-el.txt"),
),
(
"spanish-carroll",
include_str!("../test-data/Carroll-11-es.txt"),
),
(
"french-carroll",
include_str!("../test-data/Carroll-11-fr.txt"),
),
(
"hebrew-carroll",
include_str!("../test-data/Carroll-11-he.txt"),
),
(
"polish-carroll",
include_str!("../test-data/Carroll-11-pl.txt"),
),
(
"russian-carroll",
include_str!("../test-data/Carroll-11-ru.txt"),
),
(
"thai-carroll",
include_str!("../test-data/Carroll-11-th.txt"),
),
(
"turkish-carroll",
include_str!("../test-data/Carroll-11-tr.txt"),
),
(
"japanese-hiragana",
include_str!("../test-data/TestNames_Japanese_h.txt"),
),
(
"japanese-katakana",
include_str!("../test-data/TestNames_Japanese_k.txt"),
),
(
"korean-names",
include_str!("../test-data/TestNames_Korean.txt"),
),
(
"latin-names",
include_str!("../test-data/TestNames_Latin.txt"),
),
(
"thai-names",
include_str!("../test-data/TestNames_Thai.txt"),
),
("english-wotw", include_str!("../test-data/wotw.txt")),
(
"arabic-lipsum",
include_str!("../test-data/Arabic-Lipsum.txt"),
),
(
"chinese-lipsum",
include_str!("../test-data/Chinese-Lipsum.txt"),
),
(
"emoji-lipsum",
include_str!("../test-data/Emoji-Lipsum.txt"),
),
(
"hebrew-lipsum",
include_str!("../test-data/Hebrew-Lipsum.txt"),
),
(
"hindi-lipsum",
include_str!("../test-data/Hindi-Lipsum.txt"),
),
(
"japanese-lipsum",
include_str!("../test-data/Japanese-Lipsum.txt"),
),
(
"korean-lipsum",
include_str!("../test-data/Korean-Lipsum.txt"),
),
(
"latin-lipsum",
include_str!("../test-data/Latin-Lipsum.txt"),
),
(
"russian-lipsum",
include_str!("../test-data/Russian-Lipsum.txt"),
),
];
fn our_nfc(s: &str) -> Cow<'_, str> {
simd_normalizer::nfc().normalize(s)
}
fn our_nfd(s: &str) -> Cow<'_, str> {
simd_normalizer::nfd().normalize(s)
}
fn our_nfkc(s: &str) -> Cow<'_, str> {
simd_normalizer::nfkc().normalize(s)
}
fn our_nfkd(s: &str) -> Cow<'_, str> {
simd_normalizer::nfkd().normalize(s)
}
fn our_is_nfc(s: &str) -> bool {
simd_normalizer::nfc().is_normalized(s)
}
fn our_is_nfd(s: &str) -> bool {
simd_normalizer::nfd().is_normalized(s)
}
fn our_is_nfkc(s: &str) -> bool {
simd_normalizer::nfkc().is_normalized(s)
}
fn our_is_nfkd(s: &str) -> bool {
simd_normalizer::nfkd().is_normalized(s)
}
fn icu_nfc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfc()
.normalize(s)
.into_owned()
}
fn icu_nfd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfd()
.normalize(s)
.into_owned()
}
fn icu_nfkc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfkc()
.normalize(s)
.into_owned()
}
fn icu_nfkd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfkd()
.normalize(s)
.into_owned()
}
fn un_nfc(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfc().collect::<String>()
}
fn un_nfd(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfd().collect::<String>()
}
fn un_nfkc(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfkc().collect::<String>()
}
fn un_nfkd(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfkd().collect::<String>()
}
fn codepoint_preview(s: &str, max: usize) -> String {
let mut out = String::new();
for (i, c) in s.chars().enumerate() {
if i >= max {
let _ = write!(out, " ... ({} more)", s.chars().count() - max);
break;
}
if i > 0 {
out.push(' ');
}
let _ = write!(out, "U+{:04X}", c as u32);
}
out
}
fn first_divergence(a: &str, b: &str) -> Option<usize> {
a.bytes()
.zip(b.bytes())
.position(|(x, y)| x != y)
.or_else(|| {
if a.len() != b.len() {
Some(a.len().min(b.len()))
} else {
None
}
})
}
#[test]
#[allow(clippy::type_complexity)]
fn idempotence() {
let mut failures = Vec::new();
let forms: &[(&str, fn(&str) -> Cow<'_, str>)] = &[
("NFC", our_nfc),
("NFD", our_nfd),
("NFKC", our_nfkc),
("NFKD", our_nfkd),
];
for &(name, text) in TEXTS {
for &(form_name, form_fn) in forms {
let once = form_fn(text);
let twice = form_fn(&once);
if *once != *twice {
let diverge = first_divergence(&once, &twice);
failures.push(format!(
" [{name}] {form_name}: idempotence failed \
(once_len={}, twice_len={}, first_divergence_byte={:?})",
once.len(),
twice.len(),
diverge,
));
}
}
}
if !failures.is_empty() {
panic!(
"Idempotence failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
fn round_trip_nfd_nfc() {
let mut failures = Vec::new();
for &(name, text) in TEXTS {
let nfd_text = our_nfd(text);
let recovered = our_nfc(&nfd_text);
if our_is_nfc(text) {
if *recovered != *text {
let diverge = first_divergence(&recovered, text);
failures.push(format!(
" [{name}] NFC(NFD(text)) != text (text is NFC) \
(text_len={}, recovered_len={}, first_divergence_byte={:?})",
text.len(),
recovered.len(),
diverge,
));
}
} else {
let nfc_text = our_nfc(text);
if *recovered != *nfc_text {
let diverge = first_divergence(&recovered, &nfc_text);
failures.push(format!(
" [{name}] NFC(NFD(text)) != NFC(text) (text is not NFC) \
(nfc_len={}, recovered_len={}, first_divergence_byte={:?})",
nfc_text.len(),
recovered.len(),
diverge,
));
}
}
}
if !failures.is_empty() {
panic!(
"Round-trip NFD->NFC failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
#[allow(clippy::type_complexity)]
fn is_normalized_consistency() {
let mut failures = Vec::new();
let forms: &[(&str, fn(&str) -> Cow<'_, str>, fn(&str) -> bool)] = &[
("NFC", our_nfc, our_is_nfc),
("NFD", our_nfd, our_is_nfd),
("NFKC", our_nfkc, our_is_nfkc),
("NFKD", our_nfkd, our_is_nfkd),
];
for &(name, text) in TEXTS {
for &(form_name, form_fn, is_fn) in forms {
let normalized = form_fn(text);
let text_is_unchanged = matches!(&normalized, Cow::Borrowed(_)) || *normalized == *text;
if text_is_unchanged && !is_fn(text) {
failures.push(format!(
" [{name}] {form_name}: text is already normalized \
but is_{} returned false (text_len={})",
form_name.to_lowercase(),
text.len(),
));
}
if !is_fn(&normalized) {
failures.push(format!(
" [{name}] {form_name}: {form_name}(text) is not recognized \
as normalized by is_{} (normalized_len={})",
form_name.to_lowercase(),
normalized.len(),
));
}
}
}
if !failures.is_empty() {
panic!(
"is_normalized consistency failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
#[allow(clippy::type_complexity)]
fn differential_vs_icu_normalizer() {
let mut failures = Vec::new();
let forms: &[(&str, fn(&str) -> Cow<'_, str>, fn(&str) -> String)] = &[
("NFC", our_nfc, icu_nfc),
("NFD", our_nfd, icu_nfd),
("NFKC", our_nfkc, icu_nfkc),
("NFKD", our_nfkd, icu_nfkd),
];
for &(name, text) in TEXTS {
for &(form_name, our_fn, icu_fn) in forms {
let ours = our_fn(text);
let reference = icu_fn(text);
if *ours != *reference {
let diverge = first_divergence(&ours, &reference);
let ctx = diverge.map(|d| {
let start = ours[..d].chars().count().saturating_sub(2);
let ours_ctx = codepoint_preview(
&ours.chars().skip(start).take(10).collect::<String>(),
10,
);
let ref_ctx = codepoint_preview(
&reference.chars().skip(start).take(10).collect::<String>(),
10,
);
format!("ours_ctx=[{ours_ctx}] ref_ctx=[{ref_ctx}]")
});
failures.push(format!(
" [{name}] {form_name}: diverges from icu_normalizer \
(ours_len={}, ref_len={}, first_divergence_byte={:?}, {})",
ours.len(),
reference.len(),
diverge,
ctx.as_deref().unwrap_or("identical length, internal diff"),
));
}
}
}
if !failures.is_empty() {
panic!(
"Differential vs icu_normalizer failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}
#[test]
#[allow(clippy::type_complexity)]
fn differential_vs_unicode_normalization() {
let mut failures = Vec::new();
let forms: &[(&str, fn(&str) -> Cow<'_, str>, fn(&str) -> String)] = &[
("NFC", our_nfc, un_nfc),
("NFD", our_nfd, un_nfd),
("NFKC", our_nfkc, un_nfkc),
("NFKD", our_nfkd, un_nfkd),
];
for &(name, text) in TEXTS {
for &(form_name, our_fn, ref_fn) in forms {
let ours = our_fn(text);
let reference = ref_fn(text);
if *ours != *reference {
let diverge = first_divergence(&ours, &reference);
let ctx = diverge.map(|d| {
let start = ours[..d].chars().count().saturating_sub(2);
let ours_ctx = codepoint_preview(
&ours.chars().skip(start).take(10).collect::<String>(),
10,
);
let ref_ctx = codepoint_preview(
&reference.chars().skip(start).take(10).collect::<String>(),
10,
);
format!("ours_ctx=[{ours_ctx}] ref_ctx=[{ref_ctx}]")
});
failures.push(format!(
" [{name}] {form_name}: diverges from unicode-normalization \
(ours_len={}, ref_len={}, first_divergence_byte={:?}, {})",
ours.len(),
reference.len(),
diverge,
ctx.as_deref().unwrap_or("identical length, internal diff"),
));
}
}
}
if !failures.is_empty() {
panic!(
"Differential vs unicode-normalization failures ({} total):\n{}",
failures.len(),
failures.join("\n")
);
}
}