extern crate alloc;
use alloc::borrow::Cow;
use alloc::string::String;
use simd_normalizer::normalizer::{NfcNormalizer, NfdNormalizer, NfkcNormalizer, NfkdNormalizer};
#[allow(clippy::ptr_arg)]
fn is_borrowed(cow: &Cow<'_, str>) -> bool {
matches!(cow, Cow::Borrowed(_))
}
#[test]
fn empty_string_nfc() {
let n = NfcNormalizer;
let result = n.normalize("");
assert_eq!(&*result, "");
assert!(is_borrowed(&result));
}
#[test]
fn empty_string_nfd() {
let n = NfdNormalizer;
let result = n.normalize("");
assert_eq!(&*result, "");
assert!(is_borrowed(&result));
}
#[test]
fn empty_string_nfkc() {
let n = NfkcNormalizer;
let result = n.normalize("");
assert_eq!(&*result, "");
assert!(is_borrowed(&result));
}
#[test]
fn empty_string_nfkd() {
let n = NfkdNormalizer;
let result = n.normalize("");
assert_eq!(&*result, "");
assert!(is_borrowed(&result));
}
#[test]
fn ascii_short_nfc_borrowed() {
let n = NfcNormalizer;
let input = "Hello, world!";
let result = n.normalize(input);
assert_eq!(&*result, input);
assert!(is_borrowed(&result));
}
#[test]
fn ascii_short_nfd_borrowed() {
let n = NfdNormalizer;
let input = "Hello, world!";
let result = n.normalize(input);
assert_eq!(&*result, input);
assert!(is_borrowed(&result));
}
#[test]
fn ascii_long_nfc_borrowed() {
let n = NfcNormalizer;
let input = "A".repeat(200);
let result = n.normalize(&input);
assert_eq!(&*result, &*input);
assert!(is_borrowed(&result));
}
#[test]
fn ascii_long_nfd_borrowed() {
let n = NfdNormalizer;
let input = "The quick brown fox jumps over the lazy dog. ".repeat(5);
let result = n.normalize(&input);
assert_eq!(&*result, &*input);
assert!(is_borrowed(&result));
}
#[test]
fn ascii_long_nfkc_borrowed() {
let n = NfkcNormalizer;
let input = "ABCDEFGHIJ".repeat(20);
let result = n.normalize(&input);
assert_eq!(&*result, &*input);
assert!(is_borrowed(&result));
}
#[test]
fn ascii_long_nfkd_borrowed() {
let n = NfkdNormalizer;
let input = "0123456789".repeat(20);
let result = n.normalize(&input);
assert_eq!(&*result, &*input);
assert!(is_borrowed(&result));
}
#[test]
fn nfd_e_acute() {
let n = NfdNormalizer;
let result = n.normalize("\u{00E9}");
assert_eq!(&*result, "e\u{0301}");
}
#[test]
fn nfd_cafe() {
let n = NfdNormalizer;
let result = n.normalize("caf\u{00E9}");
assert_eq!(&*result, "cafe\u{0301}");
}
#[test]
fn nfd_a_grave() {
let n = NfdNormalizer;
let result = n.normalize("\u{00C0}");
assert_eq!(&*result, "A\u{0300}");
}
#[test]
fn nfd_o_diaeresis() {
let n = NfdNormalizer;
let result = n.normalize("\u{00F6}");
assert_eq!(&*result, "o\u{0308}");
}
#[test]
fn nfc_compose_e_acute() {
let n = NfcNormalizer;
let result = n.normalize("e\u{0301}");
assert_eq!(&*result, "\u{00E9}");
}
#[test]
fn nfc_already_composed() {
let n = NfcNormalizer;
let result = n.normalize("\u{00E9}");
assert_eq!(&*result, "\u{00E9}");
}
#[test]
fn nfc_compose_a_ring() {
let n = NfcNormalizer;
let result = n.normalize("a\u{030A}");
assert_eq!(&*result, "\u{00E5}");
}
#[test]
fn nfd_hangul_lv() {
let n = NfdNormalizer;
let result = n.normalize("\u{AC00}");
assert_eq!(&*result, "\u{1100}\u{1161}");
}
#[test]
fn nfd_hangul_lvt() {
let n = NfdNormalizer;
let result = n.normalize("\u{AC01}");
assert_eq!(&*result, "\u{1100}\u{1161}\u{11A8}");
}
#[test]
fn nfc_hangul_lv_composition() {
let n = NfcNormalizer;
let result = n.normalize("\u{1100}\u{1161}");
assert_eq!(&*result, "\u{AC00}");
}
#[test]
fn nfc_hangul_lvt_composition() {
let n = NfcNormalizer;
let result = n.normalize("\u{1100}\u{1161}\u{11A8}");
assert_eq!(&*result, "\u{AC01}");
}
#[test]
fn nfkd_fi_ligature() {
let n = NfkdNormalizer;
let result = n.normalize("\u{FB01}");
assert_eq!(&*result, "fi");
}
#[test]
fn nfkd_superscript_two() {
let n = NfkdNormalizer;
let result = n.normalize("\u{00B2}");
assert_eq!(&*result, "2");
}
#[test]
fn nfkd_fullwidth_a() {
let n = NfkdNormalizer;
let result = n.normalize("\u{FF21}");
assert_eq!(&*result, "A");
}
#[test]
fn nfkc_fi_ligature() {
let n = NfkcNormalizer;
let result = n.normalize("\u{FB01}");
assert_eq!(&*result, "fi");
}
#[test]
fn nfkc_superscript_two() {
let n = NfkcNormalizer;
let result = n.normalize("\u{00B2}");
assert_eq!(&*result, "2");
}
#[test]
fn nfkc_fullwidth_a() {
let n = NfkcNormalizer;
let result = n.normalize("\u{FF21}");
assert_eq!(&*result, "A");
}
#[test]
fn nfc_ccc_reorder() {
let n = NfcNormalizer;
let input = "o\u{0308}\u{0327}";
let result = n.normalize(input);
let chars: Vec<char> = result.chars().collect();
assert_eq!(chars.len(), 2);
assert_eq!(chars[0], '\u{00F6}'); assert_eq!(chars[1], '\u{0327}'); }
#[test]
fn nfd_ccc_reorder() {
let n = NfdNormalizer;
let input = "o\u{0308}\u{0327}";
let result = n.normalize(input);
let chars: Vec<char> = result.chars().collect();
assert_eq!(chars[0], 'o');
assert_eq!(chars[1], '\u{0327}'); assert_eq!(chars[2], '\u{0308}'); }
#[test]
fn chunk_boundary_multibyte() {
let n = NfcNormalizer;
let prefix = "a".repeat(63);
let input = format!("{}\u{00E9}", prefix); let result = n.normalize(&input);
assert_eq!(&*result, &*input);
}
#[test]
fn chunk_boundary_decompose() {
let n = NfdNormalizer;
let prefix = "b".repeat(62);
let input = format!("{}\u{00C0}", prefix); let result = n.normalize(&input);
let expected = format!("{}A\u{0300}", prefix);
assert_eq!(&*result, &*expected);
}
#[test]
fn long_combining_sequence() {
let n = NfcNormalizer;
let mut input = String::from("a");
for _ in 0..20 {
input.push('\u{0301}'); }
let result = n.normalize(&input);
let chars: Vec<char> = result.chars().collect();
assert_eq!(chars[0], '\u{00E1}'); assert_eq!(chars.len(), 20); }
#[test]
fn normalize_to_appends() {
let n = NfcNormalizer;
let mut out = String::from("prefix:");
n.normalize_to("e\u{0301}", &mut out);
assert_eq!(out, "prefix:\u{00E9}");
}
#[test]
fn normalize_to_already_normalized() {
let n = NfcNormalizer;
let mut out = String::new();
let was_normalized = n.normalize_to("Hello", &mut out);
assert!(was_normalized);
assert_eq!(out, "Hello");
}
#[test]
fn normalize_to_not_normalized() {
let n = NfcNormalizer;
let mut out = String::new();
let was_normalized = n.normalize_to("e\u{0301}", &mut out);
assert!(!was_normalized);
assert_eq!(out, "\u{00E9}");
}
#[test]
fn large_ascii_input() {
let n = NfcNormalizer;
let input = "x".repeat(10240);
let result = n.normalize(&input);
assert_eq!(&*result, &*input);
assert!(is_borrowed(&result));
}
#[test]
fn large_mixed_input() {
let n = NfdNormalizer;
let chunk = "Hello caf\u{00E9} world! ";
let input = chunk.repeat(500);
let result = n.normalize(&input);
assert!(result.contains("cafe\u{0301}"));
assert!(result.len() >= input.len());
}
#[test]
fn nfd_multiple_precomposed() {
let n = NfdNormalizer;
let result = n.normalize("\u{00E9}\u{00F6}");
assert_eq!(&*result, "e\u{0301}o\u{0308}");
}
#[test]
fn nfc_multiple_decomposed() {
let n = NfcNormalizer;
let result = n.normalize("e\u{0301}o\u{0308}");
assert_eq!(&*result, "\u{00E9}\u{00F6}");
}
#[test]
fn mixed_scripts_nfc() {
let n = NfcNormalizer;
let input = "Hello \u{00E9} \u{4E16}\u{754C} \u{1F600}";
let result = n.normalize(input);
assert_eq!(&*result, input);
}
#[test]
fn mixed_scripts_nfd() {
let n = NfdNormalizer;
let input = "Hello \u{00E9} \u{4E16}\u{754C} \u{1F600}";
let result = n.normalize(input);
assert_eq!(&*result, "Hello e\u{0301} \u{4E16}\u{754C} \u{1F600}");
}
#[test]
fn is_normalized_nfc_ascii() {
let n = NfcNormalizer;
assert!(n.is_normalized("Hello"));
}
#[test]
fn is_normalized_nfc_precomposed() {
let n = NfcNormalizer;
assert!(n.is_normalized("\u{00E9}"));
}
#[test]
fn is_normalized_nfc_rejects_nfd() {
let n = NfcNormalizer;
assert!(!n.is_normalized("e\u{0301}"));
}
#[test]
fn is_normalized_nfd_decomposed() {
let n = NfdNormalizer;
assert!(n.is_normalized("e\u{0301}"));
}
#[test]
fn is_normalized_nfd_rejects_nfc() {
let n = NfdNormalizer;
assert!(!n.is_normalized("\u{00E9}"));
}
#[test]
fn roundtrip_nfc_nfd() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let inputs = &[
"Hello",
"\u{00E9}",
"caf\u{00E9}",
"\u{AC00}",
"\u{AC01}",
"e\u{0301}o\u{0308}",
"\u{1100}\u{1161}\u{11A8}",
];
for &input in inputs {
let nfd_form = nfd.normalize(input);
let nfc_form = nfc.normalize(input);
let nfc_of_nfd = nfc.normalize(&nfd_form);
assert_eq!(
&*nfc_form, &*nfc_of_nfd,
"NFC(NFD(x)) != NFC(x) for input: {:?}",
input
);
}
}
#[test]
fn single_ascii_char() {
let n = NfcNormalizer;
let result = n.normalize("A");
assert_eq!(&*result, "A");
assert!(is_borrowed(&result));
}
#[test]
fn single_combining_mark() {
let n = NfcNormalizer;
let result = n.normalize("\u{0301}");
assert_eq!(&*result, "\u{0301}");
}
#[test]
fn hangul_in_long_string_nfd() {
let n = NfdNormalizer;
let prefix = "a".repeat(100);
let input = format!("{}\u{AC00}", prefix);
let result = n.normalize(&input);
let expected = format!("{}\u{1100}\u{1161}", prefix);
assert_eq!(&*result, &*expected);
}
#[test]
fn hangul_in_long_string_nfc() {
let n = NfcNormalizer;
let prefix = "a".repeat(100);
let input = format!("{}\u{1100}\u{1161}", prefix);
let result = n.normalize(&input);
let expected = format!("{}\u{AC00}", prefix);
assert_eq!(&*result, &*expected);
}
fn codepoints_debug(s: &str) -> String {
s.chars()
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" ")
}
fn icu_nfc(s: &str) -> String {
icu_normalizer::ComposingNormalizerBorrowed::new_nfc()
.normalize(s)
.into_owned()
}
fn icu_nfd(s: &str) -> String {
icu_normalizer::DecomposingNormalizerBorrowed::new_nfd()
.normalize(s)
.into_owned()
}
fn icu_nfkc(s: &str) -> String {
icu_normalizer::ComposingNormalizerBorrowed::new_nfkc()
.normalize(s)
.into_owned()
}
fn icu_nfkd(s: &str) -> String {
icu_normalizer::DecomposingNormalizerBorrowed::new_nfkd()
.normalize(s)
.into_owned()
}
fn assert_matches_icu(form: &str, input: &str, ours: &str, reference: &str) {
assert_eq!(
ours,
reference,
"\n{form} divergence from icu_normalizer!\
\n input len: {ilen} chars\
\n ours len: {olen} chars\
\n ref len: {rlen} chars\
\n first 10 input cps: {input_cps}\
\n first 10 ours cps: {ours_cps}\
\n first 10 ref cps: {ref_cps}",
form = form,
ilen = input.chars().count(),
olen = ours.chars().count(),
rlen = reference.chars().count(),
input_cps = input
.chars()
.take(10)
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" "),
ours_cps = ours
.chars()
.take(10)
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" "),
ref_cps = reference
.chars()
.take(10)
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" "),
);
}
#[test]
fn nfc_33_combining_marks_fallback_path() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let marks: &[char] = &[
'\u{0327}', '\u{0328}', '\u{0323}', '\u{0330}', '\u{0331}', '\u{0332}', '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0306}', '\u{0307}', '\u{0308}', '\u{0309}', '\u{030A}', '\u{030B}', '\u{030C}', '\u{0345}', '\u{0327}', '\u{0328}', '\u{0323}', '\u{0330}', '\u{0331}', '\u{0332}', '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0306}', '\u{0307}', '\u{0308}', ];
assert_eq!(marks.len(), 33, "must have exactly 33 combining marks");
let mut input = String::from("a");
for &m in marks {
input.push(m);
}
let our_nfc_result = nfc.normalize(&input);
let icu_nfc_result = icu_nfc(&input);
assert_matches_icu("NFC-33marks", &input, &our_nfc_result, &icu_nfc_result);
let our_nfd_result = nfd.normalize(&input);
let icu_nfd_result = icu_nfd(&input);
assert_matches_icu("NFD-33marks", &input, &our_nfd_result, &icu_nfd_result);
}
#[test]
fn nfc_64_combining_marks_deep_fallback() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let mark_cycle: &[char] = &[
'\u{0327}', '\u{0323}', '\u{0300}', '\u{0345}', '\u{0328}', '\u{0330}', '\u{0301}', '\u{0331}', '\u{0302}', '\u{0332}', '\u{0303}', '\u{0304}', '\u{0306}', '\u{0307}', '\u{0308}', '\u{0309}', ];
let mut input = String::from("a");
for i in 0..64 {
input.push(mark_cycle[i % mark_cycle.len()]);
}
assert_eq!(input.chars().count(), 65, "1 starter + 64 marks");
let our_nfc_result = nfc.normalize(&input);
let icu_nfc_result = icu_nfc(&input);
assert_matches_icu("NFC-64marks", &input, &our_nfc_result, &icu_nfc_result);
let our_nfd_result = nfd.normalize(&input);
let icu_nfd_result = icu_nfd(&input);
assert_matches_icu("NFD-64marks", &input, &our_nfd_result, &icu_nfd_result);
}
#[test]
fn nfc_100_combining_marks_extreme_fallback() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let mark_cycle: &[char] = &[
'\u{0327}', '\u{0328}', '\u{0323}', '\u{0330}', '\u{0331}', '\u{0332}', '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0306}', '\u{0307}', '\u{0308}', '\u{0309}', '\u{030A}', '\u{030B}', '\u{030C}', '\u{0345}', ];
let mut input = String::from("e");
for i in 0..100 {
input.push(mark_cycle[i % mark_cycle.len()]);
}
assert_eq!(input.chars().count(), 101, "1 starter + 100 marks");
let our_nfc_result = nfc.normalize(&input);
let icu_nfc_result = icu_nfc(&input);
assert_matches_icu("NFC-100marks", &input, &our_nfc_result, &icu_nfc_result);
let our_nfd_result = nfd.normalize(&input);
let icu_nfd_result = icu_nfd(&input);
assert_matches_icu("NFD-100marks", &input, &our_nfd_result, &icu_nfd_result);
}
#[test]
fn only_combining_marks_no_starter_nfc() {
let nfc = NfcNormalizer;
let input = "\u{0300}\u{0301}\u{0327}";
let our_result = nfc.normalize(input);
let icu_result = icu_nfc(input);
assert_matches_icu("NFC-no-starter", input, &our_result, &icu_result);
assert!(
!our_result.is_empty(),
"output should not be empty for combining-only input"
);
}
#[test]
fn only_combining_marks_no_starter_nfd() {
let nfd = NfdNormalizer;
let input = "\u{0300}\u{0301}\u{0327}";
let our_result = nfd.normalize(input);
let icu_result = icu_nfd(input);
assert_matches_icu("NFD-no-starter", input, &our_result, &icu_result);
assert!(!our_result.is_empty());
}
#[test]
fn only_combining_marks_no_starter_nfkc() {
let nfkc = NfkcNormalizer;
let input = "\u{0300}\u{0301}\u{0327}";
let our_result = nfkc.normalize(input);
let icu_result = icu_nfkc(input);
assert_matches_icu("NFKC-no-starter", input, &our_result, &icu_result);
assert!(!our_result.is_empty());
}
#[test]
fn only_combining_marks_no_starter_nfkd() {
let nfkd = NfkdNormalizer;
let input = "\u{0300}\u{0301}\u{0327}";
let our_result = nfkd.normalize(input);
let icu_result = icu_nfkd(input);
assert_matches_icu("NFKD-no-starter", input, &our_result, &icu_result);
assert!(!our_result.is_empty());
}
#[test]
fn only_combining_marks_longer_sequence() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let mut input = String::new();
let marks: &[char] = &[
'\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0327}', '\u{0328}',
'\u{0323}', '\u{0330}', '\u{0345}',
];
for &m in marks {
input.push(m);
}
let our_nfc_result = nfc.normalize(&input);
let icu_nfc_result = icu_nfc(&input);
assert_matches_icu(
"NFC-no-starter-long",
&input,
&our_nfc_result,
&icu_nfc_result,
);
let our_nfd_result = nfd.normalize(&input);
let icu_nfd_result = icu_nfd(&input);
assert_matches_icu(
"NFD-no-starter-long",
&input,
&our_nfd_result,
&icu_nfd_result,
);
}
#[test]
fn nfc_33_marks_mixed_composable_non_composable() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let marks: &[char] = &[
'\u{0327}', '\u{0328}', '\u{0323}', '\u{0330}', '\u{0331}', '\u{0332}', '\u{0301}', '\u{0300}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0306}', '\u{0307}', '\u{0308}', '\u{0309}', '\u{030A}', '\u{030B}', '\u{030C}', '\u{0345}', '\u{0327}', '\u{0328}', '\u{0323}', '\u{0330}', '\u{0331}', '\u{0300}', '\u{0301}', '\u{0302}', '\u{0303}', '\u{0304}', '\u{0306}', '\u{0307}', '\u{0308}', '\u{0345}', ];
assert_eq!(marks.len(), 33, "must have exactly 33 combining marks");
let mut input = String::from("a");
for &m in marks {
input.push(m);
}
let our_nfc_result = nfc.normalize(&input);
let icu_nfc_result = icu_nfc(&input);
assert_matches_icu(
"NFC-33marks-mixed",
&input,
&our_nfc_result,
&icu_nfc_result,
);
let nfc_chars: Vec<char> = our_nfc_result.chars().collect();
assert!(
nfc_chars.len() < input.chars().count(),
"NFC should compose at least one mark with the starter, \
reducing char count. Got {} chars from {} input chars.\
\n output cps: {}",
nfc_chars.len(),
input.chars().count(),
codepoints_debug(&our_nfc_result),
);
let our_nfd_result = nfd.normalize(&input);
let icu_nfd_result = icu_nfd(&input);
assert_matches_icu(
"NFD-33marks-mixed",
&input,
&our_nfd_result,
&icu_nfd_result,
);
let nfd_chars: Vec<char> = our_nfd_result.chars().collect();
assert_eq!(nfd_chars[0], 'a', "NFD: starter should remain 'a'");
let mut prev_ccc = 0u8;
for &ch in &nfd_chars[1..] {
let ccc = unicode_ccc_approximate(ch);
assert!(
ccc >= prev_ccc,
"NFD CCC ordering violation: U+{:04X} (CCC={}) after CCC={}",
ch as u32,
ccc,
prev_ccc,
);
prev_ccc = ccc;
}
}
fn unicode_ccc_approximate(ch: char) -> u8 {
match ch {
'\u{0327}' | '\u{0328}' => 202,
'\u{0323}' | '\u{0330}' | '\u{0331}' | '\u{0332}' => 220,
'\u{0300}' | '\u{0301}' | '\u{0302}' | '\u{0303}' | '\u{0304}' | '\u{0306}'
| '\u{0307}' | '\u{0308}' | '\u{0309}' | '\u{030A}' | '\u{030B}' | '\u{030C}' => 230,
'\u{0345}' => 240,
_ => 0, }
}
#[test]
fn long_combining_roundtrip_idempotency() {
let nfc = NfcNormalizer;
let nfd = NfdNormalizer;
let mark_cycle: &[char] = &[
'\u{0327}', '\u{0323}', '\u{0300}', '\u{0345}', '\u{0328}', '\u{0330}', '\u{0301}',
'\u{0331}',
];
let mut input = String::from("o");
for i in 0..50 {
input.push(mark_cycle[i % mark_cycle.len()]);
}
let nfc_once = nfc.normalize(&input).into_owned();
let nfc_twice = nfc.normalize(&nfc_once).into_owned();
assert_eq!(
nfc_once,
nfc_twice,
"NFC is not idempotent for 50-mark input!\
\n NFC(x) cps: {}\
\n NFC(NFC(x)) cps: {}",
codepoints_debug(&nfc_once),
codepoints_debug(&nfc_twice),
);
let nfd_once = nfd.normalize(&input).into_owned();
let nfd_twice = nfd.normalize(&nfd_once).into_owned();
assert_eq!(
nfd_once,
nfd_twice,
"NFD is not idempotent for 50-mark input!\
\n NFD(x) cps: {}\
\n NFD(NFD(x)) cps: {}",
codepoints_debug(&nfd_once),
codepoints_debug(&nfd_twice),
);
let nfc_of_nfd = nfc.normalize(&nfd_once).into_owned();
assert_eq!(
nfc_once,
nfc_of_nfd,
"NFC(NFD(x)) != NFC(x) for 50-mark input!\
\n NFC(x) cps: {}\
\n NFC(NFD(x)) cps: {}",
codepoints_debug(&nfc_once),
codepoints_debug(&nfc_of_nfd),
);
}