use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
use simd_normalizer::UnicodeNormalization;
fn assert_nfc(input: &str, expected: &str) {
let simd = simd_normalizer::nfc().normalize(input);
let icu = ComposingNormalizerBorrowed::new_nfc().normalize(input);
assert_eq!(
&*simd, expected,
"NFC mismatch for input {:?}: got {:?}, expected {:?}",
input, simd, expected
);
assert_eq!(
&*simd, &*icu,
"NFC cross-validation for {:?}: simd={:?}, icu={:?}",
input, simd, icu
);
}
fn assert_nfd(input: &str, expected: &str) {
let simd = simd_normalizer::nfd().normalize(input);
let icu = DecomposingNormalizerBorrowed::new_nfd().normalize(input);
assert_eq!(
&*simd, expected,
"NFD mismatch for input {:?}: got {:?}, expected {:?}",
input, simd, expected
);
assert_eq!(
&*simd, &*icu,
"NFD cross-validation for {:?}: simd={:?}, icu={:?}",
input, simd, icu
);
}
fn assert_nfkc(input: &str, expected: &str) {
let simd = simd_normalizer::nfkc().normalize(input);
let icu = ComposingNormalizerBorrowed::new_nfkc().normalize(input);
assert_eq!(
&*simd, expected,
"NFKC mismatch for input {:?}: got {:?}, expected {:?}",
input, simd, expected
);
assert_eq!(
&*simd, &*icu,
"NFKC cross-validation for {:?}: simd={:?}, icu={:?}",
input, simd, icu
);
}
fn assert_nfkd(input: &str, expected: &str) {
let simd = simd_normalizer::nfkd().normalize(input);
let icu = DecomposingNormalizerBorrowed::new_nfkd().normalize(input);
assert_eq!(
&*simd, expected,
"NFKD mismatch for input {:?}: got {:?}, expected {:?}",
input, simd, expected
);
assert_eq!(
&*simd, &*icu,
"NFKD cross-validation for {:?}: simd={:?}, icu={:?}",
input, simd, icu
);
}
fn assert_is_normalized_all(input: &str, nfc: bool, nfd: bool, nfkc: bool, nfkd: bool) {
assert_eq!(
input.is_nfc(),
nfc,
"is_nfc mismatch for {:?}: expected {}",
input,
nfc
);
assert_eq!(
input.is_nfd(),
nfd,
"is_nfd mismatch for {:?}: expected {}",
input,
nfd
);
assert_eq!(
input.is_nfkc(),
nfkc,
"is_nfkc mismatch for {:?}: expected {}",
input,
nfkc
);
assert_eq!(
input.is_nfkd(),
nfkd,
"is_nfkd mismatch for {:?}: expected {}",
input,
nfkd
);
assert_eq!(
ComposingNormalizerBorrowed::new_nfc().is_normalized(input),
nfc,
"ICU is_nfc mismatch for {:?}",
input
);
assert_eq!(
DecomposingNormalizerBorrowed::new_nfd().is_normalized(input),
nfd,
"ICU is_nfd mismatch for {:?}",
input
);
assert_eq!(
ComposingNormalizerBorrowed::new_nfkc().is_normalized(input),
nfkc,
"ICU is_nfkc mismatch for {:?}",
input
);
assert_eq!(
DecomposingNormalizerBorrowed::new_nfkd().is_normalized(input),
nfkd,
"ICU is_nfkd mismatch for {:?}",
input
);
}
#[test]
fn ohm_sign_decomposes_to_omega_nfc() {
assert_nfc("\u{2126}", "\u{03A9}");
}
#[test]
fn ohm_sign_decomposes_to_omega_nfd() {
assert_nfd("\u{2126}", "\u{03A9}");
}
#[test]
fn ohm_sign_decomposes_to_omega_nfkc() {
assert_nfkc("\u{2126}", "\u{03A9}");
}
#[test]
fn ohm_sign_decomposes_to_omega_nfkd() {
assert_nfkd("\u{2126}", "\u{03A9}");
}
#[test]
fn ohm_sign_is_not_normalized() {
assert_is_normalized_all("\u{2126}", false, false, false, false);
}
#[test]
fn omega_cannot_compose_back_to_ohm() {
let omega = "\u{03A9}";
assert_nfc(omega, omega);
assert_nfd(omega, omega);
assert_is_normalized_all(omega, true, true, true, true);
}
#[test]
fn kelvin_sign_decomposes_to_k_nfc() {
assert_nfc("\u{212A}", "K");
}
#[test]
fn kelvin_sign_decomposes_to_k_nfd() {
assert_nfd("\u{212A}", "K");
}
#[test]
fn kelvin_sign_decomposes_to_k_nfkc() {
assert_nfkc("\u{212A}", "K");
}
#[test]
fn kelvin_sign_decomposes_to_k_nfkd() {
assert_nfkd("\u{212A}", "K");
}
#[test]
fn kelvin_sign_is_not_normalized() {
assert_is_normalized_all("\u{212A}", false, false, false, false);
}
#[test]
fn ascii_k_cannot_compose_to_kelvin() {
let k = "K";
assert_nfc(k, k);
assert_nfd(k, k);
assert_is_normalized_all(k, true, true, true, true);
}
#[test]
fn angstrom_decomposes_to_a_ring_nfc() {
assert_nfc("\u{212B}", "\u{00C5}");
}
#[test]
fn angstrom_decomposes_to_a_ring_nfd() {
assert_nfd("\u{212B}", "A\u{030A}");
}
#[test]
fn angstrom_decomposes_to_a_ring_nfkc() {
assert_nfkc("\u{212B}", "\u{00C5}");
}
#[test]
fn angstrom_decomposes_to_a_ring_nfkd() {
assert_nfkd("\u{212B}", "A\u{030A}");
}
#[test]
fn angstrom_is_not_normalized() {
assert_is_normalized_all("\u{212B}", false, false, false, false);
}
#[test]
fn a_ring_cannot_compose_to_angstrom() {
let a_ring = "\u{00C5}";
assert_nfc(a_ring, a_ring);
assert_nfd(a_ring, "A\u{030A}");
assert_is_normalized_all(a_ring, true, false, true, false);
}
#[test]
fn singleton_exclusions_in_text() {
let input = "Temp=300\u{212A}, R=5\u{2126}, d=3\u{212B}";
let expected_nfc = "Temp=300K, R=5\u{03A9}, d=3\u{00C5}";
let expected_nfd = "Temp=300K, R=5\u{03A9}, d=3A\u{030A}";
assert_nfc(input, expected_nfc);
assert_nfd(input, expected_nfd);
assert_nfkc(input, expected_nfc);
assert_nfkd(input, expected_nfd);
}
#[test]
fn combining_grave_tone_mark_nfc() {
assert_nfc("a\u{0340}", "\u{00E0}");
}
#[test]
fn combining_grave_tone_mark_nfd() {
assert_nfd("a\u{0340}", "a\u{0300}");
}
#[test]
fn combining_grave_tone_mark_nfkc() {
assert_nfkc("a\u{0340}", "\u{00E0}");
}
#[test]
fn combining_grave_tone_mark_nfkd() {
assert_nfkd("a\u{0340}", "a\u{0300}");
}
#[test]
fn combining_grave_tone_mark_is_not_normalized() {
assert_is_normalized_all("\u{0340}", false, false, false, false);
}
#[test]
fn combining_grave_tone_mark_cannot_be_composition_target() {
let grave = "a\u{0300}";
assert_nfc(grave, "\u{00E0}");
assert_nfd(grave, "a\u{0300}");
let nfd_result = simd_normalizer::nfd().normalize(grave);
assert!(
!nfd_result.contains('\u{0340}'),
"NFD output must not contain U+0340 COMBINING GRAVE TONE MARK"
);
}
#[test]
fn combining_acute_tone_mark_nfc() {
assert_nfc("e\u{0341}", "\u{00E9}");
}
#[test]
fn combining_acute_tone_mark_nfd() {
assert_nfd("e\u{0341}", "e\u{0301}");
}
#[test]
fn combining_acute_tone_mark_nfkc() {
assert_nfkc("e\u{0341}", "\u{00E9}");
}
#[test]
fn combining_acute_tone_mark_nfkd() {
assert_nfkd("e\u{0341}", "e\u{0301}");
}
#[test]
fn combining_acute_tone_mark_is_not_normalized() {
assert_is_normalized_all("\u{0341}", false, false, false, false);
}
#[test]
fn combining_acute_tone_mark_cannot_be_composition_target() {
let acute = "e\u{0301}";
assert_nfc(acute, "\u{00E9}");
assert_nfd(acute, "e\u{0301}");
let nfd_result = simd_normalizer::nfd().normalize(acute);
assert!(
!nfd_result.contains('\u{0341}'),
"NFD output must not contain U+0341 COMBINING ACUTE TONE MARK"
);
}
#[test]
fn combining_greek_dialytika_tonos_nfc() {
assert_nfc("\u{03B9}\u{0344}", "\u{0390}");
}
#[test]
fn combining_greek_dialytika_tonos_nfd() {
assert_nfd("\u{03B9}\u{0344}", "\u{03B9}\u{0308}\u{0301}");
}
#[test]
fn combining_greek_dialytika_tonos_nfkc() {
assert_nfkc("\u{03B9}\u{0344}", "\u{0390}");
}
#[test]
fn combining_greek_dialytika_tonos_nfkd() {
assert_nfkd("\u{03B9}\u{0344}", "\u{03B9}\u{0308}\u{0301}");
}
#[test]
fn combining_greek_dialytika_tonos_is_not_normalized() {
assert_is_normalized_all("\u{0344}", false, false, false, false);
}
#[test]
fn combining_greek_dialytika_tonos_on_upsilon() {
assert_nfc("\u{03C5}\u{0344}", "\u{03B0}");
assert_nfd("\u{03C5}\u{0344}", "\u{03C5}\u{0308}\u{0301}");
}
#[test]
fn combining_greek_dialytika_tonos_cannot_be_composition_target() {
let input = "\u{03B9}\u{0308}\u{0301}";
let nfc_result = simd_normalizer::nfc().normalize(input);
assert!(
!nfc_result.contains('\u{0344}'),
"NFC output must not contain U+0344 COMBINING GREEK DIALYTIKA TONOS"
);
}
#[test]
fn combining_mark_singletons_in_isolation() {
assert_nfc("\u{0340}", "\u{0300}");
assert_nfd("\u{0340}", "\u{0300}");
assert_nfc("\u{0341}", "\u{0301}");
assert_nfd("\u{0341}", "\u{0301}");
assert_nfc("\u{0344}", "\u{0308}\u{0301}");
assert_nfd("\u{0344}", "\u{0308}\u{0301}");
}
static DEVANAGARI_EXCLUSIONS: [(char, char); 8] = [
('\u{0958}', '\u{0915}'), ('\u{0959}', '\u{0916}'), ('\u{095A}', '\u{0917}'), ('\u{095B}', '\u{091C}'), ('\u{095C}', '\u{0921}'), ('\u{095D}', '\u{0922}'), ('\u{095E}', '\u{092B}'), ('\u{095F}', '\u{092F}'), ];
#[test]
fn devanagari_exclusions_nfc() {
for &(excluded, base) in &DEVANAGARI_EXCLUSIONS {
let input = String::from(excluded);
let expected = format!("{}\u{093C}", base);
assert_nfc(&input, &expected);
}
}
#[test]
fn devanagari_exclusions_nfd() {
for &(excluded, base) in &DEVANAGARI_EXCLUSIONS {
let input = String::from(excluded);
let expected = format!("{}\u{093C}", base);
assert_nfd(&input, &expected);
}
}
#[test]
fn devanagari_exclusions_nfkc() {
for &(excluded, base) in &DEVANAGARI_EXCLUSIONS {
let input = String::from(excluded);
let expected = format!("{}\u{093C}", base);
assert_nfkc(&input, &expected);
}
}
#[test]
fn devanagari_exclusions_nfkd() {
for &(excluded, base) in &DEVANAGARI_EXCLUSIONS {
let input = String::from(excluded);
let expected = format!("{}\u{093C}", base);
assert_nfkd(&input, &expected);
}
}
#[test]
fn devanagari_exclusions_is_not_normalized() {
for &(excluded, _) in &DEVANAGARI_EXCLUSIONS {
let input = String::from(excluded);
assert_is_normalized_all(&input, false, false, false, false);
}
}
#[test]
fn devanagari_base_plus_nukta_cannot_compose_back() {
for &(excluded, base) in &DEVANAGARI_EXCLUSIONS {
let input = format!("{}\u{093C}", base);
let nfc_result = simd_normalizer::nfc().normalize(&input);
assert!(
!nfc_result.contains(excluded),
"NFC of base {:04X} + nukta must not produce excluded char {:04X}, got {:?}",
base as u32,
excluded as u32,
nfc_result
);
assert_nfc(&input, &input);
}
}
#[test]
fn devanagari_exclusions_in_word_context() {
let input = "\u{095E}\u{093E}\u{0930}\u{0938}\u{0940}";
let expected = "\u{092B}\u{093C}\u{093E}\u{0930}\u{0938}\u{0940}";
assert_nfc(input, expected);
assert_nfd(input, expected);
}
#[test]
fn hebrew_fb1d_yod_with_hiriq() {
let input = "\u{FB1D}";
let expected = "\u{05D9}\u{05B4}";
assert_nfc(input, expected);
assert_nfd(input, expected);
assert_nfkc(input, expected);
assert_nfkd(input, expected);
assert_is_normalized_all(input, false, false, false, false);
}
#[test]
fn hebrew_fb1d_cannot_compose_back() {
let input = "\u{05D9}\u{05B4}";
let nfc_result = simd_normalizer::nfc().normalize(input);
assert!(
!nfc_result.contains('\u{FB1D}'),
"NFC of yod + hiriq must not produce U+FB1D, got {:?}",
nfc_result
);
assert_nfc(input, input);
}
#[test]
fn hebrew_fb2a_shin_with_shin_dot() {
let input = "\u{FB2A}";
let expected = "\u{05E9}\u{05C1}";
assert_nfc(input, expected);
assert_nfd(input, expected);
assert_nfkc(input, expected);
assert_nfkd(input, expected);
assert_is_normalized_all(input, false, false, false, false);
}
#[test]
fn hebrew_fb2b_shin_with_sin_dot() {
let input = "\u{FB2B}";
let expected = "\u{05E9}\u{05C2}";
assert_nfc(input, expected);
assert_nfd(input, expected);
assert_nfkc(input, expected);
assert_nfkd(input, expected);
}
#[test]
fn hebrew_fb2a_shin_dot_cannot_compose_back() {
let input = "\u{05E9}\u{05C1}";
let nfc_result = simd_normalizer::nfc().normalize(input);
assert!(
!nfc_result.contains('\u{FB2A}'),
"NFC must not produce U+FB2A"
);
assert_nfc(input, input);
}
#[test]
fn hebrew_fb49_shin_with_dagesh() {
let input = "\u{FB49}";
let expected = "\u{05E9}\u{05BC}";
assert_nfc(input, expected);
assert_nfd(input, expected);
assert_nfkc(input, expected);
assert_nfkd(input, expected);
}
#[test]
fn hebrew_compatibility_forms_nfkc() {
let input = "\u{FB20}";
assert_nfc(input, input);
assert_nfd(input, input);
assert_nfkc(input, "\u{05E2}");
assert_nfkd(input, "\u{05E2}");
}
#[test]
fn hebrew_fb4f_ligature_alef_lamed() {
let input = "\u{FB4F}";
assert_nfc(input, input);
assert_nfd(input, input);
assert_nfkc(input, "\u{05D0}\u{05DC}");
assert_nfkd(input, "\u{05D0}\u{05DC}");
}
#[test]
fn nfc_idempotent_for_all_singletons() {
let exclusions: Vec<&str> = vec![
"\u{2126}", "\u{212A}", "\u{212B}", "\u{0340}", "\u{0341}", "\u{0344}", "\u{0958}",
"\u{0959}", "\u{095A}", "\u{095B}", "\u{095C}", "\u{095D}", "\u{095E}", "\u{095F}",
"\u{FB1D}", "\u{FB2A}", "\u{FB2B}", "\u{FB49}",
];
for input in &exclusions {
let nfc_once = simd_normalizer::nfc().normalize(input);
let nfc_twice = simd_normalizer::nfc().normalize(&nfc_once);
assert_eq!(
&*nfc_once, &*nfc_twice,
"NFC is not idempotent for {:?}: first={:?}, second={:?}",
input, nfc_once, nfc_twice
);
}
}
#[test]
fn nfd_idempotent_for_all_singletons() {
let exclusions: Vec<&str> = vec![
"\u{2126}", "\u{212A}", "\u{212B}", "\u{0340}", "\u{0341}", "\u{0344}", "\u{0958}",
"\u{0959}", "\u{095A}", "\u{095B}", "\u{095C}", "\u{095D}", "\u{095E}", "\u{095F}",
"\u{FB1D}", "\u{FB2A}", "\u{FB2B}", "\u{FB49}",
];
for input in &exclusions {
let nfd_once = simd_normalizer::nfd().normalize(input);
let nfd_twice = simd_normalizer::nfd().normalize(&nfd_once);
assert_eq!(
&*nfd_once, &*nfd_twice,
"NFD is not idempotent for {:?}: first={:?}, second={:?}",
input, nfd_once, nfd_twice
);
}
}
#[test]
fn excluded_chars_never_appear_in_nfc_output() {
let excluded_chars: Vec<char> = vec![
'\u{2126}', '\u{212A}', '\u{212B}', '\u{0340}', '\u{0341}', '\u{0344}', '\u{0958}', '\u{0959}', '\u{095A}', '\u{095B}', '\u{095C}', '\u{095D}', '\u{095E}', '\u{095F}', '\u{FB1D}', '\u{FB2A}', '\u{FB2B}',
'\u{FB49}', ];
for &ch in &excluded_chars {
let input = String::from(ch);
let nfc_result = simd_normalizer::nfc().normalize(&input);
assert!(
!nfc_result.contains(ch),
"NFC output for U+{:04X} must not contain the excluded character itself, got {:?}",
ch as u32,
nfc_result
);
}
}
#[test]
fn combining_grave_tone_mark_with_various_bases() {
assert_nfc("o\u{0340}", "\u{00F2}");
assert_nfd("o\u{0340}", "o\u{0300}");
assert_nfc("E\u{0340}", "\u{00C8}");
assert_nfd("E\u{0340}", "E\u{0300}");
assert_nfc("u\u{0340}", "\u{00F9}");
}
#[test]
fn combining_acute_tone_mark_with_various_bases() {
assert_nfc("a\u{0341}", "\u{00E1}");
assert_nfd("a\u{0341}", "a\u{0301}");
assert_nfc("O\u{0341}", "\u{00D3}");
assert_nfd("O\u{0341}", "O\u{0301}");
}
#[test]
fn tone_marks_stacked_with_other_combining() {
assert_nfc("a\u{0340}\u{0302}", "\u{00E0}\u{0302}");
assert_nfd("a\u{0340}\u{0302}", "a\u{0300}\u{0302}");
}
#[test]
fn dialytika_tonos_in_greek_text() {
let input = "\u{03B9}\u{0344}";
assert_nfc(input, "\u{0390}");
assert_nfd(input, "\u{03B9}\u{0308}\u{0301}");
let word_with_dialytika = "\u{03B5}\u{03BB}\u{03BB}\u{03B7}\u{03BD}\u{03B9}\u{0344}";
let nfc_result = simd_normalizer::nfc().normalize(word_with_dialytika);
let icu_result = ComposingNormalizerBorrowed::new_nfc().normalize(word_with_dialytika);
assert_eq!(&*nfc_result, &*icu_result, "Greek word cross-validation");
assert!(
!nfc_result.contains('\u{0344}'),
"NFC Greek text must not contain U+0344"
);
}
#[test]
fn nfc_then_nfd_roundtrip() {
let inputs = [
"\u{2126}", "\u{212A}", "\u{212B}", "\u{0340}", "\u{0341}", "\u{0344}", "\u{0958}",
"\u{095F}", "\u{FB1D}", "\u{FB2A}",
];
for input in &inputs {
let nfc1 = simd_normalizer::nfc().normalize(input);
let nfd1 = simd_normalizer::nfd().normalize(&nfc1);
let nfc2 = simd_normalizer::nfc().normalize(&nfd1);
assert_eq!(
&*nfc1, &*nfc2,
"NFC->NFD->NFC roundtrip unstable for {:?}",
input
);
}
}
#[test]
fn nfkc_agrees_with_nfc_for_canonical_exclusions() {
let canonical_exclusions = [
"\u{2126}", "\u{212A}", "\u{212B}", "\u{0340}", "\u{0341}", "\u{0344}", "\u{0958}",
"\u{095F}", "\u{FB1D}", "\u{FB2A}",
];
for input in &canonical_exclusions {
let nfc_result = simd_normalizer::nfc().normalize(input);
let nfkc_result = simd_normalizer::nfkc().normalize(input);
assert_eq!(
&*nfc_result, &*nfkc_result,
"NFC and NFKC disagree for canonical exclusion {:?}: nfc={:?}, nfkc={:?}",
input, nfc_result, nfkc_result
);
}
}
#[test]
fn nfkd_agrees_with_nfd_for_canonical_exclusions() {
let canonical_exclusions = [
"\u{2126}", "\u{212A}", "\u{212B}", "\u{0340}", "\u{0341}", "\u{0344}", "\u{0958}",
"\u{095F}", "\u{FB1D}", "\u{FB2A}",
];
for input in &canonical_exclusions {
let nfd_result = simd_normalizer::nfd().normalize(input);
let nfkd_result = simd_normalizer::nfkd().normalize(input);
assert_eq!(
&*nfd_result, &*nfkd_result,
"NFD and NFKD disagree for canonical exclusion {:?}: nfd={:?}, nfkd={:?}",
input, nfd_result, nfkd_result
);
}
}
#[test]
fn multiple_exclusions_in_single_string() {
let input = "\u{2126}\u{212A}\u{212B}\u{0958}\u{FB1D}";
let nfc_result = simd_normalizer::nfc().normalize(input);
let icu_result = ComposingNormalizerBorrowed::new_nfc().normalize(input);
assert_eq!(
&*nfc_result, &*icu_result,
"Multi-script exclusion cross-validation"
);
for ch in ['\u{2126}', '\u{212A}', '\u{212B}', '\u{0958}', '\u{FB1D}'] {
assert!(
!nfc_result.contains(ch),
"NFC output should not contain U+{:04X}",
ch as u32
);
}
}