mod data;
use data::normalization_tests::NORMALIZATION_TESTS;
use std::collections::HashSet;
fn our_nfc(s: &str) -> String {
simd_normalizer::nfc().normalize(s).into_owned()
}
fn our_nfd(s: &str) -> String {
simd_normalizer::nfd().normalize(s).into_owned()
}
fn our_nfkc(s: &str) -> String {
simd_normalizer::nfkc().normalize(s).into_owned()
}
fn our_nfkd(s: &str) -> String {
simd_normalizer::nfkd().normalize(s).into_owned()
}
fn icu_nfc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfc()
.normalize(s)
.into_owned()
}
fn icu_nfd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfd()
.normalize(s)
.into_owned()
}
fn icu_nfkc(s: &str) -> String {
use icu_normalizer::ComposingNormalizerBorrowed;
ComposingNormalizerBorrowed::new_nfkc()
.normalize(s)
.into_owned()
}
fn icu_nfkd(s: &str) -> String {
use icu_normalizer::DecomposingNormalizerBorrowed;
DecomposingNormalizerBorrowed::new_nfkd()
.normalize(s)
.into_owned()
}
fn codepoints(s: &str) -> String {
s.chars()
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" ")
}
fn listed_codepoints() -> HashSet<u32> {
let mut set = HashSet::with_capacity(NORMALIZATION_TESTS.len());
for t in NORMALIZATION_TESTS.iter() {
if let Some(c) = t.source.chars().next() {
set.insert(c as u32);
}
}
set
}
fn all_scalar_values() -> impl Iterator<Item = char> {
(0u32..=0x10FFFF).filter_map(char::from_u32)
}
fn sampled_scalar_values(step: u32) -> impl Iterator<Item = char> {
let boundary_ranges: Vec<std::ops::RangeInclusive<u32>> = vec![
0x0000..=0x00FF, 0x0300..=0x036F, 0x0590..=0x05FF, 0x0600..=0x06FF, 0x0900..=0x097F, 0x1100..=0x11FF, 0x2000..=0x206F, 0x2100..=0x214F, 0x2150..=0x218F, 0x2460..=0x24FF, 0x3040..=0x30FF, 0x3300..=0x33FF, 0xAC00..=0xAC00 + 100, 0xD7A0..=0xD7FF, 0xF900..=0xFAFF, 0xFB00..=0xFB06, 0xFE00..=0xFE0F, 0xFF00..=0xFFEF, 0x1D100..=0x1D1FF, 0x1F600..=0x1F64F, 0x10FF00..=0x10FFFF, ];
let mut seen = HashSet::new();
let mut result = Vec::new();
for range in boundary_ranges {
for u in range {
if let Some(c) = char::from_u32(u)
&& seen.insert(u)
{
result.push(c);
}
}
}
let mut u = 0u32;
while u <= 0x10FFFF {
if let Some(c) = char::from_u32(u)
&& seen.insert(u)
{
result.push(c);
}
u += step;
}
result.into_iter()
}
#[test]
#[ignore]
fn unlisted_codepoint_invariant_full() {
let listed = listed_codepoints();
let mut failures = Vec::new();
for c in all_scalar_values() {
if listed.contains(&(c as u32)) {
continue;
}
let s: String = c.to_string();
let nfc = our_nfc(&s);
if nfc != s {
failures.push(format!(
"U+{:04X}: NFC({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfc)
));
}
let nfd = our_nfd(&s);
if nfd != s {
failures.push(format!(
"U+{:04X}: NFD({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfd)
));
}
let nfkc = our_nfkc(&s);
if nfkc != s {
failures.push(format!(
"U+{:04X}: NFKC({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfkc)
));
}
let nfkd = our_nfkd(&s);
if nfkd != s {
failures.push(format!(
"U+{:04X}: NFKD({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfkd)
));
}
if failures.len() > 100 {
failures.push("... (truncated after 100 failures)".to_string());
break;
}
}
assert!(
failures.is_empty(),
"Unlisted codepoint invariant violations ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn unlisted_codepoint_invariant_spot_check() {
let listed = listed_codepoints();
let mut failures = Vec::new();
for c in sampled_scalar_values(100) {
if listed.contains(&(c as u32)) {
continue;
}
let s: String = c.to_string();
let nfc = our_nfc(&s);
if nfc != s {
failures.push(format!(
"U+{:04X}: NFC({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfc)
));
}
let nfd = our_nfd(&s);
if nfd != s {
failures.push(format!(
"U+{:04X}: NFD({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfd)
));
}
let nfkc = our_nfkc(&s);
if nfkc != s {
failures.push(format!(
"U+{:04X}: NFKC({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfkc)
));
}
let nfkd = our_nfkd(&s);
if nfkd != s {
failures.push(format!(
"U+{:04X}: NFKD({}) = {} (expected invariant)",
c as u32,
codepoints(&s),
codepoints(&nfkd)
));
}
}
assert!(
failures.is_empty(),
"Unlisted codepoint invariant violations ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
#[ignore]
fn differential_all_codepoints_full() {
let mut failures = Vec::new();
for c in all_scalar_values() {
let s: String = c.to_string();
let our = our_nfd(&s);
let reference = icu_nfd(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFD: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
let our = our_nfc(&s);
let reference = icu_nfc(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFC: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
let our = our_nfkd(&s);
let reference = icu_nfkd(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFKD: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
let our = our_nfkc(&s);
let reference = icu_nfkc(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFKC: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
if failures.len() > 100 {
failures.push("... (truncated after 100 failures)".to_string());
break;
}
}
assert!(
failures.is_empty(),
"Differential failures against icu_normalizer ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn differential_all_codepoints_spot_check() {
let mut failures = Vec::new();
for c in sampled_scalar_values(100) {
let s: String = c.to_string();
let our = our_nfd(&s);
let reference = icu_nfd(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFD: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
let our = our_nfc(&s);
let reference = icu_nfc(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFC: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
let our = our_nfkd(&s);
let reference = icu_nfkd(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFKD: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
let our = our_nfkc(&s);
let reference = icu_nfkc(&s);
if our != reference {
failures.push(format!(
"U+{:04X} NFKC: ours=[{}] ref=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
}
assert!(
failures.is_empty(),
"Differential failures against icu_normalizer ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
#[ignore]
fn ccc_nfd_differential_full() {
let mut failures = Vec::new();
for c in all_scalar_values() {
let s: String = c.to_string();
let our = our_nfd(&s);
let reference = icu_nfd(&s);
if our != reference {
failures.push(format!(
"U+{:04X}: our NFD=[{}] icu NFD=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
if failures.len() > 100 {
failures.push("... (truncated after 100 failures)".to_string());
break;
}
}
assert!(
failures.is_empty(),
"CCC/NFD differential failures ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn ccc_nfd_differential_spot_check() {
let mut failures = Vec::new();
for c in sampled_scalar_values(100) {
let s: String = c.to_string();
let our = our_nfd(&s);
let reference = icu_nfd(&s);
if our != reference {
failures.push(format!(
"U+{:04X}: our NFD=[{}] icu NFD=[{}]",
c as u32,
codepoints(&our),
codepoints(&reference)
));
}
}
assert!(
failures.is_empty(),
"CCC/NFD differential failures ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
#[ignore]
fn is_normalized_consistency_full() {
use simd_normalizer::UnicodeNormalization;
let mut failures = Vec::new();
for c in all_scalar_values() {
let s: String = c.to_string();
let normalized = our_nfc(&s);
if normalized == s && !s.is_nfc() {
failures.push(format!(
"U+{:04X}: NFC is invariant but is_nfc()=false",
c as u32
));
}
let normalized = our_nfd(&s);
if normalized == s && !s.is_nfd() {
failures.push(format!(
"U+{:04X}: NFD is invariant but is_nfd()=false",
c as u32
));
}
let normalized = our_nfkc(&s);
if normalized == s && !s.is_nfkc() {
failures.push(format!(
"U+{:04X}: NFKC is invariant but is_nfkc()=false",
c as u32
));
}
let normalized = our_nfkd(&s);
if normalized == s && !s.is_nfkd() {
failures.push(format!(
"U+{:04X}: NFKD is invariant but is_nfkd()=false",
c as u32
));
}
if failures.len() > 100 {
failures.push("... (truncated after 100 failures)".to_string());
break;
}
}
assert!(
failures.is_empty(),
"is_normalized consistency failures ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}
#[test]
fn is_normalized_consistency_spot_check() {
use simd_normalizer::UnicodeNormalization;
let mut failures = Vec::new();
for c in sampled_scalar_values(100) {
let s: String = c.to_string();
let normalized = our_nfc(&s);
if normalized == s && !s.is_nfc() {
failures.push(format!(
"U+{:04X}: NFC is invariant but is_nfc()=false",
c as u32
));
}
let normalized = our_nfd(&s);
if normalized == s && !s.is_nfd() {
failures.push(format!(
"U+{:04X}: NFD is invariant but is_nfd()=false",
c as u32
));
}
let normalized = our_nfkc(&s);
if normalized == s && !s.is_nfkc() {
failures.push(format!(
"U+{:04X}: NFKC is invariant but is_nfkc()=false",
c as u32
));
}
let normalized = our_nfkd(&s);
if normalized == s && !s.is_nfkd() {
failures.push(format!(
"U+{:04X}: NFKD is invariant but is_nfkd()=false",
c as u32
));
}
}
assert!(
failures.is_empty(),
"is_normalized consistency failures ({} failures):\n{}",
failures.len(),
failures.join("\n")
);
}