use proptest::prelude::*;
fn ref_nfc(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfc().collect::<String>()
}
fn ref_nfd(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfd().collect::<String>()
}
fn ref_nfkc(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfkc().collect::<String>()
}
fn ref_nfkd(s: &str) -> String {
use unicode_normalization::UnicodeNormalization;
s.nfkd().collect::<String>()
}
fn our_nfc(s: &str) -> String {
simd_normalizer::nfc().normalize(s).into_owned()
}
fn our_nfd(s: &str) -> String {
simd_normalizer::nfd().normalize(s).into_owned()
}
fn our_nfkc(s: &str) -> String {
simd_normalizer::nfkc().normalize(s).into_owned()
}
fn our_nfkd(s: &str) -> String {
simd_normalizer::nfkd().normalize(s).into_owned()
}
fn broad_unicode_strategy() -> impl Strategy<Value = String> {
let ranges = vec![
'\u{0020}'..='\u{007E}',
'\u{0100}'..='\u{017F}',
'\u{0180}'..='\u{024F}',
'\u{0300}'..='\u{036F}',
'\u{0370}'..='\u{03FF}',
'\u{0400}'..='\u{04FF}',
'\u{0590}'..='\u{05FF}',
'\u{0600}'..='\u{06FF}',
'\u{0900}'..='\u{097F}',
'\u{0E00}'..='\u{0E7F}',
'\u{1100}'..='\u{11FF}',
'\u{3040}'..='\u{309F}',
'\u{30A0}'..='\u{30FF}',
'\u{4E00}'..='\u{4FFF}',
'\u{AC00}'..='\u{AD00}',
'\u{1F600}'..='\u{1F64F}',
];
prop::collection::vec(prop::char::ranges(ranges.into()), 1..=64)
.prop_map(|chars| chars.into_iter().collect::<String>())
}
fn compat_decomp_strategy() -> impl Strategy<Value = String> {
let ranges = vec![
'\u{2150}'..='\u{215F}',
'\u{2160}'..='\u{2188}',
'\u{2460}'..='\u{24FF}',
'\u{3300}'..='\u{33FF}',
'\u{F900}'..='\u{F9FF}',
'\u{FB00}'..='\u{FB06}',
'\u{FF01}'..='\u{FF5E}',
'\u{FF65}'..='\u{FF9F}',
'\u{2070}'..='\u{209F}',
];
prop::collection::vec(prop::char::ranges(ranges.into()), 1..=32)
.prop_map(|chars| chars.into_iter().collect::<String>())
}
fn long_combining_strategy() -> impl Strategy<Value = String> {
let base_chars = vec!['A'..='Z', 'a'..='z'];
let combining_marks = vec![
'\u{0300}'..='\u{036F}',
'\u{1AB0}'..='\u{1AFF}',
'\u{0483}'..='\u{0489}',
'\u{0591}'..='\u{05BD}',
'\u{064B}'..='\u{065F}',
'\u{0E31}'..='\u{0E3A}',
'\u{20D0}'..='\u{20FF}',
];
let base = prop::char::ranges(base_chars.into());
let marks = prop::collection::vec(prop::char::ranges(combining_marks.into()), 4..=30);
(base, marks).prop_map(|(b, ms)| {
let mut s = String::with_capacity(1 + ms.len() * 4);
s.push(b);
for m in ms {
s.push(m);
}
s
})
}
fn assert_eq_normalized(form: &str, input: &str, ours: &str, reference: &str) {
assert_eq!(
ours,
reference,
"\n{form} divergence!\
\n input (len={ilen}): {input_escaped}\
\n ours (len={olen}): {ours_escaped}\
\n ref (len={rlen}): {ref_escaped}\
\n input code points: {input_cps}\
\n ours code points: {ours_cps}\
\n ref code points: {ref_cps}",
form = form,
ilen = input.len(),
input_escaped = input.escape_unicode(),
olen = ours.len(),
ours_escaped = ours.escape_unicode(),
rlen = reference.len(),
ref_escaped = reference.escape_unicode(),
input_cps = codepoints(input),
ours_cps = codepoints(ours),
ref_cps = codepoints(reference),
);
}
fn codepoints(s: &str) -> String {
s.chars()
.map(|c| format!("U+{:04X}", c as u32))
.collect::<Vec<_>>()
.join(" ")
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(5000))]
#[test]
fn differential_nfc(s in broad_unicode_strategy()) {
let ours = our_nfc(&s);
let reference = ref_nfc(&s);
assert_eq_normalized("NFC", &s, &ours, &reference);
}
#[test]
fn differential_nfd(s in broad_unicode_strategy()) {
let ours = our_nfd(&s);
let reference = ref_nfd(&s);
assert_eq_normalized("NFD", &s, &ours, &reference);
}
#[test]
fn differential_nfkc(s in broad_unicode_strategy()) {
let ours = our_nfkc(&s);
let reference = ref_nfkc(&s);
assert_eq_normalized("NFKC", &s, &ours, &reference);
}
#[test]
fn differential_nfkd(s in broad_unicode_strategy()) {
let ours = our_nfkd(&s);
let reference = ref_nfkd(&s);
assert_eq_normalized("NFKD", &s, &ours, &reference);
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(2000))]
#[test]
fn differential_nfkc_compat(s in compat_decomp_strategy()) {
let ours = our_nfkc(&s);
let reference = ref_nfkc(&s);
assert_eq_normalized("NFKC-compat", &s, &ours, &reference);
}
#[test]
fn differential_nfkd_compat(s in compat_decomp_strategy()) {
let ours = our_nfkd(&s);
let reference = ref_nfkd(&s);
assert_eq_normalized("NFKD-compat", &s, &ours, &reference);
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn differential_nfc_long_combining(s in long_combining_strategy()) {
let ours = our_nfc(&s);
let reference = ref_nfc(&s);
assert_eq_normalized("NFC-long-combining", &s, &ours, &reference);
}
#[test]
fn differential_nfd_long_combining(s in long_combining_strategy()) {
let ours = our_nfd(&s);
let reference = ref_nfd(&s);
assert_eq_normalized("NFD-long-combining", &s, &ours, &reference);
}
}
#[test]
fn differential_edge_cases() {
let cases: &[(&str, &str)] = &[
("empty", ""),
("ascii", "Hello, World! 0123456789"),
("precomposed-e-acute", "\u{00E9}"),
("decomposed-e-acute", "\u{0065}\u{0301}"),
("a-ring", "\u{0041}\u{030A}"),
("hangul-ga", "\u{AC00}"),
("hangul-gag", "\u{AC01}"),
("jamo-lv", "\u{1100}\u{1161}"),
("jamo-lvt", "\u{1100}\u{1161}\u{11A8}"),
("fi-ligature", "\u{FB01}"),
("ohm-sign", "\u{2126}"),
("angstrom", "\u{212B}"),
("hiragana-ga-precomposed", "\u{304C}"),
("hiragana-ga-decomposed", "\u{304B}\u{3099}"),
("ccc-reorder-1", "\u{0065}\u{0327}\u{0301}"),
("ccc-reorder-2", "\u{0065}\u{0301}\u{0327}"),
(
"long-combining",
"A\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030A}\u{030B}\u{030C}\u{030D}\u{030E}\u{030F}",
),
(
"mixed-scripts",
"Hello\u{0301} \u{0410}\u{0308} \u{05D0}\u{05B0} \u{0627}\u{064E} \u{3042}\u{3099}",
),
(
"emoji-zwj",
"\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}\u{200D}\u{1F466}",
),
("fullwidth", "\u{FF21}\u{FF22}\u{FF23}"),
("already-nfc", "Caf\u{00E9} na\u{00EF}ve"),
("orphan-combiners", "\u{0300}\u{0301}\u{0302}"),
("supplementary", "\u{1F600}"),
("repeated-decomposable", "\u{00C0}\u{00C0}\u{00C0}\u{00C0}"),
];
for (label, input) in cases {
let our_nfc_result = our_nfc(input);
let ref_nfc_result = ref_nfc(input);
assert_eq!(
our_nfc_result,
ref_nfc_result,
"NFC edge case {label:?} failed\n input codepoints: {input_cps}\n ours: {ours_cps}\n ref: {ref_cps}",
label = label,
input_cps = codepoints(input),
ours_cps = codepoints(&our_nfc_result),
ref_cps = codepoints(&ref_nfc_result),
);
let our_nfd_result = our_nfd(input);
let ref_nfd_result = ref_nfd(input);
assert_eq!(
our_nfd_result,
ref_nfd_result,
"NFD edge case {label:?} failed\n input codepoints: {input_cps}\n ours: {ours_cps}\n ref: {ref_cps}",
label = label,
input_cps = codepoints(input),
ours_cps = codepoints(&our_nfd_result),
ref_cps = codepoints(&ref_nfd_result),
);
let our_nfkc_result = our_nfkc(input);
let ref_nfkc_result = ref_nfkc(input);
assert_eq!(
our_nfkc_result,
ref_nfkc_result,
"NFKC edge case {label:?} failed\n input codepoints: {input_cps}\n ours: {ours_cps}\n ref: {ref_cps}",
label = label,
input_cps = codepoints(input),
ours_cps = codepoints(&our_nfkc_result),
ref_cps = codepoints(&ref_nfkc_result),
);
let our_nfkd_result = our_nfkd(input);
let ref_nfkd_result = ref_nfkd(input);
assert_eq!(
our_nfkd_result,
ref_nfkd_result,
"NFKD edge case {label:?} failed\n input codepoints: {input_cps}\n ours: {ours_cps}\n ref: {ref_cps}",
label = label,
input_cps = codepoints(input),
ours_cps = codepoints(&our_nfkd_result),
ref_cps = codepoints(&ref_nfkd_result),
);
}
}