use proptest::prelude::*;
use simd_normalizer::UnicodeNormalization;
use simd_normalizer::matching::{MatchingOptions, normalize_for_matching};
use simd_normalizer::{CaseFoldMode, casefold, skeleton};
use std::borrow::Cow;
fn unicode_string_strategy() -> impl Strategy<Value = String> {
let ranges = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{0020}'..='\u{007E}',
'\u{0100}'..='\u{024F}',
'\u{0300}'..='\u{036F}',
'\u{0400}'..='\u{04FF}',
'\u{0600}'..='\u{06FF}',
'\u{0900}'..='\u{097F}',
'\u{1100}'..='\u{11FF}',
'\u{3040}'..='\u{309F}',
'\u{4E00}'..='\u{4FFF}',
'\u{AC00}'..='\u{D7A3}',
'\u{1F600}'..='\u{1F64F}',
]));
prop::collection::vec(ranges, 1..64).prop_map(|chars| chars.into_iter().collect::<String>())
}
fn combining_heavy_strategy() -> impl Strategy<Value = String> {
let base_chars = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{0041}'..='\u{005A}', '\u{0061}'..='\u{007A}', '\u{00C0}'..='\u{00FF}', ]));
let combining = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{0300}'..='\u{036F}', ]));
prop::collection::vec((base_chars, prop::collection::vec(combining, 1..=8)), 1..=8).prop_map(
|segments| {
let mut s = String::new();
for (base, marks) in segments {
s.push(base);
for m in marks {
s.push(m);
}
}
s
},
)
}
fn hangul_strategy() -> impl Strategy<Value = String> {
let syllables = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{AC00}'..='\u{D7A3}', ]));
let leading = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{1100}'..='\u{1112}', ]));
let vowel = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{1161}'..='\u{1175}', ]));
let trailing = prop::char::ranges(std::borrow::Cow::Borrowed(&[
'\u{11A8}'..='\u{11C2}', ]));
prop::collection::vec(
prop_oneof![
syllables.prop_map(|c| vec![c]),
(leading.clone(), vowel.clone()).prop_map(|(l, v)| vec![l, v]),
(leading, vowel, trailing).prop_map(|(l, v, t)| vec![l, v, t]),
],
1..=16,
)
.prop_map(|groups| {
let mut s = String::new();
for group in groups {
for ch in group {
s.push(ch);
}
}
s
})
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(2000))]
#[test]
fn idempotence_nfc(s in unicode_string_strategy()) {
let once = s.nfc();
let twice = once.nfc();
prop_assert_eq!(&*once, &*twice, "NFC is not idempotent");
}
#[test]
fn idempotence_nfd(s in unicode_string_strategy()) {
let once = s.nfd();
let twice = once.nfd();
prop_assert_eq!(&*once, &*twice, "NFD is not idempotent");
}
#[test]
fn idempotence_nfkc(s in unicode_string_strategy()) {
let once = s.nfkc();
let twice = once.nfkc();
prop_assert_eq!(&*once, &*twice, "NFKC is not idempotent");
}
#[test]
fn idempotence_nfkd(s in unicode_string_strategy()) {
let once = s.nfkd();
let twice = once.nfkd();
prop_assert_eq!(&*once, &*twice, "NFKD is not idempotent");
}
#[test]
fn roundtrip_nfc_nfd(s in unicode_string_strategy()) {
let nfc_s = s.nfc();
let nfd_of_nfc = nfc_s.nfd();
let via_nfd = nfd_of_nfc.nfc();
prop_assert_eq!(&*nfc_s, &*via_nfd, "nfc(nfd(nfc(s))) != nfc(s)");
}
#[test]
fn roundtrip_nfd_nfc(s in unicode_string_strategy()) {
let nfd_s = s.nfd();
let nfc_of_nfd = nfd_s.nfc();
let via_nfc = nfc_of_nfd.nfd();
prop_assert_eq!(&*nfd_s, &*via_nfc, "nfd(nfc(nfd(s))) != nfd(s)");
}
#[test]
fn roundtrip_nfkc_nfkd(s in unicode_string_strategy()) {
let nfkc_s = s.nfkc();
let nfkd_of_nfkc = nfkc_s.nfkd();
let via_nfkd = nfkd_of_nfkc.nfkc();
prop_assert_eq!(&*nfkc_s, &*via_nfkd, "nfkc(nfkd(nfkc(s))) != nfkc(s)");
}
#[test]
fn roundtrip_nfkd_nfkc(s in unicode_string_strategy()) {
let nfkd_s = s.nfkd();
let nfkc_of_nfkd = nfkd_s.nfkc();
let via_nfkc = nfkc_of_nfkd.nfkd();
prop_assert_eq!(&*nfkd_s, &*via_nfkc, "nfkd(nfkc(nfkd(s))) != nfkd(s)");
}
#[test]
fn cow_borrowed_nfc(s in unicode_string_strategy()) {
let nfc_s = s.nfc();
let nfc_str: &str = &nfc_s;
let owned = nfc_str.to_string();
let second = owned.as_str().nfc();
if owned.as_str().is_nfc() {
match &second {
Cow::Borrowed(b) => {
prop_assert!(
core::ptr::eq(*b, owned.as_str()),
"NFC Cow::Borrowed pointer mismatch for already-NFC string"
);
}
Cow::Owned(_) => {
prop_assert!(false, "is_nfc() returned true but nfc() returned Owned");
}
}
}
}
#[test]
fn cow_borrowed_nfd(s in unicode_string_strategy()) {
let nfd_s = s.nfd();
let nfd_str: &str = &nfd_s;
let owned = nfd_str.to_string();
let second = owned.as_str().nfd();
if owned.as_str().is_nfd() {
match &second {
Cow::Borrowed(b) => {
prop_assert!(
core::ptr::eq(*b, owned.as_str()),
"NFD Cow::Borrowed pointer mismatch for already-NFD string"
);
}
Cow::Owned(_) => {
prop_assert!(false, "is_nfd() returned true but nfd() returned Owned");
}
}
}
}
#[test]
fn cow_borrowed_nfkc(s in unicode_string_strategy()) {
let nfkc_s = s.nfkc();
let nfkc_str: &str = &nfkc_s;
let owned = nfkc_str.to_string();
let second = owned.as_str().nfkc();
if owned.as_str().is_nfkc() {
match &second {
Cow::Borrowed(b) => {
prop_assert!(
core::ptr::eq(*b, owned.as_str()),
"NFKC Cow::Borrowed pointer mismatch for already-NFKC string"
);
}
Cow::Owned(_) => {
prop_assert!(false, "is_nfkc() returned true but nfkc() returned Owned");
}
}
}
}
#[test]
fn cow_borrowed_nfkd(s in unicode_string_strategy()) {
let nfkd_s = s.nfkd();
let nfkd_str: &str = &nfkd_s;
let owned = nfkd_str.to_string();
let second = owned.as_str().nfkd();
if owned.as_str().is_nfkd() {
match &second {
Cow::Borrowed(b) => {
prop_assert!(
core::ptr::eq(*b, owned.as_str()),
"NFKD Cow::Borrowed pointer mismatch for already-NFKD string"
);
}
Cow::Owned(_) => {
prop_assert!(false, "is_nfkd() returned true but nfkd() returned Owned");
}
}
}
}
#[test]
fn is_nfc_consistency(s in unicode_string_strategy()) {
let nfc_s = s.nfc();
if &*nfc_s == s.as_str() {
prop_assert!(s.is_nfc(), "nfc(s) == s but is_nfc(s) is false");
}
}
#[test]
fn is_nfd_consistency(s in unicode_string_strategy()) {
let nfd_s = s.nfd();
if &*nfd_s == s.as_str() {
prop_assert!(s.is_nfd(), "nfd(s) == s but is_nfd(s) is false");
}
}
#[test]
fn is_nfkc_consistency(s in unicode_string_strategy()) {
let nfkc_s = s.nfkc();
if &*nfkc_s == s.as_str() {
prop_assert!(s.is_nfkc(), "nfkc(s) == s but is_nfkc(s) is false");
}
}
#[test]
fn is_nfkd_consistency(s in unicode_string_strategy()) {
let nfkd_s = s.nfkd();
if &*nfkd_s == s.as_str() {
prop_assert!(s.is_nfkd(), "nfkd(s) == s but is_nfkd(s) is false");
}
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn combining_heavy_idempotence_nfc(s in combining_heavy_strategy()) {
let once = s.nfc();
let twice = once.nfc();
prop_assert_eq!(&*once, &*twice, "NFC not idempotent on combining-heavy input");
}
#[test]
fn combining_heavy_idempotence_nfd(s in combining_heavy_strategy()) {
let once = s.nfd();
let twice = once.nfd();
prop_assert_eq!(&*once, &*twice, "NFD not idempotent on combining-heavy input");
}
#[test]
fn combining_heavy_idempotence_nfkc(s in combining_heavy_strategy()) {
let once = s.nfkc();
let twice = once.nfkc();
prop_assert_eq!(&*once, &*twice, "NFKC not idempotent on combining-heavy input");
}
#[test]
fn combining_heavy_idempotence_nfkd(s in combining_heavy_strategy()) {
let once = s.nfkd();
let twice = once.nfkd();
prop_assert_eq!(&*once, &*twice, "NFKD not idempotent on combining-heavy input");
}
#[test]
fn combining_heavy_roundtrip_nfc_nfd(s in combining_heavy_strategy()) {
let nfc_s = s.nfc();
let nfd_of_nfc = nfc_s.nfd();
let via_nfd = nfd_of_nfc.nfc();
prop_assert_eq!(&*nfc_s, &*via_nfd, "NFC->NFD->NFC round-trip failed on combining-heavy");
}
#[test]
fn combining_heavy_roundtrip_nfd_nfc(s in combining_heavy_strategy()) {
let nfd_s = s.nfd();
let nfc_of_nfd = nfd_s.nfc();
let via_nfc = nfc_of_nfd.nfd();
prop_assert_eq!(&*nfd_s, &*via_nfc, "NFD->NFC->NFD round-trip failed on combining-heavy");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn hangul_idempotence_nfc(s in hangul_strategy()) {
let once = s.nfc();
let twice = once.nfc();
prop_assert_eq!(&*once, &*twice, "NFC not idempotent on Hangul input");
}
#[test]
fn hangul_idempotence_nfd(s in hangul_strategy()) {
let once = s.nfd();
let twice = once.nfd();
prop_assert_eq!(&*once, &*twice, "NFD not idempotent on Hangul input");
}
#[test]
fn hangul_roundtrip_nfc_nfd(s in hangul_strategy()) {
let nfc_s = s.nfc();
let nfd_of_nfc = nfc_s.nfd();
let via_nfd = nfd_of_nfc.nfc();
prop_assert_eq!(&*nfc_s, &*via_nfd, "Hangul NFC->NFD->NFC round-trip failed");
}
#[test]
fn hangul_roundtrip_nfd_nfc(s in hangul_strategy()) {
let nfd_s = s.nfd();
let nfc_of_nfd = nfd_s.nfc();
let via_nfc = nfc_of_nfd.nfd();
prop_assert_eq!(&*nfd_s, &*via_nfc, "Hangul NFD->NFC->NFD round-trip failed");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn casefold_idempotent(s in unicode_string_strategy()) {
let once = casefold(&s, CaseFoldMode::Standard);
let twice = casefold(&once, CaseFoldMode::Standard);
prop_assert_eq!(&*once, &*twice, "casefold not idempotent");
}
#[test]
fn casefold_turkish_idempotent(s in unicode_string_strategy()) {
let once = casefold(&s, CaseFoldMode::Turkish);
let twice = casefold(&once, CaseFoldMode::Turkish);
prop_assert_eq!(&*once, &*twice, "Turkish casefold not idempotent");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn skeleton_converges_in_two_passes(s in unicode_string_strategy()) {
let once = skeleton(&s);
let twice = skeleton(&once);
let thrice = skeleton(&twice);
prop_assert_eq!(twice, thrice, "skeleton did not converge after two passes");
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn matching_idempotent(s in unicode_string_strategy()) {
let opts = MatchingOptions::default();
let once = normalize_for_matching(&s, &opts);
let twice = normalize_for_matching(&once, &opts);
prop_assert_eq!(once, twice, "normalize_for_matching not idempotent");
}
}
#[test]
fn hangul_exhaustive_all_11172_syllables() {
const S_BASE: u32 = 0xAC00;
const S_COUNT: u32 = 11172;
for offset in 0..S_COUNT {
let cp = S_BASE + offset;
let syllable = char::from_u32(cp).expect("valid Hangul syllable");
let input = String::from(syllable);
let decomposed = input.nfd();
for ch in decomposed.chars() {
assert!(
(0x1100..=0x11FF).contains(&(ch as u32)),
"Hangul U+{cp:04X} decomposed to non-Jamo U+{:04X}",
ch as u32
);
}
let recomposed = decomposed.nfc();
assert_eq!(
&*recomposed, &*input,
"Hangul U+{cp:04X} didn't round-trip NFC(NFD(s))"
);
assert!(input.is_nfc(), "Hangul U+{cp:04X} not recognized as NFC");
}
}