use unicode_normalization::UnicodeNormalization;
#[inline]
pub(crate) fn validate_form(form: &str) -> Result<(), crate::ErrorRepr> {
if !matches!(form, "NFC" | "NFD" | "NFKC" | "NFKD") {
return Err(crate::ErrorRepr::InvalidNormForm {
got: form.to_owned(),
});
}
Ok(())
}
pub(crate) fn normalize(text: &str, form: &str) -> Result<String, crate::ErrorRepr> {
let mut out = String::new();
normalize_into(text, form, &mut out)?;
Ok(out)
}
pub(crate) fn normalize_into(
text: &str,
form: &str,
out: &mut String,
) -> Result<(), crate::ErrorRepr> {
validate_form(form)?;
out.clear();
if text.is_ascii() {
out.push_str(text);
return Ok(());
}
match form {
"NFC" => out.extend(text.nfc()),
"NFD" => out.extend(text.nfd()),
"NFKC" => out.extend(text.nfkc()),
"NFKD" => out.extend(text.nfkd()),
_ => unreachable!("validate_form guarantees a known normalization form"),
}
Ok(())
}
pub(crate) fn is_normalized(text: &str, form: &str) -> Result<bool, crate::ErrorRepr> {
validate_form(form)?;
let quick = match form {
"NFC" => unicode_normalization::is_nfc(text),
"NFD" => unicode_normalization::is_nfd(text),
"NFKC" => unicode_normalization::is_nfkc(text),
"NFKD" => unicode_normalization::is_nfkd(text),
_ => unreachable!("validate_form guarantees a known normalization form"),
};
if quick {
return Ok(true);
}
let already_normalized = match form {
"NFC" => text.nfc().eq(text.chars()),
"NFD" => text.nfd().eq(text.chars()),
"NFKC" => text.nfkc().eq(text.chars()),
"NFKD" => text.nfkd().eq(text.chars()),
_ => unreachable!("validate_form guarantees a known normalization form"),
};
Ok(already_normalized)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nfc_roundtrip() {
let text = "caf\u{0065}\u{0301}"; let normalized = normalize(text, "NFC").unwrap();
assert_eq!(normalized, "caf\u{00e9}"); }
#[test]
fn test_normalize_accepts_input_without_size_cap() {
assert!(normalize("Héllo wörld", "NFKD").is_ok());
let large = "é".repeat(2 * 1024 * 1024); assert!(normalize(&large, "NFKD").is_ok());
}
mod proptest_properties {
use super::*;
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn normalize_idempotent(
s in "\\PC*",
form in prop_oneof!["NFC", "NFD", "NFKC", "NFKD"],
) {
let once = normalize(&s, &form);
if let Ok(once) = once {
let twice = normalize(&once, &form).unwrap();
prop_assert_eq!(&once, &twice);
}
}
#[test]
fn normalize_then_is_normalized(
s in "\\PC*",
form in prop_oneof!["NFC", "NFD", "NFKC", "NFKD"],
) {
if let Ok(normalized) = normalize(&s, &form) {
prop_assert!(is_normalized(&normalized, &form).unwrap());
}
}
#[test]
fn nfkc_implies_nfc(s in "\\PC*") {
if let Ok(nfkc) = normalize(&s, "NFKC") {
prop_assert!(is_normalized(&nfkc, "NFC").unwrap());
}
}
#[test]
fn nfkd_implies_nfd(s in "\\PC*") {
if let Ok(nfkd) = normalize(&s, "NFKD") {
prop_assert!(is_normalized(&nfkd, "NFD").unwrap());
}
}
}
}
}