#![forbid(unsafe_code)]
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NormForm {
Nfc,
Nfd,
Nfkc,
Nfkd,
}
#[must_use]
pub fn normalize(s: &str, form: NormForm) -> String {
match form {
NormForm::Nfc => s.nfc().collect(),
NormForm::Nfd => s.nfd().collect(),
NormForm::Nfkc => s.nfkc().collect(),
NormForm::Nfkd => s.nfkd().collect(),
}
}
#[must_use]
pub fn is_normalized(s: &str, form: NormForm) -> bool {
match form {
NormForm::Nfc => unicode_normalization::is_nfc(s),
NormForm::Nfd => unicode_normalization::is_nfd(s),
NormForm::Nfkc => unicode_normalization::is_nfkc(s),
NormForm::Nfkd => unicode_normalization::is_nfkd(s),
}
}
#[must_use]
pub fn normalize_for_search(s: &str) -> String {
s.nfkc().collect::<String>().to_lowercase()
}
#[must_use]
pub fn eq_normalized(a: &str, b: &str, form: NormForm) -> bool {
normalize(a, form) == normalize(b, form)
}
pub fn nfc_iter(s: &str) -> impl Iterator<Item = char> + '_ {
s.nfc()
}
pub fn nfd_iter(s: &str) -> impl Iterator<Item = char> + '_ {
s.nfd()
}
pub fn nfkc_iter(s: &str) -> impl Iterator<Item = char> + '_ {
s.nfkc()
}
pub fn nfkd_iter(s: &str) -> impl Iterator<Item = char> + '_ {
s.nfkd()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn nfc_composes_combining_characters() {
let input = "e\u{0301}";
let result = normalize(input, NormForm::Nfc);
assert_eq!(result, "\u{00E9}");
}
#[test]
fn nfc_preserves_already_composed() {
let input = "\u{00E9}"; let result = normalize(input, NormForm::Nfc);
assert_eq!(result, "\u{00E9}");
}
#[test]
fn nfc_multiple_combining() {
let input = "a\u{0303}\u{0301}";
let result = normalize(input, NormForm::Nfc);
assert!(!result.is_empty());
assert!(is_normalized(&result, NormForm::Nfc));
}
#[test]
fn nfd_decomposes_precomposed() {
let input = "\u{00E9}";
let result = normalize(input, NormForm::Nfd);
assert_eq!(result, "e\u{0301}");
}
#[test]
fn nfd_preserves_ascii() {
let input = "hello world";
let result = normalize(input, NormForm::Nfd);
assert_eq!(result, "hello world");
}
#[test]
fn nfkc_normalizes_compatibility() {
let input = "\u{FB01}";
let result = normalize(input, NormForm::Nfkc);
assert_eq!(result, "fi");
}
#[test]
fn nfkc_normalizes_fullwidth() {
let input = "\u{FF21}";
let result = normalize(input, NormForm::Nfkc);
assert_eq!(result, "A");
}
#[test]
fn nfkc_normalizes_superscript() {
let input = "\u{00B2}";
let result = normalize(input, NormForm::Nfkc);
assert_eq!(result, "2");
}
#[test]
fn nfkd_decomposes_compatibility() {
let input = "\u{FB01}";
let result = normalize(input, NormForm::Nfkd);
assert_eq!(result, "fi");
}
#[test]
fn nfkd_decomposes_and_does_not_compose() {
let input = "\u{00E9}";
let result = normalize(input, NormForm::Nfkd);
assert_eq!(result, "e\u{0301}");
}
#[test]
fn is_nfc_on_composed() {
assert!(is_normalized("\u{00E9}", NormForm::Nfc));
}
#[test]
fn is_nfc_on_decomposed() {
assert!(!is_normalized("e\u{0301}", NormForm::Nfc));
}
#[test]
fn is_nfd_on_decomposed() {
assert!(is_normalized("e\u{0301}", NormForm::Nfd));
}
#[test]
fn is_nfd_on_composed() {
assert!(!is_normalized("\u{00E9}", NormForm::Nfd));
}
#[test]
fn ascii_is_all_forms() {
let ascii = "hello world 123";
assert!(is_normalized(ascii, NormForm::Nfc));
assert!(is_normalized(ascii, NormForm::Nfd));
assert!(is_normalized(ascii, NormForm::Nfkc));
assert!(is_normalized(ascii, NormForm::Nfkd));
}
#[test]
fn eq_normalized_composed_vs_decomposed() {
assert!(eq_normalized("\u{00E9}", "e\u{0301}", NormForm::Nfc));
assert!(eq_normalized("\u{00E9}", "e\u{0301}", NormForm::Nfd));
}
#[test]
fn eq_normalized_different_strings() {
assert!(!eq_normalized("a", "b", NormForm::Nfc));
}
#[test]
fn search_normalization_case_folds() {
assert_eq!(normalize_for_search("Hello"), normalize_for_search("hello"));
}
#[test]
fn search_normalization_handles_accents() {
let a = normalize_for_search("\u{00E9}");
let b = normalize_for_search("e\u{0301}");
assert_eq!(a, b);
}
#[test]
fn search_normalization_compatibility() {
let a = normalize_for_search("\u{FF21}");
let b = normalize_for_search("a");
assert_eq!(a, b);
}
#[test]
fn nfc_iter_matches_normalize() {
let input = "e\u{0301} cafe\u{0301}";
let iter_result: String = nfc_iter(input).collect();
let norm_result = normalize(input, NormForm::Nfc);
assert_eq!(iter_result, norm_result);
}
#[test]
fn nfd_iter_matches_normalize() {
let input = "\u{00E9} caf\u{00E9}";
let iter_result: String = nfd_iter(input).collect();
let norm_result = normalize(input, NormForm::Nfd);
assert_eq!(iter_result, norm_result);
}
#[test]
fn empty_string_all_forms() {
assert_eq!(normalize("", NormForm::Nfc), "");
assert_eq!(normalize("", NormForm::Nfd), "");
assert_eq!(normalize("", NormForm::Nfkc), "");
assert_eq!(normalize("", NormForm::Nfkd), "");
assert!(is_normalized("", NormForm::Nfc));
assert!(is_normalized("", NormForm::Nfd));
}
#[test]
fn hangul_composition() {
let decomposed = "\u{1112}\u{1161}\u{11AB}";
let composed = normalize(decomposed, NormForm::Nfc);
assert_eq!(composed, "\u{D55C}"); }
#[test]
fn hangul_decomposition() {
let composed = "\u{D55C}"; let decomposed = normalize(composed, NormForm::Nfd);
assert_eq!(decomposed, "\u{1112}\u{1161}\u{11AB}");
}
#[test]
fn mixed_script_normalization() {
let input = "Hello 世界 🌍";
let result = normalize(input, NormForm::Nfc);
assert_eq!(result, input); }
#[test]
fn long_combining_sequence() {
let mut input = String::from("a");
for _ in 0..20 {
input.push('\u{0300}'); }
let result = normalize(&input, NormForm::Nfc);
assert!(!result.is_empty());
assert!(is_normalized(&result, NormForm::Nfc));
}
#[test]
fn canonical_ordering() {
let a = normalize("A\u{0327}\u{0301}", NormForm::Nfc);
let b = normalize("A\u{0301}\u{0327}", NormForm::Nfc);
assert_eq!(a, b, "Canonical ordering should make these equivalent");
}
#[test]
fn is_nfkc_on_ascii() {
assert!(is_normalized("hello", NormForm::Nfkc));
}
#[test]
fn is_nfkc_false_for_compatibility_char() {
assert!(!is_normalized("\u{FF21}", NormForm::Nfkc));
}
#[test]
fn is_nfkd_false_for_composed() {
assert!(!is_normalized("\u{00E9}", NormForm::Nfkd));
}
#[test]
fn is_nfkd_true_for_decomposed_ascii() {
assert!(is_normalized("abc", NormForm::Nfkd));
}
#[test]
fn eq_normalized_compatibility_ligature() {
assert!(eq_normalized("\u{FB01}", "fi", NormForm::Nfkc));
assert!(eq_normalized("\u{FB01}", "fi", NormForm::Nfkd));
}
#[test]
fn eq_normalized_fullwidth_vs_ascii() {
assert!(eq_normalized("\u{FF21}", "A", NormForm::Nfkc));
}
#[test]
fn eq_normalized_false_for_different_base() {
assert!(!eq_normalized("a\u{0301}", "o\u{0301}", NormForm::Nfc));
}
#[test]
fn nfkc_iter_matches_normalize() {
let input = "\u{FB01}\u{FF21}\u{00B2}";
let iter_result: String = nfkc_iter(input).collect();
let norm_result = normalize(input, NormForm::Nfkc);
assert_eq!(iter_result, norm_result);
}
#[test]
fn nfkd_iter_matches_normalize() {
let input = "\u{00E9}\u{FB01}";
let iter_result: String = nfkd_iter(input).collect();
let norm_result = normalize(input, NormForm::Nfkd);
assert_eq!(iter_result, norm_result);
}
#[test]
fn normalize_is_idempotent_nfc() {
let input = "e\u{0301} caf\u{00E9}";
let once = normalize(input, NormForm::Nfc);
let twice = normalize(&once, NormForm::Nfc);
assert_eq!(once, twice);
}
#[test]
fn normalize_is_idempotent_nfkd() {
let input = "\u{FB01}\u{00E9}";
let once = normalize(input, NormForm::Nfkd);
let twice = normalize(&once, NormForm::Nfkd);
assert_eq!(once, twice);
}
#[test]
fn supplementary_plane_emoji_roundtrips() {
let input = "🦀🎉🌍";
assert_eq!(normalize(input, NormForm::Nfc), input);
assert_eq!(normalize(input, NormForm::Nfd), input);
assert_eq!(normalize(input, NormForm::Nfkc), input);
assert_eq!(normalize(input, NormForm::Nfkd), input);
}
#[test]
fn mathematical_bold_a_nfkc() {
let input = "\u{1D400}";
let result = normalize(input, NormForm::Nfkc);
assert_eq!(result, "A");
}
#[test]
fn zero_width_joiner_preserved() {
let input = "a\u{200D}b";
let result = normalize(input, NormForm::Nfc);
assert!(result.contains('\u{200D}'));
}
#[test]
fn normalize_for_search_ligature_and_case() {
let result = normalize_for_search("\u{FB01}LE");
assert_eq!(result, "file");
}
#[test]
fn normalize_for_search_empty() {
assert_eq!(normalize_for_search(""), "");
}
#[test]
fn norm_form_debug_and_clone() {
let form = NormForm::Nfc;
let cloned = form;
assert_eq!(form, cloned);
let _ = format!("{form:?}");
}
#[test]
fn norm_form_all_variants_distinct() {
let forms = [NormForm::Nfc, NormForm::Nfd, NormForm::Nfkc, NormForm::Nfkd];
for (i, a) in forms.iter().enumerate() {
for (j, b) in forms.iter().enumerate() {
if i == j {
assert_eq!(a, b);
} else {
assert_ne!(a, b);
}
}
}
}
}