use icu_normalizer::{ComposingNormalizer, DecomposingNormalizer};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NormalizationForm {
Nfc,
Nfd,
Nfkc,
Nfkd,
}
pub struct Normalizer {
nfc: icu_normalizer::ComposingNormalizerBorrowed<'static>,
nfkc: icu_normalizer::ComposingNormalizerBorrowed<'static>,
nfd: icu_normalizer::DecomposingNormalizerBorrowed<'static>,
nfkd: icu_normalizer::DecomposingNormalizerBorrowed<'static>,
}
impl Normalizer {
pub fn new() -> Self {
Self {
nfc: ComposingNormalizer::new_nfc(),
nfkc: ComposingNormalizer::new_nfkc(),
nfd: DecomposingNormalizer::new_nfd(),
nfkd: DecomposingNormalizer::new_nfkd(),
}
}
pub fn normalize(&self, text: &str, form: NormalizationForm) -> String {
match form {
NormalizationForm::Nfc => self.nfc.normalize(text).into_owned(),
NormalizationForm::Nfkc => self.nfkc.normalize(text).into_owned(),
NormalizationForm::Nfd => self.nfd.normalize(text).into_owned(),
NormalizationForm::Nfkd => self.nfkd.normalize(text).into_owned(),
}
}
pub fn is_normalized(&self, text: &str, form: NormalizationForm) -> bool {
self.normalize(text, form) == text
}
pub fn nfc(&self, text: &str) -> String {
self.normalize(text, NormalizationForm::Nfc)
}
}
impl Default for Normalizer {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn nfc_composes_combining_accent() {
let n = Normalizer::new();
let out = n.normalize("e\u{0301}", NormalizationForm::Nfc);
assert_eq!(out, "é");
assert_eq!(out.chars().count(), 1);
}
#[test]
fn nfd_decomposes_precomposed() {
let n = Normalizer::new();
let out = n.normalize("é", NormalizationForm::Nfd);
assert_eq!(out.chars().count(), 2);
assert_eq!(out, "e\u{0301}");
}
#[test]
fn nfc_nfd_round_trip() {
let n = Normalizer::new();
let original = "Crème Brûlée";
let decomposed = n.normalize(original, NormalizationForm::Nfd);
let recomposed = n.normalize(&decomposed, NormalizationForm::Nfc);
let original_nfc = n.normalize(original, NormalizationForm::Nfc);
assert_eq!(recomposed, original_nfc);
}
#[test]
fn nfkc_folds_compatibility_chars() {
let n = Normalizer::new();
let out = n.normalize("\u{FB01}", NormalizationForm::Nfkc);
assert_eq!(out, "fi");
}
#[test]
fn nfkd_decomposes_superscript() {
let n = Normalizer::new();
let out = n.normalize("\u{00B2}", NormalizationForm::Nfkd);
assert_eq!(out, "2");
}
#[test]
fn is_normalized_detects_form() {
let n = Normalizer::new();
assert!(n.is_normalized("é", NormalizationForm::Nfc));
assert!(!n.is_normalized("e\u{0301}", NormalizationForm::Nfc));
assert!(n.is_normalized("e\u{0301}", NormalizationForm::Nfd));
}
#[test]
fn ascii_is_unchanged() {
let n = Normalizer::new();
for form in [
NormalizationForm::Nfc,
NormalizationForm::Nfd,
NormalizationForm::Nfkc,
NormalizationForm::Nfkd,
] {
assert_eq!(n.normalize("hello world 123", form), "hello world 123");
}
}
#[test]
fn nfc_convenience_matches_explicit() {
let n = Normalizer::new();
assert_eq!(
n.nfc("e\u{0301}"),
n.normalize("e\u{0301}", NormalizationForm::Nfc)
);
}
}