use unicode_normalization::UnicodeNormalization;
use unicode_normalization::char::is_combining_mark;
pub struct Normalizer;
impl Normalizer {
pub fn normalize_name(name: &str) -> String {
let mut out = String::with_capacity(name.len());
for ch in name.nfkd() {
if is_combining_mark(ch) {
continue;
}
if ch.is_ascii_punctuation() {
continue;
}
for lc in ch.to_lowercase() {
out.push(lc);
}
}
collapse_whitespace(&out)
}
pub fn normalize_text(text: &str) -> String {
let mut out = String::with_capacity(text.len());
for ch in text.nfkd() {
if is_combining_mark(ch) {
continue;
}
for lc in ch.to_lowercase() {
out.push(lc);
}
}
collapse_whitespace(&out)
}
pub fn normalize_url(url: &str) -> String {
let trimmed = url.trim();
let no_frag = match trimmed.find('#') {
Some(idx) => &trimmed[..idx],
None => trimmed,
};
let (scheme, after_scheme) = match no_frag.find("://") {
Some(idx) => (&no_frag[..idx], Some(&no_frag[idx + 3..])),
None => (no_frag, None),
};
let Some(rest) = after_scheme else {
return no_frag.to_ascii_lowercase();
};
let (host, path) = match rest.find('/') {
Some(idx) => (&rest[..idx], &rest[idx..]),
None => (rest, ""),
};
let mut out = String::with_capacity(no_frag.len());
out.push_str(&scheme.to_ascii_lowercase());
out.push_str("://");
out.push_str(&host.to_ascii_lowercase());
if !(path.is_empty() || path == "/") {
out.push_str(path);
}
out
}
pub fn phonetic_code(name: &str) -> String {
let normalised = Self::normalize_name(name);
if normalised.is_empty() {
return String::new();
}
let ascii: String = normalised.chars().filter(|c| c.is_ascii()).collect();
if ascii.is_empty() {
return String::new();
}
soundex::american_soundex(&ascii)
}
}
fn collapse_whitespace(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut prev_space = true; for ch in s.chars() {
if ch.is_whitespace() {
if !prev_space {
out.push(' ');
prev_space = true;
}
} else {
out.push(ch);
prev_space = false;
}
}
if out.ends_with(' ') {
out.pop();
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_name_lowercases_and_trims() {
assert_eq!(Normalizer::normalize_name(" HELLO "), "hello");
}
#[test]
fn normalize_name_collapses_internal_whitespace() {
assert_eq!(Normalizer::normalize_name("a \t b\nc"), "a b c");
}
#[test]
fn normalize_name_drops_punctuation() {
assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
assert_eq!(Normalizer::normalize_name("Mary-Jane!"), "maryjane");
}
#[test]
fn normalize_name_drops_diacritics() {
assert_eq!(Normalizer::normalize_name("Siân"), "sian");
assert_eq!(Normalizer::normalize_name("café"), "cafe");
assert_eq!(Normalizer::normalize_name("Zoë"), "zoe");
}
#[test]
fn normalize_name_is_idempotent() {
let cases = ["hello", "O'Brien", " café au lait ", "JOSÉ-MARÍA"];
for c in cases {
let once = Normalizer::normalize_name(c);
let twice = Normalizer::normalize_name(&once);
assert_eq!(once, twice, "non-idempotent for {c:?}");
}
}
#[test]
fn normalize_name_empty_returns_empty() {
assert!(Normalizer::normalize_name("").is_empty());
assert!(Normalizer::normalize_name(" ").is_empty());
}
#[test]
fn normalize_text_preserves_punctuation() {
assert_eq!(Normalizer::normalize_text("Hello, World!"), "hello, world!");
}
#[test]
fn normalize_text_drops_diacritics() {
assert_eq!(Normalizer::normalize_text("Café au lait."), "cafe au lait.");
}
#[test]
fn normalize_text_is_idempotent() {
let cases = [
"The Eiffel Tower, in Paris.",
" multi space ",
"Plain.",
];
for c in cases {
let once = Normalizer::normalize_text(c);
let twice = Normalizer::normalize_text(&once);
assert_eq!(once, twice, "non-idempotent for {c:?}");
}
}
#[test]
fn normalize_url_lowercases_scheme_and_host() {
assert_eq!(
Normalizer::normalize_url("HTTPS://Example.ORG/foo"),
"https://example.org/foo",
);
}
#[test]
fn normalize_url_drops_root_trailing_slash() {
assert_eq!(
Normalizer::normalize_url("https://example.org/"),
"https://example.org",
);
}
#[test]
fn normalize_url_keeps_subpath_trailing_slash() {
assert_eq!(
Normalizer::normalize_url("https://example.org/foo/"),
"https://example.org/foo/",
);
}
#[test]
fn normalize_url_drops_fragment() {
assert_eq!(
Normalizer::normalize_url("https://example.org/foo#bar"),
"https://example.org/foo",
);
}
#[test]
fn normalize_url_handles_opaque_uri() {
assert_eq!(
Normalizer::normalize_url("URN:ISBN:0451450523"),
"urn:isbn:0451450523",
);
}
#[test]
fn normalize_url_is_idempotent() {
let cases = [
"https://example.org/",
"HTTPS://EXAMPLE.org/foo#frag",
"urn:isbn:123",
];
for c in cases {
let once = Normalizer::normalize_url(c);
let twice = Normalizer::normalize_url(&once);
assert_eq!(once, twice, "non-idempotent for {c:?}");
}
}
#[test]
fn phonetic_code_matches_homophones() {
assert_eq!(
Normalizer::phonetic_code("Stephen"),
Normalizer::phonetic_code("Steven"),
);
}
#[test]
fn phonetic_code_distinct_for_unrelated_names() {
assert_ne!(
Normalizer::phonetic_code("Alice"),
Normalizer::phonetic_code("Zachary"),
);
}
#[test]
fn phonetic_code_empty_for_empty_input() {
assert!(Normalizer::phonetic_code("").is_empty());
assert!(Normalizer::phonetic_code(" ").is_empty());
}
}