use unicode_normalization::UnicodeNormalization;
pub struct Normalizer;
impl Normalizer {
pub fn normalize_name(name: &str) -> String {
name.nfkd()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.filter(|c| !c.is_ascii_punctuation())
.collect::<String>()
.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
pub fn normalize_postcode(postcode: &str) -> String {
postcode
.chars()
.filter(|c| !c.is_whitespace())
.collect::<String>()
.to_uppercase()
}
pub fn normalize_phone(phone: &str) -> String {
let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
if digits.starts_with("0044") && digits.len() > 4 {
return digits[4..].to_string();
}
if digits.starts_with("44") && digits.len() >= 12 {
return digits[2..].to_string();
}
if digits.starts_with('0') && digits.len() > 1 {
return digits[1..].to_string();
}
digits
}
pub fn phonetic_code(name: &str) -> String {
let normalized = Self::normalize_name(name);
if normalized.is_empty() {
return String::new();
}
soundex::american_soundex(&normalized)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_name_collapses_whitespace_and_trims() {
assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
}
#[test]
fn normalize_name_strips_ascii_punctuation() {
assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
}
#[test]
fn normalize_name_strips_diacritics() {
assert_eq!(Normalizer::normalize_name("José"), "jose");
assert_eq!(Normalizer::normalize_name("Siân"), "sian");
assert_eq!(Normalizer::normalize_name("naïve"), "naive");
assert_eq!(Normalizer::normalize_name("crème"), "creme");
assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
}
#[test]
fn normalize_name_handles_empty_and_whitespace() {
assert_eq!(Normalizer::normalize_name(""), "");
assert_eq!(Normalizer::normalize_name(" "), "");
assert_eq!(Normalizer::normalize_name("\t\n"), "");
}
#[test]
fn normalize_name_lowercases() {
assert_eq!(Normalizer::normalize_name("MARY"), "mary");
assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
}
#[test]
fn normalize_name_is_idempotent() {
for input in [
" John Smith ",
"O'Brien-Jones",
"JOSÉ MARÍA",
"",
" ",
"Siân",
] {
let once = Normalizer::normalize_name(input);
let twice = Normalizer::normalize_name(&once);
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn normalize_name_does_not_normalise_unicode_punctuation() {
let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
assert!(with_curly.contains('\u{2019}'));
}
#[test]
fn normalize_postcode_uppercases() {
assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
}
#[test]
fn normalize_postcode_strips_all_whitespace() {
assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
assert_eq!(Normalizer::normalize_postcode(" CF10 1AA "), "CF101AA");
assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
}
#[test]
fn normalize_postcode_handles_empty() {
assert_eq!(Normalizer::normalize_postcode(""), "");
assert_eq!(Normalizer::normalize_postcode(" "), "");
}
#[test]
fn normalize_postcode_is_idempotent() {
for input in ["cf10 1aa", "SW1A 2AA", " EH8 9YL ", ""] {
let once = Normalizer::normalize_postcode(input);
let twice = Normalizer::normalize_postcode(&once);
assert_eq!(once, twice);
}
}
#[test]
fn normalize_phone_strips_uk_trunk_prefix() {
assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
}
#[test]
fn normalize_phone_strips_plus_44_international() {
assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
}
#[test]
fn normalize_phone_strips_0044_international() {
assert_eq!(
Normalizer::normalize_phone("0044 7700 900123"),
"7700900123"
);
}
#[test]
fn normalize_phone_handles_brackets_and_spaces() {
assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
}
#[test]
fn normalize_phone_handles_empty() {
assert_eq!(Normalizer::normalize_phone(""), "");
assert_eq!(Normalizer::normalize_phone("---"), "");
}
#[test]
fn normalize_phone_does_not_strip_44_if_too_short() {
assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
}
#[test]
fn normalize_phone_is_idempotent() {
for input in [
"07700 900123",
"+44 7700 900123",
"0044 7700 900123",
"(029) 2034 5678",
"",
] {
let once = Normalizer::normalize_phone(input);
let twice = Normalizer::normalize_phone(&once);
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn normalize_phone_keeps_lone_zero() {
assert_eq!(Normalizer::normalize_phone("0"), "0");
}
#[test]
fn phonetic_code_groups_smith_and_smyth() {
assert_eq!(
Normalizer::phonetic_code("Smith"),
Normalizer::phonetic_code("Smyth")
);
}
#[test]
fn phonetic_code_groups_stephen_and_steven() {
assert_eq!(
Normalizer::phonetic_code("Stephen"),
Normalizer::phonetic_code("Steven")
);
}
#[test]
fn phonetic_code_distinguishes_different_families() {
assert_ne!(
Normalizer::phonetic_code("Jones"),
Normalizer::phonetic_code("Smith")
);
assert_ne!(
Normalizer::phonetic_code("Anderson"),
Normalizer::phonetic_code("Zimmerman")
);
}
#[test]
fn phonetic_code_specific_values() {
assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
}
#[test]
fn phonetic_code_handles_empty() {
assert_eq!(Normalizer::phonetic_code(""), "");
assert_eq!(Normalizer::phonetic_code(" "), "");
}
#[test]
fn phonetic_code_is_case_insensitive() {
assert_eq!(
Normalizer::phonetic_code("SMITH"),
Normalizer::phonetic_code("smith")
);
}
}