use serde::{Deserialize, Serialize};
use unicode_normalization::UnicodeNormalization;
pub struct Normalizer;
impl Normalizer {
pub fn normalize_name(name: &str) -> String {
name.nfkd()
.filter(|c| !unicode_normalization::char::is_combining_mark(*c))
.filter(|c| !c.is_ascii_punctuation())
.collect::<String>()
.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ")
}
pub fn normalize_postcode(postcode: &str) -> String {
postcode
.chars()
.filter(|c| !c.is_whitespace())
.collect::<String>()
.to_uppercase()
}
pub fn normalize_phone(phone: &str) -> String {
let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
if digits.starts_with("0044") && digits.len() > 4 {
return digits[4..].to_string();
}
if digits.starts_with("44") && digits.len() >= 12 {
return digits[2..].to_string();
}
if digits.starts_with('0') && digits.len() > 1 {
return digits[1..].to_string();
}
digits
}
pub fn normalize_phone_e164(phone: &str, default_country: Option<&str>) -> Option<String> {
let has_plus = phone.chars().any(|c| c == '+');
let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
if digits.is_empty() {
return None;
}
let (info, nsn): (&CountryPhoneInfo, String) = if has_plus {
let info = lookup_by_dial_code_prefix(&digits)?;
let rest = &digits[info.dial_code.len()..];
let rest = strip_trunk_prefix(info, rest);
(info, rest.to_string())
} else if let Some(stripped) = digits.strip_prefix("00") {
let info = lookup_by_dial_code_prefix(stripped)?;
let rest = &stripped[info.dial_code.len()..];
let rest = strip_trunk_prefix(info, rest);
(info, rest.to_string())
} else {
let iso = default_country?;
let info = lookup_by_iso(iso)?;
let nsn = strip_trunk_prefix(info, &digits);
(info, nsn.to_string())
};
if nsn.len() < info.min_nsn || nsn.len() > info.max_nsn {
return None;
}
Some(format!("+{}{}", info.dial_code, nsn))
}
pub fn expand_street_abbreviations(line: &str) -> String {
line.split_whitespace()
.map(expand_one_token)
.collect::<Vec<_>>()
.join(" ")
}
pub fn normalize_address_line(line: &str) -> String {
Self::normalize_name(&Self::expand_street_abbreviations(line))
}
pub fn parse_address_line(line: &str) -> ParsedAddressLine {
let trimmed = line.trim();
let (unit, after_unit) = extract_unit_prefix(trimmed);
let after_unit = after_unit.trim_start_matches([',', ' ', '\t']).trim();
let (house_number, after_number) = extract_house_number(after_unit);
let after_number = after_number.trim_start_matches([',', ' ', '\t']).trim();
ParsedAddressLine {
house_number,
unit,
street: Self::normalize_address_line(after_number),
}
}
pub fn phonetic_code(name: &str) -> String {
let normalized = Self::normalize_name(name);
if normalized.is_empty() {
return String::new();
}
soundex::american_soundex(&normalized)
}
pub fn normalize_email(email: &str, gmail_dot_folding: bool) -> Option<String> {
let trimmed = email.trim().to_lowercase();
if trimmed.is_empty() {
return None;
}
let (local, domain) = trimmed.split_once('@')?;
if local.is_empty() || domain.is_empty() {
return None;
}
if domain.contains('@') {
return None;
}
if gmail_dot_folding && (domain == "gmail.com" || domain == "googlemail.com") {
let local_no_plus = match local.find('+') {
Some(i) => &local[..i],
None => local,
};
let local_folded: String = local_no_plus.chars().filter(|c| *c != '.').collect();
if local_folded.is_empty() {
return None;
}
return Some(format!("{local_folded}@{domain}"));
}
Some(format!("{local}@{domain}"))
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ParsedAddressLine {
pub house_number: Option<String>,
pub unit: Option<String>,
pub street: String,
}
const STREET_ABBREVIATIONS: &[(&str, &str)] = &[
("st", "street"),
("str", "street"),
("rd", "road"),
("ave", "avenue"),
("av", "avenue"),
("blvd", "boulevard"),
("bvd", "boulevard"),
("ln", "lane"),
("dr", "drive"),
("ct", "court"),
("pl", "place"),
("sq", "square"),
("ter", "terrace"),
("terr", "terrace"),
("hwy", "highway"),
("pkwy", "parkway"),
("mt", "mount"),
("mtn", "mountain"),
("cres", "crescent"),
("gdns", "gardens"),
("gdn", "garden"),
("gr", "grove"),
("cl", "close"),
("pk", "park"),
("plz", "plaza"),
("expy", "expressway"),
("trl", "trail"),
("n", "north"),
("s", "south"),
("e", "east"),
("w", "west"),
("ne", "northeast"),
("nw", "northwest"),
("se", "southeast"),
("sw", "southwest"),
];
const UNIT_PREFIXES: &[&str] = &[
"flat",
"apartment",
"apt",
"unit",
"suite",
"ste",
"room",
"rm",
];
fn expand_one_token(tok: &str) -> String {
let stripped = tok.trim_end_matches(['.', ',']);
if !stripped.is_ascii() {
return tok.to_string();
}
let lower = stripped.to_ascii_lowercase();
for (abbrev, full) in STREET_ABBREVIATIONS {
if lower == *abbrev {
return (*full).to_string();
}
}
tok.to_string()
}
fn extract_unit_prefix(s: &str) -> (Option<String>, &str) {
let trimmed = s.trim_start();
let kw_end = trimmed
.find(|c: char| c.is_whitespace())
.unwrap_or(trimmed.len());
if kw_end == 0 {
return (None, s);
}
let kw_raw = &trimmed[..kw_end];
let kw_stripped = kw_raw.trim_end_matches(['.', ',']);
if !kw_stripped.is_ascii() {
return (None, s);
}
let kw_lower = kw_stripped.to_ascii_lowercase();
if !UNIT_PREFIXES.iter().any(|p| *p == kw_lower) {
return (None, s);
}
let after_kw = trimmed[kw_end..].trim_start_matches([' ', '\t', '#']);
let id_end = after_kw
.find(|c: char| !c.is_ascii_alphanumeric())
.unwrap_or(after_kw.len());
if id_end == 0 {
return (None, s);
}
let id = &after_kw[..id_end];
let rest = &after_kw[id_end..];
let unit = format!("{} {}", kw_lower, id.to_ascii_lowercase());
(Some(unit), rest)
}
fn extract_house_number(s: &str) -> (Option<String>, &str) {
let trimmed = s.trim_start();
let mut digits_end = 0;
for (i, c) in trimmed.char_indices() {
if c.is_ascii_digit() {
digits_end = i + c.len_utf8();
} else {
break;
}
}
if digits_end == 0 {
return (None, s);
}
let mut end = digits_end;
let after_digits = &trimmed[digits_end..];
let mut chars = after_digits.chars();
if let Some(c1) = chars.next()
&& c1.is_ascii_alphabetic()
{
let next = chars.next();
if next.is_none() || next.is_some_and(|c2| !c2.is_ascii_alphanumeric()) {
end += c1.len_utf8();
}
}
let number = trimmed[..end].to_ascii_uppercase();
(Some(number), &trimmed[end..])
}
struct CountryPhoneInfo {
iso_alpha2: &'static str,
dial_code: &'static str,
trunk_prefix: Option<&'static str>,
min_nsn: usize,
max_nsn: usize,
}
const COUNTRY_PHONE_TABLE: &[CountryPhoneInfo] = &[
CountryPhoneInfo {
iso_alpha2: "GB",
dial_code: "44",
trunk_prefix: Some("0"),
min_nsn: 7,
max_nsn: 11,
},
CountryPhoneInfo {
iso_alpha2: "FR",
dial_code: "33",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "DE",
dial_code: "49",
trunk_prefix: Some("0"),
min_nsn: 7,
max_nsn: 13,
},
CountryPhoneInfo {
iso_alpha2: "ES",
dial_code: "34",
trunk_prefix: None,
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "IE",
dial_code: "353",
trunk_prefix: Some("0"),
min_nsn: 7,
max_nsn: 11,
},
CountryPhoneInfo {
iso_alpha2: "IT",
dial_code: "39",
trunk_prefix: None,
min_nsn: 6,
max_nsn: 12,
},
CountryPhoneInfo {
iso_alpha2: "NL",
dial_code: "31",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "BE",
dial_code: "32",
trunk_prefix: Some("0"),
min_nsn: 8,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "PT",
dial_code: "351",
trunk_prefix: None,
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "CH",
dial_code: "41",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "AT",
dial_code: "43",
trunk_prefix: Some("0"),
min_nsn: 4,
max_nsn: 13,
},
CountryPhoneInfo {
iso_alpha2: "SE",
dial_code: "46",
trunk_prefix: Some("0"),
min_nsn: 7,
max_nsn: 13,
},
CountryPhoneInfo {
iso_alpha2: "NO",
dial_code: "47",
trunk_prefix: None,
min_nsn: 8,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "DK",
dial_code: "45",
trunk_prefix: None,
min_nsn: 8,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "FI",
dial_code: "358",
trunk_prefix: Some("0"),
min_nsn: 5,
max_nsn: 12,
},
CountryPhoneInfo {
iso_alpha2: "PL",
dial_code: "48",
trunk_prefix: None,
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "AU",
dial_code: "61",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "NZ",
dial_code: "64",
trunk_prefix: Some("0"),
min_nsn: 8,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "US",
dial_code: "1",
trunk_prefix: None,
min_nsn: 10,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "CA",
dial_code: "1",
trunk_prefix: None,
min_nsn: 10,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "JP",
dial_code: "81",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "CN",
dial_code: "86",
trunk_prefix: Some("0"),
min_nsn: 5,
max_nsn: 12,
},
CountryPhoneInfo {
iso_alpha2: "IN",
dial_code: "91",
trunk_prefix: Some("0"),
min_nsn: 10,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "BR",
dial_code: "55",
trunk_prefix: Some("0"),
min_nsn: 10,
max_nsn: 11,
},
CountryPhoneInfo {
iso_alpha2: "MX",
dial_code: "52",
trunk_prefix: None,
min_nsn: 10,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "ZA",
dial_code: "27",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "BG",
dial_code: "359",
trunk_prefix: Some("0"),
min_nsn: 8,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "CZ",
dial_code: "420",
trunk_prefix: None,
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "EE",
dial_code: "372",
trunk_prefix: None,
min_nsn: 7,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "GR",
dial_code: "30",
trunk_prefix: None,
min_nsn: 10,
max_nsn: 10,
},
CountryPhoneInfo {
iso_alpha2: "HR",
dial_code: "385",
trunk_prefix: Some("0"),
min_nsn: 8,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "IS",
dial_code: "354",
trunk_prefix: None,
min_nsn: 7,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "LI",
dial_code: "423",
trunk_prefix: None,
min_nsn: 7,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "LT",
dial_code: "370",
trunk_prefix: Some("8"),
min_nsn: 8,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "LV",
dial_code: "371",
trunk_prefix: None,
min_nsn: 8,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "MT",
dial_code: "356",
trunk_prefix: None,
min_nsn: 8,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "RO",
dial_code: "40",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
CountryPhoneInfo {
iso_alpha2: "SI",
dial_code: "386",
trunk_prefix: Some("0"),
min_nsn: 8,
max_nsn: 8,
},
CountryPhoneInfo {
iso_alpha2: "SK",
dial_code: "421",
trunk_prefix: Some("0"),
min_nsn: 9,
max_nsn: 9,
},
];
fn lookup_by_iso(iso: &str) -> Option<&'static CountryPhoneInfo> {
if !iso.is_ascii() {
return None;
}
let upper = iso.to_ascii_uppercase();
COUNTRY_PHONE_TABLE.iter().find(|c| c.iso_alpha2 == upper)
}
fn lookup_by_dial_code_prefix(digits: &str) -> Option<&'static CountryPhoneInfo> {
for len in [3usize, 2, 1] {
if digits.len() >= len {
let prefix = &digits[..len];
if let Some(info) = COUNTRY_PHONE_TABLE.iter().find(|c| c.dial_code == prefix) {
return Some(info);
}
}
}
None
}
fn strip_trunk_prefix<'a>(info: &CountryPhoneInfo, nsn: &'a str) -> &'a str {
if let Some(tp) = info.trunk_prefix
&& let Some(rest) = nsn.strip_prefix(tp)
&& !rest.is_empty()
{
rest
} else {
nsn
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normalize_name_collapses_whitespace_and_trims() {
assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
}
#[test]
fn normalize_name_strips_ascii_punctuation() {
assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
}
#[test]
fn normalize_name_strips_diacritics() {
assert_eq!(Normalizer::normalize_name("José"), "jose");
assert_eq!(Normalizer::normalize_name("Siân"), "sian");
assert_eq!(Normalizer::normalize_name("naïve"), "naive");
assert_eq!(Normalizer::normalize_name("crème"), "creme");
assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
}
#[test]
fn normalize_name_handles_empty_and_whitespace() {
assert_eq!(Normalizer::normalize_name(""), "");
assert_eq!(Normalizer::normalize_name(" "), "");
assert_eq!(Normalizer::normalize_name("\t\n"), "");
}
#[test]
fn normalize_name_lowercases() {
assert_eq!(Normalizer::normalize_name("MARY"), "mary");
assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
}
#[test]
fn normalize_name_is_idempotent() {
for input in [
" John Smith ",
"O'Brien-Jones",
"JOSÉ MARÍA",
"",
" ",
"Siân",
] {
let once = Normalizer::normalize_name(input);
let twice = Normalizer::normalize_name(&once);
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn normalize_name_does_not_normalise_unicode_punctuation() {
let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
assert!(with_curly.contains('\u{2019}'));
}
#[test]
fn normalize_postcode_uppercases() {
assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
}
#[test]
fn normalize_postcode_strips_all_whitespace() {
assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
assert_eq!(Normalizer::normalize_postcode(" CF10 1AA "), "CF101AA");
assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
}
#[test]
fn normalize_postcode_handles_empty() {
assert_eq!(Normalizer::normalize_postcode(""), "");
assert_eq!(Normalizer::normalize_postcode(" "), "");
}
#[test]
fn normalize_postcode_is_idempotent() {
for input in ["cf10 1aa", "SW1A 2AA", " EH8 9YL ", ""] {
let once = Normalizer::normalize_postcode(input);
let twice = Normalizer::normalize_postcode(&once);
assert_eq!(once, twice);
}
}
#[test]
fn normalize_phone_strips_uk_trunk_prefix() {
assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
}
#[test]
fn normalize_phone_strips_plus_44_international() {
assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
}
#[test]
fn normalize_phone_strips_0044_international() {
assert_eq!(
Normalizer::normalize_phone("0044 7700 900123"),
"7700900123"
);
}
#[test]
fn normalize_phone_handles_brackets_and_spaces() {
assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
}
#[test]
fn normalize_phone_handles_empty() {
assert_eq!(Normalizer::normalize_phone(""), "");
assert_eq!(Normalizer::normalize_phone("---"), "");
}
#[test]
fn normalize_phone_does_not_strip_44_if_too_short() {
assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
}
#[test]
fn normalize_phone_is_idempotent() {
for input in [
"07700 900123",
"+44 7700 900123",
"0044 7700 900123",
"(029) 2034 5678",
"",
] {
let once = Normalizer::normalize_phone(input);
let twice = Normalizer::normalize_phone(&once);
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn normalize_phone_keeps_lone_zero() {
assert_eq!(Normalizer::normalize_phone("0"), "0");
}
#[test]
fn phonetic_code_groups_smith_and_smyth() {
assert_eq!(
Normalizer::phonetic_code("Smith"),
Normalizer::phonetic_code("Smyth")
);
}
#[test]
fn phonetic_code_groups_stephen_and_steven() {
assert_eq!(
Normalizer::phonetic_code("Stephen"),
Normalizer::phonetic_code("Steven")
);
}
#[test]
fn phonetic_code_distinguishes_different_families() {
assert_ne!(
Normalizer::phonetic_code("Jones"),
Normalizer::phonetic_code("Smith")
);
assert_ne!(
Normalizer::phonetic_code("Anderson"),
Normalizer::phonetic_code("Zimmerman")
);
}
#[test]
fn phonetic_code_specific_values() {
assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
}
#[test]
fn phonetic_code_handles_empty() {
assert_eq!(Normalizer::phonetic_code(""), "");
assert_eq!(Normalizer::phonetic_code(" "), "");
}
#[test]
fn phonetic_code_is_case_insensitive() {
assert_eq!(
Normalizer::phonetic_code("SMITH"),
Normalizer::phonetic_code("smith")
);
}
#[test]
fn e164_uk_layouts_canonicalise_identically() {
let canonical = Some("+447700900123".to_string());
assert_eq!(
Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
canonical,
);
assert_eq!(
Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
canonical,
);
assert_eq!(
Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
canonical,
);
assert_eq!(
Normalizer::normalize_phone_e164("(07700) 900-123", Some("GB")),
canonical,
);
}
#[test]
fn e164_french_layouts_canonicalise_identically() {
let canonical = Some("+33123456789".to_string());
assert_eq!(
Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("FR")),
canonical,
);
assert_eq!(
Normalizer::normalize_phone_e164("0033 1 23 45 67 89", Some("FR")),
canonical,
);
assert_eq!(
Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
canonical,
);
}
#[test]
fn e164_spain_has_no_national_trunk_prefix() {
assert_eq!(
Normalizer::normalize_phone_e164("912 345 678", Some("ES")),
Some("+34912345678".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("+34 912 345 678", None),
Some("+34912345678".to_string()),
);
}
#[test]
fn e164_ireland_three_digit_dial_code() {
assert_eq!(
Normalizer::normalize_phone_e164("+353 1 234 5678", None),
Some("+35312345678".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("01 234 5678", Some("IE")),
Some("+35312345678".to_string()),
);
}
#[test]
fn e164_nanp_handles_us_and_canada() {
assert_eq!(
Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
Some("+14155551234".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("+1 415 555 1234", None),
Some("+14155551234".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("(416) 555-1234", Some("CA")),
Some("+14165551234".to_string()),
);
}
#[test]
fn e164_lithuania_uses_eight_as_trunk_prefix() {
assert_eq!(
Normalizer::normalize_phone_e164("8 612 34567", Some("LT")),
Some("+37061234567".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("+370 612 34567", None),
Some("+37061234567".to_string()),
);
}
#[test]
fn e164_greece_has_no_national_trunk_prefix() {
assert_eq!(
Normalizer::normalize_phone_e164("+30 210 123 4567", None),
Some("+302101234567".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("210 123 4567", Some("GR")),
Some("+302101234567".to_string()),
);
}
#[test]
fn e164_romania_strips_trunk_zero() {
assert_eq!(
Normalizer::normalize_phone_e164("0721 234 567", Some("RO")),
Some("+40721234567".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("+40 721 234 567", None),
Some("+40721234567".to_string()),
);
}
#[test]
fn e164_czech_no_trunk_prefix() {
assert_eq!(
Normalizer::normalize_phone_e164("+420 234 567 890", None),
Some("+420234567890".to_string()),
);
assert_eq!(
Normalizer::normalize_phone_e164("234 567 890", Some("CZ")),
Some("+420234567890".to_string()),
);
}
#[test]
fn e164_iceland_seven_digit_nsn() {
assert_eq!(
Normalizer::normalize_phone_e164("+354 412 3456", None),
Some("+3544123456".to_string()),
);
}
#[test]
fn e164_distinguishes_overlapping_three_digit_dial_codes() {
let hr = Normalizer::normalize_phone_e164("+385 91 234 5678", None);
let si = Normalizer::normalize_phone_e164("+386 41 234 567", None);
assert!(hr.is_some());
assert!(si.is_some());
assert_ne!(hr, si);
}
#[test]
fn e164_distinguishes_countries_with_overlapping_national_digits() {
let uk = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
let fr = Normalizer::normalize_phone_e164("07700 90012", Some("FR"));
assert!(uk.is_some());
assert!(fr.is_some());
assert_ne!(uk, fr);
}
#[test]
fn e164_returns_none_when_default_country_missing_and_no_marker() {
assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
}
#[test]
fn e164_returns_none_for_unknown_dial_code() {
assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
}
#[test]
fn e164_returns_none_for_empty_or_punctuation_only() {
assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
assert_eq!(Normalizer::normalize_phone_e164("+", Some("GB")), None);
assert_eq!(Normalizer::normalize_phone_e164("()-", Some("GB")), None);
}
#[test]
fn e164_returns_none_for_too_short_or_too_long_nsn() {
assert_eq!(Normalizer::normalize_phone_e164("+44 1", None), None);
assert_eq!(
Normalizer::normalize_phone_e164("+44 123456789012345", None),
None,
);
}
#[test]
fn e164_rejects_unknown_default_country() {
assert_eq!(
Normalizer::normalize_phone_e164("07700 900123", Some("XX")),
None,
);
}
#[test]
fn e164_is_idempotent_on_canonical_form() {
for input in [
"+44 7700 900123",
"+33 1 23 45 67 89",
"(415) 555-1234",
"+353 1 234 5678",
"+34 912 345 678",
] {
let once = Normalizer::normalize_phone_e164(input, Some("GB")).expect("parses");
let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).expect("idempotent");
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn e164_default_country_lookup_is_case_insensitive() {
let lower = Normalizer::normalize_phone_e164("07700 900123", Some("gb"));
let upper = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
assert_eq!(lower, upper);
assert!(lower.is_some());
}
#[test]
fn e164_handles_double_zero_international_access_form() {
assert_eq!(
Normalizer::normalize_phone_e164("00 33 1 23 45 67 89", None),
Some("+33123456789".to_string()),
);
}
#[test]
fn expand_street_replaces_common_abbreviations() {
assert_eq!(
Normalizer::expand_street_abbreviations("123 High St"),
"123 High street",
);
assert_eq!(
Normalizer::expand_street_abbreviations("10 Downing Rd"),
"10 Downing road",
);
assert_eq!(
Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
"12 Sunset boulevard",
);
assert_eq!(
Normalizer::expand_street_abbreviations("1 Park Ave"),
"1 Park avenue",
);
assert_eq!(
Normalizer::expand_street_abbreviations("5 Cherry Ln"),
"5 Cherry lane",
);
}
#[test]
fn expand_street_replaces_directionals() {
assert_eq!(
Normalizer::expand_street_abbreviations("45 N Park Ave"),
"45 north Park avenue",
);
assert_eq!(
Normalizer::expand_street_abbreviations("100 SW 5th St"),
"100 southwest 5th street",
);
}
#[test]
fn expand_street_strips_trailing_period_or_comma() {
assert_eq!(
Normalizer::expand_street_abbreviations("123 High St."),
"123 High street",
);
assert_eq!(
Normalizer::expand_street_abbreviations("12 Sunset Blvd,"),
"12 Sunset boulevard",
);
}
#[test]
fn expand_street_passes_unknown_tokens_through() {
assert_eq!(
Normalizer::expand_street_abbreviations("Buckingham Palace"),
"Buckingham Palace",
);
}
#[test]
fn expand_street_is_idempotent_on_already_expanded_input() {
for input in [
"123 High St",
"45 N Park Ave",
"10 Downing Rd",
"Buckingham Palace",
] {
let once = Normalizer::expand_street_abbreviations(input);
let twice = Normalizer::expand_street_abbreviations(&once);
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn expand_street_handles_empty_and_whitespace_only() {
assert_eq!(Normalizer::expand_street_abbreviations(""), "");
assert_eq!(Normalizer::expand_street_abbreviations(" "), "");
}
#[test]
fn normalize_address_line_unifies_abbreviated_and_full_forms() {
assert_eq!(
Normalizer::normalize_address_line("123 High St"),
Normalizer::normalize_address_line("123 High Street"),
);
assert_eq!(
Normalizer::normalize_address_line("45 N Park Ave"),
Normalizer::normalize_address_line("45 North Park Avenue"),
);
}
#[test]
fn normalize_address_line_handles_punctuation_and_case() {
assert_eq!(
Normalizer::normalize_address_line("10, DOWNING Street."),
"10 downing street",
);
}
#[test]
fn normalize_address_line_is_idempotent() {
for input in [
"123 High St",
" 45 N Park Ave ",
"10, Downing Street.",
"",
] {
let once = Normalizer::normalize_address_line(input);
let twice = Normalizer::normalize_address_line(&once);
assert_eq!(once, twice, "not idempotent for {input:?}");
}
}
#[test]
fn parse_address_extracts_simple_house_number() {
let p = Normalizer::parse_address_line("123 High Street");
assert_eq!(p.house_number.as_deref(), Some("123"));
assert_eq!(p.unit, None);
assert_eq!(p.street, "high street");
}
#[test]
fn parse_address_handles_alphanumeric_house_number() {
let p = Normalizer::parse_address_line("10A Downing St");
assert_eq!(p.house_number.as_deref(), Some("10A"));
assert_eq!(p.street, "downing street");
}
#[test]
fn parse_address_does_not_greedily_consume_street_name() {
let p = Normalizer::parse_address_line("10 Apple Tree Lane");
assert_eq!(p.house_number.as_deref(), Some("10"));
assert_eq!(p.street, "apple tree lane");
}
#[test]
fn parse_address_recognises_flat_prefix() {
let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
assert_eq!(p.unit.as_deref(), Some("flat 2a"));
assert_eq!(p.house_number.as_deref(), Some("10"));
assert_eq!(p.street, "downing street");
}
#[test]
fn parse_address_recognises_apt_prefix() {
let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
assert_eq!(p.unit.as_deref(), Some("apt 5"));
assert_eq!(p.house_number.as_deref(), Some("1600"));
assert_eq!(p.street, "pennsylvania avenue");
}
#[test]
fn parse_address_recognises_suite_and_ste_and_unit_and_room() {
for input in [
"Suite 12, 100 Main St",
"Ste 12, 100 Main St",
"Unit 12, 100 Main St",
"Room 12, 100 Main St",
] {
let p = Normalizer::parse_address_line(input);
assert!(p.unit.is_some(), "no unit for {input:?}");
assert_eq!(p.house_number.as_deref(), Some("100"));
assert_eq!(p.street, "main street");
}
}
#[test]
fn parse_address_no_leading_number_falls_back_to_street_only() {
let p = Normalizer::parse_address_line("Buckingham Palace");
assert_eq!(p.house_number, None);
assert_eq!(p.unit, None);
assert_eq!(p.street, "buckingham palace");
}
#[test]
fn parse_address_empty_input_yields_empty_street() {
let p = Normalizer::parse_address_line("");
assert_eq!(p.house_number, None);
assert_eq!(p.unit, None);
assert_eq!(p.street, "");
}
#[test]
fn parse_address_round_trips_through_serde() {
let p = Normalizer::parse_address_line("Flat 2A, 10A Downing Street");
let json = serde_json::to_string(&p).unwrap();
let back: ParsedAddressLine = serde_json::from_str(&json).unwrap();
assert_eq!(p, back);
}
#[test]
fn parse_address_uppercases_house_number_suffix() {
let p = Normalizer::parse_address_line("10a Downing St");
assert_eq!(p.house_number.as_deref(), Some("10A"));
}
#[test]
fn normalize_email_lowercases_and_trims() {
assert_eq!(
Normalizer::normalize_email(" Alice@Example.ORG ", false),
Some("alice@example.org".into()),
);
}
#[test]
fn normalize_email_preserves_well_formed_input() {
assert_eq!(
Normalizer::normalize_email("alice@example.org", false),
Some("alice@example.org".into()),
);
}
#[test]
fn normalize_email_rejects_missing_at_sign() {
assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
}
#[test]
fn normalize_email_rejects_empty_localpart_or_domain() {
assert_eq!(Normalizer::normalize_email("@example.org", false), None);
assert_eq!(Normalizer::normalize_email("alice@", false), None);
}
#[test]
fn normalize_email_rejects_multiple_at_signs() {
assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
}
#[test]
fn normalize_email_rejects_empty_and_whitespace() {
assert_eq!(Normalizer::normalize_email("", false), None);
assert_eq!(Normalizer::normalize_email(" ", false), None);
}
#[test]
fn normalize_email_gmail_dot_folding_strips_dots_in_localpart() {
assert_eq!(
Normalizer::normalize_email("j.smith@gmail.com", true),
Some("jsmith@gmail.com".into()),
);
assert_eq!(
Normalizer::normalize_email("J.S.M.I.T.H@gmail.com", true),
Some("jsmith@gmail.com".into()),
);
}
#[test]
fn normalize_email_gmail_dot_folding_strips_plus_tag() {
assert_eq!(
Normalizer::normalize_email("jsmith+work@gmail.com", true),
Some("jsmith@gmail.com".into()),
);
assert_eq!(
Normalizer::normalize_email("j.smith+anything@googlemail.com", true),
Some("jsmith@googlemail.com".into()),
);
}
#[test]
fn normalize_email_gmail_dot_folding_does_not_touch_other_domains() {
assert_eq!(
Normalizer::normalize_email("j.smith@example.org", true),
Some("j.smith@example.org".into()),
);
assert_eq!(
Normalizer::normalize_email("jsmith+work@example.org", true),
Some("jsmith+work@example.org".into()),
);
}
#[test]
fn normalize_email_dot_folding_off_preserves_localpart_dots() {
assert_eq!(
Normalizer::normalize_email("j.smith@gmail.com", false),
Some("j.smith@gmail.com".into()),
);
}
#[test]
fn normalize_email_is_idempotent_on_canonical_form() {
for (input, fold) in [
("Alice@Example.ORG", false),
("j.smith@gmail.com", true),
("jsmith+x@gmail.com", true),
("user@host.tld", false),
] {
let once = Normalizer::normalize_email(input, fold).expect("parses");
let twice = Normalizer::normalize_email(&once, fold).expect("idempotent");
assert_eq!(once, twice, "not idempotent for {input:?} fold={fold}");
}
}
#[test]
fn normalize_email_gmail_dot_folding_rejects_empty_localpart_after_folding() {
assert_eq!(Normalizer::normalize_email("...@gmail.com", true), None);
}
#[test]
fn parse_address_does_not_treat_st_as_unit_prefix() {
let p = Normalizer::parse_address_line("St Mary's Road");
assert_eq!(p.unit, None);
}
#[test]
fn e164_strips_trunk_zero_after_country_code() {
assert_eq!(
Normalizer::normalize_phone_e164("+44 0 7700 900123", None),
Some("+447700900123".to_string()),
);
}
}