Skip to main content

zer_blocking/
normalize.rs

1use unicode_normalization::UnicodeNormalization;
2
3/// Strip hyphens and spaces from a license plate and uppercase it.
4/// "25-XKL-9" becomes "25XKL9".
5pub fn normalize_plate(s: &str) -> String {
6    s.chars()
7        .filter(|c| c.is_ascii_alphanumeric())
8        .collect::<String>()
9        .to_ascii_uppercase()
10}
11
12/// Transliterate non-Latin Unicode to ASCII via any_ascii, then apply
13/// standard normalization (NFKD diacritic stripping + uppercase + collapse whitespace).
14/// Useful for Arabic/Cyrillic name input before phonetic encoding.
15pub fn transliterate_and_normalize(s: &str) -> String {
16    let ascii = any_ascii::any_ascii(s);
17    normalize_text(&ascii)
18}
19
20pub fn normalize_text(s: &str) -> String {
21    s.nfkd()
22        .filter(|c| c.is_ascii())
23        .collect::<String>()
24        .to_ascii_uppercase()
25        .split_whitespace()
26        .collect::<Vec<_>>()
27        .join(" ")
28}
29
30pub fn normalize_digits_only(s: &str) -> String {
31    s.chars().filter(|c| c.is_ascii_digit()).collect()
32}
33
34/// Strip common Dutch tussenvoegsel prefixes so that
35/// "VAN DEN BERG" and "BERG" produce the same phonetic key.
36pub fn strip_tussenvoegsel(s: &str) -> &str {
37    const PREFIXES: &[&str] = &[
38        "VAN DER ", "VAN DEN ", "VAN DE ", "VAN HET ", "VAN 'T ",
39        "VAN T ", "VAN ", "DEN ", "DER ", "DE ", "TEN ", "TER ",
40        "TE ", "IN 'T ", "IN T ", "OP DEN ", "OP DE ", "OP HET ",
41        "OP ", "V/D ", "V.D. ",
42    ];
43
44    let upper = s.to_ascii_uppercase();
45    for prefix in PREFIXES {
46        if upper.starts_with(prefix) {
47            return &s[prefix.len()..];
48        }
49    }
50    s
51}
52
53/// Return the last whitespace-delimited token of a name, stripping
54/// tussenvoegsel from single-field full-name representations.
55pub fn extract_surname_token(normalized: &str) -> &str {
56    let stripped = strip_tussenvoegsel(normalized);
57    stripped.split_whitespace().last().unwrap_or(stripped)
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63
64    #[test]
65    fn normalize_strips_diacritics() {
66        assert_eq!(normalize_text("Müller"), "MULLER");
67        assert_eq!(normalize_text("Çelik"), "CELIK");
68        assert_eq!(normalize_text("van den Berg"), "VAN DEN BERG");
69    }
70
71    #[test]
72    fn normalize_collapses_whitespace() {
73        assert_eq!(normalize_text("  Jan   de  Vries  "), "JAN DE VRIES");
74    }
75
76    #[test]
77    fn normalize_digits_only_strips_punctuation() {
78        assert_eq!(normalize_digits_only("555-123-4567"), "5551234567");
79        assert_eq!(normalize_digits_only("+31 (0)20 123 4567"), "310201234567");
80        assert_eq!(normalize_digits_only("no digits"), "");
81    }
82
83    #[test]
84    fn strip_tussenvoegsel_van_der() {
85        assert_eq!(strip_tussenvoegsel("VAN DER WAL"), "WAL");
86        assert_eq!(strip_tussenvoegsel("VAN DEN BERG"), "BERG");
87        assert_eq!(strip_tussenvoegsel("DE VRIES"), "VRIES");
88        assert_eq!(strip_tussenvoegsel("JANSEN"), "JANSEN");
89    }
90
91    #[test]
92    fn extract_surname_token_from_full_name() {
93        let n = normalize_text("Saddam Hussein Al-Tikriti");
94        assert_eq!(extract_surname_token(&n), "AL-TIKRITI");
95        let n2 = normalize_text("Jan de Vries");
96        assert_eq!(extract_surname_token(&n2), "VRIES");
97    }
98
99    #[test]
100    fn normalize_plate_strips_hyphens_and_uppercases() {
101        assert_eq!(normalize_plate("25-XKL-9"), "25XKL9");
102        assert_eq!(normalize_plate("LD-321-F"), "LD321F");
103        assert_eq!(normalize_plate("CX-180-W"), "CX180W");
104        assert_eq!(normalize_plate("cx180w"), "CX180W");
105    }
106
107    #[test]
108    fn transliterate_and_normalize_handles_latin_diacritics() {
109        // For Latin input already-ASCII or diacritic-stripped by any_ascii
110        assert_eq!(transliterate_and_normalize("Müller"), "MULLER");
111        assert_eq!(transliterate_and_normalize("Çelik"), "CELIK");
112    }
113}