zer_blocking/
normalize.rs1use unicode_normalization::UnicodeNormalization;
2
3pub fn normalize_plate(s: &str) -> String {
6 s.chars()
7 .filter(|c| c.is_ascii_alphanumeric())
8 .collect::<String>()
9 .to_ascii_uppercase()
10}
11
12pub fn transliterate_and_normalize(s: &str) -> String {
16 let ascii = any_ascii::any_ascii(s);
17 normalize_text(&ascii)
18}
19
20pub fn normalize_text(s: &str) -> String {
21 s.nfkd()
22 .filter(|c| c.is_ascii())
23 .collect::<String>()
24 .to_ascii_uppercase()
25 .split_whitespace()
26 .collect::<Vec<_>>()
27 .join(" ")
28}
29
30pub fn normalize_digits_only(s: &str) -> String {
31 s.chars().filter(|c| c.is_ascii_digit()).collect()
32}
33
34pub fn strip_tussenvoegsel(s: &str) -> &str {
37 const PREFIXES: &[&str] = &[
38 "VAN DER ", "VAN DEN ", "VAN DE ", "VAN HET ", "VAN 'T ",
39 "VAN T ", "VAN ", "DEN ", "DER ", "DE ", "TEN ", "TER ",
40 "TE ", "IN 'T ", "IN T ", "OP DEN ", "OP DE ", "OP HET ",
41 "OP ", "V/D ", "V.D. ",
42 ];
43
44 let upper = s.to_ascii_uppercase();
45 for prefix in PREFIXES {
46 if upper.starts_with(prefix) {
47 return &s[prefix.len()..];
48 }
49 }
50 s
51}
52
53pub fn extract_surname_token(normalized: &str) -> &str {
56 let stripped = strip_tussenvoegsel(normalized);
57 stripped.split_whitespace().last().unwrap_or(stripped)
58}
59
60#[cfg(test)]
61mod tests {
62 use super::*;
63
64 #[test]
65 fn normalize_strips_diacritics() {
66 assert_eq!(normalize_text("Müller"), "MULLER");
67 assert_eq!(normalize_text("Çelik"), "CELIK");
68 assert_eq!(normalize_text("van den Berg"), "VAN DEN BERG");
69 }
70
71 #[test]
72 fn normalize_collapses_whitespace() {
73 assert_eq!(normalize_text(" Jan de Vries "), "JAN DE VRIES");
74 }
75
76 #[test]
77 fn normalize_digits_only_strips_punctuation() {
78 assert_eq!(normalize_digits_only("555-123-4567"), "5551234567");
79 assert_eq!(normalize_digits_only("+31 (0)20 123 4567"), "310201234567");
80 assert_eq!(normalize_digits_only("no digits"), "");
81 }
82
83 #[test]
84 fn strip_tussenvoegsel_van_der() {
85 assert_eq!(strip_tussenvoegsel("VAN DER WAL"), "WAL");
86 assert_eq!(strip_tussenvoegsel("VAN DEN BERG"), "BERG");
87 assert_eq!(strip_tussenvoegsel("DE VRIES"), "VRIES");
88 assert_eq!(strip_tussenvoegsel("JANSEN"), "JANSEN");
89 }
90
91 #[test]
92 fn extract_surname_token_from_full_name() {
93 let n = normalize_text("Saddam Hussein Al-Tikriti");
94 assert_eq!(extract_surname_token(&n), "AL-TIKRITI");
95 let n2 = normalize_text("Jan de Vries");
96 assert_eq!(extract_surname_token(&n2), "VRIES");
97 }
98
99 #[test]
100 fn normalize_plate_strips_hyphens_and_uppercases() {
101 assert_eq!(normalize_plate("25-XKL-9"), "25XKL9");
102 assert_eq!(normalize_plate("LD-321-F"), "LD321F");
103 assert_eq!(normalize_plate("CX-180-W"), "CX180W");
104 assert_eq!(normalize_plate("cx180w"), "CX180W");
105 }
106
107 #[test]
108 fn transliterate_and_normalize_handles_latin_diacritics() {
109 assert_eq!(transliterate_and_normalize("Müller"), "MULLER");
111 assert_eq!(transliterate_and_normalize("Çelik"), "CELIK");
112 }
113}