worker_matcher/normalizer.rs
1//! Text normalisation for worker demographic data.
2//!
3//! Research on worker identification (see `spec.md` §5) is unanimous: most
4//! accuracy gains come from **standardising the input** before scoring, not
5//! from cleverer similarity algorithms. This module exposes the canonical
6//! transformations the matching engine applies to names, postcodes, phone
7//! numbers, and phonetic codes.
8//!
9//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
10//! **deterministic** and allocate at most a single new `String`.
11//!
12//! ## Quick examples
13//!
14//! ```
15//! use worker_matcher::Normalizer;
16//!
17//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
18//! assert_eq!(Normalizer::normalize_name(" O'Brien "), "obrien");
19//! assert_eq!(Normalizer::normalize_name("Siân"), "sian");
20//!
21//! // Postcodes: strip whitespace, uppercase.
22//! assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
23//!
24//! // Phone numbers: keep digits, strip international and trunk prefixes.
25//! assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
26//! ```
27//!
28//! ## What this module deliberately does *not* do
29//!
30//! - It does not validate NHS numbers — that is delegated to the
31//! `nhs-number` crate at the call-site (see [`crate::matcher`]).
32//! - It does not normalise email addresses or middle names (see spec
33//! tasks T-11 and OQ-1 respectively).
34//! - It does not handle non-ASCII punctuation such as the curly apostrophe
35//! `’` (U+2019). Upstream code should convert those to ASCII first.
36//!
37//! ## International phone numbers
38//!
39//! Two phone normalisers are provided:
40//!
41//! - [`Normalizer::normalize_phone`] — UK-centric national-significant form,
42//! suitable for legacy or single-jurisdiction call-sites. Idempotent and
43//! infallible.
44//! - [`Normalizer::normalize_phone_e164`] — international-aware E.164 form
45//! (`+CCNNNN…`) for jurisdictions in the supported country table. Returns
46//! `None` if the input cannot be confidently parsed.
47//!
48//! The matching engine tries E.164 first and falls back to the legacy form
49//! when either input is unparseable, so existing single-country deployments
50//! observe the same behaviour while multinational deployments gain
51//! cross-country disambiguation (a French number and a UK number that share
52//! the same trunk digits no longer collide).
53
54use serde::{Deserialize, Serialize};
55use unicode_normalization::UnicodeNormalization;
56
57/// Stateless namespace for text normalisation routines.
58///
59/// `Normalizer` is a unit type with no fields; every method is associated.
60/// It is held as a struct rather than a free function module purely so the
61/// public API has a single, discoverable entry point.
62///
63/// ```
64/// use worker_matcher::Normalizer;
65///
66/// let canonical = Normalizer::normalize_name("José-María");
67/// assert_eq!(canonical, "josemaria");
68/// ```
69pub struct Normalizer;
70
71impl Normalizer {
72 /// Normalise a human name for comparison.
73 ///
74 /// Steps, in order:
75 ///
76 /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
77 /// 2. Drop combining marks (diacritics).
78 /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
79 /// 4. Lowercase.
80 /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
81 ///
82 /// The result is suitable for direct equality comparison or for feeding
83 /// into a string-similarity scorer.
84 ///
85 /// # Examples
86 ///
87 /// Whitespace is collapsed and trimmed:
88 ///
89 /// ```
90 /// use worker_matcher::Normalizer;
91 /// assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
92 /// ```
93 ///
94 /// Apostrophes and hyphens are stripped:
95 ///
96 /// ```
97 /// # use worker_matcher::Normalizer;
98 /// assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
99 /// assert_eq!(Normalizer::normalize_name("MARY-JANE"), "maryjane");
100 /// ```
101 ///
102 /// Diacritics are removed:
103 ///
104 /// ```
105 /// # use worker_matcher::Normalizer;
106 /// assert_eq!(Normalizer::normalize_name("José"), "jose");
107 /// assert_eq!(Normalizer::normalize_name("Siân"), "sian");
108 /// assert_eq!(Normalizer::normalize_name("Łukasz"), "łukasz"); // ł has no decomposition
109 /// ```
110 ///
111 /// Empty and whitespace-only input round-trip cleanly:
112 ///
113 /// ```
114 /// # use worker_matcher::Normalizer;
115 /// assert_eq!(Normalizer::normalize_name(""), "");
116 /// assert_eq!(Normalizer::normalize_name(" "), "");
117 /// ```
118 ///
119 /// The function is **idempotent**:
120 ///
121 /// ```
122 /// # use worker_matcher::Normalizer;
123 /// let once = Normalizer::normalize_name(" José-María ");
124 /// let twice = Normalizer::normalize_name(&once);
125 /// assert_eq!(once, twice);
126 /// ```
127 pub fn normalize_name(name: &str) -> String {
128 name.nfkd()
129 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
130 .filter(|c| !c.is_ascii_punctuation())
131 .collect::<String>()
132 .to_lowercase()
133 .split_whitespace()
134 .collect::<Vec<_>>()
135 .join(" ")
136 }
137
138 /// Normalise a postcode for comparison.
139 ///
140 /// Steps: drop all whitespace, then uppercase. No locale-specific
141 /// validation — that is intentionally out of scope.
142 ///
143 /// # Examples
144 ///
145 /// UK postcodes with and without the conventional space are equivalent:
146 ///
147 /// ```
148 /// use worker_matcher::Normalizer;
149 /// assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
150 /// assert_eq!(Normalizer::normalize_postcode("cf101aa"), "CF101AA");
151 /// assert_eq!(Normalizer::normalize_postcode(" cf10 1aa "), "CF101AA");
152 /// ```
153 ///
154 /// Empty input is preserved:
155 ///
156 /// ```
157 /// # use worker_matcher::Normalizer;
158 /// assert_eq!(Normalizer::normalize_postcode(""), "");
159 /// ```
160 ///
161 /// Idempotent:
162 ///
163 /// ```
164 /// # use worker_matcher::Normalizer;
165 /// let once = Normalizer::normalize_postcode("sw1a 2aa");
166 /// let twice = Normalizer::normalize_postcode(&once);
167 /// assert_eq!(once, twice);
168 /// ```
169 pub fn normalize_postcode(postcode: &str) -> String {
170 postcode
171 .chars()
172 .filter(|c| !c.is_whitespace())
173 .collect::<String>()
174 .to_uppercase()
175 }
176
177 /// Normalise a phone number for comparison.
178 ///
179 /// Steps:
180 ///
181 /// 1. Keep only ASCII digits (drop spaces, brackets, hyphens, `+`, …).
182 /// 2. If the result starts with `0044`, drop those four characters.
183 /// 3. Else, if the result starts with `44` and is at least 12 digits long,
184 /// drop the leading `44`.
185 /// 4. Else, if the result starts with `0` and is longer than one digit,
186 /// drop the leading `0`.
187 ///
188 /// This canonicalises the common UK formats into a single subscriber
189 /// number with no leading prefix. International numbers from other
190 /// countries pass through unchanged.
191 ///
192 /// # Examples
193 ///
194 /// ```
195 /// use worker_matcher::Normalizer;
196 ///
197 /// // UK mobile, in three formats:
198 /// assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
199 /// assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
200 /// assert_eq!(Normalizer::normalize_phone("0044 7700 900123"), "7700900123");
201 ///
202 /// // UK landline with brackets and spaces:
203 /// assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
204 ///
205 /// // Empty input is preserved (no digits to keep):
206 /// assert_eq!(Normalizer::normalize_phone(""), "");
207 /// ```
208 ///
209 /// Idempotent on canonical inputs:
210 ///
211 /// ```
212 /// # use worker_matcher::Normalizer;
213 /// let once = Normalizer::normalize_phone("07700 900123");
214 /// let twice = Normalizer::normalize_phone(&once);
215 /// assert_eq!(once, twice);
216 /// ```
217 pub fn normalize_phone(phone: &str) -> String {
218 let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
219
220 if digits.starts_with("0044") && digits.len() > 4 {
221 return digits[4..].to_string();
222 }
223
224 if digits.starts_with("44") && digits.len() >= 12 {
225 return digits[2..].to_string();
226 }
227
228 if digits.starts_with('0') && digits.len() > 1 {
229 return digits[1..].to_string();
230 }
231
232 digits
233 }
234
235 /// Normalise a phone number to its E.164-style canonical form.
236 ///
237 /// E.164 is the ITU-T standard for international telephone numbers and
238 /// has the shape `+CCNNN…`, where `CC` is the country dialling code
239 /// (1–3 digits) and the remainder is the national-significant number
240 /// (NSN) with no trunk prefix.
241 ///
242 /// The function accepts a wide range of textual layouts:
243 ///
244 /// - `+CC…` (explicit international, the canonical input form).
245 /// - `00CC…` (international access code, common across Europe).
246 /// - `0…` (national format, trunk-prefix) — interpreted relative to
247 /// `default_country` when the country uses a national trunk `0`.
248 /// - `NSN…` (bare national-significant number) — interpreted relative
249 /// to `default_country`.
250 ///
251 /// Returns `Some(canonical)` if the input parses against a country in
252 /// the supported table; otherwise `None`. The supported countries are
253 /// the five jurisdictions for which the crate exposes a national
254 /// healthcare identifier (United Kingdom, France, Spain, Ireland, and
255 /// — sharing the GB dial code — UK Northern Ireland), plus the most
256 /// common worker-mobility partners (US, CA, DE, IT, NL, BE, PT, CH,
257 /// AT, SE, NO, DK, FI, PL, AU, NZ, JP, CN, IN, BR, MX, ZA). `default_country` is the
258 /// **ISO 3166-1 alpha-2 code** (e.g. `"GB"`, `"FR"`, `"US"`) of the
259 /// jurisdiction whose national format applies when the input lacks an
260 /// explicit international marker. Pass `None` to refuse to assume a
261 /// default — only explicit `+CC` / `00CC` inputs will parse.
262 ///
263 /// The function is **deterministic** and **idempotent**: feeding a
264 /// canonical `+CCNNN…` string back in returns the same string.
265 ///
266 /// # Examples
267 ///
268 /// UK mobile, three textual layouts, all canonicalise to the same E.164 form:
269 ///
270 /// ```
271 /// use worker_matcher::Normalizer;
272 /// assert_eq!(
273 /// Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
274 /// Some("+447700900123".to_string()),
275 /// );
276 /// assert_eq!(
277 /// Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
278 /// Some("+447700900123".to_string()),
279 /// );
280 /// assert_eq!(
281 /// Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
282 /// Some("+447700900123".to_string()),
283 /// );
284 /// ```
285 ///
286 /// French national format vs international form:
287 ///
288 /// ```
289 /// # use worker_matcher::Normalizer;
290 /// assert_eq!(
291 /// Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
292 /// Some("+33123456789".to_string()),
293 /// );
294 /// assert_eq!(
295 /// Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("GB")),
296 /// Some("+33123456789".to_string()),
297 /// );
298 /// ```
299 ///
300 /// North American (NANP) numbers have no trunk prefix:
301 ///
302 /// ```
303 /// # use worker_matcher::Normalizer;
304 /// assert_eq!(
305 /// Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
306 /// Some("+14155551234".to_string()),
307 /// );
308 /// assert_eq!(
309 /// Normalizer::normalize_phone_e164("+1 415 555 1234", None),
310 /// Some("+14155551234".to_string()),
311 /// );
312 /// ```
313 ///
314 /// Unparseable or ambiguous inputs return `None`:
315 ///
316 /// ```
317 /// # use worker_matcher::Normalizer;
318 /// // No default country and no international marker: ambiguous.
319 /// assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
320 /// // Unknown dial code.
321 /// assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
322 /// // Empty input.
323 /// assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
324 /// ```
325 ///
326 /// Idempotent on canonical inputs:
327 ///
328 /// ```
329 /// # use worker_matcher::Normalizer;
330 /// let once = Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")).unwrap();
331 /// let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).unwrap();
332 /// assert_eq!(once, twice);
333 /// ```
334 pub fn normalize_phone_e164(phone: &str, default_country: Option<&str>) -> Option<String> {
335 let has_plus = phone.chars().any(|c| c == '+');
336 let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
337 if digits.is_empty() {
338 return None;
339 }
340
341 let (info, nsn): (&CountryPhoneInfo, String) = if has_plus {
342 let info = lookup_by_dial_code_prefix(&digits)?;
343 let rest = &digits[info.dial_code.len()..];
344 let rest = strip_trunk_prefix(info, rest);
345 (info, rest.to_string())
346 } else if let Some(stripped) = digits.strip_prefix("00") {
347 let info = lookup_by_dial_code_prefix(stripped)?;
348 let rest = &stripped[info.dial_code.len()..];
349 let rest = strip_trunk_prefix(info, rest);
350 (info, rest.to_string())
351 } else {
352 let iso = default_country?;
353 let info = lookup_by_iso(iso)?;
354 let nsn = strip_trunk_prefix(info, &digits);
355 (info, nsn.to_string())
356 };
357
358 if nsn.len() < info.min_nsn || nsn.len() > info.max_nsn {
359 return None;
360 }
361
362 Some(format!("+{}{}", info.dial_code, nsn))
363 }
364
365 /// Expand common postal address abbreviations as whole tokens.
366 ///
367 /// The input is tokenised on whitespace and each token is matched
368 /// case-insensitively (after stripping a single trailing `.` or `,`)
369 /// against a fixed table of street-type and directional abbreviations.
370 /// Recognised tokens are replaced with their long form, lowercased;
371 /// unrecognised tokens are passed through verbatim. Tokens are then
372 /// re-joined by single spaces.
373 ///
374 /// This function is intentionally simple: it does **not** apply any
375 /// position-aware heuristics. The well-known ambiguous case `"St"` —
376 /// which can mean *Street* or *Saint* — is always expanded to
377 /// *Street*. In practice this remains useful for fuzzy matching
378 /// because the canonical form is consistent on both sides of a
379 /// comparison; pre-process upstream if you need finer disambiguation.
380 ///
381 /// # Examples
382 ///
383 /// ```
384 /// use worker_matcher::Normalizer;
385 /// assert_eq!(
386 /// Normalizer::expand_street_abbreviations("123 High St"),
387 /// "123 High street",
388 /// );
389 /// assert_eq!(
390 /// Normalizer::expand_street_abbreviations("45 N. Park Ave."),
391 /// "45 north Park avenue",
392 /// );
393 /// assert_eq!(
394 /// Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
395 /// "12 Sunset boulevard",
396 /// );
397 /// ```
398 ///
399 /// Idempotent on already-expanded inputs (long forms are not
400 /// re-expanded):
401 ///
402 /// ```
403 /// # use worker_matcher::Normalizer;
404 /// let once = Normalizer::expand_street_abbreviations("10 Downing St");
405 /// let twice = Normalizer::expand_street_abbreviations(&once);
406 /// assert_eq!(once, twice);
407 /// ```
408 pub fn expand_street_abbreviations(line: &str) -> String {
409 line.split_whitespace()
410 .map(expand_one_token)
411 .collect::<Vec<_>>()
412 .join(" ")
413 }
414
415 /// Normalise an address line for comparison.
416 ///
417 /// Pipeline:
418 ///
419 /// 1. Expand street-type and directional abbreviations via
420 /// [`Normalizer::expand_street_abbreviations`] (so `"St" → "street"`,
421 /// `"Rd" → "road"`, `"N" → "north"`).
422 /// 2. Apply the name-normalisation pipeline
423 /// ([`Normalizer::normalize_name`]): NFKD-decompose, drop combining
424 /// marks, drop ASCII punctuation, lowercase, collapse whitespace.
425 ///
426 /// The result is idempotent and suitable for direct equality or
427 /// similarity comparison.
428 ///
429 /// # Examples
430 ///
431 /// Abbreviated and full forms canonicalise identically:
432 ///
433 /// ```
434 /// use worker_matcher::Normalizer;
435 /// assert_eq!(
436 /// Normalizer::normalize_address_line("123 High St"),
437 /// Normalizer::normalize_address_line("123 High Street"),
438 /// );
439 /// assert_eq!(
440 /// Normalizer::normalize_address_line("45 N Park Ave"),
441 /// Normalizer::normalize_address_line("45 North Park Avenue"),
442 /// );
443 /// ```
444 ///
445 /// Punctuation and case are normalised:
446 ///
447 /// ```
448 /// # use worker_matcher::Normalizer;
449 /// assert_eq!(
450 /// Normalizer::normalize_address_line("10, DOWNING Street."),
451 /// "10 downing street",
452 /// );
453 /// ```
454 pub fn normalize_address_line(line: &str) -> String {
455 Self::normalize_name(&Self::expand_street_abbreviations(line))
456 }
457
458 /// Parse an address line into its structured components.
459 ///
460 /// The function performs a best-effort structural decomposition of a
461 /// single-line postal address into:
462 ///
463 /// - `house_number` — the leading run of digits (with an optional
464 /// single alphabetic suffix, e.g. `"10A"`), uppercased. `None` if
465 /// no leading number is present.
466 /// - `unit` — a recognised sub-unit prefix (`Flat`, `Apt`,
467 /// `Apartment`, `Unit`, `Suite`, `Ste`) and its identifier,
468 /// lowercased and space-joined (e.g. `"flat 2a"`). `None` if no
469 /// recognised prefix is present.
470 /// - `street` — the remaining text after `unit` and `house_number`
471 /// are removed, run through [`Normalizer::normalize_address_line`].
472 ///
473 /// Parsing is **deterministic** and **format-only** — no postal
474 /// reference is consulted. Inputs that do not match the simple
475 /// regular structure (e.g. a postcode-only string, a city name)
476 /// degrade gracefully: `house_number` and `unit` are `None`, and
477 /// `street` carries the normalised input.
478 ///
479 /// # Examples
480 ///
481 /// Typical UK / US single-line addresses:
482 ///
483 /// ```
484 /// use worker_matcher::Normalizer;
485 ///
486 /// let p = Normalizer::parse_address_line("123 High Street");
487 /// assert_eq!(p.house_number.as_deref(), Some("123"));
488 /// assert_eq!(p.unit, None);
489 /// assert_eq!(p.street, "high street");
490 ///
491 /// let p = Normalizer::parse_address_line("10A Downing St");
492 /// assert_eq!(p.house_number.as_deref(), Some("10A"));
493 /// assert_eq!(p.street, "downing street");
494 ///
495 /// let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
496 /// assert_eq!(p.unit.as_deref(), Some("flat 2a"));
497 /// assert_eq!(p.house_number.as_deref(), Some("10"));
498 /// assert_eq!(p.street, "downing street");
499 ///
500 /// let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
501 /// assert_eq!(p.unit.as_deref(), Some("apt 5"));
502 /// assert_eq!(p.house_number.as_deref(), Some("1600"));
503 /// assert_eq!(p.street, "pennsylvania avenue");
504 /// ```
505 ///
506 /// Inputs without a leading number still parse:
507 ///
508 /// ```
509 /// # use worker_matcher::Normalizer;
510 /// let p = Normalizer::parse_address_line("Buckingham Palace");
511 /// assert_eq!(p.house_number, None);
512 /// assert_eq!(p.unit, None);
513 /// assert_eq!(p.street, "buckingham palace");
514 /// ```
515 pub fn parse_address_line(line: &str) -> ParsedAddressLine {
516 let trimmed = line.trim();
517 let (unit, after_unit) = extract_unit_prefix(trimmed);
518 let after_unit = after_unit.trim_start_matches([',', ' ', '\t']).trim();
519 let (house_number, after_number) = extract_house_number(after_unit);
520 let after_number = after_number.trim_start_matches([',', ' ', '\t']).trim();
521 ParsedAddressLine {
522 house_number,
523 unit,
524 street: Self::normalize_address_line(after_number),
525 }
526 }
527
528 /// Compute a phonetic (Soundex) code for a name.
529 ///
530 /// Internally, the input is first normalised via
531 /// [`Normalizer::normalize_name`] and then encoded with the American
532 /// Soundex algorithm. Names that sound alike map to the same code, which
533 /// lets the matcher catch spelling variants such as "Smith" / "Smyth" or
534 /// "Stephen" / "Steven".
535 ///
536 /// The implementation is suitable for English-language names. Non-English
537 /// phonemes may be lost. T-9 (spec §21.4) decided to keep Soundex as the
538 /// default and expose an opt-in `MatchConfig::phonetic_encoder` enum
539 /// (Double Metaphone, Daitch-Mokotoff) gated behind a Cargo feature flag
540 /// once an empirical multinational worker corpus is available;
541 /// implementation is tracked as T-9.1.
542 ///
543 /// # Examples
544 ///
545 /// Similar-sounding spellings share a code:
546 ///
547 /// ```
548 /// use worker_matcher::Normalizer;
549 /// assert_eq!(Normalizer::phonetic_code("Smith"), Normalizer::phonetic_code("Smyth"));
550 /// assert_eq!(Normalizer::phonetic_code("Stephen"), Normalizer::phonetic_code("Steven"));
551 /// ```
552 ///
553 /// Different families produce different codes:
554 ///
555 /// ```
556 /// # use worker_matcher::Normalizer;
557 /// assert_ne!(Normalizer::phonetic_code("Jones"), Normalizer::phonetic_code("Smith"));
558 /// ```
559 ///
560 /// Empty input returns an empty string, not a default Soundex value:
561 ///
562 /// ```
563 /// # use worker_matcher::Normalizer;
564 /// assert_eq!(Normalizer::phonetic_code(""), "");
565 /// assert_eq!(Normalizer::phonetic_code(" "), "");
566 /// ```
567 pub fn phonetic_code(name: &str) -> String {
568 let normalized = Self::normalize_name(name);
569 if normalized.is_empty() {
570 return String::new();
571 }
572 soundex::american_soundex(&normalized)
573 }
574
575 /// Normalise an email address for comparison.
576 ///
577 /// Steps:
578 ///
579 /// 1. Trim surrounding whitespace.
580 /// 2. Lowercase the entire address (RFC 5321 makes the domain
581 /// case-insensitive and most real-world deployments treat the
582 /// localpart case-insensitively too; case-sensitive localparts
583 /// are technically legal but vanishingly rare in healthcare data).
584 /// 3. Reject inputs that lack exactly one `@` or that have an empty
585 /// localpart or domain by returning `None`.
586 /// 4. If `gmail_dot_folding` is `true` and the domain is `gmail.com`
587 /// or `googlemail.com`, strip every `.` from the localpart and
588 /// drop any `+tag` suffix. Both transformations are reversible
589 /// for Gmail addresses by Google's documented routing rules:
590 /// `j.smith@gmail.com`, `js.mith@gmail.com`, and
591 /// `jsmith+work@gmail.com` all deliver to the same mailbox as
592 /// `jsmith@gmail.com`.
593 ///
594 /// The function is **deterministic** and **idempotent** on
595 /// successful outputs.
596 ///
597 /// # Examples
598 ///
599 /// Common case-and-whitespace normalisation:
600 ///
601 /// ```
602 /// use worker_matcher::Normalizer;
603 /// assert_eq!(
604 /// Normalizer::normalize_email(" Alice@Example.ORG ", false),
605 /// Some("alice@example.org".to_string()),
606 /// );
607 /// ```
608 ///
609 /// Malformed inputs return `None`:
610 ///
611 /// ```
612 /// # use worker_matcher::Normalizer;
613 /// assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
614 /// assert_eq!(Normalizer::normalize_email("@example.org", false), None);
615 /// assert_eq!(Normalizer::normalize_email("alice@", false), None);
616 /// assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
617 /// assert_eq!(Normalizer::normalize_email("", false), None);
618 /// ```
619 ///
620 /// Optional Gmail dot-folding:
621 ///
622 /// ```
623 /// # use worker_matcher::Normalizer;
624 /// assert_eq!(
625 /// Normalizer::normalize_email("j.smith@gmail.com", true),
626 /// Some("jsmith@gmail.com".to_string()),
627 /// );
628 /// assert_eq!(
629 /// Normalizer::normalize_email("jsmith+work@googlemail.com", true),
630 /// Some("jsmith@googlemail.com".to_string()),
631 /// );
632 /// // Dot-folding does not touch non-Gmail addresses.
633 /// assert_eq!(
634 /// Normalizer::normalize_email("j.smith@example.org", true),
635 /// Some("j.smith@example.org".to_string()),
636 /// );
637 /// ```
638 ///
639 /// Idempotent on canonical inputs:
640 ///
641 /// ```
642 /// # use worker_matcher::Normalizer;
643 /// let once = Normalizer::normalize_email("Alice@Example.ORG", false).unwrap();
644 /// let twice = Normalizer::normalize_email(&once, false).unwrap();
645 /// assert_eq!(once, twice);
646 /// ```
647 pub fn normalize_email(email: &str, gmail_dot_folding: bool) -> Option<String> {
648 let trimmed = email.trim().to_lowercase();
649 if trimmed.is_empty() {
650 return None;
651 }
652 // Require exactly one '@'.
653 let (local, domain) = trimmed.split_once('@')?;
654 if local.is_empty() || domain.is_empty() {
655 return None;
656 }
657 // Reject any further '@' in the domain side.
658 if domain.contains('@') {
659 return None;
660 }
661 if gmail_dot_folding && (domain == "gmail.com" || domain == "googlemail.com") {
662 let local_no_plus = match local.find('+') {
663 Some(i) => &local[..i],
664 None => local,
665 };
666 let local_folded: String = local_no_plus.chars().filter(|c| *c != '.').collect();
667 if local_folded.is_empty() {
668 return None;
669 }
670 return Some(format!("{local_folded}@{domain}"));
671 }
672 Some(format!("{local}@{domain}"))
673 }
674}
675
676/// Structured decomposition of a postal-address line.
677///
678/// Produced by [`Normalizer::parse_address_line`]. The struct is
679/// `Serialize + Deserialize` so it round-trips through JSON and can be
680/// embedded in downstream data models.
681///
682/// All three fields are best-effort: parsing is format-only and consults
683/// no postal reference. Inputs that don't follow the
684/// `(unit, house_number, street)` shape degrade gracefully, with the
685/// missing pieces returned as `None`.
686#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
687pub struct ParsedAddressLine {
688 /// Leading house / building number, including an optional single
689 /// alphabetic suffix (`"10A"`), uppercased. `None` when no leading
690 /// digit is present.
691 pub house_number: Option<String>,
692 /// Sub-unit prefix and identifier, lowercased and space-joined
693 /// (e.g. `"flat 2a"`, `"apt 5"`, `"suite 12"`). `None` when no
694 /// recognised prefix is present.
695 pub unit: Option<String>,
696 /// Remaining street portion, normalised via
697 /// [`Normalizer::normalize_address_line`].
698 pub street: String,
699}
700
701/// Token-level expansion table used by [`Normalizer::expand_street_abbreviations`].
702///
703/// Entries are matched case-insensitively against a token with at most one
704/// trailing `.` or `,` stripped. The replacement is always lowercase so the
705/// downstream name-normalisation pipeline is a no-op for these tokens.
706const STREET_ABBREVIATIONS: &[(&str, &str)] = &[
707 ("st", "street"),
708 ("str", "street"),
709 ("rd", "road"),
710 ("ave", "avenue"),
711 ("av", "avenue"),
712 ("blvd", "boulevard"),
713 ("bvd", "boulevard"),
714 ("ln", "lane"),
715 ("dr", "drive"),
716 ("ct", "court"),
717 ("pl", "place"),
718 ("sq", "square"),
719 ("ter", "terrace"),
720 ("terr", "terrace"),
721 ("hwy", "highway"),
722 ("pkwy", "parkway"),
723 ("mt", "mount"),
724 ("mtn", "mountain"),
725 ("cres", "crescent"),
726 ("gdns", "gardens"),
727 ("gdn", "garden"),
728 ("gr", "grove"),
729 ("cl", "close"),
730 ("pk", "park"),
731 ("plz", "plaza"),
732 ("expy", "expressway"),
733 ("trl", "trail"),
734 ("n", "north"),
735 ("s", "south"),
736 ("e", "east"),
737 ("w", "west"),
738 ("ne", "northeast"),
739 ("nw", "northwest"),
740 ("se", "southeast"),
741 ("sw", "southwest"),
742];
743
744/// Recognised sub-unit prefix keywords for [`Normalizer::parse_address_line`].
745const UNIT_PREFIXES: &[&str] = &[
746 "flat",
747 "apartment",
748 "apt",
749 "unit",
750 "suite",
751 "ste",
752 "room",
753 "rm",
754];
755
756/// Expand a single whitespace-separated token if it appears in
757/// [`STREET_ABBREVIATIONS`].
758///
759/// The token is matched after stripping at most one trailing `.` or `,`;
760/// the comparison is ASCII case-insensitive. Tokens that contain non-ASCII
761/// characters short-circuit to the original input unchanged.
762fn expand_one_token(tok: &str) -> String {
763 let stripped = tok.trim_end_matches(['.', ',']);
764 if !stripped.is_ascii() {
765 return tok.to_string();
766 }
767 let lower = stripped.to_ascii_lowercase();
768 for (abbrev, full) in STREET_ABBREVIATIONS {
769 if lower == *abbrev {
770 return (*full).to_string();
771 }
772 }
773 tok.to_string()
774}
775
776/// Extract a recognised unit prefix and its identifier from the start of `s`.
777///
778/// Returns `(Some("flat 2a"), rest)` when the input begins with a
779/// recognised keyword followed by an alphanumeric identifier; otherwise
780/// `(None, s)` unchanged.
781fn extract_unit_prefix(s: &str) -> (Option<String>, &str) {
782 let trimmed = s.trim_start();
783 // Find the first whitespace; everything before is the candidate keyword.
784 let kw_end = trimmed
785 .find(|c: char| c.is_whitespace())
786 .unwrap_or(trimmed.len());
787 if kw_end == 0 {
788 return (None, s);
789 }
790 let kw_raw = &trimmed[..kw_end];
791 let kw_stripped = kw_raw.trim_end_matches(['.', ',']);
792 if !kw_stripped.is_ascii() {
793 return (None, s);
794 }
795 let kw_lower = kw_stripped.to_ascii_lowercase();
796 if !UNIT_PREFIXES.iter().any(|p| *p == kw_lower) {
797 return (None, s);
798 }
799 // Skip whitespace and `#` after the keyword.
800 let after_kw = trimmed[kw_end..].trim_start_matches([' ', '\t', '#']);
801 // Read alphanumerics as the identifier.
802 let id_end = after_kw
803 .find(|c: char| !c.is_ascii_alphanumeric())
804 .unwrap_or(after_kw.len());
805 if id_end == 0 {
806 return (None, s);
807 }
808 let id = &after_kw[..id_end];
809 let rest = &after_kw[id_end..];
810 let unit = format!("{} {}", kw_lower, id.to_ascii_lowercase());
811 (Some(unit), rest)
812}
813
814/// Extract a leading house number (digits + optional single alphabetic
815/// suffix) from the start of `s`.
816///
817/// `"10 Downing Street"` → `(Some("10"), " Downing Street")`.
818/// `"10A High St"` → `(Some("10A"), " High St")`.
819/// `"Buckingham Palace"` → `(None, "Buckingham Palace")`.
820fn extract_house_number(s: &str) -> (Option<String>, &str) {
821 let trimmed = s.trim_start();
822 let mut digits_end = 0;
823 for (i, c) in trimmed.char_indices() {
824 if c.is_ascii_digit() {
825 digits_end = i + c.len_utf8();
826 } else {
827 break;
828 }
829 }
830 if digits_end == 0 {
831 return (None, s);
832 }
833 let mut end = digits_end;
834 // Allow a single alphabetic suffix (e.g. "10A"), but only when not
835 // followed by another alphabetic — otherwise we'd swallow the start
836 // of a street name like "10 Apple Tree Lane".
837 let after_digits = &trimmed[digits_end..];
838 let mut chars = after_digits.chars();
839 if let Some(c1) = chars.next()
840 && c1.is_ascii_alphabetic()
841 {
842 let next = chars.next();
843 if next.is_none() || next.is_some_and(|c2| !c2.is_ascii_alphanumeric()) {
844 end += c1.len_utf8();
845 }
846 }
847 let number = trimmed[..end].to_ascii_uppercase();
848 (Some(number), &trimmed[end..])
849}
850
851/// Per-country phone metadata for [`Normalizer::normalize_phone_e164`].
852///
853/// `min_nsn` / `max_nsn` bound the **national-significant number** length —
854/// the digits after the dial code, with the national trunk prefix removed.
855/// `trunk_prefix` is the digit string used for national dialling (`"0"` for
856/// most of Europe and Asia, `"8"` for Lithuania, `None` for NANP / Spain /
857/// Portugal and several others). When set, a single occurrence of the
858/// string at the start of the national number is stripped before
859/// canonicalisation.
860struct CountryPhoneInfo {
861 /// ISO 3166-1 alpha-2 country code, uppercase.
862 iso_alpha2: &'static str,
863 /// International dialling code, no leading `+`.
864 dial_code: &'static str,
865 /// National trunk prefix digit(s), if any.
866 trunk_prefix: Option<&'static str>,
867 /// Minimum national-significant-number length.
868 min_nsn: usize,
869 /// Maximum national-significant-number length.
870 max_nsn: usize,
871}
872
873/// Phone-numbering metadata for countries supported by
874/// [`Normalizer::normalize_phone_e164`].
875///
876/// Coverage: all five jurisdictions for which the crate exposes a national
877/// healthcare identifier (GB England/Wales/IoM, FR, ES, IE, plus UK NI via
878/// the GB dial code), plus the most common worker-mobility partners. New
879/// entries SHOULD follow the ISO 3166-1 alpha-2 convention and document the
880/// trunk-prefix rule explicitly.
881const COUNTRY_PHONE_TABLE: &[CountryPhoneInfo] = &[
882 CountryPhoneInfo {
883 iso_alpha2: "GB",
884 dial_code: "44",
885 trunk_prefix: Some("0"),
886 min_nsn: 7,
887 max_nsn: 11,
888 },
889 CountryPhoneInfo {
890 iso_alpha2: "FR",
891 dial_code: "33",
892 trunk_prefix: Some("0"),
893 min_nsn: 9,
894 max_nsn: 9,
895 },
896 CountryPhoneInfo {
897 iso_alpha2: "DE",
898 dial_code: "49",
899 trunk_prefix: Some("0"),
900 min_nsn: 7,
901 max_nsn: 13,
902 },
903 CountryPhoneInfo {
904 iso_alpha2: "ES",
905 dial_code: "34",
906 trunk_prefix: None,
907 min_nsn: 9,
908 max_nsn: 9,
909 },
910 CountryPhoneInfo {
911 iso_alpha2: "IE",
912 dial_code: "353",
913 trunk_prefix: Some("0"),
914 min_nsn: 7,
915 max_nsn: 11,
916 },
917 CountryPhoneInfo {
918 iso_alpha2: "IT",
919 dial_code: "39",
920 trunk_prefix: None,
921 min_nsn: 6,
922 max_nsn: 12,
923 },
924 CountryPhoneInfo {
925 iso_alpha2: "NL",
926 dial_code: "31",
927 trunk_prefix: Some("0"),
928 min_nsn: 9,
929 max_nsn: 9,
930 },
931 CountryPhoneInfo {
932 iso_alpha2: "BE",
933 dial_code: "32",
934 trunk_prefix: Some("0"),
935 min_nsn: 8,
936 max_nsn: 9,
937 },
938 CountryPhoneInfo {
939 iso_alpha2: "PT",
940 dial_code: "351",
941 trunk_prefix: None,
942 min_nsn: 9,
943 max_nsn: 9,
944 },
945 CountryPhoneInfo {
946 iso_alpha2: "CH",
947 dial_code: "41",
948 trunk_prefix: Some("0"),
949 min_nsn: 9,
950 max_nsn: 9,
951 },
952 CountryPhoneInfo {
953 iso_alpha2: "AT",
954 dial_code: "43",
955 trunk_prefix: Some("0"),
956 min_nsn: 4,
957 max_nsn: 13,
958 },
959 CountryPhoneInfo {
960 iso_alpha2: "SE",
961 dial_code: "46",
962 trunk_prefix: Some("0"),
963 min_nsn: 7,
964 max_nsn: 13,
965 },
966 CountryPhoneInfo {
967 iso_alpha2: "NO",
968 dial_code: "47",
969 trunk_prefix: None,
970 min_nsn: 8,
971 max_nsn: 8,
972 },
973 CountryPhoneInfo {
974 iso_alpha2: "DK",
975 dial_code: "45",
976 trunk_prefix: None,
977 min_nsn: 8,
978 max_nsn: 8,
979 },
980 CountryPhoneInfo {
981 iso_alpha2: "FI",
982 dial_code: "358",
983 trunk_prefix: Some("0"),
984 min_nsn: 5,
985 max_nsn: 12,
986 },
987 CountryPhoneInfo {
988 iso_alpha2: "PL",
989 dial_code: "48",
990 trunk_prefix: None,
991 min_nsn: 9,
992 max_nsn: 9,
993 },
994 CountryPhoneInfo {
995 iso_alpha2: "AU",
996 dial_code: "61",
997 trunk_prefix: Some("0"),
998 min_nsn: 9,
999 max_nsn: 9,
1000 },
1001 CountryPhoneInfo {
1002 iso_alpha2: "NZ",
1003 dial_code: "64",
1004 trunk_prefix: Some("0"),
1005 min_nsn: 8,
1006 max_nsn: 10,
1007 },
1008 CountryPhoneInfo {
1009 iso_alpha2: "US",
1010 dial_code: "1",
1011 trunk_prefix: None,
1012 min_nsn: 10,
1013 max_nsn: 10,
1014 },
1015 CountryPhoneInfo {
1016 iso_alpha2: "CA",
1017 dial_code: "1",
1018 trunk_prefix: None,
1019 min_nsn: 10,
1020 max_nsn: 10,
1021 },
1022 CountryPhoneInfo {
1023 iso_alpha2: "JP",
1024 dial_code: "81",
1025 trunk_prefix: Some("0"),
1026 min_nsn: 9,
1027 max_nsn: 10,
1028 },
1029 CountryPhoneInfo {
1030 iso_alpha2: "CN",
1031 dial_code: "86",
1032 trunk_prefix: Some("0"),
1033 min_nsn: 5,
1034 max_nsn: 12,
1035 },
1036 CountryPhoneInfo {
1037 iso_alpha2: "IN",
1038 dial_code: "91",
1039 trunk_prefix: Some("0"),
1040 min_nsn: 10,
1041 max_nsn: 10,
1042 },
1043 CountryPhoneInfo {
1044 iso_alpha2: "BR",
1045 dial_code: "55",
1046 trunk_prefix: Some("0"),
1047 min_nsn: 10,
1048 max_nsn: 11,
1049 },
1050 CountryPhoneInfo {
1051 iso_alpha2: "MX",
1052 dial_code: "52",
1053 trunk_prefix: None,
1054 min_nsn: 10,
1055 max_nsn: 10,
1056 },
1057 CountryPhoneInfo {
1058 iso_alpha2: "ZA",
1059 dial_code: "27",
1060 trunk_prefix: Some("0"),
1061 min_nsn: 9,
1062 max_nsn: 9,
1063 },
1064 // ---- T-19: coverage of remaining 35-scheme identifier jurisdictions ----
1065 CountryPhoneInfo {
1066 iso_alpha2: "BG",
1067 dial_code: "359",
1068 trunk_prefix: Some("0"),
1069 min_nsn: 8,
1070 max_nsn: 9,
1071 },
1072 CountryPhoneInfo {
1073 iso_alpha2: "CZ",
1074 dial_code: "420",
1075 trunk_prefix: None,
1076 min_nsn: 9,
1077 max_nsn: 9,
1078 },
1079 CountryPhoneInfo {
1080 iso_alpha2: "EE",
1081 dial_code: "372",
1082 trunk_prefix: None,
1083 min_nsn: 7,
1084 max_nsn: 8,
1085 },
1086 CountryPhoneInfo {
1087 iso_alpha2: "GR",
1088 dial_code: "30",
1089 trunk_prefix: None,
1090 min_nsn: 10,
1091 max_nsn: 10,
1092 },
1093 CountryPhoneInfo {
1094 iso_alpha2: "HR",
1095 dial_code: "385",
1096 trunk_prefix: Some("0"),
1097 min_nsn: 8,
1098 max_nsn: 9,
1099 },
1100 CountryPhoneInfo {
1101 iso_alpha2: "IS",
1102 dial_code: "354",
1103 trunk_prefix: None,
1104 min_nsn: 7,
1105 max_nsn: 9,
1106 },
1107 CountryPhoneInfo {
1108 iso_alpha2: "LI",
1109 dial_code: "423",
1110 trunk_prefix: None,
1111 min_nsn: 7,
1112 max_nsn: 9,
1113 },
1114 // Lithuania uses `8` (not `0`) as the national trunk prefix.
1115 CountryPhoneInfo {
1116 iso_alpha2: "LT",
1117 dial_code: "370",
1118 trunk_prefix: Some("8"),
1119 min_nsn: 8,
1120 max_nsn: 8,
1121 },
1122 CountryPhoneInfo {
1123 iso_alpha2: "LV",
1124 dial_code: "371",
1125 trunk_prefix: None,
1126 min_nsn: 8,
1127 max_nsn: 8,
1128 },
1129 CountryPhoneInfo {
1130 iso_alpha2: "MT",
1131 dial_code: "356",
1132 trunk_prefix: None,
1133 min_nsn: 8,
1134 max_nsn: 8,
1135 },
1136 CountryPhoneInfo {
1137 iso_alpha2: "RO",
1138 dial_code: "40",
1139 trunk_prefix: Some("0"),
1140 min_nsn: 9,
1141 max_nsn: 9,
1142 },
1143 CountryPhoneInfo {
1144 iso_alpha2: "SI",
1145 dial_code: "386",
1146 trunk_prefix: Some("0"),
1147 min_nsn: 8,
1148 max_nsn: 8,
1149 },
1150 CountryPhoneInfo {
1151 iso_alpha2: "SK",
1152 dial_code: "421",
1153 trunk_prefix: Some("0"),
1154 min_nsn: 9,
1155 max_nsn: 9,
1156 },
1157];
1158
1159/// Look up a country by ISO 3166-1 alpha-2 code (case-insensitive).
1160///
1161/// Returns the first match in [`COUNTRY_PHONE_TABLE`]. For NANP countries
1162/// (US/CA) which share dial code `1`, this disambiguates by the caller's
1163/// chosen default; the canonical E.164 output is identical for both.
1164fn lookup_by_iso(iso: &str) -> Option<&'static CountryPhoneInfo> {
1165 if !iso.is_ascii() {
1166 return None;
1167 }
1168 let upper = iso.to_ascii_uppercase();
1169 COUNTRY_PHONE_TABLE.iter().find(|c| c.iso_alpha2 == upper)
1170}
1171
1172/// Match the longest known dial-code prefix at the start of `digits`.
1173///
1174/// Tries 3-, 2-, then 1-digit prefixes to honour the country table's most
1175/// specific entry. For NANP (dial code `1`) the first matching entry — US —
1176/// is returned; the canonical E.164 form is the same whether the caller
1177/// later interprets the country as US or CA.
1178fn lookup_by_dial_code_prefix(digits: &str) -> Option<&'static CountryPhoneInfo> {
1179 for len in [3usize, 2, 1] {
1180 if digits.len() >= len {
1181 let prefix = &digits[..len];
1182 if let Some(info) = COUNTRY_PHONE_TABLE.iter().find(|c| c.dial_code == prefix) {
1183 return Some(info);
1184 }
1185 }
1186 }
1187 None
1188}
1189
1190/// Strip a single occurrence of the country's national trunk prefix
1191/// from `nsn` if one is configured and present.
1192fn strip_trunk_prefix<'a>(info: &CountryPhoneInfo, nsn: &'a str) -> &'a str {
1193 if let Some(tp) = info.trunk_prefix
1194 && let Some(rest) = nsn.strip_prefix(tp)
1195 && !rest.is_empty()
1196 {
1197 rest
1198 } else {
1199 nsn
1200 }
1201}
1202
1203#[cfg(test)]
1204mod tests {
1205 use super::*;
1206
1207 // ---------- normalize_name ----------
1208
1209 #[test]
1210 fn normalize_name_collapses_whitespace_and_trims() {
1211 assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
1212 }
1213
1214 #[test]
1215 fn normalize_name_strips_ascii_punctuation() {
1216 assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
1217 assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
1218 assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
1219 }
1220
1221 #[test]
1222 fn normalize_name_strips_diacritics() {
1223 assert_eq!(Normalizer::normalize_name("José"), "jose");
1224 assert_eq!(Normalizer::normalize_name("Siân"), "sian");
1225 // common test cases
1226 assert_eq!(Normalizer::normalize_name("naïve"), "naive");
1227 assert_eq!(Normalizer::normalize_name("crème"), "creme");
1228 // ŷ and ŵ decompose; ł has no NFKD decomposition and survives lowercased.
1229 assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
1230 }
1231
1232 #[test]
1233 fn normalize_name_handles_empty_and_whitespace() {
1234 assert_eq!(Normalizer::normalize_name(""), "");
1235 assert_eq!(Normalizer::normalize_name(" "), "");
1236 assert_eq!(Normalizer::normalize_name("\t\n"), "");
1237 }
1238
1239 #[test]
1240 fn normalize_name_lowercases() {
1241 assert_eq!(Normalizer::normalize_name("MARY"), "mary");
1242 assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
1243 }
1244
1245 #[test]
1246 fn normalize_name_is_idempotent() {
1247 for input in [
1248 " John Smith ",
1249 "O'Brien-Jones",
1250 "JOSÉ MARÍA",
1251 "",
1252 " ",
1253 "Siân",
1254 ] {
1255 let once = Normalizer::normalize_name(input);
1256 let twice = Normalizer::normalize_name(&once);
1257 assert_eq!(once, twice, "not idempotent for {input:?}");
1258 }
1259 }
1260
1261 #[test]
1262 fn normalize_name_does_not_normalise_unicode_punctuation() {
1263 // Curly apostrophe (U+2019) is intentionally not stripped.
1264 // This is documented in AGENTS/normalization.md as a known limitation.
1265 let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
1266 assert!(with_curly.contains('\u{2019}'));
1267 }
1268
1269 // ---------- normalize_postcode ----------
1270
1271 #[test]
1272 fn normalize_postcode_uppercases() {
1273 assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
1274 }
1275
1276 #[test]
1277 fn normalize_postcode_strips_all_whitespace() {
1278 assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
1279 assert_eq!(Normalizer::normalize_postcode(" CF10 1AA "), "CF101AA");
1280 assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
1281 }
1282
1283 #[test]
1284 fn normalize_postcode_handles_empty() {
1285 assert_eq!(Normalizer::normalize_postcode(""), "");
1286 assert_eq!(Normalizer::normalize_postcode(" "), "");
1287 }
1288
1289 #[test]
1290 fn normalize_postcode_is_idempotent() {
1291 for input in ["cf10 1aa", "SW1A 2AA", " EH8 9YL ", ""] {
1292 let once = Normalizer::normalize_postcode(input);
1293 let twice = Normalizer::normalize_postcode(&once);
1294 assert_eq!(once, twice);
1295 }
1296 }
1297
1298 // ---------- normalize_phone ----------
1299
1300 #[test]
1301 fn normalize_phone_strips_uk_trunk_prefix() {
1302 assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
1303 }
1304
1305 #[test]
1306 fn normalize_phone_strips_plus_44_international() {
1307 assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
1308 }
1309
1310 #[test]
1311 fn normalize_phone_strips_0044_international() {
1312 assert_eq!(
1313 Normalizer::normalize_phone("0044 7700 900123"),
1314 "7700900123"
1315 );
1316 }
1317
1318 #[test]
1319 fn normalize_phone_handles_brackets_and_spaces() {
1320 assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
1321 }
1322
1323 #[test]
1324 fn normalize_phone_handles_empty() {
1325 assert_eq!(Normalizer::normalize_phone(""), "");
1326 assert_eq!(Normalizer::normalize_phone("---"), "");
1327 }
1328
1329 #[test]
1330 fn normalize_phone_does_not_strip_44_if_too_short() {
1331 // 44 followed by fewer than 10 more digits: keep the 44 (not international prefix).
1332 assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
1333 }
1334
1335 #[test]
1336 fn normalize_phone_is_idempotent() {
1337 for input in [
1338 "07700 900123",
1339 "+44 7700 900123",
1340 "0044 7700 900123",
1341 "(029) 2034 5678",
1342 "",
1343 ] {
1344 let once = Normalizer::normalize_phone(input);
1345 let twice = Normalizer::normalize_phone(&once);
1346 assert_eq!(once, twice, "not idempotent for {input:?}");
1347 }
1348 }
1349
1350 #[test]
1351 fn normalize_phone_keeps_lone_zero() {
1352 // A bare "0" is not stripped (guard: len > 1).
1353 assert_eq!(Normalizer::normalize_phone("0"), "0");
1354 }
1355
1356 // ---------- phonetic_code ----------
1357
1358 #[test]
1359 fn phonetic_code_groups_smith_and_smyth() {
1360 assert_eq!(
1361 Normalizer::phonetic_code("Smith"),
1362 Normalizer::phonetic_code("Smyth")
1363 );
1364 }
1365
1366 #[test]
1367 fn phonetic_code_groups_stephen_and_steven() {
1368 assert_eq!(
1369 Normalizer::phonetic_code("Stephen"),
1370 Normalizer::phonetic_code("Steven")
1371 );
1372 }
1373
1374 #[test]
1375 fn phonetic_code_distinguishes_different_families() {
1376 assert_ne!(
1377 Normalizer::phonetic_code("Jones"),
1378 Normalizer::phonetic_code("Smith")
1379 );
1380 assert_ne!(
1381 Normalizer::phonetic_code("Anderson"),
1382 Normalizer::phonetic_code("Zimmerman")
1383 );
1384 }
1385
1386 #[test]
1387 fn phonetic_code_specific_values() {
1388 // Pinned values from the underlying soundex crate; act as a regression net.
1389 assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
1390 assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
1391 assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
1392 assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
1393 }
1394
1395 #[test]
1396 fn phonetic_code_handles_empty() {
1397 assert_eq!(Normalizer::phonetic_code(""), "");
1398 assert_eq!(Normalizer::phonetic_code(" "), "");
1399 }
1400
1401 #[test]
1402 fn phonetic_code_is_case_insensitive() {
1403 assert_eq!(
1404 Normalizer::phonetic_code("SMITH"),
1405 Normalizer::phonetic_code("smith")
1406 );
1407 }
1408
1409 // ---------- normalize_phone_e164 ----------
1410
1411 #[test]
1412 fn e164_uk_layouts_canonicalise_identically() {
1413 let canonical = Some("+447700900123".to_string());
1414 assert_eq!(
1415 Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
1416 canonical,
1417 );
1418 assert_eq!(
1419 Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
1420 canonical,
1421 );
1422 assert_eq!(
1423 Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
1424 canonical,
1425 );
1426 assert_eq!(
1427 Normalizer::normalize_phone_e164("(07700) 900-123", Some("GB")),
1428 canonical,
1429 );
1430 }
1431
1432 #[test]
1433 fn e164_french_layouts_canonicalise_identically() {
1434 let canonical = Some("+33123456789".to_string());
1435 assert_eq!(
1436 Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("FR")),
1437 canonical,
1438 );
1439 assert_eq!(
1440 Normalizer::normalize_phone_e164("0033 1 23 45 67 89", Some("FR")),
1441 canonical,
1442 );
1443 assert_eq!(
1444 Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
1445 canonical,
1446 );
1447 }
1448
1449 #[test]
1450 fn e164_spain_has_no_national_trunk_prefix() {
1451 // Spain switched to no trunk-0 in 1998; a bare 9-digit national
1452 // number is the canonical form.
1453 assert_eq!(
1454 Normalizer::normalize_phone_e164("912 345 678", Some("ES")),
1455 Some("+34912345678".to_string()),
1456 );
1457 assert_eq!(
1458 Normalizer::normalize_phone_e164("+34 912 345 678", None),
1459 Some("+34912345678".to_string()),
1460 );
1461 }
1462
1463 #[test]
1464 fn e164_ireland_three_digit_dial_code() {
1465 assert_eq!(
1466 Normalizer::normalize_phone_e164("+353 1 234 5678", None),
1467 Some("+35312345678".to_string()),
1468 );
1469 assert_eq!(
1470 Normalizer::normalize_phone_e164("01 234 5678", Some("IE")),
1471 Some("+35312345678".to_string()),
1472 );
1473 }
1474
1475 #[test]
1476 fn e164_nanp_handles_us_and_canada() {
1477 assert_eq!(
1478 Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
1479 Some("+14155551234".to_string()),
1480 );
1481 assert_eq!(
1482 Normalizer::normalize_phone_e164("+1 415 555 1234", None),
1483 Some("+14155551234".to_string()),
1484 );
1485 // Canada uses the same dial code; canonical form is identical.
1486 assert_eq!(
1487 Normalizer::normalize_phone_e164("(416) 555-1234", Some("CA")),
1488 Some("+14165551234".to_string()),
1489 );
1490 }
1491
1492 // ---------- T-19: 35-scheme jurisdiction coverage ----------
1493
1494 #[test]
1495 fn e164_lithuania_uses_eight_as_trunk_prefix() {
1496 // Lithuania's national trunk prefix is `8`, not `0`. National
1497 // dialling form `8 612 34567` (mobile) canonicalises to the same
1498 // E.164 string as the explicit `+370 612 34567` form.
1499 assert_eq!(
1500 Normalizer::normalize_phone_e164("8 612 34567", Some("LT")),
1501 Some("+37061234567".to_string()),
1502 );
1503 assert_eq!(
1504 Normalizer::normalize_phone_e164("+370 612 34567", None),
1505 Some("+37061234567".to_string()),
1506 );
1507 }
1508
1509 #[test]
1510 fn e164_greece_has_no_national_trunk_prefix() {
1511 // GR national-significant numbers begin with the area code (the
1512 // leading zero seen in older publications is no longer a trunk).
1513 assert_eq!(
1514 Normalizer::normalize_phone_e164("+30 210 123 4567", None),
1515 Some("+302101234567".to_string()),
1516 );
1517 assert_eq!(
1518 Normalizer::normalize_phone_e164("210 123 4567", Some("GR")),
1519 Some("+302101234567".to_string()),
1520 );
1521 }
1522
1523 #[test]
1524 fn e164_romania_strips_trunk_zero() {
1525 assert_eq!(
1526 Normalizer::normalize_phone_e164("0721 234 567", Some("RO")),
1527 Some("+40721234567".to_string()),
1528 );
1529 assert_eq!(
1530 Normalizer::normalize_phone_e164("+40 721 234 567", None),
1531 Some("+40721234567".to_string()),
1532 );
1533 }
1534
1535 #[test]
1536 fn e164_czech_no_trunk_prefix() {
1537 assert_eq!(
1538 Normalizer::normalize_phone_e164("+420 234 567 890", None),
1539 Some("+420234567890".to_string()),
1540 );
1541 assert_eq!(
1542 Normalizer::normalize_phone_e164("234 567 890", Some("CZ")),
1543 Some("+420234567890".to_string()),
1544 );
1545 }
1546
1547 #[test]
1548 fn e164_iceland_seven_digit_nsn() {
1549 assert_eq!(
1550 Normalizer::normalize_phone_e164("+354 412 3456", None),
1551 Some("+3544123456".to_string()),
1552 );
1553 }
1554
1555 #[test]
1556 fn e164_distinguishes_overlapping_three_digit_dial_codes() {
1557 // Croatia (385) vs Slovenia (386): adjacent dial codes, both
1558 // use a `0` trunk, but the canonical E.164 forms remain distinct.
1559 let hr = Normalizer::normalize_phone_e164("+385 91 234 5678", None);
1560 let si = Normalizer::normalize_phone_e164("+386 41 234 567", None);
1561 assert!(hr.is_some());
1562 assert!(si.is_some());
1563 assert_ne!(hr, si);
1564 }
1565
1566 #[test]
1567 fn e164_distinguishes_countries_with_overlapping_national_digits() {
1568 // The "same" national-format digits in two countries must yield
1569 // different E.164 strings — this is precisely the disambiguation
1570 // the new normaliser provides.
1571 let uk = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1572 let fr = Normalizer::normalize_phone_e164("07700 90012", Some("FR"));
1573 assert!(uk.is_some());
1574 assert!(fr.is_some());
1575 assert_ne!(uk, fr);
1576 }
1577
1578 #[test]
1579 fn e164_returns_none_when_default_country_missing_and_no_marker() {
1580 // Ambiguous national-format input with no default country.
1581 assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
1582 }
1583
1584 #[test]
1585 fn e164_returns_none_for_unknown_dial_code() {
1586 assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
1587 }
1588
1589 #[test]
1590 fn e164_returns_none_for_empty_or_punctuation_only() {
1591 assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
1592 assert_eq!(Normalizer::normalize_phone_e164("+", Some("GB")), None);
1593 assert_eq!(Normalizer::normalize_phone_e164("()-", Some("GB")), None);
1594 }
1595
1596 #[test]
1597 fn e164_returns_none_for_too_short_or_too_long_nsn() {
1598 // GB NSN must be 7..=11 digits.
1599 assert_eq!(Normalizer::normalize_phone_e164("+44 1", None), None);
1600 assert_eq!(
1601 Normalizer::normalize_phone_e164("+44 123456789012345", None),
1602 None,
1603 );
1604 }
1605
1606 #[test]
1607 fn e164_rejects_unknown_default_country() {
1608 // "XX" is not in the table; without an explicit international
1609 // marker the function cannot guess.
1610 assert_eq!(
1611 Normalizer::normalize_phone_e164("07700 900123", Some("XX")),
1612 None,
1613 );
1614 }
1615
1616 #[test]
1617 fn e164_is_idempotent_on_canonical_form() {
1618 for input in [
1619 "+44 7700 900123",
1620 "+33 1 23 45 67 89",
1621 "(415) 555-1234",
1622 "+353 1 234 5678",
1623 "+34 912 345 678",
1624 ] {
1625 let once = Normalizer::normalize_phone_e164(input, Some("GB")).expect("parses");
1626 let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).expect("idempotent");
1627 assert_eq!(once, twice, "not idempotent for {input:?}");
1628 }
1629 }
1630
1631 #[test]
1632 fn e164_default_country_lookup_is_case_insensitive() {
1633 let lower = Normalizer::normalize_phone_e164("07700 900123", Some("gb"));
1634 let upper = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1635 assert_eq!(lower, upper);
1636 assert!(lower.is_some());
1637 }
1638
1639 #[test]
1640 fn e164_handles_double_zero_international_access_form() {
1641 assert_eq!(
1642 Normalizer::normalize_phone_e164("00 33 1 23 45 67 89", None),
1643 Some("+33123456789".to_string()),
1644 );
1645 }
1646
1647 // ---------- expand_street_abbreviations ----------
1648
1649 #[test]
1650 fn expand_street_replaces_common_abbreviations() {
1651 assert_eq!(
1652 Normalizer::expand_street_abbreviations("123 High St"),
1653 "123 High street",
1654 );
1655 assert_eq!(
1656 Normalizer::expand_street_abbreviations("10 Downing Rd"),
1657 "10 Downing road",
1658 );
1659 assert_eq!(
1660 Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
1661 "12 Sunset boulevard",
1662 );
1663 assert_eq!(
1664 Normalizer::expand_street_abbreviations("1 Park Ave"),
1665 "1 Park avenue",
1666 );
1667 assert_eq!(
1668 Normalizer::expand_street_abbreviations("5 Cherry Ln"),
1669 "5 Cherry lane",
1670 );
1671 }
1672
1673 #[test]
1674 fn expand_street_replaces_directionals() {
1675 assert_eq!(
1676 Normalizer::expand_street_abbreviations("45 N Park Ave"),
1677 "45 north Park avenue",
1678 );
1679 assert_eq!(
1680 Normalizer::expand_street_abbreviations("100 SW 5th St"),
1681 "100 southwest 5th street",
1682 );
1683 }
1684
1685 #[test]
1686 fn expand_street_strips_trailing_period_or_comma() {
1687 assert_eq!(
1688 Normalizer::expand_street_abbreviations("123 High St."),
1689 "123 High street",
1690 );
1691 assert_eq!(
1692 Normalizer::expand_street_abbreviations("12 Sunset Blvd,"),
1693 "12 Sunset boulevard",
1694 );
1695 }
1696
1697 #[test]
1698 fn expand_street_passes_unknown_tokens_through() {
1699 assert_eq!(
1700 Normalizer::expand_street_abbreviations("Buckingham Palace"),
1701 "Buckingham Palace",
1702 );
1703 }
1704
1705 #[test]
1706 fn expand_street_is_idempotent_on_already_expanded_input() {
1707 for input in [
1708 "123 High St",
1709 "45 N Park Ave",
1710 "10 Downing Rd",
1711 "Buckingham Palace",
1712 ] {
1713 let once = Normalizer::expand_street_abbreviations(input);
1714 let twice = Normalizer::expand_street_abbreviations(&once);
1715 assert_eq!(once, twice, "not idempotent for {input:?}");
1716 }
1717 }
1718
1719 #[test]
1720 fn expand_street_handles_empty_and_whitespace_only() {
1721 assert_eq!(Normalizer::expand_street_abbreviations(""), "");
1722 assert_eq!(Normalizer::expand_street_abbreviations(" "), "");
1723 }
1724
1725 // ---------- normalize_address_line ----------
1726
1727 #[test]
1728 fn normalize_address_line_unifies_abbreviated_and_full_forms() {
1729 assert_eq!(
1730 Normalizer::normalize_address_line("123 High St"),
1731 Normalizer::normalize_address_line("123 High Street"),
1732 );
1733 assert_eq!(
1734 Normalizer::normalize_address_line("45 N Park Ave"),
1735 Normalizer::normalize_address_line("45 North Park Avenue"),
1736 );
1737 }
1738
1739 #[test]
1740 fn normalize_address_line_handles_punctuation_and_case() {
1741 assert_eq!(
1742 Normalizer::normalize_address_line("10, DOWNING Street."),
1743 "10 downing street",
1744 );
1745 }
1746
1747 #[test]
1748 fn normalize_address_line_is_idempotent() {
1749 for input in [
1750 "123 High St",
1751 " 45 N Park Ave ",
1752 "10, Downing Street.",
1753 "",
1754 ] {
1755 let once = Normalizer::normalize_address_line(input);
1756 let twice = Normalizer::normalize_address_line(&once);
1757 assert_eq!(once, twice, "not idempotent for {input:?}");
1758 }
1759 }
1760
1761 // ---------- parse_address_line ----------
1762
1763 #[test]
1764 fn parse_address_extracts_simple_house_number() {
1765 let p = Normalizer::parse_address_line("123 High Street");
1766 assert_eq!(p.house_number.as_deref(), Some("123"));
1767 assert_eq!(p.unit, None);
1768 assert_eq!(p.street, "high street");
1769 }
1770
1771 #[test]
1772 fn parse_address_handles_alphanumeric_house_number() {
1773 let p = Normalizer::parse_address_line("10A Downing St");
1774 assert_eq!(p.house_number.as_deref(), Some("10A"));
1775 assert_eq!(p.street, "downing street");
1776 }
1777
1778 #[test]
1779 fn parse_address_does_not_greedily_consume_street_name() {
1780 // "10 Apple Tree Lane" — `Apple` must not be absorbed into the
1781 // house number because two consecutive alphabetic characters
1782 // signal it's part of the street name.
1783 let p = Normalizer::parse_address_line("10 Apple Tree Lane");
1784 assert_eq!(p.house_number.as_deref(), Some("10"));
1785 assert_eq!(p.street, "apple tree lane");
1786 }
1787
1788 #[test]
1789 fn parse_address_recognises_flat_prefix() {
1790 let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
1791 assert_eq!(p.unit.as_deref(), Some("flat 2a"));
1792 assert_eq!(p.house_number.as_deref(), Some("10"));
1793 assert_eq!(p.street, "downing street");
1794 }
1795
1796 #[test]
1797 fn parse_address_recognises_apt_prefix() {
1798 let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
1799 assert_eq!(p.unit.as_deref(), Some("apt 5"));
1800 assert_eq!(p.house_number.as_deref(), Some("1600"));
1801 assert_eq!(p.street, "pennsylvania avenue");
1802 }
1803
1804 #[test]
1805 fn parse_address_recognises_suite_and_ste_and_unit_and_room() {
1806 for input in [
1807 "Suite 12, 100 Main St",
1808 "Ste 12, 100 Main St",
1809 "Unit 12, 100 Main St",
1810 "Room 12, 100 Main St",
1811 ] {
1812 let p = Normalizer::parse_address_line(input);
1813 assert!(p.unit.is_some(), "no unit for {input:?}");
1814 assert_eq!(p.house_number.as_deref(), Some("100"));
1815 assert_eq!(p.street, "main street");
1816 }
1817 }
1818
1819 #[test]
1820 fn parse_address_no_leading_number_falls_back_to_street_only() {
1821 let p = Normalizer::parse_address_line("Buckingham Palace");
1822 assert_eq!(p.house_number, None);
1823 assert_eq!(p.unit, None);
1824 assert_eq!(p.street, "buckingham palace");
1825 }
1826
1827 #[test]
1828 fn parse_address_empty_input_yields_empty_street() {
1829 let p = Normalizer::parse_address_line("");
1830 assert_eq!(p.house_number, None);
1831 assert_eq!(p.unit, None);
1832 assert_eq!(p.street, "");
1833 }
1834
1835 #[test]
1836 fn parse_address_round_trips_through_serde() {
1837 let p = Normalizer::parse_address_line("Flat 2A, 10A Downing Street");
1838 let json = serde_json::to_string(&p).unwrap();
1839 let back: ParsedAddressLine = serde_json::from_str(&json).unwrap();
1840 assert_eq!(p, back);
1841 }
1842
1843 #[test]
1844 fn parse_address_uppercases_house_number_suffix() {
1845 let p = Normalizer::parse_address_line("10a Downing St");
1846 assert_eq!(p.house_number.as_deref(), Some("10A"));
1847 }
1848
1849 // ---------- normalize_email ----------
1850
1851 #[test]
1852 fn normalize_email_lowercases_and_trims() {
1853 assert_eq!(
1854 Normalizer::normalize_email(" Alice@Example.ORG ", false),
1855 Some("alice@example.org".into()),
1856 );
1857 }
1858
1859 #[test]
1860 fn normalize_email_preserves_well_formed_input() {
1861 assert_eq!(
1862 Normalizer::normalize_email("alice@example.org", false),
1863 Some("alice@example.org".into()),
1864 );
1865 }
1866
1867 #[test]
1868 fn normalize_email_rejects_missing_at_sign() {
1869 assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
1870 }
1871
1872 #[test]
1873 fn normalize_email_rejects_empty_localpart_or_domain() {
1874 assert_eq!(Normalizer::normalize_email("@example.org", false), None);
1875 assert_eq!(Normalizer::normalize_email("alice@", false), None);
1876 }
1877
1878 #[test]
1879 fn normalize_email_rejects_multiple_at_signs() {
1880 assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
1881 }
1882
1883 #[test]
1884 fn normalize_email_rejects_empty_and_whitespace() {
1885 assert_eq!(Normalizer::normalize_email("", false), None);
1886 assert_eq!(Normalizer::normalize_email(" ", false), None);
1887 }
1888
1889 #[test]
1890 fn normalize_email_gmail_dot_folding_strips_dots_in_localpart() {
1891 assert_eq!(
1892 Normalizer::normalize_email("j.smith@gmail.com", true),
1893 Some("jsmith@gmail.com".into()),
1894 );
1895 assert_eq!(
1896 Normalizer::normalize_email("J.S.M.I.T.H@gmail.com", true),
1897 Some("jsmith@gmail.com".into()),
1898 );
1899 }
1900
1901 #[test]
1902 fn normalize_email_gmail_dot_folding_strips_plus_tag() {
1903 assert_eq!(
1904 Normalizer::normalize_email("jsmith+work@gmail.com", true),
1905 Some("jsmith@gmail.com".into()),
1906 );
1907 assert_eq!(
1908 Normalizer::normalize_email("j.smith+anything@googlemail.com", true),
1909 Some("jsmith@googlemail.com".into()),
1910 );
1911 }
1912
1913 #[test]
1914 fn normalize_email_gmail_dot_folding_does_not_touch_other_domains() {
1915 assert_eq!(
1916 Normalizer::normalize_email("j.smith@example.org", true),
1917 Some("j.smith@example.org".into()),
1918 );
1919 assert_eq!(
1920 Normalizer::normalize_email("jsmith+work@example.org", true),
1921 Some("jsmith+work@example.org".into()),
1922 );
1923 }
1924
1925 #[test]
1926 fn normalize_email_dot_folding_off_preserves_localpart_dots() {
1927 assert_eq!(
1928 Normalizer::normalize_email("j.smith@gmail.com", false),
1929 Some("j.smith@gmail.com".into()),
1930 );
1931 }
1932
1933 #[test]
1934 fn normalize_email_is_idempotent_on_canonical_form() {
1935 for (input, fold) in [
1936 ("Alice@Example.ORG", false),
1937 ("j.smith@gmail.com", true),
1938 ("jsmith+x@gmail.com", true),
1939 ("user@host.tld", false),
1940 ] {
1941 let once = Normalizer::normalize_email(input, fold).expect("parses");
1942 let twice = Normalizer::normalize_email(&once, fold).expect("idempotent");
1943 assert_eq!(once, twice, "not idempotent for {input:?} fold={fold}");
1944 }
1945 }
1946
1947 #[test]
1948 fn normalize_email_gmail_dot_folding_rejects_empty_localpart_after_folding() {
1949 // A localpart that is entirely dots (or all stripped to empty) is
1950 // not a valid address.
1951 assert_eq!(Normalizer::normalize_email("...@gmail.com", true), None);
1952 }
1953
1954 #[test]
1955 fn parse_address_does_not_treat_st_as_unit_prefix() {
1956 // The "St" street-type abbreviation must not be confused with the
1957 // "Ste" unit prefix. Only a literal "ste" token triggers the unit
1958 // path.
1959 let p = Normalizer::parse_address_line("St Mary's Road");
1960 assert_eq!(p.unit, None);
1961 }
1962
1963 #[test]
1964 fn e164_strips_trunk_zero_after_country_code() {
1965 // Some entry systems mistakenly keep the national trunk 0 after
1966 // the country code (e.g. "+44 0 7700 900123"). The normaliser
1967 // tolerates this.
1968 assert_eq!(
1969 Normalizer::normalize_phone_e164("+44 0 7700 900123", None),
1970 Some("+447700900123".to_string()),
1971 );
1972 }
1973}