worker_matcher/
normalizer.rs

1//! Text normalisation for worker demographic data.
2//!
3//! Research on worker identification (see `spec.md` §5) is unanimous: most
4//! accuracy gains come from **standardising the input** before scoring, not
5//! from cleverer similarity algorithms. This module exposes the canonical
6//! transformations the matching engine applies to names, postcodes, phone
7//! numbers, and phonetic codes.
8//!
9//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
10//! **deterministic** and allocate at most a single new `String`.
11//!
12//! ## Quick examples
13//!
14//! ```
15//! use worker_matcher::Normalizer;
16//!
17//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
18//! assert_eq!(Normalizer::normalize_name("  O'Brien  "), "obrien");
19//! assert_eq!(Normalizer::normalize_name("Siân"),         "sian");
20//!
21//! // Postcodes: strip whitespace, uppercase.
22//! assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
23//!
24//! // Phone numbers: keep digits, strip international and trunk prefixes.
25//! assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
26//! ```
27//!
28//! ## What this module deliberately does *not* do
29//!
30//! - It does not validate NHS numbers — that is delegated to the
31//!   `nhs-number` crate at the call-site (see [`crate::matcher`]).
32//! - It does not normalise email addresses or middle names (see spec
33//!   tasks T-11 and OQ-1 respectively).
34//! - It does not handle non-ASCII punctuation such as the curly apostrophe
35//!   `’` (U+2019). Upstream code should convert those to ASCII first.
36//!
37//! ## International phone numbers
38//!
39//! Two phone normalisers are provided:
40//!
41//! - [`Normalizer::normalize_phone`] — UK-centric national-significant form,
42//!   suitable for legacy or single-jurisdiction call-sites. Idempotent and
43//!   infallible.
44//! - [`Normalizer::normalize_phone_e164`] — international-aware E.164 form
45//!   (`+CCNNNN…`) for jurisdictions in the supported country table. Returns
46//!   `None` if the input cannot be confidently parsed.
47//!
48//! The matching engine tries E.164 first and falls back to the legacy form
49//! when either input is unparseable, so existing single-country deployments
50//! observe the same behaviour while multinational deployments gain
51//! cross-country disambiguation (a French number and a UK number that share
52//! the same trunk digits no longer collide).
53
54use serde::{Deserialize, Serialize};
55use unicode_normalization::UnicodeNormalization;
56
57/// Stateless namespace for text normalisation routines.
58///
59/// `Normalizer` is a unit type with no fields; every method is associated.
60/// It is held as a struct rather than a free function module purely so the
61/// public API has a single, discoverable entry point.
62///
63/// ```
64/// use worker_matcher::Normalizer;
65///
66/// let canonical = Normalizer::normalize_name("José-María");
67/// assert_eq!(canonical, "josemaria");
68/// ```
69pub struct Normalizer;
70
71impl Normalizer {
72    /// Normalise a human name for comparison.
73    ///
74    /// Steps, in order:
75    ///
76    /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
77    /// 2. Drop combining marks (diacritics).
78    /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
79    /// 4. Lowercase.
80    /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
81    ///
82    /// The result is suitable for direct equality comparison or for feeding
83    /// into a string-similarity scorer.
84    ///
85    /// # Examples
86    ///
87    /// Whitespace is collapsed and trimmed:
88    ///
89    /// ```
90    /// use worker_matcher::Normalizer;
91    /// assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
92    /// ```
93    ///
94    /// Apostrophes and hyphens are stripped:
95    ///
96    /// ```
97    /// # use worker_matcher::Normalizer;
98    /// assert_eq!(Normalizer::normalize_name("O'Brien"),    "obrien");
99    /// assert_eq!(Normalizer::normalize_name("MARY-JANE"),  "maryjane");
100    /// ```
101    ///
102    /// Diacritics are removed:
103    ///
104    /// ```
105    /// # use worker_matcher::Normalizer;
106    /// assert_eq!(Normalizer::normalize_name("José"),  "jose");
107    /// assert_eq!(Normalizer::normalize_name("Siân"),  "sian");
108    /// assert_eq!(Normalizer::normalize_name("Łukasz"), "łukasz");  // ł has no decomposition
109    /// ```
110    ///
111    /// Empty and whitespace-only input round-trip cleanly:
112    ///
113    /// ```
114    /// # use worker_matcher::Normalizer;
115    /// assert_eq!(Normalizer::normalize_name(""),       "");
116    /// assert_eq!(Normalizer::normalize_name("    "),   "");
117    /// ```
118    ///
119    /// The function is **idempotent**:
120    ///
121    /// ```
122    /// # use worker_matcher::Normalizer;
123    /// let once = Normalizer::normalize_name("  José-María  ");
124    /// let twice = Normalizer::normalize_name(&once);
125    /// assert_eq!(once, twice);
126    /// ```
127    pub fn normalize_name(name: &str) -> String {
128        name.nfkd()
129            .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
130            .filter(|c| !c.is_ascii_punctuation())
131            .collect::<String>()
132            .to_lowercase()
133            .split_whitespace()
134            .collect::<Vec<_>>()
135            .join(" ")
136    }
137
138    /// Normalise a postcode for comparison.
139    ///
140    /// Steps: drop all whitespace, then uppercase. No locale-specific
141    /// validation — that is intentionally out of scope.
142    ///
143    /// # Examples
144    ///
145    /// UK postcodes with and without the conventional space are equivalent:
146    ///
147    /// ```
148    /// use worker_matcher::Normalizer;
149    /// assert_eq!(Normalizer::normalize_postcode("CF10 1AA"),    "CF101AA");
150    /// assert_eq!(Normalizer::normalize_postcode("cf101aa"),     "CF101AA");
151    /// assert_eq!(Normalizer::normalize_postcode("  cf10 1aa "), "CF101AA");
152    /// ```
153    ///
154    /// Empty input is preserved:
155    ///
156    /// ```
157    /// # use worker_matcher::Normalizer;
158    /// assert_eq!(Normalizer::normalize_postcode(""), "");
159    /// ```
160    ///
161    /// Idempotent:
162    ///
163    /// ```
164    /// # use worker_matcher::Normalizer;
165    /// let once = Normalizer::normalize_postcode("sw1a 2aa");
166    /// let twice = Normalizer::normalize_postcode(&once);
167    /// assert_eq!(once, twice);
168    /// ```
169    pub fn normalize_postcode(postcode: &str) -> String {
170        postcode
171            .chars()
172            .filter(|c| !c.is_whitespace())
173            .collect::<String>()
174            .to_uppercase()
175    }
176
177    /// Normalise a phone number for comparison.
178    ///
179    /// Steps:
180    ///
181    /// 1. Keep only ASCII digits (drop spaces, brackets, hyphens, `+`, …).
182    /// 2. If the result starts with `0044`, drop those four characters.
183    /// 3. Else, if the result starts with `44` and is at least 12 digits long,
184    ///    drop the leading `44`.
185    /// 4. Else, if the result starts with `0` and is longer than one digit,
186    ///    drop the leading `0`.
187    ///
188    /// This canonicalises the common UK formats into a single subscriber
189    /// number with no leading prefix. International numbers from other
190    /// countries pass through unchanged.
191    ///
192    /// # Examples
193    ///
194    /// ```
195    /// use worker_matcher::Normalizer;
196    ///
197    /// // UK mobile, in three formats:
198    /// assert_eq!(Normalizer::normalize_phone("07700 900123"),    "7700900123");
199    /// assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
200    /// assert_eq!(Normalizer::normalize_phone("0044 7700 900123"), "7700900123");
201    ///
202    /// // UK landline with brackets and spaces:
203    /// assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
204    ///
205    /// // Empty input is preserved (no digits to keep):
206    /// assert_eq!(Normalizer::normalize_phone(""), "");
207    /// ```
208    ///
209    /// Idempotent on canonical inputs:
210    ///
211    /// ```
212    /// # use worker_matcher::Normalizer;
213    /// let once = Normalizer::normalize_phone("07700 900123");
214    /// let twice = Normalizer::normalize_phone(&once);
215    /// assert_eq!(once, twice);
216    /// ```
217    pub fn normalize_phone(phone: &str) -> String {
218        let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
219
220        if digits.starts_with("0044") && digits.len() > 4 {
221            return digits[4..].to_string();
222        }
223
224        if digits.starts_with("44") && digits.len() >= 12 {
225            return digits[2..].to_string();
226        }
227
228        if digits.starts_with('0') && digits.len() > 1 {
229            return digits[1..].to_string();
230        }
231
232        digits
233    }
234
235    /// Normalise a phone number to its E.164-style canonical form.
236    ///
237    /// E.164 is the ITU-T standard for international telephone numbers and
238    /// has the shape `+CCNNN…`, where `CC` is the country dialling code
239    /// (1–3 digits) and the remainder is the national-significant number
240    /// (NSN) with no trunk prefix.
241    ///
242    /// The function accepts a wide range of textual layouts:
243    ///
244    /// - `+CC…` (explicit international, the canonical input form).
245    /// - `00CC…` (international access code, common across Europe).
246    /// - `0…` (national format, trunk-prefix) — interpreted relative to
247    ///   `default_country` when the country uses a national trunk `0`.
248    /// - `NSN…` (bare national-significant number) — interpreted relative
249    ///   to `default_country`.
250    ///
251    /// Returns `Some(canonical)` if the input parses against a country in
252    /// the supported table; otherwise `None`. The supported countries are
253    /// the five jurisdictions for which the crate exposes a national
254    /// healthcare identifier (United Kingdom, France, Spain, Ireland, and
255    /// — sharing the GB dial code — UK Northern Ireland), plus the most
256    /// common worker-mobility partners (US, CA, DE, IT, NL, BE, PT, CH,
257    /// AT, SE, NO, DK, FI, PL, AU, NZ, JP, CN, IN, BR, MX, ZA). `default_country` is the
258    /// **ISO 3166-1 alpha-2 code** (e.g. `"GB"`, `"FR"`, `"US"`) of the
259    /// jurisdiction whose national format applies when the input lacks an
260    /// explicit international marker. Pass `None` to refuse to assume a
261    /// default — only explicit `+CC` / `00CC` inputs will parse.
262    ///
263    /// The function is **deterministic** and **idempotent**: feeding a
264    /// canonical `+CCNNN…` string back in returns the same string.
265    ///
266    /// # Examples
267    ///
268    /// UK mobile, three textual layouts, all canonicalise to the same E.164 form:
269    ///
270    /// ```
271    /// use worker_matcher::Normalizer;
272    /// assert_eq!(
273    ///     Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
274    ///     Some("+447700900123".to_string()),
275    /// );
276    /// assert_eq!(
277    ///     Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
278    ///     Some("+447700900123".to_string()),
279    /// );
280    /// assert_eq!(
281    ///     Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
282    ///     Some("+447700900123".to_string()),
283    /// );
284    /// ```
285    ///
286    /// French national format vs international form:
287    ///
288    /// ```
289    /// # use worker_matcher::Normalizer;
290    /// assert_eq!(
291    ///     Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
292    ///     Some("+33123456789".to_string()),
293    /// );
294    /// assert_eq!(
295    ///     Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("GB")),
296    ///     Some("+33123456789".to_string()),
297    /// );
298    /// ```
299    ///
300    /// North American (NANP) numbers have no trunk prefix:
301    ///
302    /// ```
303    /// # use worker_matcher::Normalizer;
304    /// assert_eq!(
305    ///     Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
306    ///     Some("+14155551234".to_string()),
307    /// );
308    /// assert_eq!(
309    ///     Normalizer::normalize_phone_e164("+1 415 555 1234", None),
310    ///     Some("+14155551234".to_string()),
311    /// );
312    /// ```
313    ///
314    /// Unparseable or ambiguous inputs return `None`:
315    ///
316    /// ```
317    /// # use worker_matcher::Normalizer;
318    /// // No default country and no international marker: ambiguous.
319    /// assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
320    /// // Unknown dial code.
321    /// assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
322    /// // Empty input.
323    /// assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
324    /// ```
325    ///
326    /// Idempotent on canonical inputs:
327    ///
328    /// ```
329    /// # use worker_matcher::Normalizer;
330    /// let once = Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")).unwrap();
331    /// let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).unwrap();
332    /// assert_eq!(once, twice);
333    /// ```
334    pub fn normalize_phone_e164(phone: &str, default_country: Option<&str>) -> Option<String> {
335        let has_plus = phone.chars().any(|c| c == '+');
336        let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();
337        if digits.is_empty() {
338            return None;
339        }
340
341        let (info, nsn): (&CountryPhoneInfo, String) = if has_plus {
342            let info = lookup_by_dial_code_prefix(&digits)?;
343            let rest = &digits[info.dial_code.len()..];
344            let rest = strip_trunk_prefix(info, rest);
345            (info, rest.to_string())
346        } else if let Some(stripped) = digits.strip_prefix("00") {
347            let info = lookup_by_dial_code_prefix(stripped)?;
348            let rest = &stripped[info.dial_code.len()..];
349            let rest = strip_trunk_prefix(info, rest);
350            (info, rest.to_string())
351        } else {
352            let iso = default_country?;
353            let info = lookup_by_iso(iso)?;
354            let nsn = strip_trunk_prefix(info, &digits);
355            (info, nsn.to_string())
356        };
357
358        if nsn.len() < info.min_nsn || nsn.len() > info.max_nsn {
359            return None;
360        }
361
362        Some(format!("+{}{}", info.dial_code, nsn))
363    }
364
365    /// Expand common postal address abbreviations as whole tokens.
366    ///
367    /// The input is tokenised on whitespace and each token is matched
368    /// case-insensitively (after stripping a single trailing `.` or `,`)
369    /// against a fixed table of street-type and directional abbreviations.
370    /// Recognised tokens are replaced with their long form, lowercased;
371    /// unrecognised tokens are passed through verbatim. Tokens are then
372    /// re-joined by single spaces.
373    ///
374    /// This function is intentionally simple: it does **not** apply any
375    /// position-aware heuristics. The well-known ambiguous case `"St"` —
376    /// which can mean *Street* or *Saint* — is always expanded to
377    /// *Street*. In practice this remains useful for fuzzy matching
378    /// because the canonical form is consistent on both sides of a
379    /// comparison; pre-process upstream if you need finer disambiguation.
380    ///
381    /// # Examples
382    ///
383    /// ```
384    /// use worker_matcher::Normalizer;
385    /// assert_eq!(
386    ///     Normalizer::expand_street_abbreviations("123 High St"),
387    ///     "123 High street",
388    /// );
389    /// assert_eq!(
390    ///     Normalizer::expand_street_abbreviations("45 N. Park Ave."),
391    ///     "45 north Park avenue",
392    /// );
393    /// assert_eq!(
394    ///     Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
395    ///     "12 Sunset boulevard",
396    /// );
397    /// ```
398    ///
399    /// Idempotent on already-expanded inputs (long forms are not
400    /// re-expanded):
401    ///
402    /// ```
403    /// # use worker_matcher::Normalizer;
404    /// let once = Normalizer::expand_street_abbreviations("10 Downing St");
405    /// let twice = Normalizer::expand_street_abbreviations(&once);
406    /// assert_eq!(once, twice);
407    /// ```
408    pub fn expand_street_abbreviations(line: &str) -> String {
409        line.split_whitespace()
410            .map(expand_one_token)
411            .collect::<Vec<_>>()
412            .join(" ")
413    }
414
415    /// Normalise an address line for comparison.
416    ///
417    /// Pipeline:
418    ///
419    /// 1. Expand street-type and directional abbreviations via
420    ///    [`Normalizer::expand_street_abbreviations`] (so `"St" → "street"`,
421    ///    `"Rd" → "road"`, `"N" → "north"`).
422    /// 2. Apply the name-normalisation pipeline
423    ///    ([`Normalizer::normalize_name`]): NFKD-decompose, drop combining
424    ///    marks, drop ASCII punctuation, lowercase, collapse whitespace.
425    ///
426    /// The result is idempotent and suitable for direct equality or
427    /// similarity comparison.
428    ///
429    /// # Examples
430    ///
431    /// Abbreviated and full forms canonicalise identically:
432    ///
433    /// ```
434    /// use worker_matcher::Normalizer;
435    /// assert_eq!(
436    ///     Normalizer::normalize_address_line("123 High St"),
437    ///     Normalizer::normalize_address_line("123 High Street"),
438    /// );
439    /// assert_eq!(
440    ///     Normalizer::normalize_address_line("45 N Park Ave"),
441    ///     Normalizer::normalize_address_line("45 North Park Avenue"),
442    /// );
443    /// ```
444    ///
445    /// Punctuation and case are normalised:
446    ///
447    /// ```
448    /// # use worker_matcher::Normalizer;
449    /// assert_eq!(
450    ///     Normalizer::normalize_address_line("10, DOWNING Street."),
451    ///     "10 downing street",
452    /// );
453    /// ```
454    pub fn normalize_address_line(line: &str) -> String {
455        Self::normalize_name(&Self::expand_street_abbreviations(line))
456    }
457
458    /// Parse an address line into its structured components.
459    ///
460    /// The function performs a best-effort structural decomposition of a
461    /// single-line postal address into:
462    ///
463    /// - `house_number` — the leading run of digits (with an optional
464    ///   single alphabetic suffix, e.g. `"10A"`), uppercased. `None` if
465    ///   no leading number is present.
466    /// - `unit` — a recognised sub-unit prefix (`Flat`, `Apt`,
467    ///   `Apartment`, `Unit`, `Suite`, `Ste`) and its identifier,
468    ///   lowercased and space-joined (e.g. `"flat 2a"`). `None` if no
469    ///   recognised prefix is present.
470    /// - `street` — the remaining text after `unit` and `house_number`
471    ///   are removed, run through [`Normalizer::normalize_address_line`].
472    ///
473    /// Parsing is **deterministic** and **format-only** — no postal
474    /// reference is consulted. Inputs that do not match the simple
475    /// regular structure (e.g. a postcode-only string, a city name)
476    /// degrade gracefully: `house_number` and `unit` are `None`, and
477    /// `street` carries the normalised input.
478    ///
479    /// # Examples
480    ///
481    /// Typical UK / US single-line addresses:
482    ///
483    /// ```
484    /// use worker_matcher::Normalizer;
485    ///
486    /// let p = Normalizer::parse_address_line("123 High Street");
487    /// assert_eq!(p.house_number.as_deref(), Some("123"));
488    /// assert_eq!(p.unit, None);
489    /// assert_eq!(p.street, "high street");
490    ///
491    /// let p = Normalizer::parse_address_line("10A Downing St");
492    /// assert_eq!(p.house_number.as_deref(), Some("10A"));
493    /// assert_eq!(p.street, "downing street");
494    ///
495    /// let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
496    /// assert_eq!(p.unit.as_deref(), Some("flat 2a"));
497    /// assert_eq!(p.house_number.as_deref(), Some("10"));
498    /// assert_eq!(p.street, "downing street");
499    ///
500    /// let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
501    /// assert_eq!(p.unit.as_deref(), Some("apt 5"));
502    /// assert_eq!(p.house_number.as_deref(), Some("1600"));
503    /// assert_eq!(p.street, "pennsylvania avenue");
504    /// ```
505    ///
506    /// Inputs without a leading number still parse:
507    ///
508    /// ```
509    /// # use worker_matcher::Normalizer;
510    /// let p = Normalizer::parse_address_line("Buckingham Palace");
511    /// assert_eq!(p.house_number, None);
512    /// assert_eq!(p.unit, None);
513    /// assert_eq!(p.street, "buckingham palace");
514    /// ```
515    pub fn parse_address_line(line: &str) -> ParsedAddressLine {
516        let trimmed = line.trim();
517        let (unit, after_unit) = extract_unit_prefix(trimmed);
518        let after_unit = after_unit.trim_start_matches([',', ' ', '\t']).trim();
519        let (house_number, after_number) = extract_house_number(after_unit);
520        let after_number = after_number.trim_start_matches([',', ' ', '\t']).trim();
521        ParsedAddressLine {
522            house_number,
523            unit,
524            street: Self::normalize_address_line(after_number),
525        }
526    }
527
528    /// Compute a phonetic (Soundex) code for a name.
529    ///
530    /// Internally, the input is first normalised via
531    /// [`Normalizer::normalize_name`] and then encoded with the American
532    /// Soundex algorithm. Names that sound alike map to the same code, which
533    /// lets the matcher catch spelling variants such as "Smith" / "Smyth" or
534    /// "Stephen" / "Steven".
535    ///
536    /// The implementation is suitable for English-language names. Non-English
537    /// phonemes may be lost. T-9 (spec §21.4) decided to keep Soundex as the
538    /// default and expose an opt-in `MatchConfig::phonetic_encoder` enum
539    /// (Double Metaphone, Daitch-Mokotoff) gated behind a Cargo feature flag
540    /// once an empirical multinational worker corpus is available;
541    /// implementation is tracked as T-9.1.
542    ///
543    /// # Examples
544    ///
545    /// Similar-sounding spellings share a code:
546    ///
547    /// ```
548    /// use worker_matcher::Normalizer;
549    /// assert_eq!(Normalizer::phonetic_code("Smith"), Normalizer::phonetic_code("Smyth"));
550    /// assert_eq!(Normalizer::phonetic_code("Stephen"), Normalizer::phonetic_code("Steven"));
551    /// ```
552    ///
553    /// Different families produce different codes:
554    ///
555    /// ```
556    /// # use worker_matcher::Normalizer;
557    /// assert_ne!(Normalizer::phonetic_code("Jones"), Normalizer::phonetic_code("Smith"));
558    /// ```
559    ///
560    /// Empty input returns an empty string, not a default Soundex value:
561    ///
562    /// ```
563    /// # use worker_matcher::Normalizer;
564    /// assert_eq!(Normalizer::phonetic_code(""),       "");
565    /// assert_eq!(Normalizer::phonetic_code("   "),    "");
566    /// ```
567    pub fn phonetic_code(name: &str) -> String {
568        let normalized = Self::normalize_name(name);
569        if normalized.is_empty() {
570            return String::new();
571        }
572        soundex::american_soundex(&normalized)
573    }
574
575    /// Normalise an email address for comparison.
576    ///
577    /// Steps:
578    ///
579    /// 1. Trim surrounding whitespace.
580    /// 2. Lowercase the entire address (RFC 5321 makes the domain
581    ///    case-insensitive and most real-world deployments treat the
582    ///    localpart case-insensitively too; case-sensitive localparts
583    ///    are technically legal but vanishingly rare in healthcare data).
584    /// 3. Reject inputs that lack exactly one `@` or that have an empty
585    ///    localpart or domain by returning `None`.
586    /// 4. If `gmail_dot_folding` is `true` and the domain is `gmail.com`
587    ///    or `googlemail.com`, strip every `.` from the localpart and
588    ///    drop any `+tag` suffix. Both transformations are reversible
589    ///    for Gmail addresses by Google's documented routing rules:
590    ///    `j.smith@gmail.com`, `js.mith@gmail.com`, and
591    ///    `jsmith+work@gmail.com` all deliver to the same mailbox as
592    ///    `jsmith@gmail.com`.
593    ///
594    /// The function is **deterministic** and **idempotent** on
595    /// successful outputs.
596    ///
597    /// # Examples
598    ///
599    /// Common case-and-whitespace normalisation:
600    ///
601    /// ```
602    /// use worker_matcher::Normalizer;
603    /// assert_eq!(
604    ///     Normalizer::normalize_email("  Alice@Example.ORG  ", false),
605    ///     Some("alice@example.org".to_string()),
606    /// );
607    /// ```
608    ///
609    /// Malformed inputs return `None`:
610    ///
611    /// ```
612    /// # use worker_matcher::Normalizer;
613    /// assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
614    /// assert_eq!(Normalizer::normalize_email("@example.org", false), None);
615    /// assert_eq!(Normalizer::normalize_email("alice@", false), None);
616    /// assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
617    /// assert_eq!(Normalizer::normalize_email("", false), None);
618    /// ```
619    ///
620    /// Optional Gmail dot-folding:
621    ///
622    /// ```
623    /// # use worker_matcher::Normalizer;
624    /// assert_eq!(
625    ///     Normalizer::normalize_email("j.smith@gmail.com", true),
626    ///     Some("jsmith@gmail.com".to_string()),
627    /// );
628    /// assert_eq!(
629    ///     Normalizer::normalize_email("jsmith+work@googlemail.com", true),
630    ///     Some("jsmith@googlemail.com".to_string()),
631    /// );
632    /// // Dot-folding does not touch non-Gmail addresses.
633    /// assert_eq!(
634    ///     Normalizer::normalize_email("j.smith@example.org", true),
635    ///     Some("j.smith@example.org".to_string()),
636    /// );
637    /// ```
638    ///
639    /// Idempotent on canonical inputs:
640    ///
641    /// ```
642    /// # use worker_matcher::Normalizer;
643    /// let once = Normalizer::normalize_email("Alice@Example.ORG", false).unwrap();
644    /// let twice = Normalizer::normalize_email(&once, false).unwrap();
645    /// assert_eq!(once, twice);
646    /// ```
647    pub fn normalize_email(email: &str, gmail_dot_folding: bool) -> Option<String> {
648        let trimmed = email.trim().to_lowercase();
649        if trimmed.is_empty() {
650            return None;
651        }
652        // Require exactly one '@'.
653        let (local, domain) = trimmed.split_once('@')?;
654        if local.is_empty() || domain.is_empty() {
655            return None;
656        }
657        // Reject any further '@' in the domain side.
658        if domain.contains('@') {
659            return None;
660        }
661        if gmail_dot_folding && (domain == "gmail.com" || domain == "googlemail.com") {
662            let local_no_plus = match local.find('+') {
663                Some(i) => &local[..i],
664                None => local,
665            };
666            let local_folded: String = local_no_plus.chars().filter(|c| *c != '.').collect();
667            if local_folded.is_empty() {
668                return None;
669            }
670            return Some(format!("{local_folded}@{domain}"));
671        }
672        Some(format!("{local}@{domain}"))
673    }
674}
675
676/// Structured decomposition of a postal-address line.
677///
678/// Produced by [`Normalizer::parse_address_line`]. The struct is
679/// `Serialize + Deserialize` so it round-trips through JSON and can be
680/// embedded in downstream data models.
681///
682/// All three fields are best-effort: parsing is format-only and consults
683/// no postal reference. Inputs that don't follow the
684/// `(unit, house_number, street)` shape degrade gracefully, with the
685/// missing pieces returned as `None`.
686#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
687pub struct ParsedAddressLine {
688    /// Leading house / building number, including an optional single
689    /// alphabetic suffix (`"10A"`), uppercased. `None` when no leading
690    /// digit is present.
691    pub house_number: Option<String>,
692    /// Sub-unit prefix and identifier, lowercased and space-joined
693    /// (e.g. `"flat 2a"`, `"apt 5"`, `"suite 12"`). `None` when no
694    /// recognised prefix is present.
695    pub unit: Option<String>,
696    /// Remaining street portion, normalised via
697    /// [`Normalizer::normalize_address_line`].
698    pub street: String,
699}
700
701/// Token-level expansion table used by [`Normalizer::expand_street_abbreviations`].
702///
703/// Entries are matched case-insensitively against a token with at most one
704/// trailing `.` or `,` stripped. The replacement is always lowercase so the
705/// downstream name-normalisation pipeline is a no-op for these tokens.
706const STREET_ABBREVIATIONS: &[(&str, &str)] = &[
707    ("st", "street"),
708    ("str", "street"),
709    ("rd", "road"),
710    ("ave", "avenue"),
711    ("av", "avenue"),
712    ("blvd", "boulevard"),
713    ("bvd", "boulevard"),
714    ("ln", "lane"),
715    ("dr", "drive"),
716    ("ct", "court"),
717    ("pl", "place"),
718    ("sq", "square"),
719    ("ter", "terrace"),
720    ("terr", "terrace"),
721    ("hwy", "highway"),
722    ("pkwy", "parkway"),
723    ("mt", "mount"),
724    ("mtn", "mountain"),
725    ("cres", "crescent"),
726    ("gdns", "gardens"),
727    ("gdn", "garden"),
728    ("gr", "grove"),
729    ("cl", "close"),
730    ("pk", "park"),
731    ("plz", "plaza"),
732    ("expy", "expressway"),
733    ("trl", "trail"),
734    ("n", "north"),
735    ("s", "south"),
736    ("e", "east"),
737    ("w", "west"),
738    ("ne", "northeast"),
739    ("nw", "northwest"),
740    ("se", "southeast"),
741    ("sw", "southwest"),
742];
743
744/// Recognised sub-unit prefix keywords for [`Normalizer::parse_address_line`].
745const UNIT_PREFIXES: &[&str] = &[
746    "flat",
747    "apartment",
748    "apt",
749    "unit",
750    "suite",
751    "ste",
752    "room",
753    "rm",
754];
755
756/// Expand a single whitespace-separated token if it appears in
757/// [`STREET_ABBREVIATIONS`].
758///
759/// The token is matched after stripping at most one trailing `.` or `,`;
760/// the comparison is ASCII case-insensitive. Tokens that contain non-ASCII
761/// characters short-circuit to the original input unchanged.
762fn expand_one_token(tok: &str) -> String {
763    let stripped = tok.trim_end_matches(['.', ',']);
764    if !stripped.is_ascii() {
765        return tok.to_string();
766    }
767    let lower = stripped.to_ascii_lowercase();
768    for (abbrev, full) in STREET_ABBREVIATIONS {
769        if lower == *abbrev {
770            return (*full).to_string();
771        }
772    }
773    tok.to_string()
774}
775
776/// Extract a recognised unit prefix and its identifier from the start of `s`.
777///
778/// Returns `(Some("flat 2a"), rest)` when the input begins with a
779/// recognised keyword followed by an alphanumeric identifier; otherwise
780/// `(None, s)` unchanged.
781fn extract_unit_prefix(s: &str) -> (Option<String>, &str) {
782    let trimmed = s.trim_start();
783    // Find the first whitespace; everything before is the candidate keyword.
784    let kw_end = trimmed
785        .find(|c: char| c.is_whitespace())
786        .unwrap_or(trimmed.len());
787    if kw_end == 0 {
788        return (None, s);
789    }
790    let kw_raw = &trimmed[..kw_end];
791    let kw_stripped = kw_raw.trim_end_matches(['.', ',']);
792    if !kw_stripped.is_ascii() {
793        return (None, s);
794    }
795    let kw_lower = kw_stripped.to_ascii_lowercase();
796    if !UNIT_PREFIXES.iter().any(|p| *p == kw_lower) {
797        return (None, s);
798    }
799    // Skip whitespace and `#` after the keyword.
800    let after_kw = trimmed[kw_end..].trim_start_matches([' ', '\t', '#']);
801    // Read alphanumerics as the identifier.
802    let id_end = after_kw
803        .find(|c: char| !c.is_ascii_alphanumeric())
804        .unwrap_or(after_kw.len());
805    if id_end == 0 {
806        return (None, s);
807    }
808    let id = &after_kw[..id_end];
809    let rest = &after_kw[id_end..];
810    let unit = format!("{} {}", kw_lower, id.to_ascii_lowercase());
811    (Some(unit), rest)
812}
813
814/// Extract a leading house number (digits + optional single alphabetic
815/// suffix) from the start of `s`.
816///
817/// `"10 Downing Street"` → `(Some("10"), " Downing Street")`.
818/// `"10A High St"` → `(Some("10A"), " High St")`.
819/// `"Buckingham Palace"` → `(None, "Buckingham Palace")`.
820fn extract_house_number(s: &str) -> (Option<String>, &str) {
821    let trimmed = s.trim_start();
822    let mut digits_end = 0;
823    for (i, c) in trimmed.char_indices() {
824        if c.is_ascii_digit() {
825            digits_end = i + c.len_utf8();
826        } else {
827            break;
828        }
829    }
830    if digits_end == 0 {
831        return (None, s);
832    }
833    let mut end = digits_end;
834    // Allow a single alphabetic suffix (e.g. "10A"), but only when not
835    // followed by another alphabetic — otherwise we'd swallow the start
836    // of a street name like "10 Apple Tree Lane".
837    let after_digits = &trimmed[digits_end..];
838    let mut chars = after_digits.chars();
839    if let Some(c1) = chars.next()
840        && c1.is_ascii_alphabetic()
841    {
842        let next = chars.next();
843        if next.is_none() || next.is_some_and(|c2| !c2.is_ascii_alphanumeric()) {
844            end += c1.len_utf8();
845        }
846    }
847    let number = trimmed[..end].to_ascii_uppercase();
848    (Some(number), &trimmed[end..])
849}
850
851/// Per-country phone metadata for [`Normalizer::normalize_phone_e164`].
852///
853/// `min_nsn` / `max_nsn` bound the **national-significant number** length —
854/// the digits after the dial code, with the national trunk prefix removed.
855/// `trunk_prefix` is the digit string used for national dialling (`"0"` for
856/// most of Europe and Asia, `"8"` for Lithuania, `None` for NANP / Spain /
857/// Portugal and several others). When set, a single occurrence of the
858/// string at the start of the national number is stripped before
859/// canonicalisation.
860struct CountryPhoneInfo {
861    /// ISO 3166-1 alpha-2 country code, uppercase.
862    iso_alpha2: &'static str,
863    /// International dialling code, no leading `+`.
864    dial_code: &'static str,
865    /// National trunk prefix digit(s), if any.
866    trunk_prefix: Option<&'static str>,
867    /// Minimum national-significant-number length.
868    min_nsn: usize,
869    /// Maximum national-significant-number length.
870    max_nsn: usize,
871}
872
873/// Phone-numbering metadata for countries supported by
874/// [`Normalizer::normalize_phone_e164`].
875///
876/// Coverage: all five jurisdictions for which the crate exposes a national
877/// healthcare identifier (GB England/Wales/IoM, FR, ES, IE, plus UK NI via
878/// the GB dial code), plus the most common worker-mobility partners. New
879/// entries SHOULD follow the ISO 3166-1 alpha-2 convention and document the
880/// trunk-prefix rule explicitly.
881const COUNTRY_PHONE_TABLE: &[CountryPhoneInfo] = &[
882    CountryPhoneInfo {
883        iso_alpha2: "GB",
884        dial_code: "44",
885        trunk_prefix: Some("0"),
886        min_nsn: 7,
887        max_nsn: 11,
888    },
889    CountryPhoneInfo {
890        iso_alpha2: "FR",
891        dial_code: "33",
892        trunk_prefix: Some("0"),
893        min_nsn: 9,
894        max_nsn: 9,
895    },
896    CountryPhoneInfo {
897        iso_alpha2: "DE",
898        dial_code: "49",
899        trunk_prefix: Some("0"),
900        min_nsn: 7,
901        max_nsn: 13,
902    },
903    CountryPhoneInfo {
904        iso_alpha2: "ES",
905        dial_code: "34",
906        trunk_prefix: None,
907        min_nsn: 9,
908        max_nsn: 9,
909    },
910    CountryPhoneInfo {
911        iso_alpha2: "IE",
912        dial_code: "353",
913        trunk_prefix: Some("0"),
914        min_nsn: 7,
915        max_nsn: 11,
916    },
917    CountryPhoneInfo {
918        iso_alpha2: "IT",
919        dial_code: "39",
920        trunk_prefix: None,
921        min_nsn: 6,
922        max_nsn: 12,
923    },
924    CountryPhoneInfo {
925        iso_alpha2: "NL",
926        dial_code: "31",
927        trunk_prefix: Some("0"),
928        min_nsn: 9,
929        max_nsn: 9,
930    },
931    CountryPhoneInfo {
932        iso_alpha2: "BE",
933        dial_code: "32",
934        trunk_prefix: Some("0"),
935        min_nsn: 8,
936        max_nsn: 9,
937    },
938    CountryPhoneInfo {
939        iso_alpha2: "PT",
940        dial_code: "351",
941        trunk_prefix: None,
942        min_nsn: 9,
943        max_nsn: 9,
944    },
945    CountryPhoneInfo {
946        iso_alpha2: "CH",
947        dial_code: "41",
948        trunk_prefix: Some("0"),
949        min_nsn: 9,
950        max_nsn: 9,
951    },
952    CountryPhoneInfo {
953        iso_alpha2: "AT",
954        dial_code: "43",
955        trunk_prefix: Some("0"),
956        min_nsn: 4,
957        max_nsn: 13,
958    },
959    CountryPhoneInfo {
960        iso_alpha2: "SE",
961        dial_code: "46",
962        trunk_prefix: Some("0"),
963        min_nsn: 7,
964        max_nsn: 13,
965    },
966    CountryPhoneInfo {
967        iso_alpha2: "NO",
968        dial_code: "47",
969        trunk_prefix: None,
970        min_nsn: 8,
971        max_nsn: 8,
972    },
973    CountryPhoneInfo {
974        iso_alpha2: "DK",
975        dial_code: "45",
976        trunk_prefix: None,
977        min_nsn: 8,
978        max_nsn: 8,
979    },
980    CountryPhoneInfo {
981        iso_alpha2: "FI",
982        dial_code: "358",
983        trunk_prefix: Some("0"),
984        min_nsn: 5,
985        max_nsn: 12,
986    },
987    CountryPhoneInfo {
988        iso_alpha2: "PL",
989        dial_code: "48",
990        trunk_prefix: None,
991        min_nsn: 9,
992        max_nsn: 9,
993    },
994    CountryPhoneInfo {
995        iso_alpha2: "AU",
996        dial_code: "61",
997        trunk_prefix: Some("0"),
998        min_nsn: 9,
999        max_nsn: 9,
1000    },
1001    CountryPhoneInfo {
1002        iso_alpha2: "NZ",
1003        dial_code: "64",
1004        trunk_prefix: Some("0"),
1005        min_nsn: 8,
1006        max_nsn: 10,
1007    },
1008    CountryPhoneInfo {
1009        iso_alpha2: "US",
1010        dial_code: "1",
1011        trunk_prefix: None,
1012        min_nsn: 10,
1013        max_nsn: 10,
1014    },
1015    CountryPhoneInfo {
1016        iso_alpha2: "CA",
1017        dial_code: "1",
1018        trunk_prefix: None,
1019        min_nsn: 10,
1020        max_nsn: 10,
1021    },
1022    CountryPhoneInfo {
1023        iso_alpha2: "JP",
1024        dial_code: "81",
1025        trunk_prefix: Some("0"),
1026        min_nsn: 9,
1027        max_nsn: 10,
1028    },
1029    CountryPhoneInfo {
1030        iso_alpha2: "CN",
1031        dial_code: "86",
1032        trunk_prefix: Some("0"),
1033        min_nsn: 5,
1034        max_nsn: 12,
1035    },
1036    CountryPhoneInfo {
1037        iso_alpha2: "IN",
1038        dial_code: "91",
1039        trunk_prefix: Some("0"),
1040        min_nsn: 10,
1041        max_nsn: 10,
1042    },
1043    CountryPhoneInfo {
1044        iso_alpha2: "BR",
1045        dial_code: "55",
1046        trunk_prefix: Some("0"),
1047        min_nsn: 10,
1048        max_nsn: 11,
1049    },
1050    CountryPhoneInfo {
1051        iso_alpha2: "MX",
1052        dial_code: "52",
1053        trunk_prefix: None,
1054        min_nsn: 10,
1055        max_nsn: 10,
1056    },
1057    CountryPhoneInfo {
1058        iso_alpha2: "ZA",
1059        dial_code: "27",
1060        trunk_prefix: Some("0"),
1061        min_nsn: 9,
1062        max_nsn: 9,
1063    },
1064    // ---- T-19: coverage of remaining 35-scheme identifier jurisdictions ----
1065    CountryPhoneInfo {
1066        iso_alpha2: "BG",
1067        dial_code: "359",
1068        trunk_prefix: Some("0"),
1069        min_nsn: 8,
1070        max_nsn: 9,
1071    },
1072    CountryPhoneInfo {
1073        iso_alpha2: "CZ",
1074        dial_code: "420",
1075        trunk_prefix: None,
1076        min_nsn: 9,
1077        max_nsn: 9,
1078    },
1079    CountryPhoneInfo {
1080        iso_alpha2: "EE",
1081        dial_code: "372",
1082        trunk_prefix: None,
1083        min_nsn: 7,
1084        max_nsn: 8,
1085    },
1086    CountryPhoneInfo {
1087        iso_alpha2: "GR",
1088        dial_code: "30",
1089        trunk_prefix: None,
1090        min_nsn: 10,
1091        max_nsn: 10,
1092    },
1093    CountryPhoneInfo {
1094        iso_alpha2: "HR",
1095        dial_code: "385",
1096        trunk_prefix: Some("0"),
1097        min_nsn: 8,
1098        max_nsn: 9,
1099    },
1100    CountryPhoneInfo {
1101        iso_alpha2: "IS",
1102        dial_code: "354",
1103        trunk_prefix: None,
1104        min_nsn: 7,
1105        max_nsn: 9,
1106    },
1107    CountryPhoneInfo {
1108        iso_alpha2: "LI",
1109        dial_code: "423",
1110        trunk_prefix: None,
1111        min_nsn: 7,
1112        max_nsn: 9,
1113    },
1114    // Lithuania uses `8` (not `0`) as the national trunk prefix.
1115    CountryPhoneInfo {
1116        iso_alpha2: "LT",
1117        dial_code: "370",
1118        trunk_prefix: Some("8"),
1119        min_nsn: 8,
1120        max_nsn: 8,
1121    },
1122    CountryPhoneInfo {
1123        iso_alpha2: "LV",
1124        dial_code: "371",
1125        trunk_prefix: None,
1126        min_nsn: 8,
1127        max_nsn: 8,
1128    },
1129    CountryPhoneInfo {
1130        iso_alpha2: "MT",
1131        dial_code: "356",
1132        trunk_prefix: None,
1133        min_nsn: 8,
1134        max_nsn: 8,
1135    },
1136    CountryPhoneInfo {
1137        iso_alpha2: "RO",
1138        dial_code: "40",
1139        trunk_prefix: Some("0"),
1140        min_nsn: 9,
1141        max_nsn: 9,
1142    },
1143    CountryPhoneInfo {
1144        iso_alpha2: "SI",
1145        dial_code: "386",
1146        trunk_prefix: Some("0"),
1147        min_nsn: 8,
1148        max_nsn: 8,
1149    },
1150    CountryPhoneInfo {
1151        iso_alpha2: "SK",
1152        dial_code: "421",
1153        trunk_prefix: Some("0"),
1154        min_nsn: 9,
1155        max_nsn: 9,
1156    },
1157];
1158
1159/// Look up a country by ISO 3166-1 alpha-2 code (case-insensitive).
1160///
1161/// Returns the first match in [`COUNTRY_PHONE_TABLE`]. For NANP countries
1162/// (US/CA) which share dial code `1`, this disambiguates by the caller's
1163/// chosen default; the canonical E.164 output is identical for both.
1164fn lookup_by_iso(iso: &str) -> Option<&'static CountryPhoneInfo> {
1165    if !iso.is_ascii() {
1166        return None;
1167    }
1168    let upper = iso.to_ascii_uppercase();
1169    COUNTRY_PHONE_TABLE.iter().find(|c| c.iso_alpha2 == upper)
1170}
1171
1172/// Match the longest known dial-code prefix at the start of `digits`.
1173///
1174/// Tries 3-, 2-, then 1-digit prefixes to honour the country table's most
1175/// specific entry. For NANP (dial code `1`) the first matching entry — US —
1176/// is returned; the canonical E.164 form is the same whether the caller
1177/// later interprets the country as US or CA.
1178fn lookup_by_dial_code_prefix(digits: &str) -> Option<&'static CountryPhoneInfo> {
1179    for len in [3usize, 2, 1] {
1180        if digits.len() >= len {
1181            let prefix = &digits[..len];
1182            if let Some(info) = COUNTRY_PHONE_TABLE.iter().find(|c| c.dial_code == prefix) {
1183                return Some(info);
1184            }
1185        }
1186    }
1187    None
1188}
1189
1190/// Strip a single occurrence of the country's national trunk prefix
1191/// from `nsn` if one is configured and present.
1192fn strip_trunk_prefix<'a>(info: &CountryPhoneInfo, nsn: &'a str) -> &'a str {
1193    if let Some(tp) = info.trunk_prefix
1194        && let Some(rest) = nsn.strip_prefix(tp)
1195        && !rest.is_empty()
1196    {
1197        rest
1198    } else {
1199        nsn
1200    }
1201}
1202
1203#[cfg(test)]
1204mod tests {
1205    use super::*;
1206
1207    // ---------- normalize_name ----------
1208
1209    #[test]
1210    fn normalize_name_collapses_whitespace_and_trims() {
1211        assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
1212    }
1213
1214    #[test]
1215    fn normalize_name_strips_ascii_punctuation() {
1216        assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
1217        assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
1218        assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
1219    }
1220
1221    #[test]
1222    fn normalize_name_strips_diacritics() {
1223        assert_eq!(Normalizer::normalize_name("José"), "jose");
1224        assert_eq!(Normalizer::normalize_name("Siân"), "sian");
1225        // common test cases
1226        assert_eq!(Normalizer::normalize_name("naïve"), "naive");
1227        assert_eq!(Normalizer::normalize_name("crème"), "creme");
1228        // ŷ and ŵ decompose; ł has no NFKD decomposition and survives lowercased.
1229        assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
1230    }
1231
1232    #[test]
1233    fn normalize_name_handles_empty_and_whitespace() {
1234        assert_eq!(Normalizer::normalize_name(""), "");
1235        assert_eq!(Normalizer::normalize_name("   "), "");
1236        assert_eq!(Normalizer::normalize_name("\t\n"), "");
1237    }
1238
1239    #[test]
1240    fn normalize_name_lowercases() {
1241        assert_eq!(Normalizer::normalize_name("MARY"), "mary");
1242        assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
1243    }
1244
1245    #[test]
1246    fn normalize_name_is_idempotent() {
1247        for input in [
1248            "  John  Smith  ",
1249            "O'Brien-Jones",
1250            "JOSÉ MARÍA",
1251            "",
1252            "  ",
1253            "Siân",
1254        ] {
1255            let once = Normalizer::normalize_name(input);
1256            let twice = Normalizer::normalize_name(&once);
1257            assert_eq!(once, twice, "not idempotent for {input:?}");
1258        }
1259    }
1260
1261    #[test]
1262    fn normalize_name_does_not_normalise_unicode_punctuation() {
1263        // Curly apostrophe (U+2019) is intentionally not stripped.
1264        // This is documented in AGENTS/normalization.md as a known limitation.
1265        let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
1266        assert!(with_curly.contains('\u{2019}'));
1267    }
1268
1269    // ---------- normalize_postcode ----------
1270
1271    #[test]
1272    fn normalize_postcode_uppercases() {
1273        assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
1274    }
1275
1276    #[test]
1277    fn normalize_postcode_strips_all_whitespace() {
1278        assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
1279        assert_eq!(Normalizer::normalize_postcode(" CF10  1AA "), "CF101AA");
1280        assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
1281    }
1282
1283    #[test]
1284    fn normalize_postcode_handles_empty() {
1285        assert_eq!(Normalizer::normalize_postcode(""), "");
1286        assert_eq!(Normalizer::normalize_postcode("   "), "");
1287    }
1288
1289    #[test]
1290    fn normalize_postcode_is_idempotent() {
1291        for input in ["cf10 1aa", "SW1A 2AA", "  EH8 9YL  ", ""] {
1292            let once = Normalizer::normalize_postcode(input);
1293            let twice = Normalizer::normalize_postcode(&once);
1294            assert_eq!(once, twice);
1295        }
1296    }
1297
1298    // ---------- normalize_phone ----------
1299
1300    #[test]
1301    fn normalize_phone_strips_uk_trunk_prefix() {
1302        assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
1303    }
1304
1305    #[test]
1306    fn normalize_phone_strips_plus_44_international() {
1307        assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
1308    }
1309
1310    #[test]
1311    fn normalize_phone_strips_0044_international() {
1312        assert_eq!(
1313            Normalizer::normalize_phone("0044 7700 900123"),
1314            "7700900123"
1315        );
1316    }
1317
1318    #[test]
1319    fn normalize_phone_handles_brackets_and_spaces() {
1320        assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
1321    }
1322
1323    #[test]
1324    fn normalize_phone_handles_empty() {
1325        assert_eq!(Normalizer::normalize_phone(""), "");
1326        assert_eq!(Normalizer::normalize_phone("---"), "");
1327    }
1328
1329    #[test]
1330    fn normalize_phone_does_not_strip_44_if_too_short() {
1331        // 44 followed by fewer than 10 more digits: keep the 44 (not international prefix).
1332        assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
1333    }
1334
1335    #[test]
1336    fn normalize_phone_is_idempotent() {
1337        for input in [
1338            "07700 900123",
1339            "+44 7700 900123",
1340            "0044 7700 900123",
1341            "(029) 2034 5678",
1342            "",
1343        ] {
1344            let once = Normalizer::normalize_phone(input);
1345            let twice = Normalizer::normalize_phone(&once);
1346            assert_eq!(once, twice, "not idempotent for {input:?}");
1347        }
1348    }
1349
1350    #[test]
1351    fn normalize_phone_keeps_lone_zero() {
1352        // A bare "0" is not stripped (guard: len > 1).
1353        assert_eq!(Normalizer::normalize_phone("0"), "0");
1354    }
1355
1356    // ---------- phonetic_code ----------
1357
1358    #[test]
1359    fn phonetic_code_groups_smith_and_smyth() {
1360        assert_eq!(
1361            Normalizer::phonetic_code("Smith"),
1362            Normalizer::phonetic_code("Smyth")
1363        );
1364    }
1365
1366    #[test]
1367    fn phonetic_code_groups_stephen_and_steven() {
1368        assert_eq!(
1369            Normalizer::phonetic_code("Stephen"),
1370            Normalizer::phonetic_code("Steven")
1371        );
1372    }
1373
1374    #[test]
1375    fn phonetic_code_distinguishes_different_families() {
1376        assert_ne!(
1377            Normalizer::phonetic_code("Jones"),
1378            Normalizer::phonetic_code("Smith")
1379        );
1380        assert_ne!(
1381            Normalizer::phonetic_code("Anderson"),
1382            Normalizer::phonetic_code("Zimmerman")
1383        );
1384    }
1385
1386    #[test]
1387    fn phonetic_code_specific_values() {
1388        // Pinned values from the underlying soundex crate; act as a regression net.
1389        assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
1390        assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
1391        assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
1392        assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
1393    }
1394
1395    #[test]
1396    fn phonetic_code_handles_empty() {
1397        assert_eq!(Normalizer::phonetic_code(""), "");
1398        assert_eq!(Normalizer::phonetic_code("   "), "");
1399    }
1400
1401    #[test]
1402    fn phonetic_code_is_case_insensitive() {
1403        assert_eq!(
1404            Normalizer::phonetic_code("SMITH"),
1405            Normalizer::phonetic_code("smith")
1406        );
1407    }
1408
1409    // ---------- normalize_phone_e164 ----------
1410
1411    #[test]
1412    fn e164_uk_layouts_canonicalise_identically() {
1413        let canonical = Some("+447700900123".to_string());
1414        assert_eq!(
1415            Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
1416            canonical,
1417        );
1418        assert_eq!(
1419            Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
1420            canonical,
1421        );
1422        assert_eq!(
1423            Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
1424            canonical,
1425        );
1426        assert_eq!(
1427            Normalizer::normalize_phone_e164("(07700) 900-123", Some("GB")),
1428            canonical,
1429        );
1430    }
1431
1432    #[test]
1433    fn e164_french_layouts_canonicalise_identically() {
1434        let canonical = Some("+33123456789".to_string());
1435        assert_eq!(
1436            Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("FR")),
1437            canonical,
1438        );
1439        assert_eq!(
1440            Normalizer::normalize_phone_e164("0033 1 23 45 67 89", Some("FR")),
1441            canonical,
1442        );
1443        assert_eq!(
1444            Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
1445            canonical,
1446        );
1447    }
1448
1449    #[test]
1450    fn e164_spain_has_no_national_trunk_prefix() {
1451        // Spain switched to no trunk-0 in 1998; a bare 9-digit national
1452        // number is the canonical form.
1453        assert_eq!(
1454            Normalizer::normalize_phone_e164("912 345 678", Some("ES")),
1455            Some("+34912345678".to_string()),
1456        );
1457        assert_eq!(
1458            Normalizer::normalize_phone_e164("+34 912 345 678", None),
1459            Some("+34912345678".to_string()),
1460        );
1461    }
1462
1463    #[test]
1464    fn e164_ireland_three_digit_dial_code() {
1465        assert_eq!(
1466            Normalizer::normalize_phone_e164("+353 1 234 5678", None),
1467            Some("+35312345678".to_string()),
1468        );
1469        assert_eq!(
1470            Normalizer::normalize_phone_e164("01 234 5678", Some("IE")),
1471            Some("+35312345678".to_string()),
1472        );
1473    }
1474
1475    #[test]
1476    fn e164_nanp_handles_us_and_canada() {
1477        assert_eq!(
1478            Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
1479            Some("+14155551234".to_string()),
1480        );
1481        assert_eq!(
1482            Normalizer::normalize_phone_e164("+1 415 555 1234", None),
1483            Some("+14155551234".to_string()),
1484        );
1485        // Canada uses the same dial code; canonical form is identical.
1486        assert_eq!(
1487            Normalizer::normalize_phone_e164("(416) 555-1234", Some("CA")),
1488            Some("+14165551234".to_string()),
1489        );
1490    }
1491
1492    // ---------- T-19: 35-scheme jurisdiction coverage ----------
1493
1494    #[test]
1495    fn e164_lithuania_uses_eight_as_trunk_prefix() {
1496        // Lithuania's national trunk prefix is `8`, not `0`. National
1497        // dialling form `8 612 34567` (mobile) canonicalises to the same
1498        // E.164 string as the explicit `+370 612 34567` form.
1499        assert_eq!(
1500            Normalizer::normalize_phone_e164("8 612 34567", Some("LT")),
1501            Some("+37061234567".to_string()),
1502        );
1503        assert_eq!(
1504            Normalizer::normalize_phone_e164("+370 612 34567", None),
1505            Some("+37061234567".to_string()),
1506        );
1507    }
1508
1509    #[test]
1510    fn e164_greece_has_no_national_trunk_prefix() {
1511        // GR national-significant numbers begin with the area code (the
1512        // leading zero seen in older publications is no longer a trunk).
1513        assert_eq!(
1514            Normalizer::normalize_phone_e164("+30 210 123 4567", None),
1515            Some("+302101234567".to_string()),
1516        );
1517        assert_eq!(
1518            Normalizer::normalize_phone_e164("210 123 4567", Some("GR")),
1519            Some("+302101234567".to_string()),
1520        );
1521    }
1522
1523    #[test]
1524    fn e164_romania_strips_trunk_zero() {
1525        assert_eq!(
1526            Normalizer::normalize_phone_e164("0721 234 567", Some("RO")),
1527            Some("+40721234567".to_string()),
1528        );
1529        assert_eq!(
1530            Normalizer::normalize_phone_e164("+40 721 234 567", None),
1531            Some("+40721234567".to_string()),
1532        );
1533    }
1534
1535    #[test]
1536    fn e164_czech_no_trunk_prefix() {
1537        assert_eq!(
1538            Normalizer::normalize_phone_e164("+420 234 567 890", None),
1539            Some("+420234567890".to_string()),
1540        );
1541        assert_eq!(
1542            Normalizer::normalize_phone_e164("234 567 890", Some("CZ")),
1543            Some("+420234567890".to_string()),
1544        );
1545    }
1546
1547    #[test]
1548    fn e164_iceland_seven_digit_nsn() {
1549        assert_eq!(
1550            Normalizer::normalize_phone_e164("+354 412 3456", None),
1551            Some("+3544123456".to_string()),
1552        );
1553    }
1554
1555    #[test]
1556    fn e164_distinguishes_overlapping_three_digit_dial_codes() {
1557        // Croatia (385) vs Slovenia (386): adjacent dial codes, both
1558        // use a `0` trunk, but the canonical E.164 forms remain distinct.
1559        let hr = Normalizer::normalize_phone_e164("+385 91 234 5678", None);
1560        let si = Normalizer::normalize_phone_e164("+386 41 234 567", None);
1561        assert!(hr.is_some());
1562        assert!(si.is_some());
1563        assert_ne!(hr, si);
1564    }
1565
1566    #[test]
1567    fn e164_distinguishes_countries_with_overlapping_national_digits() {
1568        // The "same" national-format digits in two countries must yield
1569        // different E.164 strings — this is precisely the disambiguation
1570        // the new normaliser provides.
1571        let uk = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1572        let fr = Normalizer::normalize_phone_e164("07700 90012", Some("FR"));
1573        assert!(uk.is_some());
1574        assert!(fr.is_some());
1575        assert_ne!(uk, fr);
1576    }
1577
1578    #[test]
1579    fn e164_returns_none_when_default_country_missing_and_no_marker() {
1580        // Ambiguous national-format input with no default country.
1581        assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
1582    }
1583
1584    #[test]
1585    fn e164_returns_none_for_unknown_dial_code() {
1586        assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
1587    }
1588
1589    #[test]
1590    fn e164_returns_none_for_empty_or_punctuation_only() {
1591        assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
1592        assert_eq!(Normalizer::normalize_phone_e164("+", Some("GB")), None);
1593        assert_eq!(Normalizer::normalize_phone_e164("()-", Some("GB")), None);
1594    }
1595
1596    #[test]
1597    fn e164_returns_none_for_too_short_or_too_long_nsn() {
1598        // GB NSN must be 7..=11 digits.
1599        assert_eq!(Normalizer::normalize_phone_e164("+44 1", None), None);
1600        assert_eq!(
1601            Normalizer::normalize_phone_e164("+44 123456789012345", None),
1602            None,
1603        );
1604    }
1605
1606    #[test]
1607    fn e164_rejects_unknown_default_country() {
1608        // "XX" is not in the table; without an explicit international
1609        // marker the function cannot guess.
1610        assert_eq!(
1611            Normalizer::normalize_phone_e164("07700 900123", Some("XX")),
1612            None,
1613        );
1614    }
1615
1616    #[test]
1617    fn e164_is_idempotent_on_canonical_form() {
1618        for input in [
1619            "+44 7700 900123",
1620            "+33 1 23 45 67 89",
1621            "(415) 555-1234",
1622            "+353 1 234 5678",
1623            "+34 912 345 678",
1624        ] {
1625            let once = Normalizer::normalize_phone_e164(input, Some("GB")).expect("parses");
1626            let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).expect("idempotent");
1627            assert_eq!(once, twice, "not idempotent for {input:?}");
1628        }
1629    }
1630
1631    #[test]
1632    fn e164_default_country_lookup_is_case_insensitive() {
1633        let lower = Normalizer::normalize_phone_e164("07700 900123", Some("gb"));
1634        let upper = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1635        assert_eq!(lower, upper);
1636        assert!(lower.is_some());
1637    }
1638
1639    #[test]
1640    fn e164_handles_double_zero_international_access_form() {
1641        assert_eq!(
1642            Normalizer::normalize_phone_e164("00 33 1 23 45 67 89", None),
1643            Some("+33123456789".to_string()),
1644        );
1645    }
1646
1647    // ---------- expand_street_abbreviations ----------
1648
1649    #[test]
1650    fn expand_street_replaces_common_abbreviations() {
1651        assert_eq!(
1652            Normalizer::expand_street_abbreviations("123 High St"),
1653            "123 High street",
1654        );
1655        assert_eq!(
1656            Normalizer::expand_street_abbreviations("10 Downing Rd"),
1657            "10 Downing road",
1658        );
1659        assert_eq!(
1660            Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
1661            "12 Sunset boulevard",
1662        );
1663        assert_eq!(
1664            Normalizer::expand_street_abbreviations("1 Park Ave"),
1665            "1 Park avenue",
1666        );
1667        assert_eq!(
1668            Normalizer::expand_street_abbreviations("5 Cherry Ln"),
1669            "5 Cherry lane",
1670        );
1671    }
1672
1673    #[test]
1674    fn expand_street_replaces_directionals() {
1675        assert_eq!(
1676            Normalizer::expand_street_abbreviations("45 N Park Ave"),
1677            "45 north Park avenue",
1678        );
1679        assert_eq!(
1680            Normalizer::expand_street_abbreviations("100 SW 5th St"),
1681            "100 southwest 5th street",
1682        );
1683    }
1684
1685    #[test]
1686    fn expand_street_strips_trailing_period_or_comma() {
1687        assert_eq!(
1688            Normalizer::expand_street_abbreviations("123 High St."),
1689            "123 High street",
1690        );
1691        assert_eq!(
1692            Normalizer::expand_street_abbreviations("12 Sunset Blvd,"),
1693            "12 Sunset boulevard",
1694        );
1695    }
1696
1697    #[test]
1698    fn expand_street_passes_unknown_tokens_through() {
1699        assert_eq!(
1700            Normalizer::expand_street_abbreviations("Buckingham Palace"),
1701            "Buckingham Palace",
1702        );
1703    }
1704
1705    #[test]
1706    fn expand_street_is_idempotent_on_already_expanded_input() {
1707        for input in [
1708            "123 High St",
1709            "45 N Park Ave",
1710            "10 Downing Rd",
1711            "Buckingham Palace",
1712        ] {
1713            let once = Normalizer::expand_street_abbreviations(input);
1714            let twice = Normalizer::expand_street_abbreviations(&once);
1715            assert_eq!(once, twice, "not idempotent for {input:?}");
1716        }
1717    }
1718
1719    #[test]
1720    fn expand_street_handles_empty_and_whitespace_only() {
1721        assert_eq!(Normalizer::expand_street_abbreviations(""), "");
1722        assert_eq!(Normalizer::expand_street_abbreviations("   "), "");
1723    }
1724
1725    // ---------- normalize_address_line ----------
1726
1727    #[test]
1728    fn normalize_address_line_unifies_abbreviated_and_full_forms() {
1729        assert_eq!(
1730            Normalizer::normalize_address_line("123 High St"),
1731            Normalizer::normalize_address_line("123 High Street"),
1732        );
1733        assert_eq!(
1734            Normalizer::normalize_address_line("45 N Park Ave"),
1735            Normalizer::normalize_address_line("45 North Park Avenue"),
1736        );
1737    }
1738
1739    #[test]
1740    fn normalize_address_line_handles_punctuation_and_case() {
1741        assert_eq!(
1742            Normalizer::normalize_address_line("10, DOWNING Street."),
1743            "10 downing street",
1744        );
1745    }
1746
1747    #[test]
1748    fn normalize_address_line_is_idempotent() {
1749        for input in [
1750            "123 High St",
1751            "  45 N Park Ave  ",
1752            "10, Downing Street.",
1753            "",
1754        ] {
1755            let once = Normalizer::normalize_address_line(input);
1756            let twice = Normalizer::normalize_address_line(&once);
1757            assert_eq!(once, twice, "not idempotent for {input:?}");
1758        }
1759    }
1760
1761    // ---------- parse_address_line ----------
1762
1763    #[test]
1764    fn parse_address_extracts_simple_house_number() {
1765        let p = Normalizer::parse_address_line("123 High Street");
1766        assert_eq!(p.house_number.as_deref(), Some("123"));
1767        assert_eq!(p.unit, None);
1768        assert_eq!(p.street, "high street");
1769    }
1770
1771    #[test]
1772    fn parse_address_handles_alphanumeric_house_number() {
1773        let p = Normalizer::parse_address_line("10A Downing St");
1774        assert_eq!(p.house_number.as_deref(), Some("10A"));
1775        assert_eq!(p.street, "downing street");
1776    }
1777
1778    #[test]
1779    fn parse_address_does_not_greedily_consume_street_name() {
1780        // "10 Apple Tree Lane" — `Apple` must not be absorbed into the
1781        // house number because two consecutive alphabetic characters
1782        // signal it's part of the street name.
1783        let p = Normalizer::parse_address_line("10 Apple Tree Lane");
1784        assert_eq!(p.house_number.as_deref(), Some("10"));
1785        assert_eq!(p.street, "apple tree lane");
1786    }
1787
1788    #[test]
1789    fn parse_address_recognises_flat_prefix() {
1790        let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
1791        assert_eq!(p.unit.as_deref(), Some("flat 2a"));
1792        assert_eq!(p.house_number.as_deref(), Some("10"));
1793        assert_eq!(p.street, "downing street");
1794    }
1795
1796    #[test]
1797    fn parse_address_recognises_apt_prefix() {
1798        let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
1799        assert_eq!(p.unit.as_deref(), Some("apt 5"));
1800        assert_eq!(p.house_number.as_deref(), Some("1600"));
1801        assert_eq!(p.street, "pennsylvania avenue");
1802    }
1803
1804    #[test]
1805    fn parse_address_recognises_suite_and_ste_and_unit_and_room() {
1806        for input in [
1807            "Suite 12, 100 Main St",
1808            "Ste 12, 100 Main St",
1809            "Unit 12, 100 Main St",
1810            "Room 12, 100 Main St",
1811        ] {
1812            let p = Normalizer::parse_address_line(input);
1813            assert!(p.unit.is_some(), "no unit for {input:?}");
1814            assert_eq!(p.house_number.as_deref(), Some("100"));
1815            assert_eq!(p.street, "main street");
1816        }
1817    }
1818
1819    #[test]
1820    fn parse_address_no_leading_number_falls_back_to_street_only() {
1821        let p = Normalizer::parse_address_line("Buckingham Palace");
1822        assert_eq!(p.house_number, None);
1823        assert_eq!(p.unit, None);
1824        assert_eq!(p.street, "buckingham palace");
1825    }
1826
1827    #[test]
1828    fn parse_address_empty_input_yields_empty_street() {
1829        let p = Normalizer::parse_address_line("");
1830        assert_eq!(p.house_number, None);
1831        assert_eq!(p.unit, None);
1832        assert_eq!(p.street, "");
1833    }
1834
1835    #[test]
1836    fn parse_address_round_trips_through_serde() {
1837        let p = Normalizer::parse_address_line("Flat 2A, 10A Downing Street");
1838        let json = serde_json::to_string(&p).unwrap();
1839        let back: ParsedAddressLine = serde_json::from_str(&json).unwrap();
1840        assert_eq!(p, back);
1841    }
1842
1843    #[test]
1844    fn parse_address_uppercases_house_number_suffix() {
1845        let p = Normalizer::parse_address_line("10a Downing St");
1846        assert_eq!(p.house_number.as_deref(), Some("10A"));
1847    }
1848
1849    // ---------- normalize_email ----------
1850
1851    #[test]
1852    fn normalize_email_lowercases_and_trims() {
1853        assert_eq!(
1854            Normalizer::normalize_email("  Alice@Example.ORG  ", false),
1855            Some("alice@example.org".into()),
1856        );
1857    }
1858
1859    #[test]
1860    fn normalize_email_preserves_well_formed_input() {
1861        assert_eq!(
1862            Normalizer::normalize_email("alice@example.org", false),
1863            Some("alice@example.org".into()),
1864        );
1865    }
1866
1867    #[test]
1868    fn normalize_email_rejects_missing_at_sign() {
1869        assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
1870    }
1871
1872    #[test]
1873    fn normalize_email_rejects_empty_localpart_or_domain() {
1874        assert_eq!(Normalizer::normalize_email("@example.org", false), None);
1875        assert_eq!(Normalizer::normalize_email("alice@", false), None);
1876    }
1877
1878    #[test]
1879    fn normalize_email_rejects_multiple_at_signs() {
1880        assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
1881    }
1882
1883    #[test]
1884    fn normalize_email_rejects_empty_and_whitespace() {
1885        assert_eq!(Normalizer::normalize_email("", false), None);
1886        assert_eq!(Normalizer::normalize_email("   ", false), None);
1887    }
1888
1889    #[test]
1890    fn normalize_email_gmail_dot_folding_strips_dots_in_localpart() {
1891        assert_eq!(
1892            Normalizer::normalize_email("j.smith@gmail.com", true),
1893            Some("jsmith@gmail.com".into()),
1894        );
1895        assert_eq!(
1896            Normalizer::normalize_email("J.S.M.I.T.H@gmail.com", true),
1897            Some("jsmith@gmail.com".into()),
1898        );
1899    }
1900
1901    #[test]
1902    fn normalize_email_gmail_dot_folding_strips_plus_tag() {
1903        assert_eq!(
1904            Normalizer::normalize_email("jsmith+work@gmail.com", true),
1905            Some("jsmith@gmail.com".into()),
1906        );
1907        assert_eq!(
1908            Normalizer::normalize_email("j.smith+anything@googlemail.com", true),
1909            Some("jsmith@googlemail.com".into()),
1910        );
1911    }
1912
1913    #[test]
1914    fn normalize_email_gmail_dot_folding_does_not_touch_other_domains() {
1915        assert_eq!(
1916            Normalizer::normalize_email("j.smith@example.org", true),
1917            Some("j.smith@example.org".into()),
1918        );
1919        assert_eq!(
1920            Normalizer::normalize_email("jsmith+work@example.org", true),
1921            Some("jsmith+work@example.org".into()),
1922        );
1923    }
1924
1925    #[test]
1926    fn normalize_email_dot_folding_off_preserves_localpart_dots() {
1927        assert_eq!(
1928            Normalizer::normalize_email("j.smith@gmail.com", false),
1929            Some("j.smith@gmail.com".into()),
1930        );
1931    }
1932
1933    #[test]
1934    fn normalize_email_is_idempotent_on_canonical_form() {
1935        for (input, fold) in [
1936            ("Alice@Example.ORG", false),
1937            ("j.smith@gmail.com", true),
1938            ("jsmith+x@gmail.com", true),
1939            ("user@host.tld", false),
1940        ] {
1941            let once = Normalizer::normalize_email(input, fold).expect("parses");
1942            let twice = Normalizer::normalize_email(&once, fold).expect("idempotent");
1943            assert_eq!(once, twice, "not idempotent for {input:?} fold={fold}");
1944        }
1945    }
1946
1947    #[test]
1948    fn normalize_email_gmail_dot_folding_rejects_empty_localpart_after_folding() {
1949        // A localpart that is entirely dots (or all stripped to empty) is
1950        // not a valid address.
1951        assert_eq!(Normalizer::normalize_email("...@gmail.com", true), None);
1952    }
1953
1954    #[test]
1955    fn parse_address_does_not_treat_st_as_unit_prefix() {
1956        // The "St" street-type abbreviation must not be confused with the
1957        // "Ste" unit prefix. Only a literal "ste" token triggers the unit
1958        // path.
1959        let p = Normalizer::parse_address_line("St Mary's Road");
1960        assert_eq!(p.unit, None);
1961    }
1962
1963    #[test]
1964    fn e164_strips_trunk_zero_after_country_code() {
1965        // Some entry systems mistakenly keep the national trunk 0 after
1966        // the country code (e.g. "+44 0 7700 900123"). The normaliser
1967        // tolerates this.
1968        assert_eq!(
1969            Normalizer::normalize_phone_e164("+44 0 7700 900123", None),
1970            Some("+447700900123".to_string()),
1971        );
1972    }
1973}
worker_matcher/normalizer.rs

worker_matcher/
normalizer.rs