event_matcher/
normalizer.rs

1//! Text normalisation for event records.
2//!
3//! Research on entity identification (see `spec.md` §5) is unanimous: most
4//! accuracy gains come from **standardising the input** before scoring, not
5//! from cleverer similarity algorithms. This module exposes the canonical
6//! transformations the matching engine applies to names, postcodes,
7//! addresses, phonetic codes, and ISO 8601 date-times.
8//!
9//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
10//! **deterministic** and allocate at most a single new `String`.
11//!
12//! ## Quick examples
13//!
14//! ```
15//! use event_matcher::Normalizer;
16//!
17//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
18//! assert_eq!(Normalizer::normalize_name("  O'Brien  "), "obrien");
19//! assert_eq!(Normalizer::normalize_name("Siân"),         "sian");
20//!
21//! // Postcodes: strip whitespace, uppercase.
22//! assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
23//!
24//! // Phone numbers: keep digits, strip international and trunk prefixes.
25//! assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
26//! ```
27//!
28//! ## What this module deliberately does *not* do
29//!
30//! - It does not handle non-ASCII punctuation such as the curly apostrophe
31//!   `’` (U+2019). Upstream code should convert those to ASCII first.
32//!
33//! ## International phone numbers
34//!
35//! Two phone normalisers are provided:
36//!
37//! - [`Normalizer::normalize_phone`] — UK-centric national-significant form,
38//!   suitable for legacy or single-jurisdiction call-sites. Idempotent and
39//!   infallible.
40//! - [`Normalizer::normalize_phone_e164`] — international-aware E.164 form
41//!   (`+CCNNNN…`) for jurisdictions in the supported country table. Returns
42//!   `None` if the input cannot be confidently parsed.
43//!
44//! The matching engine tries E.164 first and falls back to the legacy form
45//! when either input is unparseable, so existing single-country deployments
46//! observe the same behaviour while multinational deployments gain
47//! cross-country disambiguation (a French number and a UK number that share
48//! the same trunk digits no longer collide).
49
50use serde::{Deserialize, Serialize};
51use unicode_normalization::UnicodeNormalization;
52
53/// Stateless namespace for text normalisation routines.
54///
55/// `Normalizer` is a unit type with no fields; every method is associated.
56/// It is held as a struct rather than a free function module purely so the
57/// public API has a single, discoverable entry point.
58///
59/// ```
60/// use event_matcher::Normalizer;
61///
62/// let canonical = Normalizer::normalize_name("José-María");
63/// assert_eq!(canonical, "josemaria");
64/// ```
65pub struct Normalizer;
66
67impl Normalizer {
68    /// Normalise a human name for comparison.
69    ///
70    /// Steps, in order:
71    ///
72    /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
73    /// 2. Drop combining marks (diacritics).
74    /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
75    /// 4. Lowercase.
76    /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
77    ///
78    /// The result is suitable for direct equality comparison or for feeding
79    /// into a string-similarity scorer.
80    ///
81    /// # Examples
82    ///
83    /// Whitespace is collapsed and trimmed:
84    ///
85    /// ```
86    /// use event_matcher::Normalizer;
87    /// assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
88    /// ```
89    ///
90    /// Apostrophes and hyphens are stripped:
91    ///
92    /// ```
93    /// # use event_matcher::Normalizer;
94    /// assert_eq!(Normalizer::normalize_name("O'Brien"),    "obrien");
95    /// assert_eq!(Normalizer::normalize_name("MARY-JANE"),  "maryjane");
96    /// ```
97    ///
98    /// Diacritics are removed:
99    ///
100    /// ```
101    /// # use event_matcher::Normalizer;
102    /// assert_eq!(Normalizer::normalize_name("José"),  "jose");
103    /// assert_eq!(Normalizer::normalize_name("Siân"),  "sian");
104    /// assert_eq!(Normalizer::normalize_name("Łukasz"), "łukasz");  // ł has no decomposition
105    /// ```
106    ///
107    /// Empty and whitespace-only input round-trip cleanly:
108    ///
109    /// ```
110    /// # use event_matcher::Normalizer;
111    /// assert_eq!(Normalizer::normalize_name(""),       "");
112    /// assert_eq!(Normalizer::normalize_name("    "),   "");
113    /// ```
114    ///
115    /// The function is **idempotent**:
116    ///
117    /// ```
118    /// # use event_matcher::Normalizer;
119    /// let once = Normalizer::normalize_name("  José-María  ");
120    /// let twice = Normalizer::normalize_name(&once);
121    /// assert_eq!(once, twice);
122    /// ```
123    #[must_use]
124    pub fn normalize_name(name: &str) -> String {
125        name.nfkd()
126            .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
127            .filter(|c| !c.is_ascii_punctuation())
128            .collect::<String>()
129            .to_lowercase()
130            .split_whitespace()
131            .collect::<Vec<_>>()
132            .join(" ")
133    }
134
135    /// Normalise a postcode for comparison.
136    ///
137    /// Steps: drop all whitespace, then uppercase. No locale-specific
138    /// validation — that is intentionally out of scope.
139    ///
140    /// # Examples
141    ///
142    /// UK postcodes with and without the conventional space are equivalent:
143    ///
144    /// ```
145    /// use event_matcher::Normalizer;
146    /// assert_eq!(Normalizer::normalize_postcode("CF10 1AA"),    "CF101AA");
147    /// assert_eq!(Normalizer::normalize_postcode("cf101aa"),     "CF101AA");
148    /// assert_eq!(Normalizer::normalize_postcode("  cf10 1aa "), "CF101AA");
149    /// ```
150    ///
151    /// Empty input is preserved:
152    ///
153    /// ```
154    /// # use event_matcher::Normalizer;
155    /// assert_eq!(Normalizer::normalize_postcode(""), "");
156    /// ```
157    ///
158    /// Idempotent:
159    ///
160    /// ```
161    /// # use event_matcher::Normalizer;
162    /// let once = Normalizer::normalize_postcode("sw1a 2aa");
163    /// let twice = Normalizer::normalize_postcode(&once);
164    /// assert_eq!(once, twice);
165    /// ```
166    #[must_use]
167    pub fn normalize_postcode(postcode: &str) -> String {
168        postcode
169            .chars()
170            .filter(|c| !c.is_whitespace())
171            .collect::<String>()
172            .to_uppercase()
173    }
174
175    /// Normalise a phone number for comparison.
176    ///
177    /// Steps:
178    ///
179    /// 1. Keep only ASCII digits (drop spaces, brackets, hyphens, `+`, …).
180    /// 2. If the result starts with `0044`, drop those four characters.
181    /// 3. Else, if the result starts with `44` and is at least 12 digits long,
182    ///    drop the leading `44`.
183    /// 4. Else, if the result starts with `0` and is longer than one digit,
184    ///    drop the leading `0`.
185    ///
186    /// This canonicalises the common UK formats into a single subscriber
187    /// number with no leading prefix. International numbers from other
188    /// countries pass through unchanged.
189    ///
190    /// # Examples
191    ///
192    /// ```
193    /// use event_matcher::Normalizer;
194    ///
195    /// // UK mobile, in three formats:
196    /// assert_eq!(Normalizer::normalize_phone("07700 900123"),    "7700900123");
197    /// assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
198    /// assert_eq!(Normalizer::normalize_phone("0044 7700 900123"), "7700900123");
199    ///
200    /// // UK landline with brackets and spaces:
201    /// assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
202    ///
203    /// // Empty input is preserved (no digits to keep):
204    /// assert_eq!(Normalizer::normalize_phone(""), "");
205    /// ```
206    ///
207    /// Idempotent on canonical inputs:
208    ///
209    /// ```
210    /// # use event_matcher::Normalizer;
211    /// let once = Normalizer::normalize_phone("07700 900123");
212    /// let twice = Normalizer::normalize_phone(&once);
213    /// assert_eq!(once, twice);
214    /// ```
215    #[must_use]
216    pub fn normalize_phone(phone: &str) -> String {
217        let digits: String = phone.chars().filter(char::is_ascii_digit).collect();
218
219        if digits.starts_with("0044") && digits.len() > 4 {
220            return digits[4..].to_string();
221        }
222
223        if digits.starts_with("44") && digits.len() >= 12 {
224            return digits[2..].to_string();
225        }
226
227        if digits.starts_with('0') && digits.len() > 1 {
228            return digits[1..].to_string();
229        }
230
231        digits
232    }
233
234    /// Normalise a phone number to its E.164-style canonical form.
235    ///
236    /// E.164 is the ITU-T standard for international telephone numbers and
237    /// has the shape `+CCNNN…`, where `CC` is the country dialling code
238    /// (1–3 digits) and the remainder is the national-significant number
239    /// (NSN) with no trunk prefix.
240    ///
241    /// The function accepts a wide range of textual layouts:
242    ///
243    /// - `+CC…` (explicit international, the canonical input form).
244    /// - `00CC…` (international access code, common across Europe).
245    /// - `0…` (national format, trunk-prefix) — interpreted relative to
246    ///   `default_country` when the country uses a national trunk `0`.
247    /// - `NSN…` (bare national-significant number) — interpreted relative
248    ///   to `default_country`.
249    ///
250    /// Returns `Some(canonical)` if the input parses against a country in
251    /// the supported table; otherwise `None`. The supported countries are
252    /// the five jurisdictions for which the crate exposes a national
253    /// healthcare identifier (United Kingdom, France, Spain, Ireland, and
254    /// — sharing the GB dial code — UK Northern Ireland), plus the most
255    /// common international partners (US, CA, DE, IT, NL, BE, PT, CH,
256    /// AT, SE, NO, DK, FI, PL, AU, NZ, JP, CN, IN, BR, MX, ZA). `default_country` is the
257    /// **ISO 3166-1 alpha-2 code** (e.g. `"GB"`, `"FR"`, `"US"`) of the
258    /// jurisdiction whose national format applies when the input lacks an
259    /// explicit international marker. Pass `None` to refuse to assume a
260    /// default — only explicit `+CC` / `00CC` inputs will parse.
261    ///
262    /// The function is **deterministic** and **idempotent**: feeding a
263    /// canonical `+CCNNN…` string back in returns the same string.
264    ///
265    /// # Examples
266    ///
267    /// UK mobile, three textual layouts, all canonicalise to the same E.164 form:
268    ///
269    /// ```
270    /// use event_matcher::Normalizer;
271    /// assert_eq!(
272    ///     Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
273    ///     Some("+447700900123".to_string()),
274    /// );
275    /// assert_eq!(
276    ///     Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
277    ///     Some("+447700900123".to_string()),
278    /// );
279    /// assert_eq!(
280    ///     Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
281    ///     Some("+447700900123".to_string()),
282    /// );
283    /// ```
284    ///
285    /// French national format vs international form:
286    ///
287    /// ```
288    /// # use event_matcher::Normalizer;
289    /// assert_eq!(
290    ///     Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
291    ///     Some("+33123456789".to_string()),
292    /// );
293    /// assert_eq!(
294    ///     Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("GB")),
295    ///     Some("+33123456789".to_string()),
296    /// );
297    /// ```
298    ///
299    /// North American (NANP) numbers have no trunk prefix:
300    ///
301    /// ```
302    /// # use event_matcher::Normalizer;
303    /// assert_eq!(
304    ///     Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
305    ///     Some("+14155551234".to_string()),
306    /// );
307    /// assert_eq!(
308    ///     Normalizer::normalize_phone_e164("+1 415 555 1234", None),
309    ///     Some("+14155551234".to_string()),
310    /// );
311    /// ```
312    ///
313    /// Unparseable or ambiguous inputs return `None`:
314    ///
315    /// ```
316    /// # use event_matcher::Normalizer;
317    /// // No default country and no international marker: ambiguous.
318    /// assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
319    /// // Unknown dial code.
320    /// assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
321    /// // Empty input.
322    /// assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
323    /// ```
324    ///
325    /// Idempotent on canonical inputs:
326    ///
327    /// ```
328    /// # use event_matcher::Normalizer;
329    /// let once = Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")).unwrap();
330    /// let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).unwrap();
331    /// assert_eq!(once, twice);
332    /// ```
333    #[must_use]
334    pub fn normalize_phone_e164(phone: &str, default_country: Option<&str>) -> Option<String> {
335        let has_plus = phone.chars().any(|c| c == '+');
336        let digits: String = phone.chars().filter(char::is_ascii_digit).collect();
337        if digits.is_empty() {
338            return None;
339        }
340
341        let (info, nsn): (&CountryPhoneInfo, String) = if has_plus {
342            let info = lookup_by_dial_code_prefix(&digits)?;
343            let rest = &digits[info.dial_code.len()..];
344            let rest = strip_trunk_prefix(info, rest);
345            (info, rest.to_string())
346        } else if let Some(stripped) = digits.strip_prefix("00") {
347            let info = lookup_by_dial_code_prefix(stripped)?;
348            let rest = &stripped[info.dial_code.len()..];
349            let rest = strip_trunk_prefix(info, rest);
350            (info, rest.to_string())
351        } else {
352            let iso = default_country?;
353            let info = lookup_by_iso(iso)?;
354            let nsn = strip_trunk_prefix(info, &digits);
355            (info, nsn.to_string())
356        };
357
358        if nsn.len() < info.min_nsn || nsn.len() > info.max_nsn {
359            return None;
360        }
361
362        Some(format!("+{}{}", info.dial_code, nsn))
363    }
364
365    /// Expand common postal address abbreviations as whole tokens.
366    ///
367    /// The input is tokenised on whitespace and each token is matched
368    /// case-insensitively (after stripping a single trailing `.` or `,`)
369    /// against a fixed table of street-type and directional abbreviations.
370    /// Recognised tokens are replaced with their long form, lowercased;
371    /// unrecognised tokens are passed through verbatim. Tokens are then
372    /// re-joined by single spaces.
373    ///
374    /// This function is intentionally simple: it does **not** apply any
375    /// position-aware heuristics. The well-known ambiguous case `"St"` —
376    /// which can mean *Street* or *Saint* — is always expanded to
377    /// *Street*. In practice this remains useful for fuzzy matching
378    /// because the canonical form is consistent on both sides of a
379    /// comparison; pre-process upstream if you need finer disambiguation.
380    ///
381    /// # Examples
382    ///
383    /// ```
384    /// use event_matcher::Normalizer;
385    /// assert_eq!(
386    ///     Normalizer::expand_street_abbreviations("123 High St"),
387    ///     "123 High street",
388    /// );
389    /// assert_eq!(
390    ///     Normalizer::expand_street_abbreviations("45 N. Park Ave."),
391    ///     "45 north Park avenue",
392    /// );
393    /// assert_eq!(
394    ///     Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
395    ///     "12 Sunset boulevard",
396    /// );
397    /// ```
398    ///
399    /// Idempotent on already-expanded inputs (long forms are not
400    /// re-expanded):
401    ///
402    /// ```
403    /// # use event_matcher::Normalizer;
404    /// let once = Normalizer::expand_street_abbreviations("10 Downing St");
405    /// let twice = Normalizer::expand_street_abbreviations(&once);
406    /// assert_eq!(once, twice);
407    /// ```
408    pub fn expand_street_abbreviations(line: &str) -> String {
409        line.split_whitespace()
410            .map(expand_one_token)
411            .collect::<Vec<_>>()
412            .join(" ")
413    }
414
415    /// Normalise an address line for comparison.
416    ///
417    /// Pipeline:
418    ///
419    /// 1. Expand street-type and directional abbreviations via
420    ///    [`Normalizer::expand_street_abbreviations`] (so `"St" → "street"`,
421    ///    `"Rd" → "road"`, `"N" → "north"`).
422    /// 2. Apply the name-normalisation pipeline
423    ///    ([`Normalizer::normalize_name`]): NFKD-decompose, drop combining
424    ///    marks, drop ASCII punctuation, lowercase, collapse whitespace.
425    ///
426    /// The result is idempotent and suitable for direct equality or
427    /// similarity comparison.
428    ///
429    /// # Examples
430    ///
431    /// Abbreviated and full forms canonicalise identically:
432    ///
433    /// ```
434    /// use event_matcher::Normalizer;
435    /// assert_eq!(
436    ///     Normalizer::normalize_address_line("123 High St"),
437    ///     Normalizer::normalize_address_line("123 High Street"),
438    /// );
439    /// assert_eq!(
440    ///     Normalizer::normalize_address_line("45 N Park Ave"),
441    ///     Normalizer::normalize_address_line("45 North Park Avenue"),
442    /// );
443    /// ```
444    ///
445    /// Punctuation and case are normalised:
446    ///
447    /// ```
448    /// # use event_matcher::Normalizer;
449    /// assert_eq!(
450    ///     Normalizer::normalize_address_line("10, DOWNING Street."),
451    ///     "10 downing street",
452    /// );
453    /// ```
454    #[must_use]
455    pub fn normalize_address_line(line: &str) -> String {
456        Self::normalize_name(&Self::expand_street_abbreviations(line))
457    }
458
459    /// Parse an address line into its structured components.
460    ///
461    /// The function performs a best-effort structural decomposition of a
462    /// single-line postal address into:
463    ///
464    /// - `house_number` — the leading run of digits (with an optional
465    ///   single alphabetic suffix, e.g. `"10A"`), uppercased. `None` if
466    ///   no leading number is present.
467    /// - `unit` — a recognised sub-unit prefix (`Flat`, `Apt`,
468    ///   `Apartment`, `Unit`, `Suite`, `Ste`) and its identifier,
469    ///   lowercased and space-joined (e.g. `"flat 2a"`). `None` if no
470    ///   recognised prefix is present.
471    /// - `street` — the remaining text after `unit` and `house_number`
472    ///   are removed, run through [`Normalizer::normalize_address_line`].
473    ///
474    /// Parsing is **deterministic** and **format-only** — no postal
475    /// reference is consulted. Inputs that do not match the simple
476    /// regular structure (e.g. a postcode-only string, a city name)
477    /// degrade gracefully: `house_number` and `unit` are `None`, and
478    /// `street` carries the normalised input.
479    ///
480    /// # Examples
481    ///
482    /// Typical UK / US single-line addresses:
483    ///
484    /// ```
485    /// use event_matcher::Normalizer;
486    ///
487    /// let p = Normalizer::parse_address_line("123 High Street");
488    /// assert_eq!(p.house_number.as_deref(), Some("123"));
489    /// assert_eq!(p.unit, None);
490    /// assert_eq!(p.street, "high street");
491    ///
492    /// let p = Normalizer::parse_address_line("10A Downing St");
493    /// assert_eq!(p.house_number.as_deref(), Some("10A"));
494    /// assert_eq!(p.street, "downing street");
495    ///
496    /// let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
497    /// assert_eq!(p.unit.as_deref(), Some("flat 2a"));
498    /// assert_eq!(p.house_number.as_deref(), Some("10"));
499    /// assert_eq!(p.street, "downing street");
500    ///
501    /// let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
502    /// assert_eq!(p.unit.as_deref(), Some("apt 5"));
503    /// assert_eq!(p.house_number.as_deref(), Some("1600"));
504    /// assert_eq!(p.street, "pennsylvania avenue");
505    /// ```
506    ///
507    /// Inputs without a leading number still parse:
508    ///
509    /// ```
510    /// # use event_matcher::Normalizer;
511    /// let p = Normalizer::parse_address_line("Buckingham Palace");
512    /// assert_eq!(p.house_number, None);
513    /// assert_eq!(p.unit, None);
514    /// assert_eq!(p.street, "buckingham palace");
515    /// ```
516    #[must_use]
517    pub fn parse_address_line(line: &str) -> ParsedAddressLine {
518        let trimmed = line.trim();
519        let (unit, after_unit) = extract_unit_prefix(trimmed);
520        let after_unit = after_unit.trim_start_matches([',', ' ', '\t']).trim();
521        let (house_number, after_number) = extract_house_number(after_unit);
522        let after_number = after_number.trim_start_matches([',', ' ', '\t']).trim();
523        ParsedAddressLine {
524            house_number,
525            unit,
526            street: Self::normalize_address_line(after_number),
527        }
528    }
529
530    /// Compute a phonetic (Soundex) code for a name.
531    ///
532    /// Internally, the input is first normalised via
533    /// [`Normalizer::normalize_name`] and then encoded with the American
534    /// Soundex algorithm. Names that sound alike map to the same code, which
535    /// lets the matcher catch spelling variants such as "Smith" / "Smyth" or
536    /// "Stephen" / "Steven".
537    ///
538    /// The implementation is suitable for English-language names. Non-English
539    /// phonemes may be lost. T-9 (spec §21.4) decided to keep Soundex as the
540    /// default and expose an opt-in `MatchConfig::phonetic_encoder` enum
541    /// (Double Metaphone, Daitch-Mokotoff) gated behind a Cargo feature flag
542    /// once an empirical multinational event corpus is available;
543    /// implementation is tracked as T-9.1.
544    ///
545    /// # Examples
546    ///
547    /// Similar-sounding spellings share a code:
548    ///
549    /// ```
550    /// use event_matcher::Normalizer;
551    /// assert_eq!(Normalizer::phonetic_code("Smith"), Normalizer::phonetic_code("Smyth"));
552    /// assert_eq!(Normalizer::phonetic_code("Stephen"), Normalizer::phonetic_code("Steven"));
553    /// ```
554    ///
555    /// Different families produce different codes:
556    ///
557    /// ```
558    /// # use event_matcher::Normalizer;
559    /// assert_ne!(Normalizer::phonetic_code("Jones"), Normalizer::phonetic_code("Smith"));
560    /// ```
561    ///
562    /// Empty input returns an empty string, not a default Soundex value:
563    ///
564    /// ```
565    /// # use event_matcher::Normalizer;
566    /// assert_eq!(Normalizer::phonetic_code(""),       "");
567    /// assert_eq!(Normalizer::phonetic_code("   "),    "");
568    /// ```
569    #[must_use]
570    pub fn phonetic_code(name: &str) -> String {
571        let normalized = Self::normalize_name(name);
572        if normalized.is_empty() {
573            return String::new();
574        }
575        soundex::american_soundex(&normalized)
576    }
577
578    /// Normalise an email address for comparison.
579    ///
580    /// Steps:
581    ///
582    /// 1. Trim surrounding whitespace.
583    /// 2. Lowercase the entire address (RFC 5321 makes the domain
584    ///    case-insensitive and most real-world deployments treat the
585    ///    localpart case-insensitively too; case-sensitive localparts
586    ///    are technically legal but vanishingly rare in healthcare data).
587    /// 3. Reject inputs that lack exactly one `@` or that have an empty
588    ///    localpart or domain by returning `None`.
589    /// 4. If `gmail_dot_folding` is `true` and the domain is `gmail.com`
590    ///    or `googlemail.com`, strip every `.` from the localpart and
591    ///    drop any `+tag` suffix. Both transformations are reversible
592    ///    for Gmail addresses by Google's documented routing rules:
593    ///    `j.smith@gmail.com`, `js.mith@gmail.com`, and
594    ///    `jsmith+work@gmail.com` all deliver to the same mailbox as
595    ///    `jsmith@gmail.com`.
596    ///
597    /// The function is **deterministic** and **idempotent** on
598    /// successful outputs.
599    ///
600    /// # Examples
601    ///
602    /// Common case-and-whitespace normalisation:
603    ///
604    /// ```
605    /// use event_matcher::Normalizer;
606    /// assert_eq!(
607    ///     Normalizer::normalize_email("  Alice@Example.ORG  ", false),
608    ///     Some("alice@example.org".to_string()),
609    /// );
610    /// ```
611    ///
612    /// Malformed inputs return `None`:
613    ///
614    /// ```
615    /// # use event_matcher::Normalizer;
616    /// assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
617    /// assert_eq!(Normalizer::normalize_email("@example.org", false), None);
618    /// assert_eq!(Normalizer::normalize_email("alice@", false), None);
619    /// assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
620    /// assert_eq!(Normalizer::normalize_email("", false), None);
621    /// ```
622    ///
623    /// Optional Gmail dot-folding:
624    ///
625    /// ```
626    /// # use event_matcher::Normalizer;
627    /// assert_eq!(
628    ///     Normalizer::normalize_email("j.smith@gmail.com", true),
629    ///     Some("jsmith@gmail.com".to_string()),
630    /// );
631    /// assert_eq!(
632    ///     Normalizer::normalize_email("jsmith+work@googlemail.com", true),
633    ///     Some("jsmith@googlemail.com".to_string()),
634    /// );
635    /// // Dot-folding does not touch non-Gmail addresses.
636    /// assert_eq!(
637    ///     Normalizer::normalize_email("j.smith@example.org", true),
638    ///     Some("j.smith@example.org".to_string()),
639    /// );
640    /// ```
641    ///
642    /// Idempotent on canonical inputs:
643    ///
644    /// ```
645    /// # use event_matcher::Normalizer;
646    /// let once = Normalizer::normalize_email("Alice@Example.ORG", false).unwrap();
647    /// let twice = Normalizer::normalize_email(&once, false).unwrap();
648    /// assert_eq!(once, twice);
649    /// ```
650    #[must_use]
651    pub fn normalize_email(email: &str, gmail_dot_folding: bool) -> Option<String> {
652        let trimmed = email.trim().to_lowercase();
653        if trimmed.is_empty() {
654            return None;
655        }
656        // Require exactly one '@'.
657        let (local, domain) = trimmed.split_once('@')?;
658        if local.is_empty() || domain.is_empty() {
659            return None;
660        }
661        // Reject any further '@' in the domain side.
662        if domain.contains('@') {
663            return None;
664        }
665        if gmail_dot_folding && (domain == "gmail.com" || domain == "googlemail.com") {
666            let local_no_plus = match local.find('+') {
667                Some(i) => &local[..i],
668                None => local,
669            };
670            let local_folded: String = local_no_plus.chars().filter(|c| *c != '.').collect();
671            if local_folded.is_empty() {
672                return None;
673            }
674            return Some(format!("{local_folded}@{domain}"));
675        }
676        Some(format!("{local}@{domain}"))
677    }
678
679    /// Parse an ISO 8601 / RFC 3339 date or date-time string and return
680    /// the number of seconds since the Unix epoch (`1970-01-01T00:00:00Z`).
681    ///
682    /// Accepted shapes:
683    ///
684    /// - **Date only**: `YYYY-MM-DD` — interpreted as `00:00:00 UTC`.
685    /// - **Date-time, naive**: `YYYY-MM-DDTHH:MM[:SS]` — interpreted as
686    ///   `UTC`. Fractional seconds (`.NNN`) are accepted and truncated to
687    ///   whole seconds.
688    /// - **Date-time, UTC marker**: `…Z` or `…+00:00`.
689    /// - **Date-time, fixed offset**: `…±HH:MM` (or `…±HHMM`).
690    ///
691    /// Returns `None` if the input does not fit any of these shapes or if
692    /// the components are out of range (month not in `1..=12`, day not in
693    /// `1..=31` for the calendar month, hour not in `0..=23`, minute not in
694    /// `0..=59`, second not in `0..=60` — leap seconds permitted).
695    ///
696    /// The function is **deterministic** and **idempotent under
697    /// canonicalisation**: distinct textual layouts that denote the same
698    /// instant (e.g. `2024-06-26T09:00:00Z` and `2024-06-26T11:00:00+02:00`)
699    /// return the same number.
700    ///
701    /// # Examples
702    ///
703    /// ```
704    /// use event_matcher::Normalizer;
705    ///
706    /// // Unix epoch.
707    /// assert_eq!(Normalizer::parse_iso8601_unix_seconds("1970-01-01T00:00:00Z"), Some(0));
708    /// // Same date with no time component anchors at midnight UTC.
709    /// assert_eq!(Normalizer::parse_iso8601_unix_seconds("1970-01-01"), Some(0));
710    /// // Two textual layouts for the same instant.
711    /// let a = Normalizer::parse_iso8601_unix_seconds("2024-06-26T09:00:00Z").unwrap();
712    /// let b = Normalizer::parse_iso8601_unix_seconds("2024-06-26T11:00:00+02:00").unwrap();
713    /// assert_eq!(a, b);
714    /// ```
715    ///
716    /// Junk input is rejected:
717    ///
718    /// ```
719    /// # use event_matcher::Normalizer;
720    /// assert!(Normalizer::parse_iso8601_unix_seconds("not a date").is_none());
721    /// assert!(Normalizer::parse_iso8601_unix_seconds("2024-13-01").is_none());
722    /// assert!(Normalizer::parse_iso8601_unix_seconds("2024-02-30").is_none());
723    /// ```
724    #[must_use]
725    pub fn parse_iso8601_unix_seconds(input: &str) -> Option<i64> {
726        let s = input.trim();
727        if s.len() < 10 {
728            return None;
729        }
730
731        // YYYY-MM-DD prefix is required.
732        let year: i64 = s.get(..4)?.parse().ok()?;
733        if s.as_bytes().get(4)? != &b'-' {
734            return None;
735        }
736        let month: u32 = s.get(5..7)?.parse().ok()?;
737        if s.as_bytes().get(7)? != &b'-' {
738            return None;
739        }
740        let day: u32 = s.get(8..10)?.parse().ok()?;
741        if !(1..=12).contains(&month) {
742            return None;
743        }
744        if day < 1 || day > days_in_month(year, month) {
745            return None;
746        }
747
748        // Optional time component starts with `T` or a single space.
749        let (hour, minute, second, tz_offset_seconds) = if s.len() == 10 {
750            (0u32, 0u32, 0u32, 0i64)
751        } else {
752            let separator = s.as_bytes().get(10).copied()?;
753            if separator != b'T' && separator != b't' && separator != b' ' {
754                return None;
755            }
756            let rest = &s[11..];
757            parse_time_and_offset(rest)?
758        };
759
760        if hour > 23 || minute > 59 || second > 60 {
761            return None;
762        }
763
764        let days = days_from_civil(year, month, day);
765        let day_seconds = i64::from(hour) * 3600 + i64::from(minute) * 60 + i64::from(second);
766        Some(days * 86_400 + day_seconds - tz_offset_seconds)
767    }
768}
769
770/// Days in `month` for the given Gregorian `year` (1582+ for the purposes
771/// of this crate; the formula is mathematically valid for any year).
772fn days_in_month(year: i64, month: u32) -> u32 {
773    match month {
774        1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
775        4 | 6 | 9 | 11 => 30,
776        2 => {
777            let leap = (year % 4 == 0 && year % 100 != 0) || year % 400 == 0;
778            if leap { 29 } else { 28 }
779        }
780        _ => 0,
781    }
782}
783
784/// Hinnant's `days_from_civil` — number of days from `1970-01-01` to the
785/// given Gregorian date. Negative for earlier dates. Total over all
786/// `(y, m, d)` triples that satisfy the per-month range check.
787fn days_from_civil(year: i64, month: u32, day: u32) -> i64 {
788    let y = if month <= 2 { year - 1 } else { year };
789    let era = if y >= 0 { y } else { y - 399 } / 400;
790    let yoe = y - era * 400; // [0, 399]
791    let m = i64::from(month);
792    let d = i64::from(day);
793    let mp = if m > 2 { m - 3 } else { m + 9 };
794    let doy = (153 * mp + 2) / 5 + d - 1; // [0, 365]
795    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
796    era * 146_097 + doe - 719_468
797}
798
799/// Parse the time and optional timezone offset portion of an ISO 8601
800/// date-time. Input is everything after `T`.
801fn parse_time_and_offset(rest: &str) -> Option<(u32, u32, u32, i64)> {
802    if rest.len() < 5 {
803        return None;
804    }
805    let hour: u32 = rest.get(..2)?.parse().ok()?;
806    if rest.as_bytes().get(2)? != &b':' {
807        return None;
808    }
809    let minute: u32 = rest.get(3..5)?.parse().ok()?;
810
811    let mut idx = 5;
812    let mut second: u32 = 0;
813    if rest.as_bytes().get(5) == Some(&b':') {
814        let s = rest.get(6..8)?;
815        second = s.parse().ok()?;
816        idx = 8;
817    }
818
819    // Optional fractional seconds: `.NNN…`. Truncate.
820    if rest.as_bytes().get(idx) == Some(&b'.') {
821        idx += 1;
822        let frac_start = idx;
823        while rest.as_bytes().get(idx).is_some_and(u8::is_ascii_digit) {
824            idx += 1;
825        }
826        if idx == frac_start {
827            return None;
828        }
829    }
830
831    // Optional timezone designator: `Z`, `±HH:MM`, or `±HHMM`.
832    let tz_offset_seconds = match rest.as_bytes().get(idx).copied() {
833        None => 0,
834        Some(b'Z' | b'z') => {
835            idx += 1;
836            0
837        }
838        Some(b'+' | b'-') => {
839            let sign = if rest.as_bytes()[idx] == b'+' { 1 } else { -1 };
840            idx += 1;
841            let oh: i64 = rest.get(idx..idx + 2)?.parse().ok()?;
842            idx += 2;
843            let om: i64 = if rest.as_bytes().get(idx) == Some(&b':') {
844                idx += 1;
845                let m = rest.get(idx..idx + 2)?.parse().ok()?;
846                idx += 2;
847                m
848            } else if rest.len() >= idx + 2
849                && rest.as_bytes()[idx].is_ascii_digit()
850                && rest.as_bytes()[idx + 1].is_ascii_digit()
851            {
852                let m = rest.get(idx..idx + 2)?.parse().ok()?;
853                idx += 2;
854                m
855            } else {
856                0
857            };
858            sign * (oh * 3600 + om * 60)
859        }
860        Some(_) => return None,
861    };
862
863    if idx != rest.len() {
864        return None;
865    }
866    Some((hour, minute, second, tz_offset_seconds))
867}
868
869/// Structured decomposition of a postal-address line.
870///
871/// Produced by [`Normalizer::parse_address_line`]. The struct is
872/// `Serialize + Deserialize` so it round-trips through JSON and can be
873/// embedded in downstream data models.
874///
875/// All three fields are best-effort: parsing is format-only and consults
876/// no postal reference. Inputs that don't follow the
877/// `(unit, house_number, street)` shape degrade gracefully, with the
878/// missing pieces returned as `None`.
879#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
880pub struct ParsedAddressLine {
881    /// Leading house / building number, including an optional single
882    /// alphabetic suffix (`"10A"`), uppercased. `None` when no leading
883    /// digit is present.
884    pub house_number: Option<String>,
885    /// Sub-unit prefix and identifier, lowercased and space-joined
886    /// (e.g. `"flat 2a"`, `"apt 5"`, `"suite 12"`). `None` when no
887    /// recognised prefix is present.
888    pub unit: Option<String>,
889    /// Remaining street portion, normalised via
890    /// [`Normalizer::normalize_address_line`].
891    pub street: String,
892}
893
894/// Token-level expansion table used by [`Normalizer::expand_street_abbreviations`].
895///
896/// Entries are matched case-insensitively against a token with at most one
897/// trailing `.` or `,` stripped. The replacement is always lowercase so the
898/// downstream name-normalisation pipeline is a no-op for these tokens.
899const STREET_ABBREVIATIONS: &[(&str, &str)] = &[
900    ("st", "street"),
901    ("str", "street"),
902    ("rd", "road"),
903    ("ave", "avenue"),
904    ("av", "avenue"),
905    ("blvd", "boulevard"),
906    ("bvd", "boulevard"),
907    ("ln", "lane"),
908    ("dr", "drive"),
909    ("ct", "court"),
910    ("pl", "place"),
911    ("sq", "square"),
912    ("ter", "terrace"),
913    ("terr", "terrace"),
914    ("hwy", "highway"),
915    ("pkwy", "parkway"),
916    ("mt", "mount"),
917    ("mtn", "mountain"),
918    ("cres", "crescent"),
919    ("gdns", "gardens"),
920    ("gdn", "garden"),
921    ("gr", "grove"),
922    ("cl", "close"),
923    ("pk", "park"),
924    ("plz", "plaza"),
925    ("expy", "expressway"),
926    ("trl", "trail"),
927    ("n", "north"),
928    ("s", "south"),
929    ("e", "east"),
930    ("w", "west"),
931    ("ne", "northeast"),
932    ("nw", "northwest"),
933    ("se", "southeast"),
934    ("sw", "southwest"),
935];
936
937/// Recognised sub-unit prefix keywords for [`Normalizer::parse_address_line`].
938const UNIT_PREFIXES: &[&str] = &[
939    "flat",
940    "apartment",
941    "apt",
942    "unit",
943    "suite",
944    "ste",
945    "room",
946    "rm",
947];
948
949/// Expand a single whitespace-separated token if it appears in
950/// [`STREET_ABBREVIATIONS`].
951///
952/// The token is matched after stripping at most one trailing `.` or `,`;
953/// the comparison is ASCII case-insensitive. Tokens that contain non-ASCII
954/// characters short-circuit to the original input unchanged.
955fn expand_one_token(tok: &str) -> String {
956    let stripped = tok.trim_end_matches(['.', ',']);
957    if !stripped.is_ascii() {
958        return tok.to_string();
959    }
960    let lower = stripped.to_ascii_lowercase();
961    for (abbrev, full) in STREET_ABBREVIATIONS {
962        if lower == *abbrev {
963            return (*full).to_string();
964        }
965    }
966    tok.to_string()
967}
968
969/// Extract a recognised unit prefix and its identifier from the start of `s`.
970///
971/// Returns `(Some("flat 2a"), rest)` when the input begins with a
972/// recognised keyword followed by an alphanumeric identifier; otherwise
973/// `(None, s)` unchanged.
974fn extract_unit_prefix(s: &str) -> (Option<String>, &str) {
975    let trimmed = s.trim_start();
976    // Find the first whitespace; everything before is the candidate keyword.
977    let kw_end = trimmed
978        .find(|c: char| c.is_whitespace())
979        .unwrap_or(trimmed.len());
980    if kw_end == 0 {
981        return (None, s);
982    }
983    let kw_raw = &trimmed[..kw_end];
984    let kw_stripped = kw_raw.trim_end_matches(['.', ',']);
985    if !kw_stripped.is_ascii() {
986        return (None, s);
987    }
988    let kw_lower = kw_stripped.to_ascii_lowercase();
989    if !UNIT_PREFIXES.iter().any(|p| *p == kw_lower) {
990        return (None, s);
991    }
992    // Skip whitespace and `#` after the keyword.
993    let after_kw = trimmed[kw_end..].trim_start_matches([' ', '\t', '#']);
994    // Read alphanumerics as the identifier.
995    let id_end = after_kw
996        .find(|c: char| !c.is_ascii_alphanumeric())
997        .unwrap_or(after_kw.len());
998    if id_end == 0 {
999        return (None, s);
1000    }
1001    let id = &after_kw[..id_end];
1002    let rest = &after_kw[id_end..];
1003    let unit = format!("{} {}", kw_lower, id.to_ascii_lowercase());
1004    (Some(unit), rest)
1005}
1006
1007/// Extract a leading house number (digits + optional single alphabetic
1008/// suffix) from the start of `s`.
1009///
1010/// `"10 Downing Street"` → `(Some("10"), " Downing Street")`.
1011/// `"10A High St"` → `(Some("10A"), " High St")`.
1012/// `"Buckingham Palace"` → `(None, "Buckingham Palace")`.
1013fn extract_house_number(s: &str) -> (Option<String>, &str) {
1014    let trimmed = s.trim_start();
1015    let mut digits_end = 0;
1016    for (i, c) in trimmed.char_indices() {
1017        if c.is_ascii_digit() {
1018            digits_end = i + c.len_utf8();
1019        } else {
1020            break;
1021        }
1022    }
1023    if digits_end == 0 {
1024        return (None, s);
1025    }
1026    let mut end = digits_end;
1027    // Allow a single alphabetic suffix (e.g. "10A"), but only when not
1028    // followed by another alphabetic — otherwise we'd swallow the start
1029    // of a street name like "10 Apple Tree Lane".
1030    let after_digits = &trimmed[digits_end..];
1031    let mut chars = after_digits.chars();
1032    if let Some(c1) = chars.next()
1033        && c1.is_ascii_alphabetic()
1034    {
1035        let next = chars.next();
1036        if next.is_none_or(|c2| !c2.is_ascii_alphanumeric()) {
1037            end += c1.len_utf8();
1038        }
1039    }
1040    let number = trimmed[..end].to_ascii_uppercase();
1041    (Some(number), &trimmed[end..])
1042}
1043
1044/// Per-country phone metadata for [`Normalizer::normalize_phone_e164`].
1045///
1046/// `min_nsn` / `max_nsn` bound the **national-significant number** length —
1047/// the digits after the dial code, with the national trunk prefix removed.
1048/// `trunk_prefix` is the digit string used for national dialling (`"0"` for
1049/// most of Europe and Asia, `"8"` for Lithuania, `None` for NANP / Spain /
1050/// Portugal and several others). When set, a single occurrence of the
1051/// string at the start of the national number is stripped before
1052/// canonicalisation.
1053struct CountryPhoneInfo {
1054    /// ISO 3166-1 alpha-2 country code, uppercase.
1055    iso_alpha2: &'static str,
1056    /// International dialling code, no leading `+`.
1057    dial_code: &'static str,
1058    /// National trunk prefix digit(s), if any.
1059    trunk_prefix: Option<&'static str>,
1060    /// Minimum national-significant-number length.
1061    min_nsn: usize,
1062    /// Maximum national-significant-number length.
1063    max_nsn: usize,
1064}
1065
1066/// Phone-numbering metadata for countries supported by
1067/// [`Normalizer::normalize_phone_e164`].
1068///
1069/// Coverage: all five jurisdictions for which the crate exposes a national
1070/// healthcare identifier (GB England/Wales/IoM, FR, ES, IE, plus UK NI via
1071/// the GB dial code), plus the most common international partners. New
1072/// entries SHOULD follow the ISO 3166-1 alpha-2 convention and document the
1073/// trunk-prefix rule explicitly.
1074const COUNTRY_PHONE_TABLE: &[CountryPhoneInfo] = &[
1075    CountryPhoneInfo {
1076        iso_alpha2: "GB",
1077        dial_code: "44",
1078        trunk_prefix: Some("0"),
1079        min_nsn: 7,
1080        max_nsn: 11,
1081    },
1082    CountryPhoneInfo {
1083        iso_alpha2: "FR",
1084        dial_code: "33",
1085        trunk_prefix: Some("0"),
1086        min_nsn: 9,
1087        max_nsn: 9,
1088    },
1089    CountryPhoneInfo {
1090        iso_alpha2: "DE",
1091        dial_code: "49",
1092        trunk_prefix: Some("0"),
1093        min_nsn: 7,
1094        max_nsn: 13,
1095    },
1096    CountryPhoneInfo {
1097        iso_alpha2: "ES",
1098        dial_code: "34",
1099        trunk_prefix: None,
1100        min_nsn: 9,
1101        max_nsn: 9,
1102    },
1103    CountryPhoneInfo {
1104        iso_alpha2: "IE",
1105        dial_code: "353",
1106        trunk_prefix: Some("0"),
1107        min_nsn: 7,
1108        max_nsn: 11,
1109    },
1110    CountryPhoneInfo {
1111        iso_alpha2: "IT",
1112        dial_code: "39",
1113        trunk_prefix: None,
1114        min_nsn: 6,
1115        max_nsn: 12,
1116    },
1117    CountryPhoneInfo {
1118        iso_alpha2: "NL",
1119        dial_code: "31",
1120        trunk_prefix: Some("0"),
1121        min_nsn: 9,
1122        max_nsn: 9,
1123    },
1124    CountryPhoneInfo {
1125        iso_alpha2: "BE",
1126        dial_code: "32",
1127        trunk_prefix: Some("0"),
1128        min_nsn: 8,
1129        max_nsn: 9,
1130    },
1131    CountryPhoneInfo {
1132        iso_alpha2: "PT",
1133        dial_code: "351",
1134        trunk_prefix: None,
1135        min_nsn: 9,
1136        max_nsn: 9,
1137    },
1138    CountryPhoneInfo {
1139        iso_alpha2: "CH",
1140        dial_code: "41",
1141        trunk_prefix: Some("0"),
1142        min_nsn: 9,
1143        max_nsn: 9,
1144    },
1145    CountryPhoneInfo {
1146        iso_alpha2: "AT",
1147        dial_code: "43",
1148        trunk_prefix: Some("0"),
1149        min_nsn: 4,
1150        max_nsn: 13,
1151    },
1152    CountryPhoneInfo {
1153        iso_alpha2: "SE",
1154        dial_code: "46",
1155        trunk_prefix: Some("0"),
1156        min_nsn: 7,
1157        max_nsn: 13,
1158    },
1159    CountryPhoneInfo {
1160        iso_alpha2: "NO",
1161        dial_code: "47",
1162        trunk_prefix: None,
1163        min_nsn: 8,
1164        max_nsn: 8,
1165    },
1166    CountryPhoneInfo {
1167        iso_alpha2: "DK",
1168        dial_code: "45",
1169        trunk_prefix: None,
1170        min_nsn: 8,
1171        max_nsn: 8,
1172    },
1173    CountryPhoneInfo {
1174        iso_alpha2: "FI",
1175        dial_code: "358",
1176        trunk_prefix: Some("0"),
1177        min_nsn: 5,
1178        max_nsn: 12,
1179    },
1180    CountryPhoneInfo {
1181        iso_alpha2: "PL",
1182        dial_code: "48",
1183        trunk_prefix: None,
1184        min_nsn: 9,
1185        max_nsn: 9,
1186    },
1187    CountryPhoneInfo {
1188        iso_alpha2: "AU",
1189        dial_code: "61",
1190        trunk_prefix: Some("0"),
1191        min_nsn: 9,
1192        max_nsn: 9,
1193    },
1194    CountryPhoneInfo {
1195        iso_alpha2: "NZ",
1196        dial_code: "64",
1197        trunk_prefix: Some("0"),
1198        min_nsn: 8,
1199        max_nsn: 10,
1200    },
1201    CountryPhoneInfo {
1202        iso_alpha2: "US",
1203        dial_code: "1",
1204        trunk_prefix: None,
1205        min_nsn: 10,
1206        max_nsn: 10,
1207    },
1208    CountryPhoneInfo {
1209        iso_alpha2: "CA",
1210        dial_code: "1",
1211        trunk_prefix: None,
1212        min_nsn: 10,
1213        max_nsn: 10,
1214    },
1215    CountryPhoneInfo {
1216        iso_alpha2: "JP",
1217        dial_code: "81",
1218        trunk_prefix: Some("0"),
1219        min_nsn: 9,
1220        max_nsn: 10,
1221    },
1222    CountryPhoneInfo {
1223        iso_alpha2: "CN",
1224        dial_code: "86",
1225        trunk_prefix: Some("0"),
1226        min_nsn: 5,
1227        max_nsn: 12,
1228    },
1229    CountryPhoneInfo {
1230        iso_alpha2: "IN",
1231        dial_code: "91",
1232        trunk_prefix: Some("0"),
1233        min_nsn: 10,
1234        max_nsn: 10,
1235    },
1236    CountryPhoneInfo {
1237        iso_alpha2: "BR",
1238        dial_code: "55",
1239        trunk_prefix: Some("0"),
1240        min_nsn: 10,
1241        max_nsn: 11,
1242    },
1243    CountryPhoneInfo {
1244        iso_alpha2: "MX",
1245        dial_code: "52",
1246        trunk_prefix: None,
1247        min_nsn: 10,
1248        max_nsn: 10,
1249    },
1250    CountryPhoneInfo {
1251        iso_alpha2: "ZA",
1252        dial_code: "27",
1253        trunk_prefix: Some("0"),
1254        min_nsn: 9,
1255        max_nsn: 9,
1256    },
1257    // ---- T-19: coverage of remaining 35-scheme identifier jurisdictions ----
1258    CountryPhoneInfo {
1259        iso_alpha2: "BG",
1260        dial_code: "359",
1261        trunk_prefix: Some("0"),
1262        min_nsn: 8,
1263        max_nsn: 9,
1264    },
1265    CountryPhoneInfo {
1266        iso_alpha2: "CZ",
1267        dial_code: "420",
1268        trunk_prefix: None,
1269        min_nsn: 9,
1270        max_nsn: 9,
1271    },
1272    CountryPhoneInfo {
1273        iso_alpha2: "EE",
1274        dial_code: "372",
1275        trunk_prefix: None,
1276        min_nsn: 7,
1277        max_nsn: 8,
1278    },
1279    CountryPhoneInfo {
1280        iso_alpha2: "GR",
1281        dial_code: "30",
1282        trunk_prefix: None,
1283        min_nsn: 10,
1284        max_nsn: 10,
1285    },
1286    CountryPhoneInfo {
1287        iso_alpha2: "HR",
1288        dial_code: "385",
1289        trunk_prefix: Some("0"),
1290        min_nsn: 8,
1291        max_nsn: 9,
1292    },
1293    CountryPhoneInfo {
1294        iso_alpha2: "IS",
1295        dial_code: "354",
1296        trunk_prefix: None,
1297        min_nsn: 7,
1298        max_nsn: 9,
1299    },
1300    CountryPhoneInfo {
1301        iso_alpha2: "LI",
1302        dial_code: "423",
1303        trunk_prefix: None,
1304        min_nsn: 7,
1305        max_nsn: 9,
1306    },
1307    // Lithuania uses `8` (not `0`) as the national trunk prefix.
1308    CountryPhoneInfo {
1309        iso_alpha2: "LT",
1310        dial_code: "370",
1311        trunk_prefix: Some("8"),
1312        min_nsn: 8,
1313        max_nsn: 8,
1314    },
1315    CountryPhoneInfo {
1316        iso_alpha2: "LV",
1317        dial_code: "371",
1318        trunk_prefix: None,
1319        min_nsn: 8,
1320        max_nsn: 8,
1321    },
1322    CountryPhoneInfo {
1323        iso_alpha2: "MT",
1324        dial_code: "356",
1325        trunk_prefix: None,
1326        min_nsn: 8,
1327        max_nsn: 8,
1328    },
1329    CountryPhoneInfo {
1330        iso_alpha2: "RO",
1331        dial_code: "40",
1332        trunk_prefix: Some("0"),
1333        min_nsn: 9,
1334        max_nsn: 9,
1335    },
1336    CountryPhoneInfo {
1337        iso_alpha2: "SI",
1338        dial_code: "386",
1339        trunk_prefix: Some("0"),
1340        min_nsn: 8,
1341        max_nsn: 8,
1342    },
1343    CountryPhoneInfo {
1344        iso_alpha2: "SK",
1345        dial_code: "421",
1346        trunk_prefix: Some("0"),
1347        min_nsn: 9,
1348        max_nsn: 9,
1349    },
1350];
1351
1352/// Look up a country by ISO 3166-1 alpha-2 code (case-insensitive).
1353///
1354/// Returns the first match in [`COUNTRY_PHONE_TABLE`]. For NANP countries
1355/// (US/CA) which share dial code `1`, this disambiguates by the caller's
1356/// chosen default; the canonical E.164 output is identical for both.
1357fn lookup_by_iso(iso: &str) -> Option<&'static CountryPhoneInfo> {
1358    if !iso.is_ascii() {
1359        return None;
1360    }
1361    let upper = iso.to_ascii_uppercase();
1362    COUNTRY_PHONE_TABLE.iter().find(|c| c.iso_alpha2 == upper)
1363}
1364
1365/// Match the longest known dial-code prefix at the start of `digits`.
1366///
1367/// Tries 3-, 2-, then 1-digit prefixes to honour the country table's most
1368/// specific entry. For NANP (dial code `1`) the first matching entry — US —
1369/// is returned; the canonical E.164 form is the same whether the caller
1370/// later interprets the country as US or CA.
1371fn lookup_by_dial_code_prefix(digits: &str) -> Option<&'static CountryPhoneInfo> {
1372    for len in [3usize, 2, 1] {
1373        if digits.len() >= len {
1374            let prefix = &digits[..len];
1375            if let Some(info) = COUNTRY_PHONE_TABLE.iter().find(|c| c.dial_code == prefix) {
1376                return Some(info);
1377            }
1378        }
1379    }
1380    None
1381}
1382
1383/// Strip a single occurrence of the country's national trunk prefix
1384/// from `nsn` if one is configured and present.
1385fn strip_trunk_prefix<'a>(info: &CountryPhoneInfo, nsn: &'a str) -> &'a str {
1386    if let Some(tp) = info.trunk_prefix
1387        && let Some(rest) = nsn.strip_prefix(tp)
1388        && !rest.is_empty()
1389    {
1390        rest
1391    } else {
1392        nsn
1393    }
1394}
1395
1396#[cfg(test)]
1397mod tests {
1398    use super::*;
1399
1400    // ---------- normalize_name ----------
1401
1402    #[test]
1403    fn normalize_name_collapses_whitespace_and_trims() {
1404        assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
1405    }
1406
1407    #[test]
1408    fn normalize_name_strips_ascii_punctuation() {
1409        assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
1410        assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
1411        assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
1412    }
1413
1414    #[test]
1415    fn normalize_name_strips_diacritics() {
1416        assert_eq!(Normalizer::normalize_name("José"), "jose");
1417        assert_eq!(Normalizer::normalize_name("Siân"), "sian");
1418        // common test cases
1419        assert_eq!(Normalizer::normalize_name("naïve"), "naive");
1420        assert_eq!(Normalizer::normalize_name("crème"), "creme");
1421        // ŷ and ŵ decompose; ł has no NFKD decomposition and survives lowercased.
1422        assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
1423    }
1424
1425    #[test]
1426    fn normalize_name_handles_empty_and_whitespace() {
1427        assert_eq!(Normalizer::normalize_name(""), "");
1428        assert_eq!(Normalizer::normalize_name("   "), "");
1429        assert_eq!(Normalizer::normalize_name("\t\n"), "");
1430    }
1431
1432    #[test]
1433    fn normalize_name_lowercases() {
1434        assert_eq!(Normalizer::normalize_name("MARY"), "mary");
1435        assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
1436    }
1437
1438    #[test]
1439    fn normalize_name_is_idempotent() {
1440        for input in [
1441            "  John  Smith  ",
1442            "O'Brien-Jones",
1443            "JOSÉ MARÍA",
1444            "",
1445            "  ",
1446            "Siân",
1447        ] {
1448            let once = Normalizer::normalize_name(input);
1449            let twice = Normalizer::normalize_name(&once);
1450            assert_eq!(once, twice, "not idempotent for {input:?}");
1451        }
1452    }
1453
1454    #[test]
1455    fn normalize_name_does_not_normalise_unicode_punctuation() {
1456        // Curly apostrophe (U+2019) is intentionally not stripped.
1457        // This is documented in AGENTS/normalization.md as a known limitation.
1458        let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
1459        assert!(with_curly.contains('\u{2019}'));
1460    }
1461
1462    // ---------- normalize_postcode ----------
1463
1464    #[test]
1465    fn normalize_postcode_uppercases() {
1466        assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
1467    }
1468
1469    #[test]
1470    fn normalize_postcode_strips_all_whitespace() {
1471        assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
1472        assert_eq!(Normalizer::normalize_postcode(" CF10  1AA "), "CF101AA");
1473        assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
1474    }
1475
1476    #[test]
1477    fn normalize_postcode_handles_empty() {
1478        assert_eq!(Normalizer::normalize_postcode(""), "");
1479        assert_eq!(Normalizer::normalize_postcode("   "), "");
1480    }
1481
1482    #[test]
1483    fn normalize_postcode_is_idempotent() {
1484        for input in ["cf10 1aa", "SW1A 2AA", "  EH8 9YL  ", ""] {
1485            let once = Normalizer::normalize_postcode(input);
1486            let twice = Normalizer::normalize_postcode(&once);
1487            assert_eq!(once, twice);
1488        }
1489    }
1490
1491    // ---------- normalize_phone ----------
1492
1493    #[test]
1494    fn normalize_phone_strips_uk_trunk_prefix() {
1495        assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
1496    }
1497
1498    #[test]
1499    fn normalize_phone_strips_plus_44_international() {
1500        assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
1501    }
1502
1503    #[test]
1504    fn normalize_phone_strips_0044_international() {
1505        assert_eq!(
1506            Normalizer::normalize_phone("0044 7700 900123"),
1507            "7700900123"
1508        );
1509    }
1510
1511    #[test]
1512    fn normalize_phone_handles_brackets_and_spaces() {
1513        assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
1514    }
1515
1516    #[test]
1517    fn normalize_phone_handles_empty() {
1518        assert_eq!(Normalizer::normalize_phone(""), "");
1519        assert_eq!(Normalizer::normalize_phone("---"), "");
1520    }
1521
1522    #[test]
1523    fn normalize_phone_does_not_strip_44_if_too_short() {
1524        // 44 followed by fewer than 10 more digits: keep the 44 (not international prefix).
1525        assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
1526    }
1527
1528    #[test]
1529    fn normalize_phone_is_idempotent() {
1530        for input in [
1531            "07700 900123",
1532            "+44 7700 900123",
1533            "0044 7700 900123",
1534            "(029) 2034 5678",
1535            "",
1536        ] {
1537            let once = Normalizer::normalize_phone(input);
1538            let twice = Normalizer::normalize_phone(&once);
1539            assert_eq!(once, twice, "not idempotent for {input:?}");
1540        }
1541    }
1542
1543    #[test]
1544    fn normalize_phone_keeps_lone_zero() {
1545        // A bare "0" is not stripped (guard: len > 1).
1546        assert_eq!(Normalizer::normalize_phone("0"), "0");
1547    }
1548
1549    // ---------- phonetic_code ----------
1550
1551    #[test]
1552    fn phonetic_code_groups_smith_and_smyth() {
1553        assert_eq!(
1554            Normalizer::phonetic_code("Smith"),
1555            Normalizer::phonetic_code("Smyth")
1556        );
1557    }
1558
1559    #[test]
1560    fn phonetic_code_groups_stephen_and_steven() {
1561        assert_eq!(
1562            Normalizer::phonetic_code("Stephen"),
1563            Normalizer::phonetic_code("Steven")
1564        );
1565    }
1566
1567    #[test]
1568    fn phonetic_code_distinguishes_different_families() {
1569        assert_ne!(
1570            Normalizer::phonetic_code("Jones"),
1571            Normalizer::phonetic_code("Smith")
1572        );
1573        assert_ne!(
1574            Normalizer::phonetic_code("Anderson"),
1575            Normalizer::phonetic_code("Zimmerman")
1576        );
1577    }
1578
1579    #[test]
1580    fn phonetic_code_specific_values() {
1581        // Pinned values from the underlying soundex crate; act as a regression net.
1582        assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
1583        assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
1584        assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
1585        assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
1586    }
1587
1588    #[test]
1589    fn phonetic_code_handles_empty() {
1590        assert_eq!(Normalizer::phonetic_code(""), "");
1591        assert_eq!(Normalizer::phonetic_code("   "), "");
1592    }
1593
1594    #[test]
1595    fn phonetic_code_is_case_insensitive() {
1596        assert_eq!(
1597            Normalizer::phonetic_code("SMITH"),
1598            Normalizer::phonetic_code("smith")
1599        );
1600    }
1601
1602    // ---------- normalize_phone_e164 ----------
1603
1604    #[test]
1605    fn e164_uk_layouts_canonicalise_identically() {
1606        let canonical = Some("+447700900123".to_string());
1607        assert_eq!(
1608            Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
1609            canonical,
1610        );
1611        assert_eq!(
1612            Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
1613            canonical,
1614        );
1615        assert_eq!(
1616            Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
1617            canonical,
1618        );
1619        assert_eq!(
1620            Normalizer::normalize_phone_e164("(07700) 900-123", Some("GB")),
1621            canonical,
1622        );
1623    }
1624
1625    #[test]
1626    fn e164_french_layouts_canonicalise_identically() {
1627        let canonical = Some("+33123456789".to_string());
1628        assert_eq!(
1629            Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("FR")),
1630            canonical,
1631        );
1632        assert_eq!(
1633            Normalizer::normalize_phone_e164("0033 1 23 45 67 89", Some("FR")),
1634            canonical,
1635        );
1636        assert_eq!(
1637            Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
1638            canonical,
1639        );
1640    }
1641
1642    #[test]
1643    fn e164_spain_has_no_national_trunk_prefix() {
1644        // Spain switched to no trunk-0 in 1998; a bare 9-digit national
1645        // number is the canonical form.
1646        assert_eq!(
1647            Normalizer::normalize_phone_e164("912 345 678", Some("ES")),
1648            Some("+34912345678".to_string()),
1649        );
1650        assert_eq!(
1651            Normalizer::normalize_phone_e164("+34 912 345 678", None),
1652            Some("+34912345678".to_string()),
1653        );
1654    }
1655
1656    #[test]
1657    fn e164_ireland_three_digit_dial_code() {
1658        assert_eq!(
1659            Normalizer::normalize_phone_e164("+353 1 234 5678", None),
1660            Some("+35312345678".to_string()),
1661        );
1662        assert_eq!(
1663            Normalizer::normalize_phone_e164("01 234 5678", Some("IE")),
1664            Some("+35312345678".to_string()),
1665        );
1666    }
1667
1668    #[test]
1669    fn e164_nanp_handles_us_and_canada() {
1670        assert_eq!(
1671            Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
1672            Some("+14155551234".to_string()),
1673        );
1674        assert_eq!(
1675            Normalizer::normalize_phone_e164("+1 415 555 1234", None),
1676            Some("+14155551234".to_string()),
1677        );
1678        // Canada uses the same dial code; canonical form is identical.
1679        assert_eq!(
1680            Normalizer::normalize_phone_e164("(416) 555-1234", Some("CA")),
1681            Some("+14165551234".to_string()),
1682        );
1683    }
1684
1685    // ---------- T-19: 35-scheme jurisdiction coverage ----------
1686
1687    #[test]
1688    fn e164_lithuania_uses_eight_as_trunk_prefix() {
1689        // Lithuania's national trunk prefix is `8`, not `0`. National
1690        // dialling form `8 612 34567` (mobile) canonicalises to the same
1691        // E.164 string as the explicit `+370 612 34567` form.
1692        assert_eq!(
1693            Normalizer::normalize_phone_e164("8 612 34567", Some("LT")),
1694            Some("+37061234567".to_string()),
1695        );
1696        assert_eq!(
1697            Normalizer::normalize_phone_e164("+370 612 34567", None),
1698            Some("+37061234567".to_string()),
1699        );
1700    }
1701
1702    #[test]
1703    fn e164_greece_has_no_national_trunk_prefix() {
1704        // GR national-significant numbers begin with the area code (the
1705        // leading zero seen in older publications is no longer a trunk).
1706        assert_eq!(
1707            Normalizer::normalize_phone_e164("+30 210 123 4567", None),
1708            Some("+302101234567".to_string()),
1709        );
1710        assert_eq!(
1711            Normalizer::normalize_phone_e164("210 123 4567", Some("GR")),
1712            Some("+302101234567".to_string()),
1713        );
1714    }
1715
1716    #[test]
1717    fn e164_romania_strips_trunk_zero() {
1718        assert_eq!(
1719            Normalizer::normalize_phone_e164("0721 234 567", Some("RO")),
1720            Some("+40721234567".to_string()),
1721        );
1722        assert_eq!(
1723            Normalizer::normalize_phone_e164("+40 721 234 567", None),
1724            Some("+40721234567".to_string()),
1725        );
1726    }
1727
1728    #[test]
1729    fn e164_czech_no_trunk_prefix() {
1730        assert_eq!(
1731            Normalizer::normalize_phone_e164("+420 234 567 890", None),
1732            Some("+420234567890".to_string()),
1733        );
1734        assert_eq!(
1735            Normalizer::normalize_phone_e164("234 567 890", Some("CZ")),
1736            Some("+420234567890".to_string()),
1737        );
1738    }
1739
1740    #[test]
1741    fn e164_iceland_seven_digit_nsn() {
1742        assert_eq!(
1743            Normalizer::normalize_phone_e164("+354 412 3456", None),
1744            Some("+3544123456".to_string()),
1745        );
1746    }
1747
1748    #[test]
1749    fn e164_distinguishes_overlapping_three_digit_dial_codes() {
1750        // Croatia (385) vs Slovenia (386): adjacent dial codes, both
1751        // use a `0` trunk, but the canonical E.164 forms remain distinct.
1752        let hr = Normalizer::normalize_phone_e164("+385 91 234 5678", None);
1753        let si = Normalizer::normalize_phone_e164("+386 41 234 567", None);
1754        assert!(hr.is_some());
1755        assert!(si.is_some());
1756        assert_ne!(hr, si);
1757    }
1758
1759    #[test]
1760    fn e164_distinguishes_countries_with_overlapping_national_digits() {
1761        // The "same" national-format digits in two countries must yield
1762        // different E.164 strings — this is precisely the disambiguation
1763        // the new normaliser provides.
1764        let uk = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1765        let fr = Normalizer::normalize_phone_e164("07700 90012", Some("FR"));
1766        assert!(uk.is_some());
1767        assert!(fr.is_some());
1768        assert_ne!(uk, fr);
1769    }
1770
1771    #[test]
1772    fn e164_returns_none_when_default_country_missing_and_no_marker() {
1773        // Ambiguous national-format input with no default country.
1774        assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
1775    }
1776
1777    #[test]
1778    fn e164_returns_none_for_unknown_dial_code() {
1779        assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
1780    }
1781
1782    #[test]
1783    fn e164_returns_none_for_empty_or_punctuation_only() {
1784        assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
1785        assert_eq!(Normalizer::normalize_phone_e164("+", Some("GB")), None);
1786        assert_eq!(Normalizer::normalize_phone_e164("()-", Some("GB")), None);
1787    }
1788
1789    #[test]
1790    fn e164_returns_none_for_too_short_or_too_long_nsn() {
1791        // GB NSN must be 7..=11 digits.
1792        assert_eq!(Normalizer::normalize_phone_e164("+44 1", None), None);
1793        assert_eq!(
1794            Normalizer::normalize_phone_e164("+44 123456789012345", None),
1795            None,
1796        );
1797    }
1798
1799    #[test]
1800    fn e164_rejects_unknown_default_country() {
1801        // "XX" is not in the table; without an explicit international
1802        // marker the function cannot guess.
1803        assert_eq!(
1804            Normalizer::normalize_phone_e164("07700 900123", Some("XX")),
1805            None,
1806        );
1807    }
1808
1809    #[test]
1810    fn e164_is_idempotent_on_canonical_form() {
1811        for input in [
1812            "+44 7700 900123",
1813            "+33 1 23 45 67 89",
1814            "(415) 555-1234",
1815            "+353 1 234 5678",
1816            "+34 912 345 678",
1817        ] {
1818            let once = Normalizer::normalize_phone_e164(input, Some("GB")).expect("parses");
1819            let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).expect("idempotent");
1820            assert_eq!(once, twice, "not idempotent for {input:?}");
1821        }
1822    }
1823
1824    #[test]
1825    fn e164_default_country_lookup_is_case_insensitive() {
1826        let lower = Normalizer::normalize_phone_e164("07700 900123", Some("gb"));
1827        let upper = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1828        assert_eq!(lower, upper);
1829        assert!(lower.is_some());
1830    }
1831
1832    #[test]
1833    fn e164_handles_double_zero_international_access_form() {
1834        assert_eq!(
1835            Normalizer::normalize_phone_e164("00 33 1 23 45 67 89", None),
1836            Some("+33123456789".to_string()),
1837        );
1838    }
1839
1840    // ---------- expand_street_abbreviations ----------
1841
1842    #[test]
1843    fn expand_street_replaces_common_abbreviations() {
1844        assert_eq!(
1845            Normalizer::expand_street_abbreviations("123 High St"),
1846            "123 High street",
1847        );
1848        assert_eq!(
1849            Normalizer::expand_street_abbreviations("10 Downing Rd"),
1850            "10 Downing road",
1851        );
1852        assert_eq!(
1853            Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
1854            "12 Sunset boulevard",
1855        );
1856        assert_eq!(
1857            Normalizer::expand_street_abbreviations("1 Park Ave"),
1858            "1 Park avenue",
1859        );
1860        assert_eq!(
1861            Normalizer::expand_street_abbreviations("5 Cherry Ln"),
1862            "5 Cherry lane",
1863        );
1864    }
1865
1866    #[test]
1867    fn expand_street_replaces_directionals() {
1868        assert_eq!(
1869            Normalizer::expand_street_abbreviations("45 N Park Ave"),
1870            "45 north Park avenue",
1871        );
1872        assert_eq!(
1873            Normalizer::expand_street_abbreviations("100 SW 5th St"),
1874            "100 southwest 5th street",
1875        );
1876    }
1877
1878    #[test]
1879    fn expand_street_strips_trailing_period_or_comma() {
1880        assert_eq!(
1881            Normalizer::expand_street_abbreviations("123 High St."),
1882            "123 High street",
1883        );
1884        assert_eq!(
1885            Normalizer::expand_street_abbreviations("12 Sunset Blvd,"),
1886            "12 Sunset boulevard",
1887        );
1888    }
1889
1890    #[test]
1891    fn expand_street_passes_unknown_tokens_through() {
1892        assert_eq!(
1893            Normalizer::expand_street_abbreviations("Buckingham Palace"),
1894            "Buckingham Palace",
1895        );
1896    }
1897
1898    #[test]
1899    fn expand_street_is_idempotent_on_already_expanded_input() {
1900        for input in [
1901            "123 High St",
1902            "45 N Park Ave",
1903            "10 Downing Rd",
1904            "Buckingham Palace",
1905        ] {
1906            let once = Normalizer::expand_street_abbreviations(input);
1907            let twice = Normalizer::expand_street_abbreviations(&once);
1908            assert_eq!(once, twice, "not idempotent for {input:?}");
1909        }
1910    }
1911
1912    #[test]
1913    fn expand_street_handles_empty_and_whitespace_only() {
1914        assert_eq!(Normalizer::expand_street_abbreviations(""), "");
1915        assert_eq!(Normalizer::expand_street_abbreviations("   "), "");
1916    }
1917
1918    // ---------- normalize_address_line ----------
1919
1920    #[test]
1921    fn normalize_address_line_unifies_abbreviated_and_full_forms() {
1922        assert_eq!(
1923            Normalizer::normalize_address_line("123 High St"),
1924            Normalizer::normalize_address_line("123 High Street"),
1925        );
1926        assert_eq!(
1927            Normalizer::normalize_address_line("45 N Park Ave"),
1928            Normalizer::normalize_address_line("45 North Park Avenue"),
1929        );
1930    }
1931
1932    #[test]
1933    fn normalize_address_line_handles_punctuation_and_case() {
1934        assert_eq!(
1935            Normalizer::normalize_address_line("10, DOWNING Street."),
1936            "10 downing street",
1937        );
1938    }
1939
1940    #[test]
1941    fn normalize_address_line_is_idempotent() {
1942        for input in [
1943            "123 High St",
1944            "  45 N Park Ave  ",
1945            "10, Downing Street.",
1946            "",
1947        ] {
1948            let once = Normalizer::normalize_address_line(input);
1949            let twice = Normalizer::normalize_address_line(&once);
1950            assert_eq!(once, twice, "not idempotent for {input:?}");
1951        }
1952    }
1953
1954    // ---------- parse_address_line ----------
1955
1956    #[test]
1957    fn parse_address_extracts_simple_house_number() {
1958        let p = Normalizer::parse_address_line("123 High Street");
1959        assert_eq!(p.house_number.as_deref(), Some("123"));
1960        assert_eq!(p.unit, None);
1961        assert_eq!(p.street, "high street");
1962    }
1963
1964    #[test]
1965    fn parse_address_handles_alphanumeric_house_number() {
1966        let p = Normalizer::parse_address_line("10A Downing St");
1967        assert_eq!(p.house_number.as_deref(), Some("10A"));
1968        assert_eq!(p.street, "downing street");
1969    }
1970
1971    #[test]
1972    fn parse_address_does_not_greedily_consume_street_name() {
1973        // "10 Apple Tree Lane" — `Apple` must not be absorbed into the
1974        // house number because two consecutive alphabetic characters
1975        // signal it's part of the street name.
1976        let p = Normalizer::parse_address_line("10 Apple Tree Lane");
1977        assert_eq!(p.house_number.as_deref(), Some("10"));
1978        assert_eq!(p.street, "apple tree lane");
1979    }
1980
1981    #[test]
1982    fn parse_address_recognises_flat_prefix() {
1983        let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
1984        assert_eq!(p.unit.as_deref(), Some("flat 2a"));
1985        assert_eq!(p.house_number.as_deref(), Some("10"));
1986        assert_eq!(p.street, "downing street");
1987    }
1988
1989    #[test]
1990    fn parse_address_recognises_apt_prefix() {
1991        let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
1992        assert_eq!(p.unit.as_deref(), Some("apt 5"));
1993        assert_eq!(p.house_number.as_deref(), Some("1600"));
1994        assert_eq!(p.street, "pennsylvania avenue");
1995    }
1996
1997    #[test]
1998    fn parse_address_recognises_suite_and_ste_and_unit_and_room() {
1999        for input in [
2000            "Suite 12, 100 Main St",
2001            "Ste 12, 100 Main St",
2002            "Unit 12, 100 Main St",
2003            "Room 12, 100 Main St",
2004        ] {
2005            let p = Normalizer::parse_address_line(input);
2006            assert!(p.unit.is_some(), "no unit for {input:?}");
2007            assert_eq!(p.house_number.as_deref(), Some("100"));
2008            assert_eq!(p.street, "main street");
2009        }
2010    }
2011
2012    #[test]
2013    fn parse_address_no_leading_number_falls_back_to_street_only() {
2014        let p = Normalizer::parse_address_line("Buckingham Palace");
2015        assert_eq!(p.house_number, None);
2016        assert_eq!(p.unit, None);
2017        assert_eq!(p.street, "buckingham palace");
2018    }
2019
2020    #[test]
2021    fn parse_address_empty_input_yields_empty_street() {
2022        let p = Normalizer::parse_address_line("");
2023        assert_eq!(p.house_number, None);
2024        assert_eq!(p.unit, None);
2025        assert_eq!(p.street, "");
2026    }
2027
2028    #[test]
2029    fn parse_address_round_trips_through_serde() {
2030        let p = Normalizer::parse_address_line("Flat 2A, 10A Downing Street");
2031        let json = serde_json::to_string(&p).unwrap();
2032        let back: ParsedAddressLine = serde_json::from_str(&json).unwrap();
2033        assert_eq!(p, back);
2034    }
2035
2036    #[test]
2037    fn parse_address_uppercases_house_number_suffix() {
2038        let p = Normalizer::parse_address_line("10a Downing St");
2039        assert_eq!(p.house_number.as_deref(), Some("10A"));
2040    }
2041
2042    // ---------- normalize_email ----------
2043
2044    #[test]
2045    fn normalize_email_lowercases_and_trims() {
2046        assert_eq!(
2047            Normalizer::normalize_email("  Alice@Example.ORG  ", false),
2048            Some("alice@example.org".into()),
2049        );
2050    }
2051
2052    #[test]
2053    fn normalize_email_preserves_well_formed_input() {
2054        assert_eq!(
2055            Normalizer::normalize_email("alice@example.org", false),
2056            Some("alice@example.org".into()),
2057        );
2058    }
2059
2060    #[test]
2061    fn normalize_email_rejects_missing_at_sign() {
2062        assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
2063    }
2064
2065    #[test]
2066    fn normalize_email_rejects_empty_localpart_or_domain() {
2067        assert_eq!(Normalizer::normalize_email("@example.org", false), None);
2068        assert_eq!(Normalizer::normalize_email("alice@", false), None);
2069    }
2070
2071    #[test]
2072    fn normalize_email_rejects_multiple_at_signs() {
2073        assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
2074    }
2075
2076    #[test]
2077    fn normalize_email_rejects_empty_and_whitespace() {
2078        assert_eq!(Normalizer::normalize_email("", false), None);
2079        assert_eq!(Normalizer::normalize_email("   ", false), None);
2080    }
2081
2082    #[test]
2083    fn normalize_email_gmail_dot_folding_strips_dots_in_localpart() {
2084        assert_eq!(
2085            Normalizer::normalize_email("j.smith@gmail.com", true),
2086            Some("jsmith@gmail.com".into()),
2087        );
2088        assert_eq!(
2089            Normalizer::normalize_email("J.S.M.I.T.H@gmail.com", true),
2090            Some("jsmith@gmail.com".into()),
2091        );
2092    }
2093
2094    #[test]
2095    fn normalize_email_gmail_dot_folding_strips_plus_tag() {
2096        assert_eq!(
2097            Normalizer::normalize_email("jsmith+work@gmail.com", true),
2098            Some("jsmith@gmail.com".into()),
2099        );
2100        assert_eq!(
2101            Normalizer::normalize_email("j.smith+anything@googlemail.com", true),
2102            Some("jsmith@googlemail.com".into()),
2103        );
2104    }
2105
2106    #[test]
2107    fn normalize_email_gmail_dot_folding_does_not_touch_other_domains() {
2108        assert_eq!(
2109            Normalizer::normalize_email("j.smith@example.org", true),
2110            Some("j.smith@example.org".into()),
2111        );
2112        assert_eq!(
2113            Normalizer::normalize_email("jsmith+work@example.org", true),
2114            Some("jsmith+work@example.org".into()),
2115        );
2116    }
2117
2118    #[test]
2119    fn normalize_email_dot_folding_off_preserves_localpart_dots() {
2120        assert_eq!(
2121            Normalizer::normalize_email("j.smith@gmail.com", false),
2122            Some("j.smith@gmail.com".into()),
2123        );
2124    }
2125
2126    #[test]
2127    fn normalize_email_is_idempotent_on_canonical_form() {
2128        for (input, fold) in [
2129            ("Alice@Example.ORG", false),
2130            ("j.smith@gmail.com", true),
2131            ("jsmith+x@gmail.com", true),
2132            ("user@host.tld", false),
2133        ] {
2134            let once = Normalizer::normalize_email(input, fold).expect("parses");
2135            let twice = Normalizer::normalize_email(&once, fold).expect("idempotent");
2136            assert_eq!(once, twice, "not idempotent for {input:?} fold={fold}");
2137        }
2138    }
2139
2140    #[test]
2141    fn normalize_email_gmail_dot_folding_rejects_empty_localpart_after_folding() {
2142        // A localpart that is entirely dots (or all stripped to empty) is
2143        // not a valid address.
2144        assert_eq!(Normalizer::normalize_email("...@gmail.com", true), None);
2145    }
2146
2147    #[test]
2148    fn parse_address_does_not_treat_st_as_unit_prefix() {
2149        // The "St" street-type abbreviation must not be confused with the
2150        // "Ste" unit prefix. Only a literal "ste" token triggers the unit
2151        // path.
2152        let p = Normalizer::parse_address_line("St Mary's Road");
2153        assert_eq!(p.unit, None);
2154    }
2155
2156    #[test]
2157    fn e164_strips_trunk_zero_after_country_code() {
2158        // Some entry systems mistakenly keep the national trunk 0 after
2159        // the country code (e.g. "+44 0 7700 900123"). The normaliser
2160        // tolerates this.
2161        assert_eq!(
2162            Normalizer::normalize_phone_e164("+44 0 7700 900123", None),
2163            Some("+447700900123".to_string()),
2164        );
2165    }
2166}
event_matcher/normalizer.rs

event_matcher/
normalizer.rs