event_matcher/normalizer.rs
1//! Text normalisation for event records.
2//!
3//! Research on entity identification (see `spec.md` §5) is unanimous: most
4//! accuracy gains come from **standardising the input** before scoring, not
5//! from cleverer similarity algorithms. This module exposes the canonical
6//! transformations the matching engine applies to names, postcodes,
7//! addresses, phonetic codes, and ISO 8601 date-times.
8//!
9//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
10//! **deterministic** and allocate at most a single new `String`.
11//!
12//! ## Quick examples
13//!
14//! ```
15//! use event_matcher::Normalizer;
16//!
17//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
18//! assert_eq!(Normalizer::normalize_name(" O'Brien "), "obrien");
19//! assert_eq!(Normalizer::normalize_name("Siân"), "sian");
20//!
21//! // Postcodes: strip whitespace, uppercase.
22//! assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
23//!
24//! // Phone numbers: keep digits, strip international and trunk prefixes.
25//! assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
26//! ```
27//!
28//! ## What this module deliberately does *not* do
29//!
30//! - It does not handle non-ASCII punctuation such as the curly apostrophe
31//! `’` (U+2019). Upstream code should convert those to ASCII first.
32//!
33//! ## International phone numbers
34//!
35//! Two phone normalisers are provided:
36//!
37//! - [`Normalizer::normalize_phone`] — UK-centric national-significant form,
38//! suitable for legacy or single-jurisdiction call-sites. Idempotent and
39//! infallible.
40//! - [`Normalizer::normalize_phone_e164`] — international-aware E.164 form
41//! (`+CCNNNN…`) for jurisdictions in the supported country table. Returns
42//! `None` if the input cannot be confidently parsed.
43//!
44//! The matching engine tries E.164 first and falls back to the legacy form
45//! when either input is unparseable, so existing single-country deployments
46//! observe the same behaviour while multinational deployments gain
47//! cross-country disambiguation (a French number and a UK number that share
48//! the same trunk digits no longer collide).
49
50use serde::{Deserialize, Serialize};
51use unicode_normalization::UnicodeNormalization;
52
53/// Stateless namespace for text normalisation routines.
54///
55/// `Normalizer` is a unit type with no fields; every method is associated.
56/// It is held as a struct rather than a free function module purely so the
57/// public API has a single, discoverable entry point.
58///
59/// ```
60/// use event_matcher::Normalizer;
61///
62/// let canonical = Normalizer::normalize_name("José-María");
63/// assert_eq!(canonical, "josemaria");
64/// ```
65pub struct Normalizer;
66
67impl Normalizer {
68 /// Normalise a human name for comparison.
69 ///
70 /// Steps, in order:
71 ///
72 /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
73 /// 2. Drop combining marks (diacritics).
74 /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
75 /// 4. Lowercase.
76 /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
77 ///
78 /// The result is suitable for direct equality comparison or for feeding
79 /// into a string-similarity scorer.
80 ///
81 /// # Examples
82 ///
83 /// Whitespace is collapsed and trimmed:
84 ///
85 /// ```
86 /// use event_matcher::Normalizer;
87 /// assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
88 /// ```
89 ///
90 /// Apostrophes and hyphens are stripped:
91 ///
92 /// ```
93 /// # use event_matcher::Normalizer;
94 /// assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
95 /// assert_eq!(Normalizer::normalize_name("MARY-JANE"), "maryjane");
96 /// ```
97 ///
98 /// Diacritics are removed:
99 ///
100 /// ```
101 /// # use event_matcher::Normalizer;
102 /// assert_eq!(Normalizer::normalize_name("José"), "jose");
103 /// assert_eq!(Normalizer::normalize_name("Siân"), "sian");
104 /// assert_eq!(Normalizer::normalize_name("Łukasz"), "łukasz"); // ł has no decomposition
105 /// ```
106 ///
107 /// Empty and whitespace-only input round-trip cleanly:
108 ///
109 /// ```
110 /// # use event_matcher::Normalizer;
111 /// assert_eq!(Normalizer::normalize_name(""), "");
112 /// assert_eq!(Normalizer::normalize_name(" "), "");
113 /// ```
114 ///
115 /// The function is **idempotent**:
116 ///
117 /// ```
118 /// # use event_matcher::Normalizer;
119 /// let once = Normalizer::normalize_name(" José-María ");
120 /// let twice = Normalizer::normalize_name(&once);
121 /// assert_eq!(once, twice);
122 /// ```
123 #[must_use]
124 pub fn normalize_name(name: &str) -> String {
125 name.nfkd()
126 .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
127 .filter(|c| !c.is_ascii_punctuation())
128 .collect::<String>()
129 .to_lowercase()
130 .split_whitespace()
131 .collect::<Vec<_>>()
132 .join(" ")
133 }
134
135 /// Normalise a postcode for comparison.
136 ///
137 /// Steps: drop all whitespace, then uppercase. No locale-specific
138 /// validation — that is intentionally out of scope.
139 ///
140 /// # Examples
141 ///
142 /// UK postcodes with and without the conventional space are equivalent:
143 ///
144 /// ```
145 /// use event_matcher::Normalizer;
146 /// assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
147 /// assert_eq!(Normalizer::normalize_postcode("cf101aa"), "CF101AA");
148 /// assert_eq!(Normalizer::normalize_postcode(" cf10 1aa "), "CF101AA");
149 /// ```
150 ///
151 /// Empty input is preserved:
152 ///
153 /// ```
154 /// # use event_matcher::Normalizer;
155 /// assert_eq!(Normalizer::normalize_postcode(""), "");
156 /// ```
157 ///
158 /// Idempotent:
159 ///
160 /// ```
161 /// # use event_matcher::Normalizer;
162 /// let once = Normalizer::normalize_postcode("sw1a 2aa");
163 /// let twice = Normalizer::normalize_postcode(&once);
164 /// assert_eq!(once, twice);
165 /// ```
166 #[must_use]
167 pub fn normalize_postcode(postcode: &str) -> String {
168 postcode
169 .chars()
170 .filter(|c| !c.is_whitespace())
171 .collect::<String>()
172 .to_uppercase()
173 }
174
175 /// Normalise a phone number for comparison.
176 ///
177 /// Steps:
178 ///
179 /// 1. Keep only ASCII digits (drop spaces, brackets, hyphens, `+`, …).
180 /// 2. If the result starts with `0044`, drop those four characters.
181 /// 3. Else, if the result starts with `44` and is at least 12 digits long,
182 /// drop the leading `44`.
183 /// 4. Else, if the result starts with `0` and is longer than one digit,
184 /// drop the leading `0`.
185 ///
186 /// This canonicalises the common UK formats into a single subscriber
187 /// number with no leading prefix. International numbers from other
188 /// countries pass through unchanged.
189 ///
190 /// # Examples
191 ///
192 /// ```
193 /// use event_matcher::Normalizer;
194 ///
195 /// // UK mobile, in three formats:
196 /// assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
197 /// assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
198 /// assert_eq!(Normalizer::normalize_phone("0044 7700 900123"), "7700900123");
199 ///
200 /// // UK landline with brackets and spaces:
201 /// assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
202 ///
203 /// // Empty input is preserved (no digits to keep):
204 /// assert_eq!(Normalizer::normalize_phone(""), "");
205 /// ```
206 ///
207 /// Idempotent on canonical inputs:
208 ///
209 /// ```
210 /// # use event_matcher::Normalizer;
211 /// let once = Normalizer::normalize_phone("07700 900123");
212 /// let twice = Normalizer::normalize_phone(&once);
213 /// assert_eq!(once, twice);
214 /// ```
215 #[must_use]
216 pub fn normalize_phone(phone: &str) -> String {
217 let digits: String = phone.chars().filter(char::is_ascii_digit).collect();
218
219 if digits.starts_with("0044") && digits.len() > 4 {
220 return digits[4..].to_string();
221 }
222
223 if digits.starts_with("44") && digits.len() >= 12 {
224 return digits[2..].to_string();
225 }
226
227 if digits.starts_with('0') && digits.len() > 1 {
228 return digits[1..].to_string();
229 }
230
231 digits
232 }
233
234 /// Normalise a phone number to its E.164-style canonical form.
235 ///
236 /// E.164 is the ITU-T standard for international telephone numbers and
237 /// has the shape `+CCNNN…`, where `CC` is the country dialling code
238 /// (1–3 digits) and the remainder is the national-significant number
239 /// (NSN) with no trunk prefix.
240 ///
241 /// The function accepts a wide range of textual layouts:
242 ///
243 /// - `+CC…` (explicit international, the canonical input form).
244 /// - `00CC…` (international access code, common across Europe).
245 /// - `0…` (national format, trunk-prefix) — interpreted relative to
246 /// `default_country` when the country uses a national trunk `0`.
247 /// - `NSN…` (bare national-significant number) — interpreted relative
248 /// to `default_country`.
249 ///
250 /// Returns `Some(canonical)` if the input parses against a country in
251 /// the supported table; otherwise `None`. The supported countries are
252 /// the five jurisdictions for which the crate exposes a national
253 /// healthcare identifier (United Kingdom, France, Spain, Ireland, and
254 /// — sharing the GB dial code — UK Northern Ireland), plus the most
255 /// common international partners (US, CA, DE, IT, NL, BE, PT, CH,
256 /// AT, SE, NO, DK, FI, PL, AU, NZ, JP, CN, IN, BR, MX, ZA). `default_country` is the
257 /// **ISO 3166-1 alpha-2 code** (e.g. `"GB"`, `"FR"`, `"US"`) of the
258 /// jurisdiction whose national format applies when the input lacks an
259 /// explicit international marker. Pass `None` to refuse to assume a
260 /// default — only explicit `+CC` / `00CC` inputs will parse.
261 ///
262 /// The function is **deterministic** and **idempotent**: feeding a
263 /// canonical `+CCNNN…` string back in returns the same string.
264 ///
265 /// # Examples
266 ///
267 /// UK mobile, three textual layouts, all canonicalise to the same E.164 form:
268 ///
269 /// ```
270 /// use event_matcher::Normalizer;
271 /// assert_eq!(
272 /// Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
273 /// Some("+447700900123".to_string()),
274 /// );
275 /// assert_eq!(
276 /// Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
277 /// Some("+447700900123".to_string()),
278 /// );
279 /// assert_eq!(
280 /// Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
281 /// Some("+447700900123".to_string()),
282 /// );
283 /// ```
284 ///
285 /// French national format vs international form:
286 ///
287 /// ```
288 /// # use event_matcher::Normalizer;
289 /// assert_eq!(
290 /// Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
291 /// Some("+33123456789".to_string()),
292 /// );
293 /// assert_eq!(
294 /// Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("GB")),
295 /// Some("+33123456789".to_string()),
296 /// );
297 /// ```
298 ///
299 /// North American (NANP) numbers have no trunk prefix:
300 ///
301 /// ```
302 /// # use event_matcher::Normalizer;
303 /// assert_eq!(
304 /// Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
305 /// Some("+14155551234".to_string()),
306 /// );
307 /// assert_eq!(
308 /// Normalizer::normalize_phone_e164("+1 415 555 1234", None),
309 /// Some("+14155551234".to_string()),
310 /// );
311 /// ```
312 ///
313 /// Unparseable or ambiguous inputs return `None`:
314 ///
315 /// ```
316 /// # use event_matcher::Normalizer;
317 /// // No default country and no international marker: ambiguous.
318 /// assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
319 /// // Unknown dial code.
320 /// assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
321 /// // Empty input.
322 /// assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
323 /// ```
324 ///
325 /// Idempotent on canonical inputs:
326 ///
327 /// ```
328 /// # use event_matcher::Normalizer;
329 /// let once = Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")).unwrap();
330 /// let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).unwrap();
331 /// assert_eq!(once, twice);
332 /// ```
333 #[must_use]
334 pub fn normalize_phone_e164(phone: &str, default_country: Option<&str>) -> Option<String> {
335 let has_plus = phone.chars().any(|c| c == '+');
336 let digits: String = phone.chars().filter(char::is_ascii_digit).collect();
337 if digits.is_empty() {
338 return None;
339 }
340
341 let (info, nsn): (&CountryPhoneInfo, String) = if has_plus {
342 let info = lookup_by_dial_code_prefix(&digits)?;
343 let rest = &digits[info.dial_code.len()..];
344 let rest = strip_trunk_prefix(info, rest);
345 (info, rest.to_string())
346 } else if let Some(stripped) = digits.strip_prefix("00") {
347 let info = lookup_by_dial_code_prefix(stripped)?;
348 let rest = &stripped[info.dial_code.len()..];
349 let rest = strip_trunk_prefix(info, rest);
350 (info, rest.to_string())
351 } else {
352 let iso = default_country?;
353 let info = lookup_by_iso(iso)?;
354 let nsn = strip_trunk_prefix(info, &digits);
355 (info, nsn.to_string())
356 };
357
358 if nsn.len() < info.min_nsn || nsn.len() > info.max_nsn {
359 return None;
360 }
361
362 Some(format!("+{}{}", info.dial_code, nsn))
363 }
364
365 /// Expand common postal address abbreviations as whole tokens.
366 ///
367 /// The input is tokenised on whitespace and each token is matched
368 /// case-insensitively (after stripping a single trailing `.` or `,`)
369 /// against a fixed table of street-type and directional abbreviations.
370 /// Recognised tokens are replaced with their long form, lowercased;
371 /// unrecognised tokens are passed through verbatim. Tokens are then
372 /// re-joined by single spaces.
373 ///
374 /// This function is intentionally simple: it does **not** apply any
375 /// position-aware heuristics. The well-known ambiguous case `"St"` —
376 /// which can mean *Street* or *Saint* — is always expanded to
377 /// *Street*. In practice this remains useful for fuzzy matching
378 /// because the canonical form is consistent on both sides of a
379 /// comparison; pre-process upstream if you need finer disambiguation.
380 ///
381 /// # Examples
382 ///
383 /// ```
384 /// use event_matcher::Normalizer;
385 /// assert_eq!(
386 /// Normalizer::expand_street_abbreviations("123 High St"),
387 /// "123 High street",
388 /// );
389 /// assert_eq!(
390 /// Normalizer::expand_street_abbreviations("45 N. Park Ave."),
391 /// "45 north Park avenue",
392 /// );
393 /// assert_eq!(
394 /// Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
395 /// "12 Sunset boulevard",
396 /// );
397 /// ```
398 ///
399 /// Idempotent on already-expanded inputs (long forms are not
400 /// re-expanded):
401 ///
402 /// ```
403 /// # use event_matcher::Normalizer;
404 /// let once = Normalizer::expand_street_abbreviations("10 Downing St");
405 /// let twice = Normalizer::expand_street_abbreviations(&once);
406 /// assert_eq!(once, twice);
407 /// ```
408 pub fn expand_street_abbreviations(line: &str) -> String {
409 line.split_whitespace()
410 .map(expand_one_token)
411 .collect::<Vec<_>>()
412 .join(" ")
413 }
414
415 /// Normalise an address line for comparison.
416 ///
417 /// Pipeline:
418 ///
419 /// 1. Expand street-type and directional abbreviations via
420 /// [`Normalizer::expand_street_abbreviations`] (so `"St" → "street"`,
421 /// `"Rd" → "road"`, `"N" → "north"`).
422 /// 2. Apply the name-normalisation pipeline
423 /// ([`Normalizer::normalize_name`]): NFKD-decompose, drop combining
424 /// marks, drop ASCII punctuation, lowercase, collapse whitespace.
425 ///
426 /// The result is idempotent and suitable for direct equality or
427 /// similarity comparison.
428 ///
429 /// # Examples
430 ///
431 /// Abbreviated and full forms canonicalise identically:
432 ///
433 /// ```
434 /// use event_matcher::Normalizer;
435 /// assert_eq!(
436 /// Normalizer::normalize_address_line("123 High St"),
437 /// Normalizer::normalize_address_line("123 High Street"),
438 /// );
439 /// assert_eq!(
440 /// Normalizer::normalize_address_line("45 N Park Ave"),
441 /// Normalizer::normalize_address_line("45 North Park Avenue"),
442 /// );
443 /// ```
444 ///
445 /// Punctuation and case are normalised:
446 ///
447 /// ```
448 /// # use event_matcher::Normalizer;
449 /// assert_eq!(
450 /// Normalizer::normalize_address_line("10, DOWNING Street."),
451 /// "10 downing street",
452 /// );
453 /// ```
454 #[must_use]
455 pub fn normalize_address_line(line: &str) -> String {
456 Self::normalize_name(&Self::expand_street_abbreviations(line))
457 }
458
459 /// Parse an address line into its structured components.
460 ///
461 /// The function performs a best-effort structural decomposition of a
462 /// single-line postal address into:
463 ///
464 /// - `house_number` — the leading run of digits (with an optional
465 /// single alphabetic suffix, e.g. `"10A"`), uppercased. `None` if
466 /// no leading number is present.
467 /// - `unit` — a recognised sub-unit prefix (`Flat`, `Apt`,
468 /// `Apartment`, `Unit`, `Suite`, `Ste`) and its identifier,
469 /// lowercased and space-joined (e.g. `"flat 2a"`). `None` if no
470 /// recognised prefix is present.
471 /// - `street` — the remaining text after `unit` and `house_number`
472 /// are removed, run through [`Normalizer::normalize_address_line`].
473 ///
474 /// Parsing is **deterministic** and **format-only** — no postal
475 /// reference is consulted. Inputs that do not match the simple
476 /// regular structure (e.g. a postcode-only string, a city name)
477 /// degrade gracefully: `house_number` and `unit` are `None`, and
478 /// `street` carries the normalised input.
479 ///
480 /// # Examples
481 ///
482 /// Typical UK / US single-line addresses:
483 ///
484 /// ```
485 /// use event_matcher::Normalizer;
486 ///
487 /// let p = Normalizer::parse_address_line("123 High Street");
488 /// assert_eq!(p.house_number.as_deref(), Some("123"));
489 /// assert_eq!(p.unit, None);
490 /// assert_eq!(p.street, "high street");
491 ///
492 /// let p = Normalizer::parse_address_line("10A Downing St");
493 /// assert_eq!(p.house_number.as_deref(), Some("10A"));
494 /// assert_eq!(p.street, "downing street");
495 ///
496 /// let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
497 /// assert_eq!(p.unit.as_deref(), Some("flat 2a"));
498 /// assert_eq!(p.house_number.as_deref(), Some("10"));
499 /// assert_eq!(p.street, "downing street");
500 ///
501 /// let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
502 /// assert_eq!(p.unit.as_deref(), Some("apt 5"));
503 /// assert_eq!(p.house_number.as_deref(), Some("1600"));
504 /// assert_eq!(p.street, "pennsylvania avenue");
505 /// ```
506 ///
507 /// Inputs without a leading number still parse:
508 ///
509 /// ```
510 /// # use event_matcher::Normalizer;
511 /// let p = Normalizer::parse_address_line("Buckingham Palace");
512 /// assert_eq!(p.house_number, None);
513 /// assert_eq!(p.unit, None);
514 /// assert_eq!(p.street, "buckingham palace");
515 /// ```
516 #[must_use]
517 pub fn parse_address_line(line: &str) -> ParsedAddressLine {
518 let trimmed = line.trim();
519 let (unit, after_unit) = extract_unit_prefix(trimmed);
520 let after_unit = after_unit.trim_start_matches([',', ' ', '\t']).trim();
521 let (house_number, after_number) = extract_house_number(after_unit);
522 let after_number = after_number.trim_start_matches([',', ' ', '\t']).trim();
523 ParsedAddressLine {
524 house_number,
525 unit,
526 street: Self::normalize_address_line(after_number),
527 }
528 }
529
530 /// Compute a phonetic (Soundex) code for a name.
531 ///
532 /// Internally, the input is first normalised via
533 /// [`Normalizer::normalize_name`] and then encoded with the American
534 /// Soundex algorithm. Names that sound alike map to the same code, which
535 /// lets the matcher catch spelling variants such as "Smith" / "Smyth" or
536 /// "Stephen" / "Steven".
537 ///
538 /// The implementation is suitable for English-language names. Non-English
539 /// phonemes may be lost. T-9 (spec §21.4) decided to keep Soundex as the
540 /// default and expose an opt-in `MatchConfig::phonetic_encoder` enum
541 /// (Double Metaphone, Daitch-Mokotoff) gated behind a Cargo feature flag
542 /// once an empirical multinational event corpus is available;
543 /// implementation is tracked as T-9.1.
544 ///
545 /// # Examples
546 ///
547 /// Similar-sounding spellings share a code:
548 ///
549 /// ```
550 /// use event_matcher::Normalizer;
551 /// assert_eq!(Normalizer::phonetic_code("Smith"), Normalizer::phonetic_code("Smyth"));
552 /// assert_eq!(Normalizer::phonetic_code("Stephen"), Normalizer::phonetic_code("Steven"));
553 /// ```
554 ///
555 /// Different families produce different codes:
556 ///
557 /// ```
558 /// # use event_matcher::Normalizer;
559 /// assert_ne!(Normalizer::phonetic_code("Jones"), Normalizer::phonetic_code("Smith"));
560 /// ```
561 ///
562 /// Empty input returns an empty string, not a default Soundex value:
563 ///
564 /// ```
565 /// # use event_matcher::Normalizer;
566 /// assert_eq!(Normalizer::phonetic_code(""), "");
567 /// assert_eq!(Normalizer::phonetic_code(" "), "");
568 /// ```
569 #[must_use]
570 pub fn phonetic_code(name: &str) -> String {
571 let normalized = Self::normalize_name(name);
572 if normalized.is_empty() {
573 return String::new();
574 }
575 soundex::american_soundex(&normalized)
576 }
577
578 /// Normalise an email address for comparison.
579 ///
580 /// Steps:
581 ///
582 /// 1. Trim surrounding whitespace.
583 /// 2. Lowercase the entire address (RFC 5321 makes the domain
584 /// case-insensitive and most real-world deployments treat the
585 /// localpart case-insensitively too; case-sensitive localparts
586 /// are technically legal but vanishingly rare in healthcare data).
587 /// 3. Reject inputs that lack exactly one `@` or that have an empty
588 /// localpart or domain by returning `None`.
589 /// 4. If `gmail_dot_folding` is `true` and the domain is `gmail.com`
590 /// or `googlemail.com`, strip every `.` from the localpart and
591 /// drop any `+tag` suffix. Both transformations are reversible
592 /// for Gmail addresses by Google's documented routing rules:
593 /// `j.smith@gmail.com`, `js.mith@gmail.com`, and
594 /// `jsmith+work@gmail.com` all deliver to the same mailbox as
595 /// `jsmith@gmail.com`.
596 ///
597 /// The function is **deterministic** and **idempotent** on
598 /// successful outputs.
599 ///
600 /// # Examples
601 ///
602 /// Common case-and-whitespace normalisation:
603 ///
604 /// ```
605 /// use event_matcher::Normalizer;
606 /// assert_eq!(
607 /// Normalizer::normalize_email(" Alice@Example.ORG ", false),
608 /// Some("alice@example.org".to_string()),
609 /// );
610 /// ```
611 ///
612 /// Malformed inputs return `None`:
613 ///
614 /// ```
615 /// # use event_matcher::Normalizer;
616 /// assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
617 /// assert_eq!(Normalizer::normalize_email("@example.org", false), None);
618 /// assert_eq!(Normalizer::normalize_email("alice@", false), None);
619 /// assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
620 /// assert_eq!(Normalizer::normalize_email("", false), None);
621 /// ```
622 ///
623 /// Optional Gmail dot-folding:
624 ///
625 /// ```
626 /// # use event_matcher::Normalizer;
627 /// assert_eq!(
628 /// Normalizer::normalize_email("j.smith@gmail.com", true),
629 /// Some("jsmith@gmail.com".to_string()),
630 /// );
631 /// assert_eq!(
632 /// Normalizer::normalize_email("jsmith+work@googlemail.com", true),
633 /// Some("jsmith@googlemail.com".to_string()),
634 /// );
635 /// // Dot-folding does not touch non-Gmail addresses.
636 /// assert_eq!(
637 /// Normalizer::normalize_email("j.smith@example.org", true),
638 /// Some("j.smith@example.org".to_string()),
639 /// );
640 /// ```
641 ///
642 /// Idempotent on canonical inputs:
643 ///
644 /// ```
645 /// # use event_matcher::Normalizer;
646 /// let once = Normalizer::normalize_email("Alice@Example.ORG", false).unwrap();
647 /// let twice = Normalizer::normalize_email(&once, false).unwrap();
648 /// assert_eq!(once, twice);
649 /// ```
650 #[must_use]
651 pub fn normalize_email(email: &str, gmail_dot_folding: bool) -> Option<String> {
652 let trimmed = email.trim().to_lowercase();
653 if trimmed.is_empty() {
654 return None;
655 }
656 // Require exactly one '@'.
657 let (local, domain) = trimmed.split_once('@')?;
658 if local.is_empty() || domain.is_empty() {
659 return None;
660 }
661 // Reject any further '@' in the domain side.
662 if domain.contains('@') {
663 return None;
664 }
665 if gmail_dot_folding && (domain == "gmail.com" || domain == "googlemail.com") {
666 let local_no_plus = match local.find('+') {
667 Some(i) => &local[..i],
668 None => local,
669 };
670 let local_folded: String = local_no_plus.chars().filter(|c| *c != '.').collect();
671 if local_folded.is_empty() {
672 return None;
673 }
674 return Some(format!("{local_folded}@{domain}"));
675 }
676 Some(format!("{local}@{domain}"))
677 }
678
679 /// Parse an ISO 8601 / RFC 3339 date or date-time string and return
680 /// the number of seconds since the Unix epoch (`1970-01-01T00:00:00Z`).
681 ///
682 /// Accepted shapes:
683 ///
684 /// - **Date only**: `YYYY-MM-DD` — interpreted as `00:00:00 UTC`.
685 /// - **Date-time, naive**: `YYYY-MM-DDTHH:MM[:SS]` — interpreted as
686 /// `UTC`. Fractional seconds (`.NNN`) are accepted and truncated to
687 /// whole seconds.
688 /// - **Date-time, UTC marker**: `…Z` or `…+00:00`.
689 /// - **Date-time, fixed offset**: `…±HH:MM` (or `…±HHMM`).
690 ///
691 /// Returns `None` if the input does not fit any of these shapes or if
692 /// the components are out of range (month not in `1..=12`, day not in
693 /// `1..=31` for the calendar month, hour not in `0..=23`, minute not in
694 /// `0..=59`, second not in `0..=60` — leap seconds permitted).
695 ///
696 /// The function is **deterministic** and **idempotent under
697 /// canonicalisation**: distinct textual layouts that denote the same
698 /// instant (e.g. `2024-06-26T09:00:00Z` and `2024-06-26T11:00:00+02:00`)
699 /// return the same number.
700 ///
701 /// # Examples
702 ///
703 /// ```
704 /// use event_matcher::Normalizer;
705 ///
706 /// // Unix epoch.
707 /// assert_eq!(Normalizer::parse_iso8601_unix_seconds("1970-01-01T00:00:00Z"), Some(0));
708 /// // Same date with no time component anchors at midnight UTC.
709 /// assert_eq!(Normalizer::parse_iso8601_unix_seconds("1970-01-01"), Some(0));
710 /// // Two textual layouts for the same instant.
711 /// let a = Normalizer::parse_iso8601_unix_seconds("2024-06-26T09:00:00Z").unwrap();
712 /// let b = Normalizer::parse_iso8601_unix_seconds("2024-06-26T11:00:00+02:00").unwrap();
713 /// assert_eq!(a, b);
714 /// ```
715 ///
716 /// Junk input is rejected:
717 ///
718 /// ```
719 /// # use event_matcher::Normalizer;
720 /// assert!(Normalizer::parse_iso8601_unix_seconds("not a date").is_none());
721 /// assert!(Normalizer::parse_iso8601_unix_seconds("2024-13-01").is_none());
722 /// assert!(Normalizer::parse_iso8601_unix_seconds("2024-02-30").is_none());
723 /// ```
724 #[must_use]
725 pub fn parse_iso8601_unix_seconds(input: &str) -> Option<i64> {
726 let s = input.trim();
727 if s.len() < 10 {
728 return None;
729 }
730
731 // YYYY-MM-DD prefix is required.
732 let year: i64 = s.get(..4)?.parse().ok()?;
733 if s.as_bytes().get(4)? != &b'-' {
734 return None;
735 }
736 let month: u32 = s.get(5..7)?.parse().ok()?;
737 if s.as_bytes().get(7)? != &b'-' {
738 return None;
739 }
740 let day: u32 = s.get(8..10)?.parse().ok()?;
741 if !(1..=12).contains(&month) {
742 return None;
743 }
744 if day < 1 || day > days_in_month(year, month) {
745 return None;
746 }
747
748 // Optional time component starts with `T` or a single space.
749 let (hour, minute, second, tz_offset_seconds) = if s.len() == 10 {
750 (0u32, 0u32, 0u32, 0i64)
751 } else {
752 let separator = s.as_bytes().get(10).copied()?;
753 if separator != b'T' && separator != b't' && separator != b' ' {
754 return None;
755 }
756 let rest = &s[11..];
757 parse_time_and_offset(rest)?
758 };
759
760 if hour > 23 || minute > 59 || second > 60 {
761 return None;
762 }
763
764 let days = days_from_civil(year, month, day);
765 let day_seconds = i64::from(hour) * 3600 + i64::from(minute) * 60 + i64::from(second);
766 Some(days * 86_400 + day_seconds - tz_offset_seconds)
767 }
768}
769
770/// Days in `month` for the given Gregorian `year` (1582+ for the purposes
771/// of this crate; the formula is mathematically valid for any year).
772fn days_in_month(year: i64, month: u32) -> u32 {
773 match month {
774 1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
775 4 | 6 | 9 | 11 => 30,
776 2 => {
777 let leap = (year % 4 == 0 && year % 100 != 0) || year % 400 == 0;
778 if leap { 29 } else { 28 }
779 }
780 _ => 0,
781 }
782}
783
784/// Hinnant's `days_from_civil` — number of days from `1970-01-01` to the
785/// given Gregorian date. Negative for earlier dates. Total over all
786/// `(y, m, d)` triples that satisfy the per-month range check.
787fn days_from_civil(year: i64, month: u32, day: u32) -> i64 {
788 let y = if month <= 2 { year - 1 } else { year };
789 let era = if y >= 0 { y } else { y - 399 } / 400;
790 let yoe = y - era * 400; // [0, 399]
791 let m = i64::from(month);
792 let d = i64::from(day);
793 let mp = if m > 2 { m - 3 } else { m + 9 };
794 let doy = (153 * mp + 2) / 5 + d - 1; // [0, 365]
795 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; // [0, 146096]
796 era * 146_097 + doe - 719_468
797}
798
799/// Parse the time and optional timezone offset portion of an ISO 8601
800/// date-time. Input is everything after `T`.
801fn parse_time_and_offset(rest: &str) -> Option<(u32, u32, u32, i64)> {
802 if rest.len() < 5 {
803 return None;
804 }
805 let hour: u32 = rest.get(..2)?.parse().ok()?;
806 if rest.as_bytes().get(2)? != &b':' {
807 return None;
808 }
809 let minute: u32 = rest.get(3..5)?.parse().ok()?;
810
811 let mut idx = 5;
812 let mut second: u32 = 0;
813 if rest.as_bytes().get(5) == Some(&b':') {
814 let s = rest.get(6..8)?;
815 second = s.parse().ok()?;
816 idx = 8;
817 }
818
819 // Optional fractional seconds: `.NNN…`. Truncate.
820 if rest.as_bytes().get(idx) == Some(&b'.') {
821 idx += 1;
822 let frac_start = idx;
823 while rest.as_bytes().get(idx).is_some_and(u8::is_ascii_digit) {
824 idx += 1;
825 }
826 if idx == frac_start {
827 return None;
828 }
829 }
830
831 // Optional timezone designator: `Z`, `±HH:MM`, or `±HHMM`.
832 let tz_offset_seconds = match rest.as_bytes().get(idx).copied() {
833 None => 0,
834 Some(b'Z' | b'z') => {
835 idx += 1;
836 0
837 }
838 Some(b'+' | b'-') => {
839 let sign = if rest.as_bytes()[idx] == b'+' { 1 } else { -1 };
840 idx += 1;
841 let oh: i64 = rest.get(idx..idx + 2)?.parse().ok()?;
842 idx += 2;
843 let om: i64 = if rest.as_bytes().get(idx) == Some(&b':') {
844 idx += 1;
845 let m = rest.get(idx..idx + 2)?.parse().ok()?;
846 idx += 2;
847 m
848 } else if rest.len() >= idx + 2
849 && rest.as_bytes()[idx].is_ascii_digit()
850 && rest.as_bytes()[idx + 1].is_ascii_digit()
851 {
852 let m = rest.get(idx..idx + 2)?.parse().ok()?;
853 idx += 2;
854 m
855 } else {
856 0
857 };
858 sign * (oh * 3600 + om * 60)
859 }
860 Some(_) => return None,
861 };
862
863 if idx != rest.len() {
864 return None;
865 }
866 Some((hour, minute, second, tz_offset_seconds))
867}
868
869/// Structured decomposition of a postal-address line.
870///
871/// Produced by [`Normalizer::parse_address_line`]. The struct is
872/// `Serialize + Deserialize` so it round-trips through JSON and can be
873/// embedded in downstream data models.
874///
875/// All three fields are best-effort: parsing is format-only and consults
876/// no postal reference. Inputs that don't follow the
877/// `(unit, house_number, street)` shape degrade gracefully, with the
878/// missing pieces returned as `None`.
879#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
880pub struct ParsedAddressLine {
881 /// Leading house / building number, including an optional single
882 /// alphabetic suffix (`"10A"`), uppercased. `None` when no leading
883 /// digit is present.
884 pub house_number: Option<String>,
885 /// Sub-unit prefix and identifier, lowercased and space-joined
886 /// (e.g. `"flat 2a"`, `"apt 5"`, `"suite 12"`). `None` when no
887 /// recognised prefix is present.
888 pub unit: Option<String>,
889 /// Remaining street portion, normalised via
890 /// [`Normalizer::normalize_address_line`].
891 pub street: String,
892}
893
894/// Token-level expansion table used by [`Normalizer::expand_street_abbreviations`].
895///
896/// Entries are matched case-insensitively against a token with at most one
897/// trailing `.` or `,` stripped. The replacement is always lowercase so the
898/// downstream name-normalisation pipeline is a no-op for these tokens.
899const STREET_ABBREVIATIONS: &[(&str, &str)] = &[
900 ("st", "street"),
901 ("str", "street"),
902 ("rd", "road"),
903 ("ave", "avenue"),
904 ("av", "avenue"),
905 ("blvd", "boulevard"),
906 ("bvd", "boulevard"),
907 ("ln", "lane"),
908 ("dr", "drive"),
909 ("ct", "court"),
910 ("pl", "place"),
911 ("sq", "square"),
912 ("ter", "terrace"),
913 ("terr", "terrace"),
914 ("hwy", "highway"),
915 ("pkwy", "parkway"),
916 ("mt", "mount"),
917 ("mtn", "mountain"),
918 ("cres", "crescent"),
919 ("gdns", "gardens"),
920 ("gdn", "garden"),
921 ("gr", "grove"),
922 ("cl", "close"),
923 ("pk", "park"),
924 ("plz", "plaza"),
925 ("expy", "expressway"),
926 ("trl", "trail"),
927 ("n", "north"),
928 ("s", "south"),
929 ("e", "east"),
930 ("w", "west"),
931 ("ne", "northeast"),
932 ("nw", "northwest"),
933 ("se", "southeast"),
934 ("sw", "southwest"),
935];
936
937/// Recognised sub-unit prefix keywords for [`Normalizer::parse_address_line`].
938const UNIT_PREFIXES: &[&str] = &[
939 "flat",
940 "apartment",
941 "apt",
942 "unit",
943 "suite",
944 "ste",
945 "room",
946 "rm",
947];
948
949/// Expand a single whitespace-separated token if it appears in
950/// [`STREET_ABBREVIATIONS`].
951///
952/// The token is matched after stripping at most one trailing `.` or `,`;
953/// the comparison is ASCII case-insensitive. Tokens that contain non-ASCII
954/// characters short-circuit to the original input unchanged.
955fn expand_one_token(tok: &str) -> String {
956 let stripped = tok.trim_end_matches(['.', ',']);
957 if !stripped.is_ascii() {
958 return tok.to_string();
959 }
960 let lower = stripped.to_ascii_lowercase();
961 for (abbrev, full) in STREET_ABBREVIATIONS {
962 if lower == *abbrev {
963 return (*full).to_string();
964 }
965 }
966 tok.to_string()
967}
968
969/// Extract a recognised unit prefix and its identifier from the start of `s`.
970///
971/// Returns `(Some("flat 2a"), rest)` when the input begins with a
972/// recognised keyword followed by an alphanumeric identifier; otherwise
973/// `(None, s)` unchanged.
974fn extract_unit_prefix(s: &str) -> (Option<String>, &str) {
975 let trimmed = s.trim_start();
976 // Find the first whitespace; everything before is the candidate keyword.
977 let kw_end = trimmed
978 .find(|c: char| c.is_whitespace())
979 .unwrap_or(trimmed.len());
980 if kw_end == 0 {
981 return (None, s);
982 }
983 let kw_raw = &trimmed[..kw_end];
984 let kw_stripped = kw_raw.trim_end_matches(['.', ',']);
985 if !kw_stripped.is_ascii() {
986 return (None, s);
987 }
988 let kw_lower = kw_stripped.to_ascii_lowercase();
989 if !UNIT_PREFIXES.iter().any(|p| *p == kw_lower) {
990 return (None, s);
991 }
992 // Skip whitespace and `#` after the keyword.
993 let after_kw = trimmed[kw_end..].trim_start_matches([' ', '\t', '#']);
994 // Read alphanumerics as the identifier.
995 let id_end = after_kw
996 .find(|c: char| !c.is_ascii_alphanumeric())
997 .unwrap_or(after_kw.len());
998 if id_end == 0 {
999 return (None, s);
1000 }
1001 let id = &after_kw[..id_end];
1002 let rest = &after_kw[id_end..];
1003 let unit = format!("{} {}", kw_lower, id.to_ascii_lowercase());
1004 (Some(unit), rest)
1005}
1006
1007/// Extract a leading house number (digits + optional single alphabetic
1008/// suffix) from the start of `s`.
1009///
1010/// `"10 Downing Street"` → `(Some("10"), " Downing Street")`.
1011/// `"10A High St"` → `(Some("10A"), " High St")`.
1012/// `"Buckingham Palace"` → `(None, "Buckingham Palace")`.
1013fn extract_house_number(s: &str) -> (Option<String>, &str) {
1014 let trimmed = s.trim_start();
1015 let mut digits_end = 0;
1016 for (i, c) in trimmed.char_indices() {
1017 if c.is_ascii_digit() {
1018 digits_end = i + c.len_utf8();
1019 } else {
1020 break;
1021 }
1022 }
1023 if digits_end == 0 {
1024 return (None, s);
1025 }
1026 let mut end = digits_end;
1027 // Allow a single alphabetic suffix (e.g. "10A"), but only when not
1028 // followed by another alphabetic — otherwise we'd swallow the start
1029 // of a street name like "10 Apple Tree Lane".
1030 let after_digits = &trimmed[digits_end..];
1031 let mut chars = after_digits.chars();
1032 if let Some(c1) = chars.next()
1033 && c1.is_ascii_alphabetic()
1034 {
1035 let next = chars.next();
1036 if next.is_none_or(|c2| !c2.is_ascii_alphanumeric()) {
1037 end += c1.len_utf8();
1038 }
1039 }
1040 let number = trimmed[..end].to_ascii_uppercase();
1041 (Some(number), &trimmed[end..])
1042}
1043
1044/// Per-country phone metadata for [`Normalizer::normalize_phone_e164`].
1045///
1046/// `min_nsn` / `max_nsn` bound the **national-significant number** length —
1047/// the digits after the dial code, with the national trunk prefix removed.
1048/// `trunk_prefix` is the digit string used for national dialling (`"0"` for
1049/// most of Europe and Asia, `"8"` for Lithuania, `None` for NANP / Spain /
1050/// Portugal and several others). When set, a single occurrence of the
1051/// string at the start of the national number is stripped before
1052/// canonicalisation.
1053struct CountryPhoneInfo {
1054 /// ISO 3166-1 alpha-2 country code, uppercase.
1055 iso_alpha2: &'static str,
1056 /// International dialling code, no leading `+`.
1057 dial_code: &'static str,
1058 /// National trunk prefix digit(s), if any.
1059 trunk_prefix: Option<&'static str>,
1060 /// Minimum national-significant-number length.
1061 min_nsn: usize,
1062 /// Maximum national-significant-number length.
1063 max_nsn: usize,
1064}
1065
1066/// Phone-numbering metadata for countries supported by
1067/// [`Normalizer::normalize_phone_e164`].
1068///
1069/// Coverage: all five jurisdictions for which the crate exposes a national
1070/// healthcare identifier (GB England/Wales/IoM, FR, ES, IE, plus UK NI via
1071/// the GB dial code), plus the most common international partners. New
1072/// entries SHOULD follow the ISO 3166-1 alpha-2 convention and document the
1073/// trunk-prefix rule explicitly.
1074const COUNTRY_PHONE_TABLE: &[CountryPhoneInfo] = &[
1075 CountryPhoneInfo {
1076 iso_alpha2: "GB",
1077 dial_code: "44",
1078 trunk_prefix: Some("0"),
1079 min_nsn: 7,
1080 max_nsn: 11,
1081 },
1082 CountryPhoneInfo {
1083 iso_alpha2: "FR",
1084 dial_code: "33",
1085 trunk_prefix: Some("0"),
1086 min_nsn: 9,
1087 max_nsn: 9,
1088 },
1089 CountryPhoneInfo {
1090 iso_alpha2: "DE",
1091 dial_code: "49",
1092 trunk_prefix: Some("0"),
1093 min_nsn: 7,
1094 max_nsn: 13,
1095 },
1096 CountryPhoneInfo {
1097 iso_alpha2: "ES",
1098 dial_code: "34",
1099 trunk_prefix: None,
1100 min_nsn: 9,
1101 max_nsn: 9,
1102 },
1103 CountryPhoneInfo {
1104 iso_alpha2: "IE",
1105 dial_code: "353",
1106 trunk_prefix: Some("0"),
1107 min_nsn: 7,
1108 max_nsn: 11,
1109 },
1110 CountryPhoneInfo {
1111 iso_alpha2: "IT",
1112 dial_code: "39",
1113 trunk_prefix: None,
1114 min_nsn: 6,
1115 max_nsn: 12,
1116 },
1117 CountryPhoneInfo {
1118 iso_alpha2: "NL",
1119 dial_code: "31",
1120 trunk_prefix: Some("0"),
1121 min_nsn: 9,
1122 max_nsn: 9,
1123 },
1124 CountryPhoneInfo {
1125 iso_alpha2: "BE",
1126 dial_code: "32",
1127 trunk_prefix: Some("0"),
1128 min_nsn: 8,
1129 max_nsn: 9,
1130 },
1131 CountryPhoneInfo {
1132 iso_alpha2: "PT",
1133 dial_code: "351",
1134 trunk_prefix: None,
1135 min_nsn: 9,
1136 max_nsn: 9,
1137 },
1138 CountryPhoneInfo {
1139 iso_alpha2: "CH",
1140 dial_code: "41",
1141 trunk_prefix: Some("0"),
1142 min_nsn: 9,
1143 max_nsn: 9,
1144 },
1145 CountryPhoneInfo {
1146 iso_alpha2: "AT",
1147 dial_code: "43",
1148 trunk_prefix: Some("0"),
1149 min_nsn: 4,
1150 max_nsn: 13,
1151 },
1152 CountryPhoneInfo {
1153 iso_alpha2: "SE",
1154 dial_code: "46",
1155 trunk_prefix: Some("0"),
1156 min_nsn: 7,
1157 max_nsn: 13,
1158 },
1159 CountryPhoneInfo {
1160 iso_alpha2: "NO",
1161 dial_code: "47",
1162 trunk_prefix: None,
1163 min_nsn: 8,
1164 max_nsn: 8,
1165 },
1166 CountryPhoneInfo {
1167 iso_alpha2: "DK",
1168 dial_code: "45",
1169 trunk_prefix: None,
1170 min_nsn: 8,
1171 max_nsn: 8,
1172 },
1173 CountryPhoneInfo {
1174 iso_alpha2: "FI",
1175 dial_code: "358",
1176 trunk_prefix: Some("0"),
1177 min_nsn: 5,
1178 max_nsn: 12,
1179 },
1180 CountryPhoneInfo {
1181 iso_alpha2: "PL",
1182 dial_code: "48",
1183 trunk_prefix: None,
1184 min_nsn: 9,
1185 max_nsn: 9,
1186 },
1187 CountryPhoneInfo {
1188 iso_alpha2: "AU",
1189 dial_code: "61",
1190 trunk_prefix: Some("0"),
1191 min_nsn: 9,
1192 max_nsn: 9,
1193 },
1194 CountryPhoneInfo {
1195 iso_alpha2: "NZ",
1196 dial_code: "64",
1197 trunk_prefix: Some("0"),
1198 min_nsn: 8,
1199 max_nsn: 10,
1200 },
1201 CountryPhoneInfo {
1202 iso_alpha2: "US",
1203 dial_code: "1",
1204 trunk_prefix: None,
1205 min_nsn: 10,
1206 max_nsn: 10,
1207 },
1208 CountryPhoneInfo {
1209 iso_alpha2: "CA",
1210 dial_code: "1",
1211 trunk_prefix: None,
1212 min_nsn: 10,
1213 max_nsn: 10,
1214 },
1215 CountryPhoneInfo {
1216 iso_alpha2: "JP",
1217 dial_code: "81",
1218 trunk_prefix: Some("0"),
1219 min_nsn: 9,
1220 max_nsn: 10,
1221 },
1222 CountryPhoneInfo {
1223 iso_alpha2: "CN",
1224 dial_code: "86",
1225 trunk_prefix: Some("0"),
1226 min_nsn: 5,
1227 max_nsn: 12,
1228 },
1229 CountryPhoneInfo {
1230 iso_alpha2: "IN",
1231 dial_code: "91",
1232 trunk_prefix: Some("0"),
1233 min_nsn: 10,
1234 max_nsn: 10,
1235 },
1236 CountryPhoneInfo {
1237 iso_alpha2: "BR",
1238 dial_code: "55",
1239 trunk_prefix: Some("0"),
1240 min_nsn: 10,
1241 max_nsn: 11,
1242 },
1243 CountryPhoneInfo {
1244 iso_alpha2: "MX",
1245 dial_code: "52",
1246 trunk_prefix: None,
1247 min_nsn: 10,
1248 max_nsn: 10,
1249 },
1250 CountryPhoneInfo {
1251 iso_alpha2: "ZA",
1252 dial_code: "27",
1253 trunk_prefix: Some("0"),
1254 min_nsn: 9,
1255 max_nsn: 9,
1256 },
1257 // ---- T-19: coverage of remaining 35-scheme identifier jurisdictions ----
1258 CountryPhoneInfo {
1259 iso_alpha2: "BG",
1260 dial_code: "359",
1261 trunk_prefix: Some("0"),
1262 min_nsn: 8,
1263 max_nsn: 9,
1264 },
1265 CountryPhoneInfo {
1266 iso_alpha2: "CZ",
1267 dial_code: "420",
1268 trunk_prefix: None,
1269 min_nsn: 9,
1270 max_nsn: 9,
1271 },
1272 CountryPhoneInfo {
1273 iso_alpha2: "EE",
1274 dial_code: "372",
1275 trunk_prefix: None,
1276 min_nsn: 7,
1277 max_nsn: 8,
1278 },
1279 CountryPhoneInfo {
1280 iso_alpha2: "GR",
1281 dial_code: "30",
1282 trunk_prefix: None,
1283 min_nsn: 10,
1284 max_nsn: 10,
1285 },
1286 CountryPhoneInfo {
1287 iso_alpha2: "HR",
1288 dial_code: "385",
1289 trunk_prefix: Some("0"),
1290 min_nsn: 8,
1291 max_nsn: 9,
1292 },
1293 CountryPhoneInfo {
1294 iso_alpha2: "IS",
1295 dial_code: "354",
1296 trunk_prefix: None,
1297 min_nsn: 7,
1298 max_nsn: 9,
1299 },
1300 CountryPhoneInfo {
1301 iso_alpha2: "LI",
1302 dial_code: "423",
1303 trunk_prefix: None,
1304 min_nsn: 7,
1305 max_nsn: 9,
1306 },
1307 // Lithuania uses `8` (not `0`) as the national trunk prefix.
1308 CountryPhoneInfo {
1309 iso_alpha2: "LT",
1310 dial_code: "370",
1311 trunk_prefix: Some("8"),
1312 min_nsn: 8,
1313 max_nsn: 8,
1314 },
1315 CountryPhoneInfo {
1316 iso_alpha2: "LV",
1317 dial_code: "371",
1318 trunk_prefix: None,
1319 min_nsn: 8,
1320 max_nsn: 8,
1321 },
1322 CountryPhoneInfo {
1323 iso_alpha2: "MT",
1324 dial_code: "356",
1325 trunk_prefix: None,
1326 min_nsn: 8,
1327 max_nsn: 8,
1328 },
1329 CountryPhoneInfo {
1330 iso_alpha2: "RO",
1331 dial_code: "40",
1332 trunk_prefix: Some("0"),
1333 min_nsn: 9,
1334 max_nsn: 9,
1335 },
1336 CountryPhoneInfo {
1337 iso_alpha2: "SI",
1338 dial_code: "386",
1339 trunk_prefix: Some("0"),
1340 min_nsn: 8,
1341 max_nsn: 8,
1342 },
1343 CountryPhoneInfo {
1344 iso_alpha2: "SK",
1345 dial_code: "421",
1346 trunk_prefix: Some("0"),
1347 min_nsn: 9,
1348 max_nsn: 9,
1349 },
1350];
1351
1352/// Look up a country by ISO 3166-1 alpha-2 code (case-insensitive).
1353///
1354/// Returns the first match in [`COUNTRY_PHONE_TABLE`]. For NANP countries
1355/// (US/CA) which share dial code `1`, this disambiguates by the caller's
1356/// chosen default; the canonical E.164 output is identical for both.
1357fn lookup_by_iso(iso: &str) -> Option<&'static CountryPhoneInfo> {
1358 if !iso.is_ascii() {
1359 return None;
1360 }
1361 let upper = iso.to_ascii_uppercase();
1362 COUNTRY_PHONE_TABLE.iter().find(|c| c.iso_alpha2 == upper)
1363}
1364
1365/// Match the longest known dial-code prefix at the start of `digits`.
1366///
1367/// Tries 3-, 2-, then 1-digit prefixes to honour the country table's most
1368/// specific entry. For NANP (dial code `1`) the first matching entry — US —
1369/// is returned; the canonical E.164 form is the same whether the caller
1370/// later interprets the country as US or CA.
1371fn lookup_by_dial_code_prefix(digits: &str) -> Option<&'static CountryPhoneInfo> {
1372 for len in [3usize, 2, 1] {
1373 if digits.len() >= len {
1374 let prefix = &digits[..len];
1375 if let Some(info) = COUNTRY_PHONE_TABLE.iter().find(|c| c.dial_code == prefix) {
1376 return Some(info);
1377 }
1378 }
1379 }
1380 None
1381}
1382
1383/// Strip a single occurrence of the country's national trunk prefix
1384/// from `nsn` if one is configured and present.
1385fn strip_trunk_prefix<'a>(info: &CountryPhoneInfo, nsn: &'a str) -> &'a str {
1386 if let Some(tp) = info.trunk_prefix
1387 && let Some(rest) = nsn.strip_prefix(tp)
1388 && !rest.is_empty()
1389 {
1390 rest
1391 } else {
1392 nsn
1393 }
1394}
1395
1396#[cfg(test)]
1397mod tests {
1398 use super::*;
1399
1400 // ---------- normalize_name ----------
1401
1402 #[test]
1403 fn normalize_name_collapses_whitespace_and_trims() {
1404 assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
1405 }
1406
1407 #[test]
1408 fn normalize_name_strips_ascii_punctuation() {
1409 assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
1410 assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
1411 assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
1412 }
1413
1414 #[test]
1415 fn normalize_name_strips_diacritics() {
1416 assert_eq!(Normalizer::normalize_name("José"), "jose");
1417 assert_eq!(Normalizer::normalize_name("Siân"), "sian");
1418 // common test cases
1419 assert_eq!(Normalizer::normalize_name("naïve"), "naive");
1420 assert_eq!(Normalizer::normalize_name("crème"), "creme");
1421 // ŷ and ŵ decompose; ł has no NFKD decomposition and survives lowercased.
1422 assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
1423 }
1424
1425 #[test]
1426 fn normalize_name_handles_empty_and_whitespace() {
1427 assert_eq!(Normalizer::normalize_name(""), "");
1428 assert_eq!(Normalizer::normalize_name(" "), "");
1429 assert_eq!(Normalizer::normalize_name("\t\n"), "");
1430 }
1431
1432 #[test]
1433 fn normalize_name_lowercases() {
1434 assert_eq!(Normalizer::normalize_name("MARY"), "mary");
1435 assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
1436 }
1437
1438 #[test]
1439 fn normalize_name_is_idempotent() {
1440 for input in [
1441 " John Smith ",
1442 "O'Brien-Jones",
1443 "JOSÉ MARÍA",
1444 "",
1445 " ",
1446 "Siân",
1447 ] {
1448 let once = Normalizer::normalize_name(input);
1449 let twice = Normalizer::normalize_name(&once);
1450 assert_eq!(once, twice, "not idempotent for {input:?}");
1451 }
1452 }
1453
1454 #[test]
1455 fn normalize_name_does_not_normalise_unicode_punctuation() {
1456 // Curly apostrophe (U+2019) is intentionally not stripped.
1457 // This is documented in AGENTS/normalization.md as a known limitation.
1458 let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
1459 assert!(with_curly.contains('\u{2019}'));
1460 }
1461
1462 // ---------- normalize_postcode ----------
1463
1464 #[test]
1465 fn normalize_postcode_uppercases() {
1466 assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
1467 }
1468
1469 #[test]
1470 fn normalize_postcode_strips_all_whitespace() {
1471 assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
1472 assert_eq!(Normalizer::normalize_postcode(" CF10 1AA "), "CF101AA");
1473 assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
1474 }
1475
1476 #[test]
1477 fn normalize_postcode_handles_empty() {
1478 assert_eq!(Normalizer::normalize_postcode(""), "");
1479 assert_eq!(Normalizer::normalize_postcode(" "), "");
1480 }
1481
1482 #[test]
1483 fn normalize_postcode_is_idempotent() {
1484 for input in ["cf10 1aa", "SW1A 2AA", " EH8 9YL ", ""] {
1485 let once = Normalizer::normalize_postcode(input);
1486 let twice = Normalizer::normalize_postcode(&once);
1487 assert_eq!(once, twice);
1488 }
1489 }
1490
1491 // ---------- normalize_phone ----------
1492
1493 #[test]
1494 fn normalize_phone_strips_uk_trunk_prefix() {
1495 assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
1496 }
1497
1498 #[test]
1499 fn normalize_phone_strips_plus_44_international() {
1500 assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
1501 }
1502
1503 #[test]
1504 fn normalize_phone_strips_0044_international() {
1505 assert_eq!(
1506 Normalizer::normalize_phone("0044 7700 900123"),
1507 "7700900123"
1508 );
1509 }
1510
1511 #[test]
1512 fn normalize_phone_handles_brackets_and_spaces() {
1513 assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
1514 }
1515
1516 #[test]
1517 fn normalize_phone_handles_empty() {
1518 assert_eq!(Normalizer::normalize_phone(""), "");
1519 assert_eq!(Normalizer::normalize_phone("---"), "");
1520 }
1521
1522 #[test]
1523 fn normalize_phone_does_not_strip_44_if_too_short() {
1524 // 44 followed by fewer than 10 more digits: keep the 44 (not international prefix).
1525 assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
1526 }
1527
1528 #[test]
1529 fn normalize_phone_is_idempotent() {
1530 for input in [
1531 "07700 900123",
1532 "+44 7700 900123",
1533 "0044 7700 900123",
1534 "(029) 2034 5678",
1535 "",
1536 ] {
1537 let once = Normalizer::normalize_phone(input);
1538 let twice = Normalizer::normalize_phone(&once);
1539 assert_eq!(once, twice, "not idempotent for {input:?}");
1540 }
1541 }
1542
1543 #[test]
1544 fn normalize_phone_keeps_lone_zero() {
1545 // A bare "0" is not stripped (guard: len > 1).
1546 assert_eq!(Normalizer::normalize_phone("0"), "0");
1547 }
1548
1549 // ---------- phonetic_code ----------
1550
1551 #[test]
1552 fn phonetic_code_groups_smith_and_smyth() {
1553 assert_eq!(
1554 Normalizer::phonetic_code("Smith"),
1555 Normalizer::phonetic_code("Smyth")
1556 );
1557 }
1558
1559 #[test]
1560 fn phonetic_code_groups_stephen_and_steven() {
1561 assert_eq!(
1562 Normalizer::phonetic_code("Stephen"),
1563 Normalizer::phonetic_code("Steven")
1564 );
1565 }
1566
1567 #[test]
1568 fn phonetic_code_distinguishes_different_families() {
1569 assert_ne!(
1570 Normalizer::phonetic_code("Jones"),
1571 Normalizer::phonetic_code("Smith")
1572 );
1573 assert_ne!(
1574 Normalizer::phonetic_code("Anderson"),
1575 Normalizer::phonetic_code("Zimmerman")
1576 );
1577 }
1578
1579 #[test]
1580 fn phonetic_code_specific_values() {
1581 // Pinned values from the underlying soundex crate; act as a regression net.
1582 assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
1583 assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
1584 assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
1585 assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
1586 }
1587
1588 #[test]
1589 fn phonetic_code_handles_empty() {
1590 assert_eq!(Normalizer::phonetic_code(""), "");
1591 assert_eq!(Normalizer::phonetic_code(" "), "");
1592 }
1593
1594 #[test]
1595 fn phonetic_code_is_case_insensitive() {
1596 assert_eq!(
1597 Normalizer::phonetic_code("SMITH"),
1598 Normalizer::phonetic_code("smith")
1599 );
1600 }
1601
1602 // ---------- normalize_phone_e164 ----------
1603
1604 #[test]
1605 fn e164_uk_layouts_canonicalise_identically() {
1606 let canonical = Some("+447700900123".to_string());
1607 assert_eq!(
1608 Normalizer::normalize_phone_e164("+44 7700 900123", Some("GB")),
1609 canonical,
1610 );
1611 assert_eq!(
1612 Normalizer::normalize_phone_e164("0044 7700 900123", Some("GB")),
1613 canonical,
1614 );
1615 assert_eq!(
1616 Normalizer::normalize_phone_e164("07700 900123", Some("GB")),
1617 canonical,
1618 );
1619 assert_eq!(
1620 Normalizer::normalize_phone_e164("(07700) 900-123", Some("GB")),
1621 canonical,
1622 );
1623 }
1624
1625 #[test]
1626 fn e164_french_layouts_canonicalise_identically() {
1627 let canonical = Some("+33123456789".to_string());
1628 assert_eq!(
1629 Normalizer::normalize_phone_e164("+33 1 23 45 67 89", Some("FR")),
1630 canonical,
1631 );
1632 assert_eq!(
1633 Normalizer::normalize_phone_e164("0033 1 23 45 67 89", Some("FR")),
1634 canonical,
1635 );
1636 assert_eq!(
1637 Normalizer::normalize_phone_e164("01 23 45 67 89", Some("FR")),
1638 canonical,
1639 );
1640 }
1641
1642 #[test]
1643 fn e164_spain_has_no_national_trunk_prefix() {
1644 // Spain switched to no trunk-0 in 1998; a bare 9-digit national
1645 // number is the canonical form.
1646 assert_eq!(
1647 Normalizer::normalize_phone_e164("912 345 678", Some("ES")),
1648 Some("+34912345678".to_string()),
1649 );
1650 assert_eq!(
1651 Normalizer::normalize_phone_e164("+34 912 345 678", None),
1652 Some("+34912345678".to_string()),
1653 );
1654 }
1655
1656 #[test]
1657 fn e164_ireland_three_digit_dial_code() {
1658 assert_eq!(
1659 Normalizer::normalize_phone_e164("+353 1 234 5678", None),
1660 Some("+35312345678".to_string()),
1661 );
1662 assert_eq!(
1663 Normalizer::normalize_phone_e164("01 234 5678", Some("IE")),
1664 Some("+35312345678".to_string()),
1665 );
1666 }
1667
1668 #[test]
1669 fn e164_nanp_handles_us_and_canada() {
1670 assert_eq!(
1671 Normalizer::normalize_phone_e164("(415) 555-1234", Some("US")),
1672 Some("+14155551234".to_string()),
1673 );
1674 assert_eq!(
1675 Normalizer::normalize_phone_e164("+1 415 555 1234", None),
1676 Some("+14155551234".to_string()),
1677 );
1678 // Canada uses the same dial code; canonical form is identical.
1679 assert_eq!(
1680 Normalizer::normalize_phone_e164("(416) 555-1234", Some("CA")),
1681 Some("+14165551234".to_string()),
1682 );
1683 }
1684
1685 // ---------- T-19: 35-scheme jurisdiction coverage ----------
1686
1687 #[test]
1688 fn e164_lithuania_uses_eight_as_trunk_prefix() {
1689 // Lithuania's national trunk prefix is `8`, not `0`. National
1690 // dialling form `8 612 34567` (mobile) canonicalises to the same
1691 // E.164 string as the explicit `+370 612 34567` form.
1692 assert_eq!(
1693 Normalizer::normalize_phone_e164("8 612 34567", Some("LT")),
1694 Some("+37061234567".to_string()),
1695 );
1696 assert_eq!(
1697 Normalizer::normalize_phone_e164("+370 612 34567", None),
1698 Some("+37061234567".to_string()),
1699 );
1700 }
1701
1702 #[test]
1703 fn e164_greece_has_no_national_trunk_prefix() {
1704 // GR national-significant numbers begin with the area code (the
1705 // leading zero seen in older publications is no longer a trunk).
1706 assert_eq!(
1707 Normalizer::normalize_phone_e164("+30 210 123 4567", None),
1708 Some("+302101234567".to_string()),
1709 );
1710 assert_eq!(
1711 Normalizer::normalize_phone_e164("210 123 4567", Some("GR")),
1712 Some("+302101234567".to_string()),
1713 );
1714 }
1715
1716 #[test]
1717 fn e164_romania_strips_trunk_zero() {
1718 assert_eq!(
1719 Normalizer::normalize_phone_e164("0721 234 567", Some("RO")),
1720 Some("+40721234567".to_string()),
1721 );
1722 assert_eq!(
1723 Normalizer::normalize_phone_e164("+40 721 234 567", None),
1724 Some("+40721234567".to_string()),
1725 );
1726 }
1727
1728 #[test]
1729 fn e164_czech_no_trunk_prefix() {
1730 assert_eq!(
1731 Normalizer::normalize_phone_e164("+420 234 567 890", None),
1732 Some("+420234567890".to_string()),
1733 );
1734 assert_eq!(
1735 Normalizer::normalize_phone_e164("234 567 890", Some("CZ")),
1736 Some("+420234567890".to_string()),
1737 );
1738 }
1739
1740 #[test]
1741 fn e164_iceland_seven_digit_nsn() {
1742 assert_eq!(
1743 Normalizer::normalize_phone_e164("+354 412 3456", None),
1744 Some("+3544123456".to_string()),
1745 );
1746 }
1747
1748 #[test]
1749 fn e164_distinguishes_overlapping_three_digit_dial_codes() {
1750 // Croatia (385) vs Slovenia (386): adjacent dial codes, both
1751 // use a `0` trunk, but the canonical E.164 forms remain distinct.
1752 let hr = Normalizer::normalize_phone_e164("+385 91 234 5678", None);
1753 let si = Normalizer::normalize_phone_e164("+386 41 234 567", None);
1754 assert!(hr.is_some());
1755 assert!(si.is_some());
1756 assert_ne!(hr, si);
1757 }
1758
1759 #[test]
1760 fn e164_distinguishes_countries_with_overlapping_national_digits() {
1761 // The "same" national-format digits in two countries must yield
1762 // different E.164 strings — this is precisely the disambiguation
1763 // the new normaliser provides.
1764 let uk = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1765 let fr = Normalizer::normalize_phone_e164("07700 90012", Some("FR"));
1766 assert!(uk.is_some());
1767 assert!(fr.is_some());
1768 assert_ne!(uk, fr);
1769 }
1770
1771 #[test]
1772 fn e164_returns_none_when_default_country_missing_and_no_marker() {
1773 // Ambiguous national-format input with no default country.
1774 assert_eq!(Normalizer::normalize_phone_e164("07700 900123", None), None);
1775 }
1776
1777 #[test]
1778 fn e164_returns_none_for_unknown_dial_code() {
1779 assert_eq!(Normalizer::normalize_phone_e164("+999 1234567", None), None);
1780 }
1781
1782 #[test]
1783 fn e164_returns_none_for_empty_or_punctuation_only() {
1784 assert_eq!(Normalizer::normalize_phone_e164("", Some("GB")), None);
1785 assert_eq!(Normalizer::normalize_phone_e164("+", Some("GB")), None);
1786 assert_eq!(Normalizer::normalize_phone_e164("()-", Some("GB")), None);
1787 }
1788
1789 #[test]
1790 fn e164_returns_none_for_too_short_or_too_long_nsn() {
1791 // GB NSN must be 7..=11 digits.
1792 assert_eq!(Normalizer::normalize_phone_e164("+44 1", None), None);
1793 assert_eq!(
1794 Normalizer::normalize_phone_e164("+44 123456789012345", None),
1795 None,
1796 );
1797 }
1798
1799 #[test]
1800 fn e164_rejects_unknown_default_country() {
1801 // "XX" is not in the table; without an explicit international
1802 // marker the function cannot guess.
1803 assert_eq!(
1804 Normalizer::normalize_phone_e164("07700 900123", Some("XX")),
1805 None,
1806 );
1807 }
1808
1809 #[test]
1810 fn e164_is_idempotent_on_canonical_form() {
1811 for input in [
1812 "+44 7700 900123",
1813 "+33 1 23 45 67 89",
1814 "(415) 555-1234",
1815 "+353 1 234 5678",
1816 "+34 912 345 678",
1817 ] {
1818 let once = Normalizer::normalize_phone_e164(input, Some("GB")).expect("parses");
1819 let twice = Normalizer::normalize_phone_e164(&once, Some("GB")).expect("idempotent");
1820 assert_eq!(once, twice, "not idempotent for {input:?}");
1821 }
1822 }
1823
1824 #[test]
1825 fn e164_default_country_lookup_is_case_insensitive() {
1826 let lower = Normalizer::normalize_phone_e164("07700 900123", Some("gb"));
1827 let upper = Normalizer::normalize_phone_e164("07700 900123", Some("GB"));
1828 assert_eq!(lower, upper);
1829 assert!(lower.is_some());
1830 }
1831
1832 #[test]
1833 fn e164_handles_double_zero_international_access_form() {
1834 assert_eq!(
1835 Normalizer::normalize_phone_e164("00 33 1 23 45 67 89", None),
1836 Some("+33123456789".to_string()),
1837 );
1838 }
1839
1840 // ---------- expand_street_abbreviations ----------
1841
1842 #[test]
1843 fn expand_street_replaces_common_abbreviations() {
1844 assert_eq!(
1845 Normalizer::expand_street_abbreviations("123 High St"),
1846 "123 High street",
1847 );
1848 assert_eq!(
1849 Normalizer::expand_street_abbreviations("10 Downing Rd"),
1850 "10 Downing road",
1851 );
1852 assert_eq!(
1853 Normalizer::expand_street_abbreviations("12 Sunset Blvd"),
1854 "12 Sunset boulevard",
1855 );
1856 assert_eq!(
1857 Normalizer::expand_street_abbreviations("1 Park Ave"),
1858 "1 Park avenue",
1859 );
1860 assert_eq!(
1861 Normalizer::expand_street_abbreviations("5 Cherry Ln"),
1862 "5 Cherry lane",
1863 );
1864 }
1865
1866 #[test]
1867 fn expand_street_replaces_directionals() {
1868 assert_eq!(
1869 Normalizer::expand_street_abbreviations("45 N Park Ave"),
1870 "45 north Park avenue",
1871 );
1872 assert_eq!(
1873 Normalizer::expand_street_abbreviations("100 SW 5th St"),
1874 "100 southwest 5th street",
1875 );
1876 }
1877
1878 #[test]
1879 fn expand_street_strips_trailing_period_or_comma() {
1880 assert_eq!(
1881 Normalizer::expand_street_abbreviations("123 High St."),
1882 "123 High street",
1883 );
1884 assert_eq!(
1885 Normalizer::expand_street_abbreviations("12 Sunset Blvd,"),
1886 "12 Sunset boulevard",
1887 );
1888 }
1889
1890 #[test]
1891 fn expand_street_passes_unknown_tokens_through() {
1892 assert_eq!(
1893 Normalizer::expand_street_abbreviations("Buckingham Palace"),
1894 "Buckingham Palace",
1895 );
1896 }
1897
1898 #[test]
1899 fn expand_street_is_idempotent_on_already_expanded_input() {
1900 for input in [
1901 "123 High St",
1902 "45 N Park Ave",
1903 "10 Downing Rd",
1904 "Buckingham Palace",
1905 ] {
1906 let once = Normalizer::expand_street_abbreviations(input);
1907 let twice = Normalizer::expand_street_abbreviations(&once);
1908 assert_eq!(once, twice, "not idempotent for {input:?}");
1909 }
1910 }
1911
1912 #[test]
1913 fn expand_street_handles_empty_and_whitespace_only() {
1914 assert_eq!(Normalizer::expand_street_abbreviations(""), "");
1915 assert_eq!(Normalizer::expand_street_abbreviations(" "), "");
1916 }
1917
1918 // ---------- normalize_address_line ----------
1919
1920 #[test]
1921 fn normalize_address_line_unifies_abbreviated_and_full_forms() {
1922 assert_eq!(
1923 Normalizer::normalize_address_line("123 High St"),
1924 Normalizer::normalize_address_line("123 High Street"),
1925 );
1926 assert_eq!(
1927 Normalizer::normalize_address_line("45 N Park Ave"),
1928 Normalizer::normalize_address_line("45 North Park Avenue"),
1929 );
1930 }
1931
1932 #[test]
1933 fn normalize_address_line_handles_punctuation_and_case() {
1934 assert_eq!(
1935 Normalizer::normalize_address_line("10, DOWNING Street."),
1936 "10 downing street",
1937 );
1938 }
1939
1940 #[test]
1941 fn normalize_address_line_is_idempotent() {
1942 for input in [
1943 "123 High St",
1944 " 45 N Park Ave ",
1945 "10, Downing Street.",
1946 "",
1947 ] {
1948 let once = Normalizer::normalize_address_line(input);
1949 let twice = Normalizer::normalize_address_line(&once);
1950 assert_eq!(once, twice, "not idempotent for {input:?}");
1951 }
1952 }
1953
1954 // ---------- parse_address_line ----------
1955
1956 #[test]
1957 fn parse_address_extracts_simple_house_number() {
1958 let p = Normalizer::parse_address_line("123 High Street");
1959 assert_eq!(p.house_number.as_deref(), Some("123"));
1960 assert_eq!(p.unit, None);
1961 assert_eq!(p.street, "high street");
1962 }
1963
1964 #[test]
1965 fn parse_address_handles_alphanumeric_house_number() {
1966 let p = Normalizer::parse_address_line("10A Downing St");
1967 assert_eq!(p.house_number.as_deref(), Some("10A"));
1968 assert_eq!(p.street, "downing street");
1969 }
1970
1971 #[test]
1972 fn parse_address_does_not_greedily_consume_street_name() {
1973 // "10 Apple Tree Lane" — `Apple` must not be absorbed into the
1974 // house number because two consecutive alphabetic characters
1975 // signal it's part of the street name.
1976 let p = Normalizer::parse_address_line("10 Apple Tree Lane");
1977 assert_eq!(p.house_number.as_deref(), Some("10"));
1978 assert_eq!(p.street, "apple tree lane");
1979 }
1980
1981 #[test]
1982 fn parse_address_recognises_flat_prefix() {
1983 let p = Normalizer::parse_address_line("Flat 2A, 10 Downing Street");
1984 assert_eq!(p.unit.as_deref(), Some("flat 2a"));
1985 assert_eq!(p.house_number.as_deref(), Some("10"));
1986 assert_eq!(p.street, "downing street");
1987 }
1988
1989 #[test]
1990 fn parse_address_recognises_apt_prefix() {
1991 let p = Normalizer::parse_address_line("Apt 5, 1600 Pennsylvania Ave");
1992 assert_eq!(p.unit.as_deref(), Some("apt 5"));
1993 assert_eq!(p.house_number.as_deref(), Some("1600"));
1994 assert_eq!(p.street, "pennsylvania avenue");
1995 }
1996
1997 #[test]
1998 fn parse_address_recognises_suite_and_ste_and_unit_and_room() {
1999 for input in [
2000 "Suite 12, 100 Main St",
2001 "Ste 12, 100 Main St",
2002 "Unit 12, 100 Main St",
2003 "Room 12, 100 Main St",
2004 ] {
2005 let p = Normalizer::parse_address_line(input);
2006 assert!(p.unit.is_some(), "no unit for {input:?}");
2007 assert_eq!(p.house_number.as_deref(), Some("100"));
2008 assert_eq!(p.street, "main street");
2009 }
2010 }
2011
2012 #[test]
2013 fn parse_address_no_leading_number_falls_back_to_street_only() {
2014 let p = Normalizer::parse_address_line("Buckingham Palace");
2015 assert_eq!(p.house_number, None);
2016 assert_eq!(p.unit, None);
2017 assert_eq!(p.street, "buckingham palace");
2018 }
2019
2020 #[test]
2021 fn parse_address_empty_input_yields_empty_street() {
2022 let p = Normalizer::parse_address_line("");
2023 assert_eq!(p.house_number, None);
2024 assert_eq!(p.unit, None);
2025 assert_eq!(p.street, "");
2026 }
2027
2028 #[test]
2029 fn parse_address_round_trips_through_serde() {
2030 let p = Normalizer::parse_address_line("Flat 2A, 10A Downing Street");
2031 let json = serde_json::to_string(&p).unwrap();
2032 let back: ParsedAddressLine = serde_json::from_str(&json).unwrap();
2033 assert_eq!(p, back);
2034 }
2035
2036 #[test]
2037 fn parse_address_uppercases_house_number_suffix() {
2038 let p = Normalizer::parse_address_line("10a Downing St");
2039 assert_eq!(p.house_number.as_deref(), Some("10A"));
2040 }
2041
2042 // ---------- normalize_email ----------
2043
2044 #[test]
2045 fn normalize_email_lowercases_and_trims() {
2046 assert_eq!(
2047 Normalizer::normalize_email(" Alice@Example.ORG ", false),
2048 Some("alice@example.org".into()),
2049 );
2050 }
2051
2052 #[test]
2053 fn normalize_email_preserves_well_formed_input() {
2054 assert_eq!(
2055 Normalizer::normalize_email("alice@example.org", false),
2056 Some("alice@example.org".into()),
2057 );
2058 }
2059
2060 #[test]
2061 fn normalize_email_rejects_missing_at_sign() {
2062 assert_eq!(Normalizer::normalize_email("no-at-sign", false), None);
2063 }
2064
2065 #[test]
2066 fn normalize_email_rejects_empty_localpart_or_domain() {
2067 assert_eq!(Normalizer::normalize_email("@example.org", false), None);
2068 assert_eq!(Normalizer::normalize_email("alice@", false), None);
2069 }
2070
2071 #[test]
2072 fn normalize_email_rejects_multiple_at_signs() {
2073 assert_eq!(Normalizer::normalize_email("a@b@c", false), None);
2074 }
2075
2076 #[test]
2077 fn normalize_email_rejects_empty_and_whitespace() {
2078 assert_eq!(Normalizer::normalize_email("", false), None);
2079 assert_eq!(Normalizer::normalize_email(" ", false), None);
2080 }
2081
2082 #[test]
2083 fn normalize_email_gmail_dot_folding_strips_dots_in_localpart() {
2084 assert_eq!(
2085 Normalizer::normalize_email("j.smith@gmail.com", true),
2086 Some("jsmith@gmail.com".into()),
2087 );
2088 assert_eq!(
2089 Normalizer::normalize_email("J.S.M.I.T.H@gmail.com", true),
2090 Some("jsmith@gmail.com".into()),
2091 );
2092 }
2093
2094 #[test]
2095 fn normalize_email_gmail_dot_folding_strips_plus_tag() {
2096 assert_eq!(
2097 Normalizer::normalize_email("jsmith+work@gmail.com", true),
2098 Some("jsmith@gmail.com".into()),
2099 );
2100 assert_eq!(
2101 Normalizer::normalize_email("j.smith+anything@googlemail.com", true),
2102 Some("jsmith@googlemail.com".into()),
2103 );
2104 }
2105
2106 #[test]
2107 fn normalize_email_gmail_dot_folding_does_not_touch_other_domains() {
2108 assert_eq!(
2109 Normalizer::normalize_email("j.smith@example.org", true),
2110 Some("j.smith@example.org".into()),
2111 );
2112 assert_eq!(
2113 Normalizer::normalize_email("jsmith+work@example.org", true),
2114 Some("jsmith+work@example.org".into()),
2115 );
2116 }
2117
2118 #[test]
2119 fn normalize_email_dot_folding_off_preserves_localpart_dots() {
2120 assert_eq!(
2121 Normalizer::normalize_email("j.smith@gmail.com", false),
2122 Some("j.smith@gmail.com".into()),
2123 );
2124 }
2125
2126 #[test]
2127 fn normalize_email_is_idempotent_on_canonical_form() {
2128 for (input, fold) in [
2129 ("Alice@Example.ORG", false),
2130 ("j.smith@gmail.com", true),
2131 ("jsmith+x@gmail.com", true),
2132 ("user@host.tld", false),
2133 ] {
2134 let once = Normalizer::normalize_email(input, fold).expect("parses");
2135 let twice = Normalizer::normalize_email(&once, fold).expect("idempotent");
2136 assert_eq!(once, twice, "not idempotent for {input:?} fold={fold}");
2137 }
2138 }
2139
2140 #[test]
2141 fn normalize_email_gmail_dot_folding_rejects_empty_localpart_after_folding() {
2142 // A localpart that is entirely dots (or all stripped to empty) is
2143 // not a valid address.
2144 assert_eq!(Normalizer::normalize_email("...@gmail.com", true), None);
2145 }
2146
2147 #[test]
2148 fn parse_address_does_not_treat_st_as_unit_prefix() {
2149 // The "St" street-type abbreviation must not be confused with the
2150 // "Ste" unit prefix. Only a literal "ste" token triggers the unit
2151 // path.
2152 let p = Normalizer::parse_address_line("St Mary's Road");
2153 assert_eq!(p.unit, None);
2154 }
2155
2156 #[test]
2157 fn e164_strips_trunk_zero_after_country_code() {
2158 // Some entry systems mistakenly keep the national trunk 0 after
2159 // the country code (e.g. "+44 0 7700 900123"). The normaliser
2160 // tolerates this.
2161 assert_eq!(
2162 Normalizer::normalize_phone_e164("+44 0 7700 900123", None),
2163 Some("+447700900123".to_string()),
2164 );
2165 }
2166}