worker_matcher/
matcher.rs

1//! Worker matcher engine: deterministic and probabilistic algorithms.
2//!
3//! This is the orchestration layer of the crate. It pulls together the data
4//! types from [`crate::models`], the text transformations from
5//! [`crate::normalizer`], and the similarity primitives from
6//! [`crate::scorer`] to produce a single answer about whether two worker
7//! records refer to the same individual.
8//!
9//! ## Two strategies, one engine
10//!
11//! - [`MatchingEngine::deterministic_match`] — fast, binary, defensible.
12//!   Returns `true` iff either both NHS numbers parse to the same value or
13//!   the normalised name + DOB + gender all match exactly.
14//! - [`MatchingEngine::match_workers`] — weighted probabilistic scoring,
15//!   returning a [`MatchResult`] with per-field [`MatchBreakdown`].
16//!
17//! ## Example
18//!
19//! ```
20//! use worker_matcher::{MatchingEngine, Worker};
21//! use chrono::NaiveDate;
22//!
23//! let a = Worker::builder()
24//!     .given_name("John")
25//!     .family_name("Smith")
26//!     .date_of_birth(NaiveDate::from_ymd_opt(1980, 5, 15).unwrap())
27//!     .build();
28//!
29//! let b = Worker::builder()
30//!     .given_name("Jon")          // typo
31//!     .family_name("Smith")
32//!     .date_of_birth(NaiveDate::from_ymd_opt(1980, 5, 15).unwrap())
33//!     .build();
34//!
35//! let engine = MatchingEngine::default_config();
36//! let result = engine.match_workers(&a, &b);
37//! assert!(result.is_match);
38//! ```
39
40use crate::identifiers;
41use crate::models::{Address, PassportBook, Worker};
42use crate::nicknames::NicknameTable;
43use crate::normalizer::Normalizer;
44use crate::scorer::{Scorer, SimilarityAlgorithm};
45use chrono::{Datelike, NaiveDate};
46use serde::{Deserialize, Serialize};
47
48/// Tunable configuration for the matching engine.
49///
50/// All weights are dimensionless and contribute to a renormalised weighted
51/// sum — they do not need to add to `1.0`. The matching pipeline divides the
52/// weighted sum by the sum of *participating* weights so that missing fields
53/// neither contribute nor penalise. The score is then compared against
54/// [`MatchConfig::match_threshold`] to produce the `is_match` boolean.
55///
56/// Two presets cover most needs:
57///
58/// - [`MatchConfig::strict`]  — `match_threshold = 0.95`, `strict_mode = true`.
59/// - [`MatchConfig::lenient`] — `match_threshold = 0.75`, phonetic on.
60///
61/// # Example
62///
63/// ```
64/// use worker_matcher::{MatchConfig, SimilarityAlgorithm};
65///
66/// let custom = MatchConfig {
67///     match_threshold: 0.80,
68///     uk_nhs_number_weight: 0.30,
69///     fr_nir_weight: 0.30,
70///     es_tsi_weight: 0.30,
71///     ie_ihi_weight: 0.30,
72///     uk_hc_number_weight: 0.30,
73///     us_ssn_weight: 0.30,
74///     au_ihi_weight: 0.30,
75///     de_kvnr_weight: 0.30,
76///     it_cf_weight: 0.30,
77///     nl_bsn_weight: 0.30,
78///     se_workernummer_weight: 0.30,
79///     uk_chi_number_weight: 0.30,
80///     be_nn_weight: 0.30,
81///     bg_egn_weight: 0.30,
82///     cz_rc_weight: 0.30,
83///     dk_cpr_weight: 0.30,
84///     ee_ik_weight: 0.30,
85///     es_dni_weight: 0.30,
86///     fi_hetu_weight: 0.30,
87///     hr_oib_weight: 0.30,
88///     is_kt_weight: 0.30,
89///     lt_ak_weight: 0.30,
90///     lv_pk_weight: 0.30,
91///     mt_id_weight: 0.30,
92///     no_fnr_weight: 0.30,
93///     pl_pesel_weight: 0.30,
94///     ro_cnp_weight: 0.30,
95///     si_emso_weight: 0.30,
96///     sk_rc_weight: 0.30,
97///     uk_nino_weight: 0.30,
98///     gr_dss_weight: 0.30,
99///     li_id_weight: 0.30,
100///     nl_id_weight: 0.30,
101///     pl_nip_weight: 0.30,
102///     pt_nif_weight: 0.30,
103///     br_cpf_weight: 0.30,
104///     cn_rrn_weight: 0.30,
105///     in_aadhaar_weight: 0.30,
106///     jp_my_number_weight: 0.30,
107///     mx_curp_weight: 0.30,
108///     nz_nhi_weight: 0.30,
109///     za_id_weight: 0.30,
110///     passport_book_weight: 0.30,
111///     given_name_weight: 0.15,
112///     family_name_weight: 0.20,
113///     date_of_birth_weight: 0.15,
114///     gender_weight: 0.05,
115///     blood_type_weight: 0.05,
116///     multiple_birth_weight: 0.05,
117///     address_weight: 0.025,
118///     birth_place_weight: 0.05,
119///     death_date_weight: 0.10,
120///     death_place_weight: 0.05,
121///     phone_weight: 0.025,
122///     email_weight: 0.05,
123///     use_phonetic_matching: true,
124///     name_algorithm: SimilarityAlgorithm::JaroWinkler,
125///     strict_mode: false,
126///     nickname_table: worker_matcher::NicknameTable::empty(),
127///     gmail_dot_folding: false,
128///     phone_default_country: Some("GB".into()),
129/// };
130/// assert_eq!(custom.match_threshold, 0.80);
131/// ```
132#[derive(Debug, Clone, Serialize, Deserialize)]
133#[serde(default)]
134pub struct MatchConfig {
135    /// Threshold score for considering two workers a match (`0.0..=1.0`).
136    pub match_threshold: f64,
137
138    /// Weight for UK NHS Number match (only contributes if both parse).
139    pub uk_nhs_number_weight: f64,
140
141    /// Weight for France NIR match (only contributes if both parse).
142    pub fr_nir_weight: f64,
143
144    /// Weight for España TSI / CIP-SNS match (only contributes if both parse).
145    pub es_tsi_weight: f64,
146
147    /// Weight for Éire IHI match (only contributes if both parse).
148    pub ie_ihi_weight: f64,
149
150    /// Weight for United Kingdom Northern Ireland H&C Number match (only contributes if both parse).
151    pub uk_hc_number_weight: f64,
152
153    /// Weight for United States Social Security Number match (only contributes if both parse).
154    pub us_ssn_weight: f64,
155
156    /// Weight for Australia IHI match (only contributes if both parse).
157    pub au_ihi_weight: f64,
158
159    /// Weight for Germany KVNR match (only contributes if both parse).
160    pub de_kvnr_weight: f64,
161
162    /// Weight for Italy *Codice Fiscale* match (only contributes if both parse).
163    pub it_cf_weight: f64,
164
165    /// Weight for Netherlands BSN match (only contributes if both parse).
166    pub nl_bsn_weight: f64,
167
168    /// Weight for Sweden *Workernummer* match (only contributes if both parse).
169    pub se_workernummer_weight: f64,
170
171    /// Weight for United Kingdom (Scotland) CHI Number match (only contributes if both parse).
172    pub uk_chi_number_weight: f64,
173
174    /// Weight for Belgium National Number match (only contributes if both parse).
175    pub be_nn_weight: f64,
176    /// Weight for Bulgaria EGN match (only contributes if both parse).
177    pub bg_egn_weight: f64,
178    /// Weight for Czech *Rodné číslo* match (only contributes if both parse).
179    pub cz_rc_weight: f64,
180    /// Weight for Denmark CPR match (only contributes if both parse).
181    pub dk_cpr_weight: f64,
182    /// Weight for Estonia *Isikukood* match (only contributes if both parse).
183    pub ee_ik_weight: f64,
184    /// Weight for Spain DNI / NIE match (only contributes if both parse).
185    pub es_dni_weight: f64,
186    /// Weight for Finland HETU match (only contributes if both parse).
187    pub fi_hetu_weight: f64,
188    /// Weight for Croatia OIB match (only contributes if both parse).
189    pub hr_oib_weight: f64,
190    /// Weight for Iceland *Kennitala* match (only contributes if both parse).
191    pub is_kt_weight: f64,
192    /// Weight for Lithuania *Asmens kodas* match (only contributes if both parse).
193    pub lt_ak_weight: f64,
194    /// Weight for Latvia *Workeras kods* match (only contributes if both parse).
195    pub lv_pk_weight: f64,
196    /// Weight for Malta National ID match (only contributes if both parse).
197    pub mt_id_weight: f64,
198    /// Weight for Norway *Fødselsnummer* match (only contributes if both parse).
199    pub no_fnr_weight: f64,
200    /// Weight for Poland PESEL match (only contributes if both parse).
201    pub pl_pesel_weight: f64,
202    /// Weight for Romania CNP match (only contributes if both parse).
203    pub ro_cnp_weight: f64,
204    /// Weight for Slovenia EMŠO match (only contributes if both parse).
205    pub si_emso_weight: f64,
206    /// Weight for Slovakia *Rodné číslo* match (only contributes if both parse).
207    pub sk_rc_weight: f64,
208    /// Weight for UK NINO match (only contributes if both parse).
209    pub uk_nino_weight: f64,
210    /// Weight for Greece DSS investor-share match (only contributes if both parse).
211    pub gr_dss_weight: f64,
212    /// Weight for Liechtenstein National ID match (only contributes if both parse).
213    pub li_id_weight: f64,
214    /// Weight for Netherlands National ID match (only contributes if both parse).
215    pub nl_id_weight: f64,
216    /// Weight for Poland NIP match (only contributes if both parse).
217    pub pl_nip_weight: f64,
218    /// Weight for Portugal NIF match (only contributes if both parse).
219    pub pt_nif_weight: f64,
220    /// Weight for Brazil CPF match (only contributes if both parse).
221    pub br_cpf_weight: f64,
222    /// Weight for China Resident Identity Card match (only contributes if both parse).
223    pub cn_rrn_weight: f64,
224    /// Weight for India Aadhaar match (only contributes if both parse).
225    pub in_aadhaar_weight: f64,
226    /// Weight for Japan My Number match (only contributes if both parse).
227    pub jp_my_number_weight: f64,
228    /// Weight for Mexico CURP match (only contributes if both parse).
229    pub mx_curp_weight: f64,
230    /// Weight for New Zealand NHI match (only contributes if both parse).
231    pub nz_nhi_weight: f64,
232    /// Weight for South Africa ID Number match (only contributes if both parse).
233    pub za_id_weight: f64,
234
235    /// Weight for passport-book match (contributes when both workers
236    /// have at least one [`crate::PassportBook`] recorded). See spec
237    /// §6.4a / FR-51.
238    pub passport_book_weight: f64,
239
240    /// Weight for given-name similarity.
241    pub given_name_weight: f64,
242
243    /// Weight for family-name similarity.
244    pub family_name_weight: f64,
245
246    /// Weight for date-of-birth exact match.
247    pub date_of_birth_weight: f64,
248
249    /// Weight for gender exact match.
250    pub gender_weight: f64,
251
252    /// Weight for ABO+RhD blood-type exact match (see [`crate::BloodType`]).
253    /// Defaults to `0.05` — blood type is a weak positive signal but a
254    /// strong negative signal (disagreement is reliable evidence of
255    /// non-match because blood type doesn't change over a lifetime).
256    pub blood_type_weight: f64,
257
258    /// Weight for multiple-birth indicator exact match (FHIR
259    /// `Patient.multipleBirth`). Defaults to `0.05` — a weak positive
260    /// signal in general, but a strong negative signal for
261    /// distinguishing identical twins who otherwise share name, DOB,
262    /// and address.
263    pub multiple_birth_weight: f64,
264
265    /// Weight for address similarity.
266    pub address_weight: f64,
267
268    /// Weight for place-of-birth match (FHIR `Patient.birthPlace`).
269    /// Defaults to `0.05` — stable for life so disagreement is
270    /// informative, but agreement alone is weak because many people
271    /// are born in the same place. Scored against the `city` and
272    /// `country` sub-fields of [`crate::Address`].
273    pub birth_place_weight: f64,
274
275    /// Weight for date-of-death match (FHIR
276    /// `Patient.deceasedDateTime`). Defaults to `0.10` — exact
277    /// agreement on a recorded death date is strong evidence the
278    /// records refer to the same worker, scored with the same DOB
279    /// transposition heuristic as [`Self::date_of_birth_weight`].
280    pub death_date_weight: f64,
281
282    /// Weight for place-of-death match. Defaults to `0.05` — analogous
283    /// to [`Self::birth_place_weight`]: stable per record (someone only
284    /// dies once) so disagreement is informative, but agreement alone
285    /// is weak. Scored against the `city` and `country` sub-fields of
286    /// [`crate::Address`].
287    pub death_place_weight: f64,
288
289    /// Weight for phone-number exact match (after normalisation).
290    pub phone_weight: f64,
291
292    /// Weight for email-address exact match (after normalisation via
293    /// [`crate::Normalizer::normalize_email`]).
294    pub email_weight: f64,
295
296    /// Whether to add a phonetic-name bonus when both names sound alike.
297    pub use_phonetic_matching: bool,
298
299    /// Similarity algorithm to use when comparing given and family names.
300    pub name_algorithm: SimilarityAlgorithm,
301
302    /// Reserved flag for stricter deterministic enforcement. See spec OQ-5.
303    pub strict_mode: bool,
304
305    /// Fold Gmail-specific localpart dots and `+tag` suffixes during
306    /// email normalisation. When `true`, addresses on `gmail.com` /
307    /// `googlemail.com` have every `.` in the localpart removed and any
308    /// `+anything` suffix dropped before comparison, mirroring Gmail's
309    /// documented routing rules. Defaults to `false` so non-Gmail
310    /// addresses are unaffected and the canonical form is unsurprising.
311    pub gmail_dot_folding: bool,
312
313    /// Optional table of nickname equivalence classes consulted by name
314    /// scoring.
315    ///
316    /// When a name pair is equivalent under this table — e.g.
317    /// `("Michael", "Mike")` — the matcher **lifts the given-name (and
318    /// family-name) similarity score to at least `0.9`**, ensuring the
319    /// table-driven equivalence is not undone by a low Jaro-Winkler /
320    /// Levenshtein score on dissimilar forms. The boost never lowers a
321    /// score.
322    ///
323    /// Defaults to [`NicknameTable::empty`] so existing behaviour is
324    /// preserved. Opt in with [`NicknameTable::english`] (a built-in
325    /// English-language dictionary) or build a custom table via
326    /// [`NicknameTable::with_class`].
327    pub nickname_table: NicknameTable,
328
329    /// ISO 3166-1 alpha-2 country code applied to phone numbers that lack
330    /// an explicit international marker (`+CC` or `00CC`).
331    ///
332    /// When `Some(cc)`, the matcher converts each phone to E.164 form via
333    /// [`crate::Normalizer::normalize_phone_e164`] using `cc` as the
334    /// fallback jurisdiction. Numbers from different countries with
335    /// overlapping national digits will no longer collide. When `None`,
336    /// only inputs carrying an explicit international marker reach E.164;
337    /// every other comparison falls back to the legacy
338    /// [`crate::Normalizer::normalize_phone`] form.
339    ///
340    /// Defaults to `Some("GB")` to preserve the crate's historical
341    /// UK-centric behaviour. Set to the worker population's predominant
342    /// jurisdiction in production deployments.
343    pub phone_default_country: Option<String>,
344}
345
346impl Default for MatchConfig {
347    /// Production-ready defaults tuned per spec §13.1.
348    ///
349    /// ```
350    /// use worker_matcher::{MatchConfig, SimilarityAlgorithm};
351    /// let c = MatchConfig::default();
352    /// assert!((c.match_threshold - 0.85).abs() < 1e-9);
353    /// assert!(c.use_phonetic_matching);
354    /// assert!(matches!(c.name_algorithm, SimilarityAlgorithm::Combined));
355    /// ```
356    fn default() -> Self {
357        Self {
358            match_threshold: 0.85,
359            uk_nhs_number_weight: 0.30,
360            fr_nir_weight: 0.30,
361            es_tsi_weight: 0.30,
362            ie_ihi_weight: 0.30,
363            uk_hc_number_weight: 0.30,
364            us_ssn_weight: 0.30,
365            au_ihi_weight: 0.30,
366            de_kvnr_weight: 0.30,
367            it_cf_weight: 0.30,
368            nl_bsn_weight: 0.30,
369            se_workernummer_weight: 0.30,
370            uk_chi_number_weight: 0.30,
371            be_nn_weight: 0.30,
372            bg_egn_weight: 0.30,
373            cz_rc_weight: 0.30,
374            dk_cpr_weight: 0.30,
375            ee_ik_weight: 0.30,
376            es_dni_weight: 0.30,
377            fi_hetu_weight: 0.30,
378            hr_oib_weight: 0.30,
379            is_kt_weight: 0.30,
380            lt_ak_weight: 0.30,
381            lv_pk_weight: 0.30,
382            mt_id_weight: 0.30,
383            no_fnr_weight: 0.30,
384            pl_pesel_weight: 0.30,
385            ro_cnp_weight: 0.30,
386            si_emso_weight: 0.30,
387            sk_rc_weight: 0.30,
388            uk_nino_weight: 0.30,
389            gr_dss_weight: 0.30,
390            li_id_weight: 0.30,
391            nl_id_weight: 0.30,
392            pl_nip_weight: 0.30,
393            pt_nif_weight: 0.30,
394            br_cpf_weight: 0.30,
395            cn_rrn_weight: 0.30,
396            in_aadhaar_weight: 0.30,
397            jp_my_number_weight: 0.30,
398            mx_curp_weight: 0.30,
399            nz_nhi_weight: 0.30,
400            za_id_weight: 0.30,
401            passport_book_weight: 0.30,
402            given_name_weight: 0.15,
403            family_name_weight: 0.20,
404            date_of_birth_weight: 0.20,
405            gender_weight: 0.05,
406            blood_type_weight: 0.05,
407            multiple_birth_weight: 0.05,
408            address_weight: 0.05,
409            birth_place_weight: 0.05,
410            death_date_weight: 0.10,
411            death_place_weight: 0.05,
412            phone_weight: 0.05,
413            email_weight: 0.05,
414            use_phonetic_matching: true,
415            name_algorithm: SimilarityAlgorithm::Combined,
416            strict_mode: false,
417            nickname_table: NicknameTable::empty(),
418            gmail_dot_folding: false,
419            phone_default_country: Some("GB".to_string()),
420        }
421    }
422}
423
424impl MatchConfig {
425    /// A stricter preset: `match_threshold = 0.95`, `strict_mode = true`.
426    ///
427    /// Use when a clinician must rely on the answer and false positives are
428    /// more dangerous than false negatives.
429    ///
430    /// ```
431    /// use worker_matcher::MatchConfig;
432    /// let c = MatchConfig::strict();
433    /// assert!((c.match_threshold - 0.95).abs() < 1e-9);
434    /// assert!(c.strict_mode);
435    /// ```
436    pub fn strict() -> Self {
437        Self {
438            match_threshold: 0.95,
439            strict_mode: true,
440            ..Default::default()
441        }
442    }
443
444    /// A more forgiving preset: `match_threshold = 0.75`, phonetic matching on.
445    ///
446    /// Use when triaging large candidate sets where false negatives are
447    /// worse than false positives.
448    ///
449    /// ```
450    /// use worker_matcher::MatchConfig;
451    /// let c = MatchConfig::lenient();
452    /// assert!((c.match_threshold - 0.75).abs() < 1e-9);
453    /// assert!(c.use_phonetic_matching);
454    /// ```
455    pub fn lenient() -> Self {
456        Self {
457            match_threshold: 0.75,
458            use_phonetic_matching: true,
459            ..Default::default()
460        }
461    }
462}
463
464/// Qualitative confidence band derived from the probabilistic
465/// [`MatchResult::score`].
466///
467/// The bands are fixed across all `MatchConfig` presets — they do **not**
468/// follow `match_threshold`. They are intended for triage UIs and audit
469/// logs where a coarse High/Medium/Low summary is more useful than the
470/// raw float. The `is_match` boolean remains the authoritative go/no-go
471/// signal because it incorporates the configured threshold.
472///
473/// Boundaries (per spec §12.5):
474///
475/// | Score range | Band |
476/// |---|---|
477/// | `score >= 0.90` | `High` |
478/// | `0.75 <= score < 0.90` | `Medium` |
479/// | `score < 0.75` | `Low` |
480///
481/// # Examples
482///
483/// ```
484/// use worker_matcher::Confidence;
485///
486/// assert_eq!(Confidence::from_score(0.99), Confidence::High);
487/// assert_eq!(Confidence::from_score(0.90), Confidence::High);   // inclusive
488/// assert_eq!(Confidence::from_score(0.85), Confidence::Medium);
489/// assert_eq!(Confidence::from_score(0.75), Confidence::Medium); // inclusive
490/// assert_eq!(Confidence::from_score(0.50), Confidence::Low);
491/// assert_eq!(Confidence::from_score(0.00), Confidence::Low);
492/// ```
493#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
494pub enum Confidence {
495    /// Score is at or above `0.90`. Strong match; safe to act on with
496    /// minimal review.
497    High,
498    /// Score is in `0.75..0.90`. Medium-confidence match; the per-field
499    /// [`MatchBreakdown`] should be inspected before clinical use.
500    Medium,
501    /// Score is below `0.75`. Treat as a candidate at best; require
502    /// additional evidence before treating as the same worker.
503    Low,
504}
505
506impl Confidence {
507    /// Bucket a probabilistic score into one of the three bands.
508    ///
509    /// The function is total over `f64`: NaN inputs degrade to `Low`,
510    /// negative scores degrade to `Low`, scores above `1.0` are treated
511    /// as `High`. In practice the matcher only ever produces values in
512    /// `[0.0, 1.0]`, so callers shouldn't encounter the degenerate
513    /// inputs.
514    ///
515    /// ```
516    /// use worker_matcher::Confidence;
517    ///
518    /// assert_eq!(Confidence::from_score(f64::NAN), Confidence::Low);
519    /// assert_eq!(Confidence::from_score(-0.5),     Confidence::Low);
520    /// assert_eq!(Confidence::from_score(2.0),      Confidence::High);
521    /// ```
522    pub fn from_score(score: f64) -> Self {
523        if score >= 0.90 {
524            Confidence::High
525        } else if score >= 0.75 {
526            Confidence::Medium
527        } else {
528            Confidence::Low
529        }
530    }
531}
532
533/// Outcome of a probabilistic worker match.
534///
535/// Contains the overall renormalised `score`, the threshold-derived
536/// `is_match` boolean, a coarse [`Confidence`] band, and a per-field
537/// [`MatchBreakdown`] for audit.
538///
539/// `MatchResult` implements `Serialize + Deserialize` so it can be persisted
540/// or returned over an API.
541///
542/// ```
543/// use worker_matcher::{Confidence, MatchingEngine, Worker};
544///
545/// let p = Worker::builder().given_name("Ada").family_name("Lovelace").build();
546/// let q = p.clone();
547/// let result = MatchingEngine::default_config().match_workers(&p, &q);
548/// assert_eq!(result.confidence, Confidence::High);
549///
550/// // Round-trip through JSON.
551/// let json = serde_json::to_string(&result).unwrap();
552/// let back: worker_matcher::MatchResult = serde_json::from_str(&json).unwrap();
553/// assert!((result.score - back.score).abs() < 1e-12);
554/// assert_eq!(result.is_match, back.is_match);
555/// assert_eq!(result.confidence, back.confidence);
556/// ```
557#[derive(Debug, Clone, Serialize, Deserialize)]
558pub struct MatchResult {
559    /// Overall match score in `[0.0, 1.0]`.
560    pub score: f64,
561
562    /// `true` if `score >= MatchConfig::match_threshold`.
563    pub is_match: bool,
564
565    /// Coarse confidence band derived from `score` per spec §12.5.
566    /// Defaults to [`Confidence::Low`] on legacy JSON payloads that
567    /// pre-date the field.
568    #[serde(default = "default_confidence")]
569    pub confidence: Confidence,
570
571    /// Per-field score contributions for explainability.
572    pub breakdown: MatchBreakdown,
573}
574
575/// Backstop for legacy `MatchResult` JSON payloads that lack the
576/// `confidence` field. Returns `Confidence::Low` so a deserialised
577/// payload that pre-dates the field is unambiguously flagged as
578/// "needs re-scoring".
579fn default_confidence() -> Confidence {
580    Confidence::Low
581}
582
583/// Per-field score breakdown returned with every [`MatchResult`].
584///
585/// Each field is `Option<f64>`:
586///
587/// - `Some(score)` — the field was scored; the value is in `[0.0, 1.0]`.
588/// - `None` — the field was missing on at least one side and so did not
589///   participate in the weighted sum.
590///
591/// The breakdown exists so a clinician or auditor can see *why* a match was
592/// flagged. Do not throw it away in downstream services.
593#[derive(Debug, Clone, Serialize, Deserialize)]
594pub struct MatchBreakdown {
595    /// Score for UK NHS Number equality (`1.0` or `0.0`), or `None` if either side did not parse.
596    #[serde(default)]
597    pub uk_nhs_number_score: Option<f64>,
598    /// Score for France NIR equality (`1.0` or `0.0`), or `None` if either side did not parse.
599    #[serde(default)]
600    pub fr_nir_score: Option<f64>,
601    /// Score for España TSI / CIP-SNS equality (`1.0` or `0.0`), or `None` if either side did not parse.
602    #[serde(default)]
603    pub es_tsi_score: Option<f64>,
604    /// Score for Éire IHI equality (`1.0` or `0.0`), or `None` if either side did not parse.
605    #[serde(default)]
606    pub ie_ihi_score: Option<f64>,
607    /// Score for United Kingdom Northern Ireland H&C Number equality (`1.0` or `0.0`), or `None` if either side did not parse.
608    #[serde(default)]
609    pub uk_hc_number_score: Option<f64>,
610    /// Score for United States Social Security Number equality (`1.0` or `0.0`), or `None` if either side did not parse.
611    #[serde(default)]
612    pub us_ssn_score: Option<f64>,
613    /// Score for Australia IHI equality (`1.0` or `0.0`), or `None` if either side did not parse.
614    #[serde(default)]
615    pub au_ihi_score: Option<f64>,
616    /// Score for Germany KVNR equality (`1.0` or `0.0`), or `None` if either side did not parse.
617    #[serde(default)]
618    pub de_kvnr_score: Option<f64>,
619    /// Score for Italy *Codice Fiscale* equality (`1.0` or `0.0`), or `None` if either side did not parse.
620    #[serde(default)]
621    pub it_cf_score: Option<f64>,
622    /// Score for Netherlands BSN equality (`1.0` or `0.0`), or `None` if either side did not parse.
623    #[serde(default)]
624    pub nl_bsn_score: Option<f64>,
625    /// Score for Sweden *Workernummer* equality (`1.0` or `0.0`), or `None` if either side did not parse.
626    #[serde(default)]
627    pub se_workernummer_score: Option<f64>,
628    /// Score for United Kingdom (Scotland) CHI Number equality (`1.0` or `0.0`), or `None` if either side did not parse.
629    #[serde(default)]
630    pub uk_chi_number_score: Option<f64>,
631    /// Score for Belgium National Number equality (`1.0` or `0.0`), or `None`.
632    #[serde(default)]
633    pub be_nn_score: Option<f64>,
634    /// Score for Bulgaria EGN equality (`1.0` or `0.0`), or `None`.
635    #[serde(default)]
636    pub bg_egn_score: Option<f64>,
637    /// Score for Czech *Rodné číslo* equality (`1.0` or `0.0`), or `None`.
638    #[serde(default)]
639    pub cz_rc_score: Option<f64>,
640    /// Score for Denmark CPR equality (`1.0` or `0.0`), or `None`.
641    #[serde(default)]
642    pub dk_cpr_score: Option<f64>,
643    /// Score for Estonia *Isikukood* equality (`1.0` or `0.0`), or `None`.
644    #[serde(default)]
645    pub ee_ik_score: Option<f64>,
646    /// Score for Spain DNI / NIE equality (`1.0` or `0.0`), or `None`.
647    #[serde(default)]
648    pub es_dni_score: Option<f64>,
649    /// Score for Finland HETU equality (`1.0` or `0.0`), or `None`.
650    #[serde(default)]
651    pub fi_hetu_score: Option<f64>,
652    /// Score for Croatia OIB equality (`1.0` or `0.0`), or `None`.
653    #[serde(default)]
654    pub hr_oib_score: Option<f64>,
655    /// Score for Iceland *Kennitala* equality (`1.0` or `0.0`), or `None`.
656    #[serde(default)]
657    pub is_kt_score: Option<f64>,
658    /// Score for Lithuania *Asmens kodas* equality (`1.0` or `0.0`), or `None`.
659    #[serde(default)]
660    pub lt_ak_score: Option<f64>,
661    /// Score for Latvia *Workeras kods* equality (`1.0` or `0.0`), or `None`.
662    #[serde(default)]
663    pub lv_pk_score: Option<f64>,
664    /// Score for Malta National ID equality (`1.0` or `0.0`), or `None`.
665    #[serde(default)]
666    pub mt_id_score: Option<f64>,
667    /// Score for Norway *Fødselsnummer* equality (`1.0` or `0.0`), or `None`.
668    #[serde(default)]
669    pub no_fnr_score: Option<f64>,
670    /// Score for Poland PESEL equality (`1.0` or `0.0`), or `None`.
671    #[serde(default)]
672    pub pl_pesel_score: Option<f64>,
673    /// Score for Romania CNP equality (`1.0` or `0.0`), or `None`.
674    #[serde(default)]
675    pub ro_cnp_score: Option<f64>,
676    /// Score for Slovenia EMŠO equality (`1.0` or `0.0`), or `None`.
677    #[serde(default)]
678    pub si_emso_score: Option<f64>,
679    /// Score for Slovakia *Rodné číslo* equality (`1.0` or `0.0`), or `None`.
680    #[serde(default)]
681    pub sk_rc_score: Option<f64>,
682    /// Score for UK NINO equality (`1.0` or `0.0`), or `None`.
683    #[serde(default)]
684    pub uk_nino_score: Option<f64>,
685    /// Score for Greece DSS investor-share equality (`1.0` or `0.0`), or `None`.
686    #[serde(default)]
687    pub gr_dss_score: Option<f64>,
688    /// Score for Liechtenstein National ID equality (`1.0` or `0.0`), or `None`.
689    #[serde(default)]
690    pub li_id_score: Option<f64>,
691    /// Score for Netherlands National ID equality (`1.0` or `0.0`), or `None`.
692    #[serde(default)]
693    pub nl_id_score: Option<f64>,
694    /// Score for Poland NIP equality (`1.0` or `0.0`), or `None`.
695    #[serde(default)]
696    pub pl_nip_score: Option<f64>,
697    /// Score for Portugal NIF equality (`1.0` or `0.0`), or `None`.
698    #[serde(default)]
699    pub pt_nif_score: Option<f64>,
700    /// Score for Brazil CPF equality (`1.0` or `0.0`), or `None`.
701    #[serde(default)]
702    pub br_cpf_score: Option<f64>,
703    /// Score for China Resident Identity Card equality (`1.0` or `0.0`), or `None`.
704    #[serde(default)]
705    pub cn_rrn_score: Option<f64>,
706    /// Score for India Aadhaar equality (`1.0` or `0.0`), or `None`.
707    #[serde(default)]
708    pub in_aadhaar_score: Option<f64>,
709    /// Score for Japan My Number equality (`1.0` or `0.0`), or `None`.
710    #[serde(default)]
711    pub jp_my_number_score: Option<f64>,
712    /// Score for Mexico CURP equality (`1.0` or `0.0`), or `None`.
713    #[serde(default)]
714    pub mx_curp_score: Option<f64>,
715    /// Score for New Zealand NHI equality (`1.0` or `0.0`), or `None`.
716    #[serde(default)]
717    pub nz_nhi_score: Option<f64>,
718    /// Score for South Africa ID Number equality (`1.0` or `0.0`), or `None`.
719    #[serde(default)]
720    pub za_id_score: Option<f64>,
721    /// Score for passport-book match: `Some(1.0)` when at least one
722    /// `(country, number)` pair is shared across the two workers'
723    /// passport lists, `Some(0.0)` when both have at least one book
724    /// but no pair is shared, `None` when either side has no books.
725    /// See spec §6.4a / FR-51.
726    #[serde(default)]
727    pub passport_book_score: Option<f64>,
728    /// Score for given-name similarity using the configured algorithm.
729    pub given_name_score: Option<f64>,
730    /// Score for family-name similarity using the configured algorithm.
731    pub family_name_score: Option<f64>,
732    /// Score for date-of-birth equality (`1.0` or `0.0`).
733    pub date_of_birth_score: Option<f64>,
734    /// Score for gender equality (`1.0` or `0.0`).
735    pub gender_score: Option<f64>,
736    /// Score for ABO+RhD blood-type equality (`1.0` or `0.0`), or
737    /// `None` if either side is missing. See [`crate::BloodType`].
738    #[serde(default)]
739    pub blood_type_score: Option<f64>,
740    /// Score for multiple-birth indicator equality (`1.0` for matching
741    /// birth order, `0.0` for different birth orders within the same
742    /// multiple-birth set), or `None` if either side is missing.
743    #[serde(default)]
744    pub multiple_birth_score: Option<f64>,
745    /// Score for address similarity (weighted blend of postcode, city, line 1).
746    pub address_score: Option<f64>,
747    /// Score for place-of-birth similarity (city Jaro-Winkler blended
748    /// with country exact match), or `None` if either side is absent
749    /// or carries no city / country sub-fields.
750    #[serde(default)]
751    pub birth_place_score: Option<f64>,
752    /// Score for date-of-death equality, using the same DOB
753    /// transposition heuristic as [`Self::date_of_birth_score`]:
754    /// `1.0` on exact equality, `0.5` on day/month swap, `0.0`
755    /// otherwise. `None` if either side has no recorded death date.
756    #[serde(default)]
757    pub death_date_score: Option<f64>,
758    /// Score for place-of-death similarity (analogous to
759    /// [`Self::birth_place_score`]: city Jaro-Winkler blended with
760    /// country exact match), or `None` if either side is absent or
761    /// carries no city / country sub-fields.
762    #[serde(default)]
763    pub death_place_score: Option<f64>,
764    /// Score for normalised phone-number equality (`1.0` or `0.0`).
765    pub phone_score: Option<f64>,
766    /// Score for normalised email-address equality (`1.0` or `0.0`).
767    /// `None` if either side is absent or fails to parse as an email.
768    #[serde(default)]
769    pub email_score: Option<f64>,
770    /// Mean Soundex match across given and family names (`0.0`, `0.5`, or `1.0`).
771    pub phonetic_name_score: Option<f64>,
772}
773
774/// Worker matcher engine.
775///
776/// The engine is **immutable after construction** and cheap to clone (it
777/// owns only a [`MatchConfig`]). Construct one and call its methods from any
778/// thread.
779///
780/// ```
781/// use worker_matcher::{MatchConfig, MatchingEngine};
782///
783/// let engine_a = MatchingEngine::default_config();
784/// let engine_b = MatchingEngine::new(MatchConfig::strict());
785/// # let _ = (engine_a, engine_b);
786/// ```
787pub struct MatchingEngine {
788    config: MatchConfig,
789}
790
791impl MatchingEngine {
792    /// Construct an engine with the given configuration.
793    ///
794    /// ```
795    /// use worker_matcher::{MatchConfig, MatchingEngine};
796    /// let engine = MatchingEngine::new(MatchConfig::lenient());
797    /// # let _ = engine;
798    /// ```
799    pub fn new(config: MatchConfig) -> Self {
800        Self { config }
801    }
802
803    /// Construct an engine with [`MatchConfig::default`].
804    ///
805    /// ```
806    /// use worker_matcher::MatchingEngine;
807    /// let engine = MatchingEngine::default_config();
808    /// # let _ = engine;
809    /// ```
810    pub fn default_config() -> Self {
811        Self::new(MatchConfig::default())
812    }
813
814    /// Compare two workers probabilistically and return a [`MatchResult`].
815    ///
816    /// The score is the weight-renormalised sum of every component that
817    /// scored on both records. Missing fields are skipped, not penalised.
818    ///
819    /// ```
820    /// use worker_matcher::{MatchingEngine, Worker};
821    /// use chrono::NaiveDate;
822    ///
823    /// let p = Worker::builder()
824    ///     .given_name("Carys")
825    ///     .family_name("Pritchard")
826    ///     .date_of_birth(NaiveDate::from_ymd_opt(1985, 1, 1).unwrap())
827    ///     .build();
828    ///
829    /// let result = MatchingEngine::default_config().match_workers(&p, &p);
830    /// assert!(result.is_match);
831    /// assert!(result.score > 0.99);
832    /// ```
833    pub fn match_workers(&self, worker1: &Worker, worker2: &Worker) -> MatchResult {
834        let breakdown = self.calculate_breakdown(worker1, worker2);
835        let score = self.calculate_weighted_score(&breakdown);
836        let above_threshold = score >= self.config.match_threshold;
837        // Under strict mode, `is_match` ALSO requires a deterministic
838        // match (any identifier scheme agrees, or the full demographic
839        // tuple agrees). This narrows the false-positive surface in
840        // clinical workflows where the threshold alone is not enough.
841        // See spec FR-47 / §13.2 / OQ-5.
842        let is_match = if self.config.strict_mode {
843            above_threshold && self.deterministic_match(worker1, worker2)
844        } else {
845            above_threshold
846        };
847        let confidence = Confidence::from_score(score);
848
849        MatchResult {
850            score,
851            is_match,
852            confidence,
853            breakdown,
854        }
855    }
856
857    /// Score a single query against many candidates in parallel-friendly
858    /// fashion. Returns one [`MatchResult`] per candidate, in the same
859    /// order as the input slice.
860    ///
861    /// This is the building block for a Master Worker Index workflow:
862    /// given a new record and the existing population, produce a fully
863    /// audited score for each potential link. The engine is immutable
864    /// and `Send + Sync`, so call-sites that want parallel evaluation
865    /// can wrap the call in `rayon::par_iter` or similar without
866    /// further changes to this crate.
867    ///
868    /// For sparse / large populations consider blocking — pre-filter
869    /// `candidates` with a cheap predicate (e.g. matching family-name
870    /// Soundex or postcode prefix) before passing the survivors to this
871    /// function. Blocking is a consumer concern and is intentionally
872    /// not baked into the API; the crate stays a pure scoring library.
873    ///
874    /// # Examples
875    ///
876    /// ```
877    /// use worker_matcher::{MatchingEngine, Worker};
878    ///
879    /// let query = Worker::builder()
880    ///     .given_name("Ada")
881    ///     .family_name("Lovelace")
882    ///     .build();
883    /// let candidates = vec![
884    ///     Worker::builder().given_name("Ada").family_name("Lovelace").build(),
885    ///     Worker::builder().given_name("Alan").family_name("Turing").build(),
886    ///     Worker::builder().given_name("Grace").family_name("Hopper").build(),
887    /// ];
888    ///
889    /// let engine = MatchingEngine::default_config();
890    /// let results = engine.match_one_to_many(&query, &candidates);
891    ///
892    /// assert_eq!(results.len(), 3);
893    /// assert!(results[0].is_match);
894    /// assert!(!results[1].is_match);
895    /// ```
896    ///
897    /// Empty candidates yield an empty result:
898    ///
899    /// ```
900    /// # use worker_matcher::{MatchingEngine, Worker};
901    /// let q = Worker::builder().given_name("Solo").build();
902    /// let r = MatchingEngine::default_config().match_one_to_many(&q, &[]);
903    /// assert!(r.is_empty());
904    /// ```
905    pub fn match_one_to_many(&self, query: &Worker, candidates: &[Worker]) -> Vec<MatchResult> {
906        candidates
907            .iter()
908            .map(|c| self.match_workers(query, c))
909            .collect()
910    }
911
912    /// Score and rank: return `(original_index, MatchResult)` tuples
913    /// sorted by descending score. Ties are broken by ascending original
914    /// index, so the result is deterministic.
915    ///
916    /// Convenience wrapper around [`MatchingEngine::match_one_to_many`]
917    /// for the common "give me the best matches first" workflow.
918    /// Consumers that need a filtered view (e.g. only `is_match == true`)
919    /// can drop entries off the front, while consumers that need to
920    /// pair results with external metadata can use the preserved
921    /// original index.
922    ///
923    /// # Examples
924    ///
925    /// ```
926    /// use worker_matcher::{MatchingEngine, Worker};
927    ///
928    /// let query = Worker::builder().given_name("Ada").family_name("Lovelace").build();
929    /// let candidates = vec![
930    ///     Worker::builder().given_name("Grace").family_name("Hopper").build(),   // index 0
931    ///     Worker::builder().given_name("Ada").family_name("Lovelace").build(),   // index 1 — best match
932    ///     Worker::builder().given_name("Alan").family_name("Turing").build(),    // index 2
933    /// ];
934    ///
935    /// let ranked = MatchingEngine::default_config().rank_one_to_many(&query, &candidates);
936    /// assert_eq!(ranked.len(), 3);
937    /// assert_eq!(ranked[0].0, 1);                  // best match's original index
938    /// assert!(ranked[0].1.score >= ranked[1].1.score);
939    /// assert!(ranked[1].1.score >= ranked[2].1.score);
940    /// ```
941    pub fn rank_one_to_many(
942        &self,
943        query: &Worker,
944        candidates: &[Worker],
945    ) -> Vec<(usize, MatchResult)> {
946        let mut indexed: Vec<(usize, MatchResult)> = self
947            .match_one_to_many(query, candidates)
948            .into_iter()
949            .enumerate()
950            .collect();
951        indexed.sort_by(|a, b| {
952            b.1.score
953                .partial_cmp(&a.1.score)
954                .unwrap_or(std::cmp::Ordering::Equal)
955                .then_with(|| a.0.cmp(&b.0))
956        });
957        indexed
958    }
959
960    /// Compare two workers deterministically and return a single boolean.
961    ///
962    /// Returns `true` iff **any** of the following hold:
963    ///
964    /// - Both UK NHS Numbers parse and are equal.
965    /// - Both France NIRs parse and are equal.
966    /// - Both España TSIs parse and are equal.
967    /// - Both Éire IHIs parse and are equal.
968    /// - Both UK Northern Ireland H&C Numbers parse and are equal.
969    /// - Both US Social Security Numbers parse and are equal.
970    /// - Both Australia IHIs parse and are equal.
971    /// - Both Germany KVNRs parse and are equal.
972    /// - Both Italy *Codice Fiscale* values parse and are equal.
973    /// - Both Netherlands BSNs parse and are equal.
974    /// - Both Sweden *Workernummer* values parse and are equal.
975    /// - Both UK Scotland CHI Numbers parse and are equal.
976    /// - The workers share at least one `(country, number)` passport-book
977    ///   pair after canonicalisation (see [`crate::PassportBook`]).
978    /// - Normalised given name matches, **and** normalised family name
979    ///   matches, **and** date of birth matches, **and** gender matches (or
980    ///   is missing on at least one side).
981    ///
982    /// National identifiers from different schemes never cross-match: an
983    /// NHS Number is only ever compared against another NHS Number, never
984    /// against an H&C Number that happens to share the same 10 digits.
985    ///
986    /// ```
987    /// use worker_matcher::{MatchingEngine, Worker};
988    ///
989    /// // Same NHS number, different formatting → match.
990    /// let a = Worker::builder().uk_nhs_number("943 476 5919").build();
991    /// let b = Worker::builder().uk_nhs_number("9434765919").build();
992    /// assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
993    /// ```
994    pub fn deterministic_match(&self, worker1: &Worker, worker2: &Worker) -> bool {
995        if identifier_equal(
996            &worker1.uk_nhs_number,
997            &worker2.uk_nhs_number,
998            identifiers::parse_uk_nhs_number,
999        ) {
1000            return true;
1001        }
1002        if identifier_equal(&worker1.fr_nir, &worker2.fr_nir, identifiers::parse_fr_nir) {
1003            return true;
1004        }
1005        if identifier_equal(&worker1.es_tsi, &worker2.es_tsi, identifiers::parse_es_tsi) {
1006            return true;
1007        }
1008        if identifier_equal(&worker1.ie_ihi, &worker2.ie_ihi, identifiers::parse_ie_ihi) {
1009            return true;
1010        }
1011        if identifier_equal(
1012            &worker1.uk_hc_number,
1013            &worker2.uk_hc_number,
1014            identifiers::parse_uk_hc_number,
1015        ) {
1016            return true;
1017        }
1018        if identifier_equal(&worker1.us_ssn, &worker2.us_ssn, identifiers::parse_us_ssn) {
1019            return true;
1020        }
1021        if identifier_equal(&worker1.au_ihi, &worker2.au_ihi, identifiers::parse_au_ihi) {
1022            return true;
1023        }
1024        if identifier_equal(
1025            &worker1.de_kvnr,
1026            &worker2.de_kvnr,
1027            identifiers::parse_de_kvnr,
1028        ) {
1029            return true;
1030        }
1031        if identifier_equal(&worker1.it_cf, &worker2.it_cf, identifiers::parse_it_cf) {
1032            return true;
1033        }
1034        if identifier_equal(&worker1.nl_bsn, &worker2.nl_bsn, identifiers::parse_nl_bsn) {
1035            return true;
1036        }
1037        if identifier_equal(
1038            &worker1.se_workernummer,
1039            &worker2.se_workernummer,
1040            identifiers::parse_se_workernummer,
1041        ) {
1042            return true;
1043        }
1044        if identifier_equal(
1045            &worker1.uk_chi_number,
1046            &worker2.uk_chi_number,
1047            identifiers::parse_uk_chi_number,
1048        ) {
1049            return true;
1050        }
1051        if identifier_equal(&worker1.be_nn, &worker2.be_nn, identifiers::parse_be_nn) {
1052            return true;
1053        }
1054        if identifier_equal(&worker1.bg_egn, &worker2.bg_egn, identifiers::parse_bg_egn) {
1055            return true;
1056        }
1057        if identifier_equal(&worker1.cz_rc, &worker2.cz_rc, identifiers::parse_cz_rc) {
1058            return true;
1059        }
1060        if identifier_equal(&worker1.dk_cpr, &worker2.dk_cpr, identifiers::parse_dk_cpr) {
1061            return true;
1062        }
1063        if identifier_equal(&worker1.ee_ik, &worker2.ee_ik, identifiers::parse_ee_ik) {
1064            return true;
1065        }
1066        if identifier_equal(&worker1.es_dni, &worker2.es_dni, identifiers::parse_es_dni) {
1067            return true;
1068        }
1069        if identifier_equal(
1070            &worker1.fi_hetu,
1071            &worker2.fi_hetu,
1072            identifiers::parse_fi_hetu,
1073        ) {
1074            return true;
1075        }
1076        if identifier_equal(&worker1.hr_oib, &worker2.hr_oib, identifiers::parse_hr_oib) {
1077            return true;
1078        }
1079        if identifier_equal(&worker1.is_kt, &worker2.is_kt, identifiers::parse_is_kt) {
1080            return true;
1081        }
1082        if identifier_equal(&worker1.lt_ak, &worker2.lt_ak, identifiers::parse_lt_ak) {
1083            return true;
1084        }
1085        if identifier_equal(&worker1.lv_pk, &worker2.lv_pk, identifiers::parse_lv_pk) {
1086            return true;
1087        }
1088        if identifier_equal(&worker1.mt_id, &worker2.mt_id, identifiers::parse_mt_id) {
1089            return true;
1090        }
1091        if identifier_equal(&worker1.no_fnr, &worker2.no_fnr, identifiers::parse_no_fnr) {
1092            return true;
1093        }
1094        if identifier_equal(
1095            &worker1.pl_pesel,
1096            &worker2.pl_pesel,
1097            identifiers::parse_pl_pesel,
1098        ) {
1099            return true;
1100        }
1101        if identifier_equal(&worker1.ro_cnp, &worker2.ro_cnp, identifiers::parse_ro_cnp) {
1102            return true;
1103        }
1104        if identifier_equal(
1105            &worker1.si_emso,
1106            &worker2.si_emso,
1107            identifiers::parse_si_emso,
1108        ) {
1109            return true;
1110        }
1111        if identifier_equal(&worker1.sk_rc, &worker2.sk_rc, identifiers::parse_sk_rc) {
1112            return true;
1113        }
1114        if identifier_equal(
1115            &worker1.uk_nino,
1116            &worker2.uk_nino,
1117            identifiers::parse_uk_nino,
1118        ) {
1119            return true;
1120        }
1121        if identifier_equal(&worker1.gr_dss, &worker2.gr_dss, identifiers::parse_gr_dss) {
1122            return true;
1123        }
1124        if identifier_equal(&worker1.li_id, &worker2.li_id, identifiers::parse_li_id) {
1125            return true;
1126        }
1127        if identifier_equal(&worker1.nl_id, &worker2.nl_id, identifiers::parse_nl_id) {
1128            return true;
1129        }
1130        if identifier_equal(&worker1.pl_nip, &worker2.pl_nip, identifiers::parse_pl_nip) {
1131            return true;
1132        }
1133        if identifier_equal(&worker1.pt_nif, &worker2.pt_nif, identifiers::parse_pt_nif) {
1134            return true;
1135        }
1136        if identifier_equal(&worker1.br_cpf, &worker2.br_cpf, identifiers::parse_br_cpf) {
1137            return true;
1138        }
1139        if identifier_equal(&worker1.cn_rrn, &worker2.cn_rrn, identifiers::parse_cn_rrn) {
1140            return true;
1141        }
1142        if identifier_equal(
1143            &worker1.in_aadhaar,
1144            &worker2.in_aadhaar,
1145            identifiers::parse_in_aadhaar,
1146        ) {
1147            return true;
1148        }
1149        if identifier_equal(
1150            &worker1.jp_my_number,
1151            &worker2.jp_my_number,
1152            identifiers::parse_jp_my_number,
1153        ) {
1154            return true;
1155        }
1156        if identifier_equal(
1157            &worker1.mx_curp,
1158            &worker2.mx_curp,
1159            identifiers::parse_mx_curp,
1160        ) {
1161            return true;
1162        }
1163        if identifier_equal(&worker1.nz_nhi, &worker2.nz_nhi, identifiers::parse_nz_nhi) {
1164            return true;
1165        }
1166        if identifier_equal(&worker1.za_id, &worker2.za_id, identifiers::parse_za_id) {
1167            return true;
1168        }
1169        if passport_books_share_pair(&worker1.passport_books, &worker2.passport_books) {
1170            return true;
1171        }
1172
1173        let name_match = match (&worker1.given_name, &worker2.given_name) {
1174            (Some(f1), Some(f2)) => {
1175                Normalizer::normalize_name(f1) == Normalizer::normalize_name(f2)
1176            }
1177            _ => false,
1178        } && match (&worker1.family_name, &worker2.family_name) {
1179            (Some(l1), Some(l2)) => {
1180                Normalizer::normalize_name(l1) == Normalizer::normalize_name(l2)
1181            }
1182            _ => false,
1183        };
1184
1185        let dob_match = match (worker1.date_of_birth, worker2.date_of_birth) {
1186            (Some(d1), Some(d2)) => d1 == d2,
1187            _ => false,
1188        };
1189
1190        let gender_match = match (worker1.gender, worker2.gender) {
1191            (Some(g1), Some(g2)) => g1 == g2,
1192            _ => true,
1193        };
1194
1195        name_match && dob_match && gender_match
1196    }
1197
1198    fn calculate_breakdown(&self, worker1: &Worker, worker2: &Worker) -> MatchBreakdown {
1199        MatchBreakdown {
1200            uk_nhs_number_score: identifier_score(
1201                &worker1.uk_nhs_number,
1202                &worker2.uk_nhs_number,
1203                identifiers::parse_uk_nhs_number,
1204            ),
1205            fr_nir_score: identifier_score(
1206                &worker1.fr_nir,
1207                &worker2.fr_nir,
1208                identifiers::parse_fr_nir,
1209            ),
1210            es_tsi_score: identifier_score(
1211                &worker1.es_tsi,
1212                &worker2.es_tsi,
1213                identifiers::parse_es_tsi,
1214            ),
1215            ie_ihi_score: identifier_score(
1216                &worker1.ie_ihi,
1217                &worker2.ie_ihi,
1218                identifiers::parse_ie_ihi,
1219            ),
1220            uk_hc_number_score: identifier_score(
1221                &worker1.uk_hc_number,
1222                &worker2.uk_hc_number,
1223                identifiers::parse_uk_hc_number,
1224            ),
1225            us_ssn_score: identifier_score(
1226                &worker1.us_ssn,
1227                &worker2.us_ssn,
1228                identifiers::parse_us_ssn,
1229            ),
1230            au_ihi_score: identifier_score(
1231                &worker1.au_ihi,
1232                &worker2.au_ihi,
1233                identifiers::parse_au_ihi,
1234            ),
1235            de_kvnr_score: identifier_score(
1236                &worker1.de_kvnr,
1237                &worker2.de_kvnr,
1238                identifiers::parse_de_kvnr,
1239            ),
1240            it_cf_score: identifier_score(&worker1.it_cf, &worker2.it_cf, identifiers::parse_it_cf),
1241            nl_bsn_score: identifier_score(
1242                &worker1.nl_bsn,
1243                &worker2.nl_bsn,
1244                identifiers::parse_nl_bsn,
1245            ),
1246            se_workernummer_score: identifier_score(
1247                &worker1.se_workernummer,
1248                &worker2.se_workernummer,
1249                identifiers::parse_se_workernummer,
1250            ),
1251            uk_chi_number_score: identifier_score(
1252                &worker1.uk_chi_number,
1253                &worker2.uk_chi_number,
1254                identifiers::parse_uk_chi_number,
1255            ),
1256            be_nn_score: identifier_score(&worker1.be_nn, &worker2.be_nn, identifiers::parse_be_nn),
1257            bg_egn_score: identifier_score(
1258                &worker1.bg_egn,
1259                &worker2.bg_egn,
1260                identifiers::parse_bg_egn,
1261            ),
1262            cz_rc_score: identifier_score(&worker1.cz_rc, &worker2.cz_rc, identifiers::parse_cz_rc),
1263            dk_cpr_score: identifier_score(
1264                &worker1.dk_cpr,
1265                &worker2.dk_cpr,
1266                identifiers::parse_dk_cpr,
1267            ),
1268            ee_ik_score: identifier_score(&worker1.ee_ik, &worker2.ee_ik, identifiers::parse_ee_ik),
1269            es_dni_score: identifier_score(
1270                &worker1.es_dni,
1271                &worker2.es_dni,
1272                identifiers::parse_es_dni,
1273            ),
1274            fi_hetu_score: identifier_score(
1275                &worker1.fi_hetu,
1276                &worker2.fi_hetu,
1277                identifiers::parse_fi_hetu,
1278            ),
1279            hr_oib_score: identifier_score(
1280                &worker1.hr_oib,
1281                &worker2.hr_oib,
1282                identifiers::parse_hr_oib,
1283            ),
1284            is_kt_score: identifier_score(&worker1.is_kt, &worker2.is_kt, identifiers::parse_is_kt),
1285            lt_ak_score: identifier_score(&worker1.lt_ak, &worker2.lt_ak, identifiers::parse_lt_ak),
1286            lv_pk_score: identifier_score(&worker1.lv_pk, &worker2.lv_pk, identifiers::parse_lv_pk),
1287            mt_id_score: identifier_score(&worker1.mt_id, &worker2.mt_id, identifiers::parse_mt_id),
1288            no_fnr_score: identifier_score(
1289                &worker1.no_fnr,
1290                &worker2.no_fnr,
1291                identifiers::parse_no_fnr,
1292            ),
1293            pl_pesel_score: identifier_score(
1294                &worker1.pl_pesel,
1295                &worker2.pl_pesel,
1296                identifiers::parse_pl_pesel,
1297            ),
1298            ro_cnp_score: identifier_score(
1299                &worker1.ro_cnp,
1300                &worker2.ro_cnp,
1301                identifiers::parse_ro_cnp,
1302            ),
1303            si_emso_score: identifier_score(
1304                &worker1.si_emso,
1305                &worker2.si_emso,
1306                identifiers::parse_si_emso,
1307            ),
1308            sk_rc_score: identifier_score(&worker1.sk_rc, &worker2.sk_rc, identifiers::parse_sk_rc),
1309            uk_nino_score: identifier_score(
1310                &worker1.uk_nino,
1311                &worker2.uk_nino,
1312                identifiers::parse_uk_nino,
1313            ),
1314            gr_dss_score: identifier_score(
1315                &worker1.gr_dss,
1316                &worker2.gr_dss,
1317                identifiers::parse_gr_dss,
1318            ),
1319            li_id_score: identifier_score(&worker1.li_id, &worker2.li_id, identifiers::parse_li_id),
1320            nl_id_score: identifier_score(&worker1.nl_id, &worker2.nl_id, identifiers::parse_nl_id),
1321            pl_nip_score: identifier_score(
1322                &worker1.pl_nip,
1323                &worker2.pl_nip,
1324                identifiers::parse_pl_nip,
1325            ),
1326            pt_nif_score: identifier_score(
1327                &worker1.pt_nif,
1328                &worker2.pt_nif,
1329                identifiers::parse_pt_nif,
1330            ),
1331            br_cpf_score: identifier_score(
1332                &worker1.br_cpf,
1333                &worker2.br_cpf,
1334                identifiers::parse_br_cpf,
1335            ),
1336            cn_rrn_score: identifier_score(
1337                &worker1.cn_rrn,
1338                &worker2.cn_rrn,
1339                identifiers::parse_cn_rrn,
1340            ),
1341            in_aadhaar_score: identifier_score(
1342                &worker1.in_aadhaar,
1343                &worker2.in_aadhaar,
1344                identifiers::parse_in_aadhaar,
1345            ),
1346            jp_my_number_score: identifier_score(
1347                &worker1.jp_my_number,
1348                &worker2.jp_my_number,
1349                identifiers::parse_jp_my_number,
1350            ),
1351            mx_curp_score: identifier_score(
1352                &worker1.mx_curp,
1353                &worker2.mx_curp,
1354                identifiers::parse_mx_curp,
1355            ),
1356            nz_nhi_score: identifier_score(
1357                &worker1.nz_nhi,
1358                &worker2.nz_nhi,
1359                identifiers::parse_nz_nhi,
1360            ),
1361            za_id_score: identifier_score(&worker1.za_id, &worker2.za_id, identifiers::parse_za_id),
1362            passport_book_score: score_passport_books(
1363                &worker1.passport_books,
1364                &worker2.passport_books,
1365            ),
1366            given_name_score: self.score_given_name(worker1, worker2),
1367            family_name_score: self.score_family_name(worker1, worker2),
1368            date_of_birth_score: self.score_date_of_birth(worker1, worker2),
1369            gender_score: self.score_gender(worker1, worker2),
1370            blood_type_score: self.score_blood_type(worker1, worker2),
1371            multiple_birth_score: self.score_multiple_birth(worker1, worker2),
1372            address_score: self.score_address(worker1, worker2),
1373            birth_place_score: self.score_birth_place(worker1, worker2),
1374            death_date_score: self.score_death_date(worker1, worker2),
1375            death_place_score: self.score_death_place(worker1, worker2),
1376            phone_score: self.score_phone(worker1, worker2),
1377            email_score: self.score_email(worker1, worker2),
1378            phonetic_name_score: if self.config.use_phonetic_matching {
1379                self.score_phonetic_names(worker1, worker2)
1380            } else {
1381                None
1382            },
1383        }
1384    }
1385
1386    fn calculate_weighted_score(&self, breakdown: &MatchBreakdown) -> f64 {
1387        let mut total_weight = 0.0;
1388        let mut weighted_sum = 0.0;
1389
1390        if let Some(score) = breakdown.uk_nhs_number_score {
1391            weighted_sum += score * self.config.uk_nhs_number_weight;
1392            total_weight += self.config.uk_nhs_number_weight;
1393        }
1394        if let Some(score) = breakdown.fr_nir_score {
1395            weighted_sum += score * self.config.fr_nir_weight;
1396            total_weight += self.config.fr_nir_weight;
1397        }
1398        if let Some(score) = breakdown.es_tsi_score {
1399            weighted_sum += score * self.config.es_tsi_weight;
1400            total_weight += self.config.es_tsi_weight;
1401        }
1402        if let Some(score) = breakdown.ie_ihi_score {
1403            weighted_sum += score * self.config.ie_ihi_weight;
1404            total_weight += self.config.ie_ihi_weight;
1405        }
1406        if let Some(score) = breakdown.uk_hc_number_score {
1407            weighted_sum += score * self.config.uk_hc_number_weight;
1408            total_weight += self.config.uk_hc_number_weight;
1409        }
1410        if let Some(score) = breakdown.us_ssn_score {
1411            weighted_sum += score * self.config.us_ssn_weight;
1412            total_weight += self.config.us_ssn_weight;
1413        }
1414        if let Some(score) = breakdown.au_ihi_score {
1415            weighted_sum += score * self.config.au_ihi_weight;
1416            total_weight += self.config.au_ihi_weight;
1417        }
1418        if let Some(score) = breakdown.de_kvnr_score {
1419            weighted_sum += score * self.config.de_kvnr_weight;
1420            total_weight += self.config.de_kvnr_weight;
1421        }
1422        if let Some(score) = breakdown.it_cf_score {
1423            weighted_sum += score * self.config.it_cf_weight;
1424            total_weight += self.config.it_cf_weight;
1425        }
1426        if let Some(score) = breakdown.nl_bsn_score {
1427            weighted_sum += score * self.config.nl_bsn_weight;
1428            total_weight += self.config.nl_bsn_weight;
1429        }
1430        if let Some(score) = breakdown.se_workernummer_score {
1431            weighted_sum += score * self.config.se_workernummer_weight;
1432            total_weight += self.config.se_workernummer_weight;
1433        }
1434        if let Some(score) = breakdown.uk_chi_number_score {
1435            weighted_sum += score * self.config.uk_chi_number_weight;
1436            total_weight += self.config.uk_chi_number_weight;
1437        }
1438        if let Some(score) = breakdown.be_nn_score {
1439            weighted_sum += score * self.config.be_nn_weight;
1440            total_weight += self.config.be_nn_weight;
1441        }
1442        if let Some(score) = breakdown.bg_egn_score {
1443            weighted_sum += score * self.config.bg_egn_weight;
1444            total_weight += self.config.bg_egn_weight;
1445        }
1446        if let Some(score) = breakdown.cz_rc_score {
1447            weighted_sum += score * self.config.cz_rc_weight;
1448            total_weight += self.config.cz_rc_weight;
1449        }
1450        if let Some(score) = breakdown.dk_cpr_score {
1451            weighted_sum += score * self.config.dk_cpr_weight;
1452            total_weight += self.config.dk_cpr_weight;
1453        }
1454        if let Some(score) = breakdown.ee_ik_score {
1455            weighted_sum += score * self.config.ee_ik_weight;
1456            total_weight += self.config.ee_ik_weight;
1457        }
1458        if let Some(score) = breakdown.es_dni_score {
1459            weighted_sum += score * self.config.es_dni_weight;
1460            total_weight += self.config.es_dni_weight;
1461        }
1462        if let Some(score) = breakdown.fi_hetu_score {
1463            weighted_sum += score * self.config.fi_hetu_weight;
1464            total_weight += self.config.fi_hetu_weight;
1465        }
1466        if let Some(score) = breakdown.hr_oib_score {
1467            weighted_sum += score * self.config.hr_oib_weight;
1468            total_weight += self.config.hr_oib_weight;
1469        }
1470        if let Some(score) = breakdown.is_kt_score {
1471            weighted_sum += score * self.config.is_kt_weight;
1472            total_weight += self.config.is_kt_weight;
1473        }
1474        if let Some(score) = breakdown.lt_ak_score {
1475            weighted_sum += score * self.config.lt_ak_weight;
1476            total_weight += self.config.lt_ak_weight;
1477        }
1478        if let Some(score) = breakdown.lv_pk_score {
1479            weighted_sum += score * self.config.lv_pk_weight;
1480            total_weight += self.config.lv_pk_weight;
1481        }
1482        if let Some(score) = breakdown.mt_id_score {
1483            weighted_sum += score * self.config.mt_id_weight;
1484            total_weight += self.config.mt_id_weight;
1485        }
1486        if let Some(score) = breakdown.no_fnr_score {
1487            weighted_sum += score * self.config.no_fnr_weight;
1488            total_weight += self.config.no_fnr_weight;
1489        }
1490        if let Some(score) = breakdown.pl_pesel_score {
1491            weighted_sum += score * self.config.pl_pesel_weight;
1492            total_weight += self.config.pl_pesel_weight;
1493        }
1494        if let Some(score) = breakdown.ro_cnp_score {
1495            weighted_sum += score * self.config.ro_cnp_weight;
1496            total_weight += self.config.ro_cnp_weight;
1497        }
1498        if let Some(score) = breakdown.si_emso_score {
1499            weighted_sum += score * self.config.si_emso_weight;
1500            total_weight += self.config.si_emso_weight;
1501        }
1502        if let Some(score) = breakdown.sk_rc_score {
1503            weighted_sum += score * self.config.sk_rc_weight;
1504            total_weight += self.config.sk_rc_weight;
1505        }
1506        if let Some(score) = breakdown.uk_nino_score {
1507            weighted_sum += score * self.config.uk_nino_weight;
1508            total_weight += self.config.uk_nino_weight;
1509        }
1510        if let Some(score) = breakdown.gr_dss_score {
1511            weighted_sum += score * self.config.gr_dss_weight;
1512            total_weight += self.config.gr_dss_weight;
1513        }
1514        if let Some(score) = breakdown.li_id_score {
1515            weighted_sum += score * self.config.li_id_weight;
1516            total_weight += self.config.li_id_weight;
1517        }
1518        if let Some(score) = breakdown.nl_id_score {
1519            weighted_sum += score * self.config.nl_id_weight;
1520            total_weight += self.config.nl_id_weight;
1521        }
1522        if let Some(score) = breakdown.pl_nip_score {
1523            weighted_sum += score * self.config.pl_nip_weight;
1524            total_weight += self.config.pl_nip_weight;
1525        }
1526        if let Some(score) = breakdown.pt_nif_score {
1527            weighted_sum += score * self.config.pt_nif_weight;
1528            total_weight += self.config.pt_nif_weight;
1529        }
1530        if let Some(score) = breakdown.br_cpf_score {
1531            weighted_sum += score * self.config.br_cpf_weight;
1532            total_weight += self.config.br_cpf_weight;
1533        }
1534        if let Some(score) = breakdown.cn_rrn_score {
1535            weighted_sum += score * self.config.cn_rrn_weight;
1536            total_weight += self.config.cn_rrn_weight;
1537        }
1538        if let Some(score) = breakdown.in_aadhaar_score {
1539            weighted_sum += score * self.config.in_aadhaar_weight;
1540            total_weight += self.config.in_aadhaar_weight;
1541        }
1542        if let Some(score) = breakdown.jp_my_number_score {
1543            weighted_sum += score * self.config.jp_my_number_weight;
1544            total_weight += self.config.jp_my_number_weight;
1545        }
1546        if let Some(score) = breakdown.mx_curp_score {
1547            weighted_sum += score * self.config.mx_curp_weight;
1548            total_weight += self.config.mx_curp_weight;
1549        }
1550        if let Some(score) = breakdown.nz_nhi_score {
1551            weighted_sum += score * self.config.nz_nhi_weight;
1552            total_weight += self.config.nz_nhi_weight;
1553        }
1554        if let Some(score) = breakdown.za_id_score {
1555            weighted_sum += score * self.config.za_id_weight;
1556            total_weight += self.config.za_id_weight;
1557        }
1558        if let Some(score) = breakdown.passport_book_score {
1559            weighted_sum += score * self.config.passport_book_weight;
1560            total_weight += self.config.passport_book_weight;
1561        }
1562        if let Some(score) = breakdown.given_name_score {
1563            weighted_sum += score * self.config.given_name_weight;
1564            total_weight += self.config.given_name_weight;
1565        }
1566        if let Some(score) = breakdown.family_name_score {
1567            weighted_sum += score * self.config.family_name_weight;
1568            total_weight += self.config.family_name_weight;
1569        }
1570        if let Some(score) = breakdown.date_of_birth_score {
1571            weighted_sum += score * self.config.date_of_birth_weight;
1572            total_weight += self.config.date_of_birth_weight;
1573        }
1574        if let Some(score) = breakdown.gender_score {
1575            weighted_sum += score * self.config.gender_weight;
1576            total_weight += self.config.gender_weight;
1577        }
1578        if let Some(score) = breakdown.blood_type_score {
1579            weighted_sum += score * self.config.blood_type_weight;
1580            total_weight += self.config.blood_type_weight;
1581        }
1582        if let Some(score) = breakdown.multiple_birth_score {
1583            weighted_sum += score * self.config.multiple_birth_weight;
1584            total_weight += self.config.multiple_birth_weight;
1585        }
1586        if let Some(score) = breakdown.address_score {
1587            weighted_sum += score * self.config.address_weight;
1588            total_weight += self.config.address_weight;
1589        }
1590        if let Some(score) = breakdown.birth_place_score {
1591            weighted_sum += score * self.config.birth_place_weight;
1592            total_weight += self.config.birth_place_weight;
1593        }
1594        if let Some(score) = breakdown.death_date_score {
1595            weighted_sum += score * self.config.death_date_weight;
1596            total_weight += self.config.death_date_weight;
1597        }
1598        if let Some(score) = breakdown.death_place_score {
1599            weighted_sum += score * self.config.death_place_weight;
1600            total_weight += self.config.death_place_weight;
1601        }
1602        if let Some(score) = breakdown.phone_score {
1603            weighted_sum += score * self.config.phone_weight;
1604            total_weight += self.config.phone_weight;
1605        }
1606        if let Some(score) = breakdown.email_score {
1607            weighted_sum += score * self.config.email_weight;
1608            total_weight += self.config.email_weight;
1609        }
1610
1611        // Phonetic match is a bonus only — never lowers the score.
1612        if let Some(score) = breakdown.phonetic_name_score
1613            && score > 0.9
1614        {
1615            weighted_sum += score * 0.05;
1616            total_weight += 0.05;
1617        }
1618
1619        if total_weight > 0.0 {
1620            weighted_sum / total_weight
1621        } else {
1622            0.0
1623        }
1624    }
1625
1626    fn score_given_name(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1627        let (g1, g2) = match (&worker1.given_name, &worker2.given_name) {
1628            (Some(a), Some(b)) => (a.as_str(), b.as_str()),
1629            _ => return None,
1630        };
1631        let given = self.score_name(g1, g2);
1632        // Middle-name contribution: when both workers have a middle
1633        // name, blend 5% of a middle-name similarity into the
1634        // given-name component. The same `score_name` helper is used,
1635        // so the configured similarity algorithm and nickname boost
1636        // apply to middle names as well. Per spec FR-49 / §12.2;
1637        // resolves OQ-1.
1638        let blended = match (&worker1.middle_name, &worker2.middle_name) {
1639            (Some(m1), Some(m2)) => {
1640                let middle = self.score_name(m1, m2);
1641                0.95 * given + 0.05 * middle
1642            }
1643            _ => given,
1644        };
1645        Some(blended)
1646    }
1647
1648    fn score_family_name(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1649        match (&worker1.family_name, &worker2.family_name) {
1650            (Some(name1), Some(name2)) => Some(self.score_name(name1, name2)),
1651            _ => None,
1652        }
1653    }
1654
1655    fn score_name(&self, name1: &str, name2: &str) -> f64 {
1656        let norm1 = Normalizer::normalize_name(name1);
1657        let norm2 = Normalizer::normalize_name(name2);
1658        let base = match self.config.name_algorithm {
1659            SimilarityAlgorithm::JaroWinkler => Scorer::jaro_winkler_similarity(&norm1, &norm2),
1660            SimilarityAlgorithm::Levenshtein => Scorer::levenshtein_similarity(&norm1, &norm2),
1661            SimilarityAlgorithm::Exact => Scorer::exact_match(&norm1, &norm2),
1662            SimilarityAlgorithm::Combined => Scorer::combined_similarity(&norm1, &norm2),
1663        };
1664        // Nickname boost: when both normalised forms appear in the same
1665        // equivalence class of the configured table, lift the score so
1666        // table-driven equivalence is not undone by a low Jaro-Winkler /
1667        // Levenshtein result on dissimilar surface forms. The boost only
1668        // ever raises the score; it never lowers it.
1669        if !self.config.nickname_table.is_empty()
1670            && self.config.nickname_table.are_equivalent(&norm1, &norm2)
1671        {
1672            base.max(0.9)
1673        } else {
1674            base
1675        }
1676    }
1677
1678    fn score_date_of_birth(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1679        match (worker1.date_of_birth, worker2.date_of_birth) {
1680            (Some(dob1), Some(dob2)) => Some(score_dob_pair(dob1, dob2)),
1681            _ => None,
1682        }
1683    }
1684
1685    fn score_gender(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1686        match (worker1.gender, worker2.gender) {
1687            (Some(g1), Some(g2)) => Some(if g1 == g2 { 1.0 } else { 0.0 }),
1688            _ => None,
1689        }
1690    }
1691
1692    fn score_blood_type(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1693        match (worker1.blood_type, worker2.blood_type) {
1694            (Some(b1), Some(b2)) => Some(if b1 == b2 { 1.0 } else { 0.0 }),
1695            _ => None,
1696        }
1697    }
1698
1699    fn score_multiple_birth(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1700        match (worker1.multiple_birth, worker2.multiple_birth) {
1701            (Some(m1), Some(m2)) => Some(f64::from(m1 == m2)),
1702            _ => None,
1703        }
1704    }
1705
1706    fn score_address(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1707        // Consider the current address plus any historical addresses
1708        // recorded on each worker. The reported `address_score` is the
1709        // best (highest) score across the cartesian product of the two
1710        // address lists. This catches the "worker moved house"
1711        // failure mode where the current addresses no longer agree
1712        // but a prior address on one side still matches the other
1713        // side's current. Per spec §12.4 / FR-48; resolves OQ-3.
1714        let all_p1: Vec<&Address> = worker1
1715            .address
1716            .as_ref()
1717            .into_iter()
1718            .chain(worker1.previous_addresses.iter())
1719            .collect();
1720        let all_p2: Vec<&Address> = worker2
1721            .address
1722            .as_ref()
1723            .into_iter()
1724            .chain(worker2.previous_addresses.iter())
1725            .collect();
1726        if all_p1.is_empty() || all_p2.is_empty() {
1727            return None;
1728        }
1729        let mut best = f64::NEG_INFINITY;
1730        for a1 in &all_p1 {
1731            for a2 in &all_p2 {
1732                let s = self.compare_addresses(a1, a2);
1733                if s > best {
1734                    best = s;
1735                }
1736            }
1737        }
1738        Some(best)
1739    }
1740
1741    /// Compare two place-of-birth `Address` values.
1742    ///
1743    /// Delegates to [`score_named_place`], which scores city via
1744    /// Jaro-Winkler and country via exact equality on the normalised
1745    /// string, blending them as `0.7 × city + 0.3 × country`.
1746    fn score_birth_place(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1747        score_named_place(worker1.birth_place.as_ref()?, worker2.birth_place.as_ref()?)
1748    }
1749
1750    /// Compare two place-of-death `Address` values.
1751    ///
1752    /// Symmetrical to [`Self::score_birth_place`] — death-place data
1753    /// in practice carries `city` and `country` only; we reuse the
1754    /// shared [`score_named_place`] helper.
1755    fn score_death_place(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1756        score_named_place(worker1.death_place.as_ref()?, worker2.death_place.as_ref()?)
1757    }
1758
1759    /// Compare two recorded dates of death using the same DOB
1760    /// transposition heuristic as [`Self::score_date_of_birth`]
1761    /// — exact equality scores `1.0`, day/month swap scores `0.5`,
1762    /// otherwise `0.0`. Day-month swaps are just as common a
1763    /// data-entry mistake on death certificates as on birth records.
1764    fn score_death_date(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1765        Some(score_dob_pair(worker1.death_date?, worker2.death_date?))
1766    }
1767
1768    fn compare_addresses(&self, addr1: &Address, addr2: &Address) -> f64 {
1769        // Each sub-component contributes its own raw score in `[0.0, 1.0]`
1770        // and a weight. The final sub-score is the weight-renormalised
1771        // average — `Σ(score × weight) / Σ(weight)` — over the
1772        // sub-components that actually fired. This matches the
1773        // probabilistic outer pipeline (§12.3) and resolves §22 OQ-4:
1774        // postcode dominates (`0.5`), then city (`0.3`), then line 1
1775        // (`0.2`), regardless of how many components are populated.
1776        let mut weighted_sum = 0.0_f64;
1777        let mut total_weight = 0.0_f64;
1778
1779        if let (Some(pc1), Some(pc2)) = (&addr1.postcode, &addr2.postcode) {
1780            let norm1 = Normalizer::normalize_postcode(pc1);
1781            let norm2 = Normalizer::normalize_postcode(pc2);
1782            weighted_sum += f64::from(norm1 == norm2) * 0.5;
1783            total_weight += 0.5;
1784        }
1785
1786        if let (Some(city1), Some(city2)) = (&addr1.city, &addr2.city) {
1787            let norm1 = Normalizer::normalize_name(city1);
1788            let norm2 = Normalizer::normalize_name(city2);
1789            weighted_sum += Scorer::jaro_winkler_similarity(&norm1, &norm2) * 0.3;
1790            total_weight += 0.3;
1791        }
1792
1793        if let (Some(line1), Some(line2)) = (&addr1.line1, &addr2.line1) {
1794            let parsed1 = Normalizer::parse_address_line(line1);
1795            let parsed2 = Normalizer::parse_address_line(line2);
1796            let street_sim = Scorer::jaro_winkler_similarity(&parsed1.street, &parsed2.street);
1797            let house_score = match (&parsed1.house_number, &parsed2.house_number) {
1798                (Some(a), Some(b)) => Some(f64::from(a == b)),
1799                _ => None,
1800            };
1801            let line1_score = match house_score {
1802                Some(h) => 0.6 * street_sim + 0.4 * h,
1803                None => street_sim,
1804            };
1805            weighted_sum += line1_score * 0.2;
1806            total_weight += 0.2;
1807        }
1808
1809        if total_weight == 0.0 {
1810            0.5
1811        } else {
1812            weighted_sum / total_weight
1813        }
1814    }
1815
1816    fn score_phone(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1817        let phone1 = worker1.phone.as_ref().or(worker1.mobile.as_ref())?;
1818        let phone2 = worker2.phone.as_ref().or(worker2.mobile.as_ref())?;
1819
1820        let default = self.config.phone_default_country.as_deref();
1821        let e164_1 = Normalizer::normalize_phone_e164(phone1, default);
1822        let e164_2 = Normalizer::normalize_phone_e164(phone2, default);
1823
1824        // Prefer the international-aware comparison: a French and a UK
1825        // number that share the same national-significant digits must not
1826        // collide. Fall back to the legacy national-significant form when
1827        // either side cannot be parsed to E.164 — preserving prior
1828        // behaviour for inputs the country table does not cover.
1829        if let (Some(a), Some(b)) = (&e164_1, &e164_2) {
1830            return Some(f64::from(a == b));
1831        }
1832
1833        let norm1 = Normalizer::normalize_phone(phone1);
1834        let norm2 = Normalizer::normalize_phone(phone2);
1835        Some(f64::from(norm1 == norm2))
1836    }
1837
1838    fn score_email(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1839        let email1 = worker1.email.as_ref()?;
1840        let email2 = worker2.email.as_ref()?;
1841        let fold = self.config.gmail_dot_folding;
1842        let canonical1 = Normalizer::normalize_email(email1, fold)?;
1843        let canonical2 = Normalizer::normalize_email(email2, fold)?;
1844        Some(f64::from(canonical1 == canonical2))
1845    }
1846
1847    fn score_phonetic_names(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
1848        let p1_given_name = worker1.given_name.as_ref()?;
1849        let p1_given_name_phonetic = Normalizer::phonetic_code(p1_given_name);
1850        let p1_family_name = worker1.family_name.as_ref()?;
1851        let p1_family_name_phonetic = Normalizer::phonetic_code(p1_family_name);
1852
1853        let p2_given_name = worker2.given_name.as_ref()?;
1854        let p2_given_name_phonetic = Normalizer::phonetic_code(p2_given_name);
1855        let p2_family_name = worker2.family_name.as_ref()?;
1856        let p2_family_name_phonetic = Normalizer::phonetic_code(p2_family_name);
1857
1858        let given_name_match = f64::from(p1_given_name_phonetic == p2_given_name_phonetic);
1859        let family_name_match = f64::from(p1_family_name_phonetic == p2_family_name_phonetic);
1860        Some((given_name_match + family_name_match) / 2.0)
1861    }
1862}
1863
1864/// Return `true` iff both raw identifier strings parse via `parser` to the
1865/// same canonical form. Both `None` or a parse failure on either side
1866/// yields `false` — this helper backs deterministic identifier matching.
1867fn identifier_equal<F>(a: &Option<String>, b: &Option<String>, parser: F) -> bool
1868where
1869    F: Fn(&str) -> Option<String>,
1870{
1871    match (a, b) {
1872        (Some(x), Some(y)) => match (parser(x), parser(y)) {
1873            (Some(cx), Some(cy)) => cx == cy,
1874            _ => false,
1875        },
1876        _ => false,
1877    }
1878}
1879
1880/// Return `Some(1.0)` if both inputs parse and are equal, `Some(0.0)` if
1881/// both parse but differ, or `None` if either input is absent or fails to
1882/// parse — mirroring the existing per-field scoring contract.
1883fn identifier_score<F>(a: &Option<String>, b: &Option<String>, parser: F) -> Option<f64>
1884where
1885    F: Fn(&str) -> Option<String>,
1886{
1887    if let (Some(x), Some(y)) = (a, b)
1888        && let (Some(cx), Some(cy)) = (parser(x), parser(y))
1889    {
1890        return Some(f64::from(cx == cy));
1891    }
1892    None
1893}
1894
1895/// `true` iff the two passport-book lists share at least one
1896/// `(country, number)` pair after the canonicalisation performed by
1897/// [`crate::PassportBook::new`].
1898///
1899/// A single shared pair is sufficient: a worker with passports from
1900/// multiple countries matches another worker who carries any one of
1901/// those `(country, number)` pairs, regardless of issue date. This
1902/// captures the multi-country and time-varying nature of passport
1903/// book numbers (see [`crate::PassportBook`] for the design
1904/// rationale).
1905fn passport_books_share_pair(a: &[PassportBook], b: &[PassportBook]) -> bool {
1906    for ba in a {
1907        for bb in b {
1908            if ba.country == bb.country && ba.number == bb.number {
1909                return true;
1910            }
1911        }
1912    }
1913    false
1914}
1915
1916/// Score a pair of passport-book lists for the probabilistic
1917/// breakdown. Returns:
1918///
1919/// - `None` if either side has no books at all (the field is
1920///   irrelevant for this pair).
1921/// - `Some(1.0)` if at least one `(country, number)` pair is shared.
1922/// - `Some(0.0)` if both sides carry books but none share a pair.
1923fn score_passport_books(a: &[PassportBook], b: &[PassportBook]) -> Option<f64> {
1924    if a.is_empty() || b.is_empty() {
1925        return None;
1926    }
1927    Some(f64::from(passport_books_share_pair(a, b)))
1928}
1929
1930/// Score a pair of `NaiveDate` values for the date-of-birth component.
1931///
1932/// - `1.0` when the dates are exactly equal.
1933/// - `0.5` when **swapping the day and month on one side** yields the
1934///   other side, and the years agree. This catches the DD/MM ↔ MM/DD
1935///   data-entry bug that is common in cross-border or multi-system data
1936///   flows; `1995-01-10` vs `1995-10-01` is the canonical example.
1937/// - `0.0` otherwise.
1938///
1939/// The transposition path is conservative by design: it requires the
1940/// years to match, and it relies on `NaiveDate::from_ymd_opt` to validate
1941/// the swapped form (so it cannot fire on a day greater than 12, on a
1942/// month longer than the original day's day-count, or across years).
1943/// Compare two named-place [`Address`] values (typically `city` and
1944/// `country` populated). Used by both birth-place and death-place
1945/// scoring — Jaro-Winkler on the normalised city blended with exact
1946/// equality on the normalised country: `0.7 × city + 0.3 × country`
1947/// when both are present, otherwise whichever single signal is
1948/// available. Returns `None` if neither sub-field is populated on
1949/// both sides.
1950fn score_named_place(a: &Address, b: &Address) -> Option<f64> {
1951    let city = match (&a.city, &b.city) {
1952        (Some(c1), Some(c2)) => Some(Scorer::jaro_winkler_similarity(
1953            &Normalizer::normalize_name(c1),
1954            &Normalizer::normalize_name(c2),
1955        )),
1956        _ => None,
1957    };
1958    let country = match (&a.country, &b.country) {
1959        (Some(c1), Some(c2)) => Some(f64::from(
1960            Normalizer::normalize_name(c1) == Normalizer::normalize_name(c2),
1961        )),
1962        _ => None,
1963    };
1964    match (city, country) {
1965        (Some(c), Some(co)) => Some(0.7 * c + 0.3 * co),
1966        (Some(c), None) => Some(c),
1967        (None, Some(co)) => Some(co),
1968        (None, None) => None,
1969    }
1970}
1971
1972fn score_dob_pair(dob1: NaiveDate, dob2: NaiveDate) -> f64 {
1973    if dob1 == dob2 {
1974        return 1.0;
1975    }
1976    if dob1.year() == dob2.year()
1977        && let Some(swapped) = NaiveDate::from_ymd_opt(dob1.year(), dob1.day(), dob1.month())
1978        && swapped == dob2
1979    {
1980        return 0.5;
1981    }
1982    0.0
1983}
1984
1985#[cfg(test)]
1986mod tests {
1987    use super::*;
1988    use crate::models::Gender;
1989    use chrono::NaiveDate;
1990
1991    fn dob(y: i32, m: u32, d: u32) -> NaiveDate {
1992        NaiveDate::from_ymd_opt(y, m, d).expect("valid date")
1993    }
1994
1995    // ---------- MatchConfig presets ----------
1996
1997    #[test]
1998    fn config_default_values() {
1999        let c = MatchConfig::default();
2000        assert!((c.match_threshold - 0.85).abs() < 1e-9);
2001        assert!((c.uk_nhs_number_weight - 0.30).abs() < 1e-9);
2002        assert!(c.use_phonetic_matching);
2003        assert!(!c.strict_mode);
2004    }
2005
2006    #[test]
2007    fn config_strict_raises_threshold_and_sets_flag() {
2008        let c = MatchConfig::strict();
2009        assert!((c.match_threshold - 0.95).abs() < 1e-9);
2010        assert!(c.strict_mode);
2011    }
2012
2013    // ---------- MatchConfig serde ----------
2014
2015    #[test]
2016    fn config_default_round_trips_through_json() {
2017        let cfg = MatchConfig::default();
2018        let json = serde_json::to_string(&cfg).expect("serialise");
2019        let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
2020        assert!((cfg.match_threshold - back.match_threshold).abs() < 1e-12);
2021        assert!((cfg.uk_nhs_number_weight - back.uk_nhs_number_weight).abs() < 1e-12);
2022        assert_eq!(cfg.use_phonetic_matching, back.use_phonetic_matching);
2023        assert!(matches!(back.name_algorithm, SimilarityAlgorithm::Combined));
2024        assert_eq!(cfg.strict_mode, back.strict_mode);
2025        assert_eq!(cfg.nickname_table, back.nickname_table);
2026        assert_eq!(cfg.gmail_dot_folding, back.gmail_dot_folding);
2027        assert_eq!(cfg.phone_default_country, back.phone_default_country);
2028    }
2029
2030    #[test]
2031    fn config_strict_round_trips_through_json() {
2032        let cfg = MatchConfig::strict();
2033        let json = serde_json::to_string(&cfg).expect("serialise");
2034        let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
2035        assert!((back.match_threshold - 0.95).abs() < 1e-12);
2036        assert!(back.strict_mode);
2037    }
2038
2039    #[test]
2040    fn config_lenient_round_trips_through_json() {
2041        let cfg = MatchConfig::lenient();
2042        let json = serde_json::to_string(&cfg).expect("serialise");
2043        let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
2044        assert!((back.match_threshold - 0.75).abs() < 1e-12);
2045    }
2046
2047    #[test]
2048    fn config_partial_json_fills_missing_fields_from_default() {
2049        // `#[serde(default)]` on the struct: a JSON document carrying
2050        // only the fields the caller cares about deserialises with the
2051        // rest filled in from `MatchConfig::default()`.
2052        let partial = r#"{"match_threshold": 0.80, "gmail_dot_folding": true}"#;
2053        let cfg: MatchConfig = serde_json::from_str(partial).expect("partial json");
2054        assert!((cfg.match_threshold - 0.80).abs() < 1e-12);
2055        assert!(cfg.gmail_dot_folding);
2056        // Other fields come from default().
2057        assert!((cfg.uk_nhs_number_weight - 0.30).abs() < 1e-12);
2058        assert!(matches!(cfg.name_algorithm, SimilarityAlgorithm::Combined));
2059        assert_eq!(cfg.phone_default_country.as_deref(), Some("GB"));
2060    }
2061
2062    #[test]
2063    fn similarity_algorithm_round_trips_through_json() {
2064        for alg in [
2065            SimilarityAlgorithm::JaroWinkler,
2066            SimilarityAlgorithm::Levenshtein,
2067            SimilarityAlgorithm::Exact,
2068            SimilarityAlgorithm::Combined,
2069        ] {
2070            let json = serde_json::to_string(&alg).expect("serialise");
2071            let back: SimilarityAlgorithm = serde_json::from_str(&json).expect("deserialise");
2072            assert_eq!(alg, back);
2073        }
2074    }
2075
2076    #[test]
2077    fn config_lenient_lowers_threshold() {
2078        let c = MatchConfig::lenient();
2079        assert!((c.match_threshold - 0.75).abs() < 1e-9);
2080        assert!(c.use_phonetic_matching);
2081    }
2082
2083    // ---------- probabilistic match ----------
2084
2085    #[test]
2086    fn exact_clone_is_a_match() {
2087        let p = Worker::builder()
2088            .given_name("John")
2089            .family_name("Smith")
2090            .date_of_birth(dob(1980, 5, 15))
2091            .gender(Gender::Male)
2092            .uk_nhs_number("9434765919")
2093            .build();
2094        let result = MatchingEngine::default_config().match_workers(&p, &p.clone());
2095        assert!(result.is_match);
2096        assert!(result.score > 0.95);
2097    }
2098
2099    #[test]
2100    fn fuzzy_given_name_still_matches() {
2101        let a = Worker::builder()
2102            .given_name("John")
2103            .family_name("Smith")
2104            .date_of_birth(dob(1980, 5, 15))
2105            .gender(Gender::Male)
2106            .build();
2107        let b = Worker::builder()
2108            .given_name("Jon")
2109            .family_name("Smith")
2110            .date_of_birth(dob(1980, 5, 15))
2111            .gender(Gender::Male)
2112            .build();
2113        let r = MatchingEngine::default_config().match_workers(&a, &b);
2114        assert!(r.is_match);
2115        assert!(r.score > 0.85);
2116    }
2117
2118    #[test]
2119    fn completely_different_patients_do_not_match() {
2120        let a = Worker::builder()
2121            .given_name("John")
2122            .family_name("Smith")
2123            .date_of_birth(dob(1980, 5, 15))
2124            .gender(Gender::Male)
2125            .build();
2126        let b = Worker::builder()
2127            .given_name("Jane")
2128            .family_name("Doe")
2129            .date_of_birth(dob(1990, 3, 20))
2130            .gender(Gender::Female)
2131            .build();
2132        let r = MatchingEngine::default_config().match_workers(&a, &b);
2133        assert!(!r.is_match);
2134        assert!(r.score < 0.5);
2135    }
2136
2137    #[test]
2138    fn no_overlapping_fields_returns_zero_score() {
2139        // Neither side has any scoreable field on both records.
2140        let a = Worker::builder().given_name("Solo").build();
2141        let b = Worker::builder().family_name("Only").build();
2142        let r = MatchingEngine::default_config().match_workers(&a, &b);
2143        assert_eq!(r.score, 0.0);
2144        assert!(!r.is_match);
2145    }
2146
2147    #[test]
2148    fn unparseable_uk_nhs_number_is_none_not_zero() {
2149        let a = Worker::builder()
2150            .uk_nhs_number("not-a-number")
2151            .given_name("John")
2152            .family_name("Smith")
2153            .date_of_birth(dob(1980, 5, 15))
2154            .build();
2155        let b = Worker::builder()
2156            .uk_nhs_number("also-not-a-number")
2157            .given_name("John")
2158            .family_name("Smith")
2159            .date_of_birth(dob(1980, 5, 15))
2160            .build();
2161        let r = MatchingEngine::default_config().match_workers(&a, &b);
2162        assert_eq!(
2163            r.breakdown.uk_nhs_number_score, None,
2164            "unparseable NHS numbers should not produce a 0.0 penalty"
2165        );
2166        assert!(r.is_match, "should still match on demographics");
2167    }
2168
2169    #[test]
2170    fn missing_field_yields_none_in_breakdown() {
2171        let a = Worker::builder().given_name("Ada").build();
2172        let b = Worker::builder()
2173            .given_name("Ada")
2174            .family_name("Lovelace")
2175            .build();
2176        let r = MatchingEngine::default_config().match_workers(&a, &b);
2177        assert!(r.breakdown.given_name_score.is_some());
2178        assert!(r.breakdown.family_name_score.is_none());
2179    }
2180
2181    #[test]
2182    fn phonetic_match_is_a_bonus_not_a_penalty() {
2183        // Identical names should not be hurt when phonetic matching is on.
2184        let p = Worker::builder()
2185            .given_name("Stephen")
2186            .family_name("Jones")
2187            .build();
2188        let with_phon = MatchingEngine::new(MatchConfig {
2189            use_phonetic_matching: true,
2190            ..MatchConfig::default()
2191        })
2192        .match_workers(&p, &p.clone());
2193        let without_phon = MatchingEngine::new(MatchConfig {
2194            use_phonetic_matching: false,
2195            ..MatchConfig::default()
2196        })
2197        .match_workers(&p, &p.clone());
2198        assert!(with_phon.score >= without_phon.score);
2199    }
2200
2201    #[test]
2202    fn phonetic_score_disabled_when_config_off() {
2203        let p = Worker::builder()
2204            .given_name("Steven")
2205            .family_name("Smith")
2206            .build();
2207        let q = Worker::builder()
2208            .given_name("Stephen")
2209            .family_name("Smyth")
2210            .build();
2211        let r = MatchingEngine::new(MatchConfig {
2212            use_phonetic_matching: false,
2213            ..MatchConfig::default()
2214        })
2215        .match_workers(&p, &q);
2216        assert_eq!(r.breakdown.phonetic_name_score, None);
2217    }
2218
2219    #[test]
2220    fn address_with_no_subfields_is_neutral_half() {
2221        let a = Address::new();
2222        let b = Address::new();
2223        let engine = MatchingEngine::default_config();
2224        let score = engine.compare_addresses(&a, &b);
2225        assert!(
2226            (score - 0.5).abs() < 1e-9,
2227            "empty addresses must be neutral (0.5), got {score}"
2228        );
2229    }
2230
2231    #[test]
2232    fn address_postcode_dominates() {
2233        let mut a = Address::new();
2234        a.postcode = Some("CF10 1AA".into());
2235        let mut b = Address::new();
2236        b.postcode = Some("CF10 1AA".into());
2237        let s = MatchingEngine::default_config().compare_addresses(&a, &b);
2238        assert!(s > 0.0);
2239    }
2240
2241    // ---------- deterministic match ----------
2242
2243    #[test]
2244    fn deterministic_uk_nhs_match_overrides_demographics() {
2245        let a = Worker::builder()
2246            .uk_nhs_number("943 476 5919")
2247            .given_name("Bob")
2248            .build();
2249        let b = Worker::builder()
2250            .uk_nhs_number("9434765919")
2251            .given_name("Alice") // intentionally different
2252            .build();
2253        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
2254    }
2255
2256    #[test]
2257    fn deterministic_demographics_match_when_all_align() {
2258        let p = Worker::builder()
2259            .given_name("John")
2260            .family_name("Smith")
2261            .date_of_birth(dob(1980, 5, 15))
2262            .gender(Gender::Male)
2263            .build();
2264        assert!(MatchingEngine::default_config().deterministic_match(&p, &p.clone()));
2265    }
2266
2267    #[test]
2268    fn deterministic_demographics_tolerates_missing_gender() {
2269        let a = Worker::builder()
2270            .given_name("John")
2271            .family_name("Smith")
2272            .date_of_birth(dob(1980, 5, 15))
2273            .build();
2274        let b = Worker::builder()
2275            .given_name("John")
2276            .family_name("Smith")
2277            .date_of_birth(dob(1980, 5, 15))
2278            .gender(Gender::Male)
2279            .build();
2280        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
2281    }
2282
2283    #[test]
2284    fn deterministic_rejects_when_dob_differs() {
2285        let a = Worker::builder()
2286            .given_name("John")
2287            .family_name("Smith")
2288            .date_of_birth(dob(1980, 5, 15))
2289            .gender(Gender::Male)
2290            .build();
2291        let b = Worker::builder()
2292            .given_name("John")
2293            .family_name("Smith")
2294            .date_of_birth(dob(1980, 5, 16)) // off by one day
2295            .gender(Gender::Male)
2296            .build();
2297        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
2298    }
2299
2300    #[test]
2301    fn deterministic_rejects_when_gender_differs() {
2302        let a = Worker::builder()
2303            .given_name("John")
2304            .family_name("Smith")
2305            .date_of_birth(dob(1980, 5, 15))
2306            .gender(Gender::Male)
2307            .build();
2308        let b = Worker::builder()
2309            .given_name("John")
2310            .family_name("Smith")
2311            .date_of_birth(dob(1980, 5, 15))
2312            .gender(Gender::Female)
2313            .build();
2314        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
2315    }
2316
2317    // ---------- strict_mode enforcement ----------
2318
2319    #[test]
2320    fn strict_mode_requires_deterministic_for_is_match() {
2321        // A fuzzy match that lifts the score above the strict threshold
2322        // (0.95) but is NOT a deterministic match must produce
2323        // `is_match = false` under strict mode.
2324        let cfg = MatchConfig {
2325            // Lower threshold so the fuzzy score clears it.
2326            match_threshold: 0.85,
2327            strict_mode: true,
2328            ..MatchConfig::default()
2329        };
2330        let p1 = Worker::builder()
2331            .given_name("John")
2332            .family_name("Smith")
2333            .date_of_birth(dob(1980, 5, 15))
2334            .gender(Gender::Male)
2335            .build();
2336        let p2 = Worker::builder()
2337            .given_name("Jon") // typo — fuzzy match
2338            .family_name("Smith")
2339            .date_of_birth(dob(1980, 5, 15))
2340            .gender(Gender::Male)
2341            .build();
2342        let engine = MatchingEngine::new(cfg);
2343        let r = engine.match_workers(&p1, &p2);
2344        assert!(
2345            r.score >= 0.85,
2346            "fuzzy score should clear lowered threshold"
2347        );
2348        // Normalised given names differ ("john" vs "jon"), so the
2349        // demographic-tuple deterministic branch fails.
2350        assert!(!engine.deterministic_match(&p1, &p2));
2351        assert!(
2352            !r.is_match,
2353            "strict mode must reject fuzzy-only matches even above threshold"
2354        );
2355    }
2356
2357    #[test]
2358    fn strict_mode_accepts_when_deterministic_holds() {
2359        // An identifier-driven deterministic match also clears any
2360        // sensible threshold. Strict mode must allow it.
2361        let cfg = MatchConfig::strict();
2362        let p1 = Worker::builder()
2363            .uk_nhs_number("9434765919")
2364            .given_name("John")
2365            .family_name("Smith")
2366            .date_of_birth(dob(1980, 5, 15))
2367            .gender(Gender::Male)
2368            .build();
2369        let p2 = p1.clone();
2370        let r = MatchingEngine::new(cfg).match_workers(&p1, &p2);
2371        assert!(r.is_match);
2372    }
2373
2374    #[test]
2375    fn non_strict_mode_accepts_fuzzy_match_above_threshold() {
2376        // Sanity: in default (non-strict) config, a fuzzy match above
2377        // the threshold is still accepted. This pins that the strict
2378        // logic only activates when explicitly opted in.
2379        let p1 = Worker::builder()
2380            .given_name("John")
2381            .family_name("Smith")
2382            .date_of_birth(dob(1980, 5, 15))
2383            .gender(Gender::Male)
2384            .build();
2385        let p2 = Worker::builder()
2386            .given_name("Jon")
2387            .family_name("Smith")
2388            .date_of_birth(dob(1980, 5, 15))
2389            .gender(Gender::Male)
2390            .build();
2391        let r = MatchingEngine::default_config().match_workers(&p1, &p2);
2392        assert!(r.is_match);
2393    }
2394
2395    // ---------- batch API: match_one_to_many / rank_one_to_many ----------
2396
2397    #[test]
2398    fn match_one_to_many_empty_candidates_yields_empty_vec() {
2399        let engine = MatchingEngine::default_config();
2400        let q = Worker::builder().given_name("Solo").build();
2401        assert!(engine.match_one_to_many(&q, &[]).is_empty());
2402    }
2403
2404    #[test]
2405    fn match_one_to_many_preserves_order() {
2406        let engine = MatchingEngine::default_config();
2407        let q = Worker::builder()
2408            .given_name("Ada")
2409            .family_name("Lovelace")
2410            .build();
2411        let candidates = vec![
2412            Worker::builder()
2413                .given_name("Grace")
2414                .family_name("Hopper")
2415                .build(),
2416            q.clone(),
2417            Worker::builder()
2418                .given_name("Alan")
2419                .family_name("Turing")
2420                .build(),
2421        ];
2422        let r = engine.match_one_to_many(&q, &candidates);
2423        assert_eq!(r.len(), 3);
2424        // Index 1 is the perfect clone → highest score.
2425        assert!(r[1].score > r[0].score);
2426        assert!(r[1].score > r[2].score);
2427        assert!(r[1].is_match);
2428    }
2429
2430    #[test]
2431    fn match_one_to_many_matches_individual_scoring() {
2432        // Calling match_one_to_many must produce the same per-candidate
2433        // score as calling match_workers individually.
2434        let engine = MatchingEngine::default_config();
2435        let q = Worker::builder()
2436            .given_name("Ada")
2437            .family_name("Lovelace")
2438            .build();
2439        let candidates = vec![
2440            Worker::builder()
2441                .given_name("Ada")
2442                .family_name("Lovelace")
2443                .build(),
2444            Worker::builder()
2445                .given_name("Alan")
2446                .family_name("Turing")
2447                .build(),
2448        ];
2449        let batch = engine.match_one_to_many(&q, &candidates);
2450        for (i, c) in candidates.iter().enumerate() {
2451            let individual = engine.match_workers(&q, c);
2452            assert!((batch[i].score - individual.score).abs() < 1e-12);
2453            assert_eq!(batch[i].is_match, individual.is_match);
2454            assert_eq!(batch[i].confidence, individual.confidence);
2455        }
2456    }
2457
2458    #[test]
2459    fn rank_one_to_many_sorts_by_score_descending() {
2460        let engine = MatchingEngine::default_config();
2461        let q = Worker::builder()
2462            .given_name("Ada")
2463            .family_name("Lovelace")
2464            .build();
2465        let candidates = vec![
2466            Worker::builder()
2467                .given_name("Grace")
2468                .family_name("Hopper")
2469                .build(),
2470            q.clone(),
2471            Worker::builder()
2472                .given_name("Alan")
2473                .family_name("Turing")
2474                .build(),
2475        ];
2476        let ranked = engine.rank_one_to_many(&q, &candidates);
2477        assert_eq!(ranked.len(), 3);
2478        // Best match must be index 1 (the clone).
2479        assert_eq!(ranked[0].0, 1);
2480        // Ordering invariant.
2481        for w in ranked.windows(2) {
2482            assert!(w[0].1.score >= w[1].1.score);
2483        }
2484    }
2485
2486    #[test]
2487    fn rank_one_to_many_breaks_ties_by_ascending_original_index() {
2488        // Two candidates that produce the same score: ranking must keep
2489        // the earlier original index first for determinism.
2490        let engine = MatchingEngine::default_config();
2491        let q = Worker::builder()
2492            .given_name("Ada")
2493            .family_name("Lovelace")
2494            .build();
2495        let twin = q.clone();
2496        let candidates = vec![twin.clone(), twin.clone(), twin.clone()];
2497        let ranked = engine.rank_one_to_many(&q, &candidates);
2498        assert_eq!(ranked.len(), 3);
2499        // All three score identically; order MUST be 0, 1, 2.
2500        assert_eq!(ranked[0].0, 0);
2501        assert_eq!(ranked[1].0, 1);
2502        assert_eq!(ranked[2].0, 2);
2503    }
2504
2505    #[test]
2506    fn match_one_to_many_is_deterministic_across_calls() {
2507        let engine = MatchingEngine::default_config();
2508        let q = Worker::builder().given_name("X").family_name("Y").build();
2509        let candidates = vec![
2510            Worker::builder().given_name("X").family_name("Y").build(),
2511            Worker::builder().given_name("A").family_name("B").build(),
2512        ];
2513        let a = engine.match_one_to_many(&q, &candidates);
2514        let b = engine.match_one_to_many(&q, &candidates);
2515        for i in 0..a.len() {
2516            assert!((a[i].score - b[i].score).abs() < 1e-12);
2517        }
2518    }
2519
2520    // ---------- score_dob_pair (transposition heuristic) ----------
2521
2522    #[test]
2523    fn dob_pair_exact_equal_scores_one() {
2524        assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1995, 1, 10)), 1.0);
2525    }
2526
2527    #[test]
2528    fn dob_pair_day_month_swap_scores_half() {
2529        // 1995-01-10 vs 1995-10-01 — classic DD/MM ↔ MM/DD bug.
2530        assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1995, 10, 1)), 0.5);
2531        // Symmetric.
2532        assert_eq!(score_dob_pair(dob(1995, 10, 1), dob(1995, 1, 10)), 0.5);
2533    }
2534
2535    #[test]
2536    fn dob_pair_swap_requires_year_to_match() {
2537        // Same day/month layout but different year — not a transposition.
2538        assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1996, 10, 1)), 0.0);
2539    }
2540
2541    #[test]
2542    fn dob_pair_swap_skipped_when_day_exceeds_12() {
2543        // 1995-01-25: the swap target (1995, month=25, day=1) is not a
2544        // valid calendar date, so `NaiveDate::from_ymd_opt` returns None
2545        // and the heuristic does not fire. Compared against any other
2546        // valid date — including the same month with a different day —
2547        // the score must be 0.0.
2548        assert_eq!(score_dob_pair(dob(1995, 1, 25), dob(1995, 1, 26)), 0.0);
2549        assert_eq!(score_dob_pair(dob(1995, 1, 25), dob(1995, 2, 25)), 0.0);
2550    }
2551
2552    #[test]
2553    fn dob_pair_unrelated_dates_score_zero() {
2554        assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1980, 6, 30)), 0.0);
2555        assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1995, 1, 11)), 0.0);
2556    }
2557
2558    #[test]
2559    fn dob_pair_day_equals_month_collapses_to_exact() {
2560        // 1995-05-05: swap is a no-op; exact-match branch wins.
2561        assert_eq!(score_dob_pair(dob(1995, 5, 5), dob(1995, 5, 5)), 1.0);
2562        // 1995-05-05 vs 1995-05-06: not a transposition; 0.0.
2563        assert_eq!(score_dob_pair(dob(1995, 5, 5), dob(1995, 5, 6)), 0.0);
2564    }
2565
2566    #[test]
2567    fn dob_pair_invalid_swap_target_does_not_panic() {
2568        // 2003-02-30 is never constructed (chrono rejects it), so the
2569        // helper only ever receives valid dates. But test a near-edge:
2570        // swap of Feb 29 (leap) vs swap target.
2571        assert_eq!(score_dob_pair(dob(2000, 2, 29), dob(2000, 2, 29)), 1.0);
2572        // 2000-02-12 swap = 2000-12-02, valid.
2573        assert_eq!(score_dob_pair(dob(2000, 2, 12), dob(2000, 12, 2)), 0.5);
2574    }
2575
2576    #[test]
2577    fn deterministic_match_still_rejects_transposed_dob() {
2578        // The transposition heuristic only lifts the probabilistic
2579        // score; deterministic matching by demographics still requires
2580        // exact DOB equality.
2581        let a = Worker::builder()
2582            .given_name("Thomas")
2583            .family_name("Price")
2584            .date_of_birth(dob(1995, 1, 10))
2585            .gender(Gender::Male)
2586            .build();
2587        let b = Worker::builder()
2588            .given_name("Thomas")
2589            .family_name("Price")
2590            .date_of_birth(dob(1995, 10, 1))
2591            .gender(Gender::Male)
2592            .build();
2593        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
2594    }
2595
2596    #[test]
2597    fn transposed_dob_lifts_probabilistic_score_above_zero() {
2598        let a = Worker::builder()
2599            .given_name("Thomas")
2600            .family_name("Price")
2601            .date_of_birth(dob(1995, 1, 10))
2602            .gender(Gender::Male)
2603            .build();
2604        let b = Worker::builder()
2605            .given_name("Thomas")
2606            .family_name("Price")
2607            .date_of_birth(dob(1995, 10, 1))
2608            .gender(Gender::Male)
2609            .build();
2610        let r = MatchingEngine::default_config().match_workers(&a, &b);
2611        assert_eq!(r.breakdown.date_of_birth_score, Some(0.5));
2612        // Demographics agree on name + gender; DOB contributes 0.5
2613        // partial credit; overall score must lift above what a 0.0 DOB
2614        // would have produced.
2615        assert!(r.score > 0.6);
2616    }
2617
2618    // ---------- Confidence ----------
2619
2620    #[test]
2621    fn confidence_band_boundaries_are_inclusive_on_the_low_side() {
2622        assert_eq!(Confidence::from_score(0.90), Confidence::High);
2623        assert_eq!(Confidence::from_score(0.89), Confidence::Medium);
2624        assert_eq!(Confidence::from_score(0.75), Confidence::Medium);
2625        assert_eq!(Confidence::from_score(0.74), Confidence::Low);
2626    }
2627
2628    #[test]
2629    fn confidence_handles_degenerate_inputs_gracefully() {
2630        assert_eq!(Confidence::from_score(f64::NAN), Confidence::Low);
2631        assert_eq!(Confidence::from_score(-1.0), Confidence::Low);
2632        assert_eq!(Confidence::from_score(2.0), Confidence::High);
2633    }
2634
2635    #[test]
2636    fn confidence_is_independent_of_match_threshold() {
2637        // Strict (threshold 0.95) and lenient (threshold 0.75) produce
2638        // the same Confidence band for the same score — bands are
2639        // fixed, only is_match follows the threshold.
2640        let p = Worker::builder()
2641            .given_name("Ada")
2642            .family_name("Lovelace")
2643            .build();
2644        let strict = MatchingEngine::new(MatchConfig::strict()).match_workers(&p, &p.clone());
2645        let lenient = MatchingEngine::new(MatchConfig::lenient()).match_workers(&p, &p.clone());
2646        assert_eq!(strict.confidence, lenient.confidence);
2647        // And the exact-clone score should land in High.
2648        assert_eq!(strict.confidence, Confidence::High);
2649    }
2650
2651    #[test]
2652    fn match_result_carries_confidence() {
2653        let p = Worker::builder()
2654            .given_name("Ada")
2655            .family_name("Lovelace")
2656            .build();
2657        let r = MatchingEngine::default_config().match_workers(&p, &p.clone());
2658        assert_eq!(r.confidence, Confidence::High);
2659    }
2660
2661    #[test]
2662    fn match_result_confidence_round_trips_via_serde() {
2663        let p = Worker::builder()
2664            .given_name("Ada")
2665            .family_name("Lovelace")
2666            .build();
2667        let r = MatchingEngine::default_config().match_workers(&p, &p.clone());
2668        let json = serde_json::to_string(&r).unwrap();
2669        let back: MatchResult = serde_json::from_str(&json).unwrap();
2670        assert_eq!(r.confidence, back.confidence);
2671    }
2672
2673    #[test]
2674    fn deterministic_rejects_when_names_missing() {
2675        let a = Worker::builder()
2676            .date_of_birth(dob(1980, 5, 15))
2677            .gender(Gender::Male)
2678            .build();
2679        let b = a.clone();
2680        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
2681    }
2682
2683    // ---------- score_named_place ----------
2684
2685    #[test]
2686    fn score_named_place_both_subfields_blend_seven_three() {
2687        let a = Address::new().with_city("Paris").with_country("France");
2688        let b = Address::new().with_city("Paris").with_country("France");
2689        assert_eq!(score_named_place(&a, &b), Some(1.0));
2690    }
2691
2692    #[test]
2693    fn score_named_place_city_only_matches_returns_city_score() {
2694        let a = Address::new().with_city("Cardiff");
2695        let b = Address::new().with_city("Cardiff");
2696        assert_eq!(score_named_place(&a, &b), Some(1.0));
2697    }
2698
2699    #[test]
2700    fn score_named_place_country_only_matches_returns_country_score() {
2701        let a = Address::new().with_country("Wales");
2702        let b = Address::new().with_country("Wales");
2703        assert_eq!(score_named_place(&a, &b), Some(1.0));
2704    }
2705
2706    #[test]
2707    fn score_named_place_empty_returns_none() {
2708        let a = Address::new();
2709        let b = Address::new();
2710        assert_eq!(score_named_place(&a, &b), None);
2711    }
2712
2713    #[test]
2714    fn score_named_place_city_partial_country_mismatch_blends() {
2715        let a = Address::new().with_city("Paris").with_country("France");
2716        let b = Address::new().with_city("Paris").with_country("USA");
2717        let s = score_named_place(&a, &b).unwrap();
2718        assert!((s - 0.7).abs() < 1e-9);
2719    }
2720
2721    // ---------- death_date / death_place wiring ----------
2722
2723    #[test]
2724    fn match_config_default_carries_death_weights() {
2725        let c = MatchConfig::default();
2726        assert!((c.death_date_weight - 0.10).abs() < 1e-9);
2727        assert!((c.death_place_weight - 0.05).abs() < 1e-9);
2728    }
2729
2730    #[test]
2731    fn breakdown_carries_death_date_score_when_both_sides_present() {
2732        let p1 = Worker::builder()
2733            .given_name("X")
2734            .family_name("Y")
2735            .death_date(dob(2020, 3, 14))
2736            .build();
2737        let p2 = Worker::builder()
2738            .given_name("X")
2739            .family_name("Y")
2740            .death_date(dob(2020, 3, 14))
2741            .build();
2742        let r = MatchingEngine::default_config().match_workers(&p1, &p2);
2743        assert_eq!(r.breakdown.death_date_score, Some(1.0));
2744    }
2745
2746    // ---------- T-3 / OQ-4 address sub-score arithmetic ----------
2747
2748    #[test]
2749    fn address_subscore_exact_postcode_plus_slightly_different_street_clears_seven_tenths() {
2750        // OQ-4 acceptance: an identical postcode plus a slightly
2751        // different street MUST score ≥ 0.7. Under the old
2752        // `sum / count` formula this would have scored ~ 0.33.
2753        let engine = MatchingEngine::default_config();
2754        let a = Address::new();
2755        let a = Address {
2756            line1: Some("10 High Street".into()),
2757            postcode: Some("CF10 1AA".into()),
2758            city: Some("Cardiff".into()),
2759            ..a
2760        };
2761        let b = Address {
2762            line1: Some("10 High Road".into()),
2763            postcode: Some("CF10 1AA".into()),
2764            city: Some("Cardiff".into()),
2765            ..Address::new()
2766        };
2767        let s = engine.compare_addresses(&a, &b);
2768        assert!(
2769            s >= 0.7,
2770            "exact postcode + slight street typo should score ≥ 0.7: {s}"
2771        );
2772    }
2773
2774    #[test]
2775    fn address_subscore_postcode_only_match_returns_one() {
2776        // Only postcode populated on both sides → weighted average
2777        // collapses to the postcode score alone.
2778        let engine = MatchingEngine::default_config();
2779        let a = Address {
2780            postcode: Some("CF10 1AA".into()),
2781            ..Address::new()
2782        };
2783        let b = Address {
2784            postcode: Some("CF10 1AA".into()),
2785            ..Address::new()
2786        };
2787        let s = engine.compare_addresses(&a, &b);
2788        assert!((s - 1.0).abs() < 1e-9, "postcode-only match: {s}");
2789    }
2790
2791    #[test]
2792    fn address_subscore_no_comparable_fields_returns_neutral_half() {
2793        // Neither side populated on any sub-field — neutral 0.5.
2794        let engine = MatchingEngine::default_config();
2795        let s = engine.compare_addresses(&Address::new(), &Address::new());
2796        assert!((s - 0.5).abs() < 1e-9, "neutral fallback: {s}");
2797    }
2798
2799    #[test]
2800    fn address_subscore_postcode_match_plus_street_mismatch_dominated_by_postcode() {
2801        // Postcode (0.5) match + line1 (0.2) mismatch → 0.5/0.7 ≈ 0.714.
2802        let engine = MatchingEngine::default_config();
2803        let a = Address {
2804            postcode: Some("CF10 1AA".into()),
2805            line1: Some("Wholly Different".into()),
2806            ..Address::new()
2807        };
2808        let b = Address {
2809            postcode: Some("CF10 1AA".into()),
2810            line1: Some("Completely Other".into()),
2811            ..Address::new()
2812        };
2813        let s = engine.compare_addresses(&a, &b);
2814        assert!(s >= 0.5, "postcode should still dominate: {s}");
2815    }
2816
2817    #[test]
2818    fn breakdown_omits_death_place_score_when_one_side_absent() {
2819        let p1 = Worker::builder()
2820            .given_name("X")
2821            .family_name("Y")
2822            .death_place(Address::new().with_city("Cambridge"))
2823            .build();
2824        let p2 = Worker::builder().given_name("X").family_name("Y").build();
2825        let r = MatchingEngine::default_config().match_workers(&p1, &p2);
2826        assert_eq!(r.breakdown.death_place_score, None);
2827    }
2828}
worker_matcher/matcher.rs

worker_matcher/
matcher.rs