use crate::identifiers;
use crate::models::{Address, PassportBook, Worker};
use crate::nicknames::NicknameTable;
use crate::normalizer::Normalizer;
use crate::scorer::{Scorer, SimilarityAlgorithm};
use chrono::{Datelike, NaiveDate};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct MatchConfig {
pub match_threshold: f64,
pub uk_nhs_number_weight: f64,
pub fr_nir_weight: f64,
pub es_tsi_weight: f64,
pub ie_ihi_weight: f64,
pub uk_hc_number_weight: f64,
pub us_ssn_weight: f64,
pub au_ihi_weight: f64,
pub de_kvnr_weight: f64,
pub it_cf_weight: f64,
pub nl_bsn_weight: f64,
pub se_workernummer_weight: f64,
pub uk_chi_number_weight: f64,
pub be_nn_weight: f64,
pub bg_egn_weight: f64,
pub cz_rc_weight: f64,
pub dk_cpr_weight: f64,
pub ee_ik_weight: f64,
pub es_dni_weight: f64,
pub fi_hetu_weight: f64,
pub hr_oib_weight: f64,
pub is_kt_weight: f64,
pub lt_ak_weight: f64,
pub lv_pk_weight: f64,
pub mt_id_weight: f64,
pub no_fnr_weight: f64,
pub pl_pesel_weight: f64,
pub ro_cnp_weight: f64,
pub si_emso_weight: f64,
pub sk_rc_weight: f64,
pub uk_nino_weight: f64,
pub gr_dss_weight: f64,
pub li_id_weight: f64,
pub nl_id_weight: f64,
pub pl_nip_weight: f64,
pub pt_nif_weight: f64,
pub br_cpf_weight: f64,
pub cn_rrn_weight: f64,
pub in_aadhaar_weight: f64,
pub jp_my_number_weight: f64,
pub mx_curp_weight: f64,
pub nz_nhi_weight: f64,
pub za_id_weight: f64,
pub passport_book_weight: f64,
pub given_name_weight: f64,
pub family_name_weight: f64,
pub date_of_birth_weight: f64,
pub gender_weight: f64,
pub blood_type_weight: f64,
pub multiple_birth_weight: f64,
pub address_weight: f64,
pub birth_place_weight: f64,
pub death_date_weight: f64,
pub death_place_weight: f64,
pub phone_weight: f64,
pub email_weight: f64,
pub use_phonetic_matching: bool,
pub name_algorithm: SimilarityAlgorithm,
pub strict_mode: bool,
pub gmail_dot_folding: bool,
pub nickname_table: NicknameTable,
pub phone_default_country: Option<String>,
}
impl Default for MatchConfig {
fn default() -> Self {
Self {
match_threshold: 0.85,
uk_nhs_number_weight: 0.30,
fr_nir_weight: 0.30,
es_tsi_weight: 0.30,
ie_ihi_weight: 0.30,
uk_hc_number_weight: 0.30,
us_ssn_weight: 0.30,
au_ihi_weight: 0.30,
de_kvnr_weight: 0.30,
it_cf_weight: 0.30,
nl_bsn_weight: 0.30,
se_workernummer_weight: 0.30,
uk_chi_number_weight: 0.30,
be_nn_weight: 0.30,
bg_egn_weight: 0.30,
cz_rc_weight: 0.30,
dk_cpr_weight: 0.30,
ee_ik_weight: 0.30,
es_dni_weight: 0.30,
fi_hetu_weight: 0.30,
hr_oib_weight: 0.30,
is_kt_weight: 0.30,
lt_ak_weight: 0.30,
lv_pk_weight: 0.30,
mt_id_weight: 0.30,
no_fnr_weight: 0.30,
pl_pesel_weight: 0.30,
ro_cnp_weight: 0.30,
si_emso_weight: 0.30,
sk_rc_weight: 0.30,
uk_nino_weight: 0.30,
gr_dss_weight: 0.30,
li_id_weight: 0.30,
nl_id_weight: 0.30,
pl_nip_weight: 0.30,
pt_nif_weight: 0.30,
br_cpf_weight: 0.30,
cn_rrn_weight: 0.30,
in_aadhaar_weight: 0.30,
jp_my_number_weight: 0.30,
mx_curp_weight: 0.30,
nz_nhi_weight: 0.30,
za_id_weight: 0.30,
passport_book_weight: 0.30,
given_name_weight: 0.15,
family_name_weight: 0.20,
date_of_birth_weight: 0.20,
gender_weight: 0.05,
blood_type_weight: 0.05,
multiple_birth_weight: 0.05,
address_weight: 0.05,
birth_place_weight: 0.05,
death_date_weight: 0.10,
death_place_weight: 0.05,
phone_weight: 0.05,
email_weight: 0.05,
use_phonetic_matching: true,
name_algorithm: SimilarityAlgorithm::Combined,
strict_mode: false,
nickname_table: NicknameTable::empty(),
gmail_dot_folding: false,
phone_default_country: Some("GB".to_string()),
}
}
}
impl MatchConfig {
pub fn strict() -> Self {
Self {
match_threshold: 0.95,
strict_mode: true,
..Default::default()
}
}
pub fn lenient() -> Self {
Self {
match_threshold: 0.75,
use_phonetic_matching: true,
..Default::default()
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Confidence {
High,
Medium,
Low,
}
impl Confidence {
pub fn from_score(score: f64) -> Self {
if score >= 0.90 {
Confidence::High
} else if score >= 0.75 {
Confidence::Medium
} else {
Confidence::Low
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MatchResult {
pub score: f64,
pub is_match: bool,
#[serde(default = "default_confidence")]
pub confidence: Confidence,
pub breakdown: MatchBreakdown,
}
fn default_confidence() -> Confidence {
Confidence::Low
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MatchBreakdown {
#[serde(default)]
pub uk_nhs_number_score: Option<f64>,
#[serde(default)]
pub fr_nir_score: Option<f64>,
#[serde(default)]
pub es_tsi_score: Option<f64>,
#[serde(default)]
pub ie_ihi_score: Option<f64>,
#[serde(default)]
pub uk_hc_number_score: Option<f64>,
#[serde(default)]
pub us_ssn_score: Option<f64>,
#[serde(default)]
pub au_ihi_score: Option<f64>,
#[serde(default)]
pub de_kvnr_score: Option<f64>,
#[serde(default)]
pub it_cf_score: Option<f64>,
#[serde(default)]
pub nl_bsn_score: Option<f64>,
#[serde(default)]
pub se_workernummer_score: Option<f64>,
#[serde(default)]
pub uk_chi_number_score: Option<f64>,
#[serde(default)]
pub be_nn_score: Option<f64>,
#[serde(default)]
pub bg_egn_score: Option<f64>,
#[serde(default)]
pub cz_rc_score: Option<f64>,
#[serde(default)]
pub dk_cpr_score: Option<f64>,
#[serde(default)]
pub ee_ik_score: Option<f64>,
#[serde(default)]
pub es_dni_score: Option<f64>,
#[serde(default)]
pub fi_hetu_score: Option<f64>,
#[serde(default)]
pub hr_oib_score: Option<f64>,
#[serde(default)]
pub is_kt_score: Option<f64>,
#[serde(default)]
pub lt_ak_score: Option<f64>,
#[serde(default)]
pub lv_pk_score: Option<f64>,
#[serde(default)]
pub mt_id_score: Option<f64>,
#[serde(default)]
pub no_fnr_score: Option<f64>,
#[serde(default)]
pub pl_pesel_score: Option<f64>,
#[serde(default)]
pub ro_cnp_score: Option<f64>,
#[serde(default)]
pub si_emso_score: Option<f64>,
#[serde(default)]
pub sk_rc_score: Option<f64>,
#[serde(default)]
pub uk_nino_score: Option<f64>,
#[serde(default)]
pub gr_dss_score: Option<f64>,
#[serde(default)]
pub li_id_score: Option<f64>,
#[serde(default)]
pub nl_id_score: Option<f64>,
#[serde(default)]
pub pl_nip_score: Option<f64>,
#[serde(default)]
pub pt_nif_score: Option<f64>,
#[serde(default)]
pub br_cpf_score: Option<f64>,
#[serde(default)]
pub cn_rrn_score: Option<f64>,
#[serde(default)]
pub in_aadhaar_score: Option<f64>,
#[serde(default)]
pub jp_my_number_score: Option<f64>,
#[serde(default)]
pub mx_curp_score: Option<f64>,
#[serde(default)]
pub nz_nhi_score: Option<f64>,
#[serde(default)]
pub za_id_score: Option<f64>,
#[serde(default)]
pub passport_book_score: Option<f64>,
pub given_name_score: Option<f64>,
pub family_name_score: Option<f64>,
pub date_of_birth_score: Option<f64>,
pub gender_score: Option<f64>,
#[serde(default)]
pub blood_type_score: Option<f64>,
#[serde(default)]
pub multiple_birth_score: Option<f64>,
pub address_score: Option<f64>,
#[serde(default)]
pub birth_place_score: Option<f64>,
#[serde(default)]
pub death_date_score: Option<f64>,
#[serde(default)]
pub death_place_score: Option<f64>,
pub phone_score: Option<f64>,
#[serde(default)]
pub email_score: Option<f64>,
pub phonetic_name_score: Option<f64>,
}
pub struct MatchingEngine {
config: MatchConfig,
}
impl MatchingEngine {
pub fn new(config: MatchConfig) -> Self {
Self { config }
}
pub fn default_config() -> Self {
Self::new(MatchConfig::default())
}
pub fn match_workers(&self, worker1: &Worker, worker2: &Worker) -> MatchResult {
let breakdown = self.calculate_breakdown(worker1, worker2);
let score = self.calculate_weighted_score(&breakdown);
let above_threshold = score >= self.config.match_threshold;
let is_match = if self.config.strict_mode {
above_threshold && self.deterministic_match(worker1, worker2)
} else {
above_threshold
};
let confidence = Confidence::from_score(score);
MatchResult {
score,
is_match,
confidence,
breakdown,
}
}
pub fn match_one_to_many(&self, query: &Worker, candidates: &[Worker]) -> Vec<MatchResult> {
candidates
.iter()
.map(|c| self.match_workers(query, c))
.collect()
}
pub fn rank_one_to_many(
&self,
query: &Worker,
candidates: &[Worker],
) -> Vec<(usize, MatchResult)> {
let mut indexed: Vec<(usize, MatchResult)> = self
.match_one_to_many(query, candidates)
.into_iter()
.enumerate()
.collect();
indexed.sort_by(|a, b| {
b.1.score
.partial_cmp(&a.1.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
indexed
}
pub fn deterministic_match(&self, worker1: &Worker, worker2: &Worker) -> bool {
if identifier_equal(
&worker1.uk_nhs_number,
&worker2.uk_nhs_number,
identifiers::parse_uk_nhs_number,
) {
return true;
}
if identifier_equal(&worker1.fr_nir, &worker2.fr_nir, identifiers::parse_fr_nir) {
return true;
}
if identifier_equal(&worker1.es_tsi, &worker2.es_tsi, identifiers::parse_es_tsi) {
return true;
}
if identifier_equal(&worker1.ie_ihi, &worker2.ie_ihi, identifiers::parse_ie_ihi) {
return true;
}
if identifier_equal(
&worker1.uk_hc_number,
&worker2.uk_hc_number,
identifiers::parse_uk_hc_number,
) {
return true;
}
if identifier_equal(&worker1.us_ssn, &worker2.us_ssn, identifiers::parse_us_ssn) {
return true;
}
if identifier_equal(&worker1.au_ihi, &worker2.au_ihi, identifiers::parse_au_ihi) {
return true;
}
if identifier_equal(
&worker1.de_kvnr,
&worker2.de_kvnr,
identifiers::parse_de_kvnr,
) {
return true;
}
if identifier_equal(&worker1.it_cf, &worker2.it_cf, identifiers::parse_it_cf) {
return true;
}
if identifier_equal(&worker1.nl_bsn, &worker2.nl_bsn, identifiers::parse_nl_bsn) {
return true;
}
if identifier_equal(
&worker1.se_workernummer,
&worker2.se_workernummer,
identifiers::parse_se_workernummer,
) {
return true;
}
if identifier_equal(
&worker1.uk_chi_number,
&worker2.uk_chi_number,
identifiers::parse_uk_chi_number,
) {
return true;
}
if identifier_equal(&worker1.be_nn, &worker2.be_nn, identifiers::parse_be_nn) {
return true;
}
if identifier_equal(&worker1.bg_egn, &worker2.bg_egn, identifiers::parse_bg_egn) {
return true;
}
if identifier_equal(&worker1.cz_rc, &worker2.cz_rc, identifiers::parse_cz_rc) {
return true;
}
if identifier_equal(&worker1.dk_cpr, &worker2.dk_cpr, identifiers::parse_dk_cpr) {
return true;
}
if identifier_equal(&worker1.ee_ik, &worker2.ee_ik, identifiers::parse_ee_ik) {
return true;
}
if identifier_equal(&worker1.es_dni, &worker2.es_dni, identifiers::parse_es_dni) {
return true;
}
if identifier_equal(
&worker1.fi_hetu,
&worker2.fi_hetu,
identifiers::parse_fi_hetu,
) {
return true;
}
if identifier_equal(&worker1.hr_oib, &worker2.hr_oib, identifiers::parse_hr_oib) {
return true;
}
if identifier_equal(&worker1.is_kt, &worker2.is_kt, identifiers::parse_is_kt) {
return true;
}
if identifier_equal(&worker1.lt_ak, &worker2.lt_ak, identifiers::parse_lt_ak) {
return true;
}
if identifier_equal(&worker1.lv_pk, &worker2.lv_pk, identifiers::parse_lv_pk) {
return true;
}
if identifier_equal(&worker1.mt_id, &worker2.mt_id, identifiers::parse_mt_id) {
return true;
}
if identifier_equal(&worker1.no_fnr, &worker2.no_fnr, identifiers::parse_no_fnr) {
return true;
}
if identifier_equal(
&worker1.pl_pesel,
&worker2.pl_pesel,
identifiers::parse_pl_pesel,
) {
return true;
}
if identifier_equal(&worker1.ro_cnp, &worker2.ro_cnp, identifiers::parse_ro_cnp) {
return true;
}
if identifier_equal(
&worker1.si_emso,
&worker2.si_emso,
identifiers::parse_si_emso,
) {
return true;
}
if identifier_equal(&worker1.sk_rc, &worker2.sk_rc, identifiers::parse_sk_rc) {
return true;
}
if identifier_equal(
&worker1.uk_nino,
&worker2.uk_nino,
identifiers::parse_uk_nino,
) {
return true;
}
if identifier_equal(&worker1.gr_dss, &worker2.gr_dss, identifiers::parse_gr_dss) {
return true;
}
if identifier_equal(&worker1.li_id, &worker2.li_id, identifiers::parse_li_id) {
return true;
}
if identifier_equal(&worker1.nl_id, &worker2.nl_id, identifiers::parse_nl_id) {
return true;
}
if identifier_equal(&worker1.pl_nip, &worker2.pl_nip, identifiers::parse_pl_nip) {
return true;
}
if identifier_equal(&worker1.pt_nif, &worker2.pt_nif, identifiers::parse_pt_nif) {
return true;
}
if identifier_equal(&worker1.br_cpf, &worker2.br_cpf, identifiers::parse_br_cpf) {
return true;
}
if identifier_equal(&worker1.cn_rrn, &worker2.cn_rrn, identifiers::parse_cn_rrn) {
return true;
}
if identifier_equal(
&worker1.in_aadhaar,
&worker2.in_aadhaar,
identifiers::parse_in_aadhaar,
) {
return true;
}
if identifier_equal(
&worker1.jp_my_number,
&worker2.jp_my_number,
identifiers::parse_jp_my_number,
) {
return true;
}
if identifier_equal(
&worker1.mx_curp,
&worker2.mx_curp,
identifiers::parse_mx_curp,
) {
return true;
}
if identifier_equal(&worker1.nz_nhi, &worker2.nz_nhi, identifiers::parse_nz_nhi) {
return true;
}
if identifier_equal(&worker1.za_id, &worker2.za_id, identifiers::parse_za_id) {
return true;
}
if passport_books_share_pair(&worker1.passport_books, &worker2.passport_books) {
return true;
}
let name_match = match (&worker1.given_name, &worker2.given_name) {
(Some(f1), Some(f2)) => {
Normalizer::normalize_name(f1) == Normalizer::normalize_name(f2)
}
_ => false,
} && match (&worker1.family_name, &worker2.family_name) {
(Some(l1), Some(l2)) => {
Normalizer::normalize_name(l1) == Normalizer::normalize_name(l2)
}
_ => false,
};
let dob_match = match (worker1.date_of_birth, worker2.date_of_birth) {
(Some(d1), Some(d2)) => d1 == d2,
_ => false,
};
let gender_match = match (worker1.gender, worker2.gender) {
(Some(g1), Some(g2)) => g1 == g2,
_ => true,
};
name_match && dob_match && gender_match
}
fn calculate_breakdown(&self, worker1: &Worker, worker2: &Worker) -> MatchBreakdown {
MatchBreakdown {
uk_nhs_number_score: identifier_score(
&worker1.uk_nhs_number,
&worker2.uk_nhs_number,
identifiers::parse_uk_nhs_number,
),
fr_nir_score: identifier_score(
&worker1.fr_nir,
&worker2.fr_nir,
identifiers::parse_fr_nir,
),
es_tsi_score: identifier_score(
&worker1.es_tsi,
&worker2.es_tsi,
identifiers::parse_es_tsi,
),
ie_ihi_score: identifier_score(
&worker1.ie_ihi,
&worker2.ie_ihi,
identifiers::parse_ie_ihi,
),
uk_hc_number_score: identifier_score(
&worker1.uk_hc_number,
&worker2.uk_hc_number,
identifiers::parse_uk_hc_number,
),
us_ssn_score: identifier_score(
&worker1.us_ssn,
&worker2.us_ssn,
identifiers::parse_us_ssn,
),
au_ihi_score: identifier_score(
&worker1.au_ihi,
&worker2.au_ihi,
identifiers::parse_au_ihi,
),
de_kvnr_score: identifier_score(
&worker1.de_kvnr,
&worker2.de_kvnr,
identifiers::parse_de_kvnr,
),
it_cf_score: identifier_score(&worker1.it_cf, &worker2.it_cf, identifiers::parse_it_cf),
nl_bsn_score: identifier_score(
&worker1.nl_bsn,
&worker2.nl_bsn,
identifiers::parse_nl_bsn,
),
se_workernummer_score: identifier_score(
&worker1.se_workernummer,
&worker2.se_workernummer,
identifiers::parse_se_workernummer,
),
uk_chi_number_score: identifier_score(
&worker1.uk_chi_number,
&worker2.uk_chi_number,
identifiers::parse_uk_chi_number,
),
be_nn_score: identifier_score(&worker1.be_nn, &worker2.be_nn, identifiers::parse_be_nn),
bg_egn_score: identifier_score(
&worker1.bg_egn,
&worker2.bg_egn,
identifiers::parse_bg_egn,
),
cz_rc_score: identifier_score(&worker1.cz_rc, &worker2.cz_rc, identifiers::parse_cz_rc),
dk_cpr_score: identifier_score(
&worker1.dk_cpr,
&worker2.dk_cpr,
identifiers::parse_dk_cpr,
),
ee_ik_score: identifier_score(&worker1.ee_ik, &worker2.ee_ik, identifiers::parse_ee_ik),
es_dni_score: identifier_score(
&worker1.es_dni,
&worker2.es_dni,
identifiers::parse_es_dni,
),
fi_hetu_score: identifier_score(
&worker1.fi_hetu,
&worker2.fi_hetu,
identifiers::parse_fi_hetu,
),
hr_oib_score: identifier_score(
&worker1.hr_oib,
&worker2.hr_oib,
identifiers::parse_hr_oib,
),
is_kt_score: identifier_score(&worker1.is_kt, &worker2.is_kt, identifiers::parse_is_kt),
lt_ak_score: identifier_score(&worker1.lt_ak, &worker2.lt_ak, identifiers::parse_lt_ak),
lv_pk_score: identifier_score(&worker1.lv_pk, &worker2.lv_pk, identifiers::parse_lv_pk),
mt_id_score: identifier_score(&worker1.mt_id, &worker2.mt_id, identifiers::parse_mt_id),
no_fnr_score: identifier_score(
&worker1.no_fnr,
&worker2.no_fnr,
identifiers::parse_no_fnr,
),
pl_pesel_score: identifier_score(
&worker1.pl_pesel,
&worker2.pl_pesel,
identifiers::parse_pl_pesel,
),
ro_cnp_score: identifier_score(
&worker1.ro_cnp,
&worker2.ro_cnp,
identifiers::parse_ro_cnp,
),
si_emso_score: identifier_score(
&worker1.si_emso,
&worker2.si_emso,
identifiers::parse_si_emso,
),
sk_rc_score: identifier_score(&worker1.sk_rc, &worker2.sk_rc, identifiers::parse_sk_rc),
uk_nino_score: identifier_score(
&worker1.uk_nino,
&worker2.uk_nino,
identifiers::parse_uk_nino,
),
gr_dss_score: identifier_score(
&worker1.gr_dss,
&worker2.gr_dss,
identifiers::parse_gr_dss,
),
li_id_score: identifier_score(&worker1.li_id, &worker2.li_id, identifiers::parse_li_id),
nl_id_score: identifier_score(&worker1.nl_id, &worker2.nl_id, identifiers::parse_nl_id),
pl_nip_score: identifier_score(
&worker1.pl_nip,
&worker2.pl_nip,
identifiers::parse_pl_nip,
),
pt_nif_score: identifier_score(
&worker1.pt_nif,
&worker2.pt_nif,
identifiers::parse_pt_nif,
),
br_cpf_score: identifier_score(
&worker1.br_cpf,
&worker2.br_cpf,
identifiers::parse_br_cpf,
),
cn_rrn_score: identifier_score(
&worker1.cn_rrn,
&worker2.cn_rrn,
identifiers::parse_cn_rrn,
),
in_aadhaar_score: identifier_score(
&worker1.in_aadhaar,
&worker2.in_aadhaar,
identifiers::parse_in_aadhaar,
),
jp_my_number_score: identifier_score(
&worker1.jp_my_number,
&worker2.jp_my_number,
identifiers::parse_jp_my_number,
),
mx_curp_score: identifier_score(
&worker1.mx_curp,
&worker2.mx_curp,
identifiers::parse_mx_curp,
),
nz_nhi_score: identifier_score(
&worker1.nz_nhi,
&worker2.nz_nhi,
identifiers::parse_nz_nhi,
),
za_id_score: identifier_score(&worker1.za_id, &worker2.za_id, identifiers::parse_za_id),
passport_book_score: score_passport_books(
&worker1.passport_books,
&worker2.passport_books,
),
given_name_score: self.score_given_name(worker1, worker2),
family_name_score: self.score_family_name(worker1, worker2),
date_of_birth_score: self.score_date_of_birth(worker1, worker2),
gender_score: self.score_gender(worker1, worker2),
blood_type_score: self.score_blood_type(worker1, worker2),
multiple_birth_score: self.score_multiple_birth(worker1, worker2),
address_score: self.score_address(worker1, worker2),
birth_place_score: self.score_birth_place(worker1, worker2),
death_date_score: self.score_death_date(worker1, worker2),
death_place_score: self.score_death_place(worker1, worker2),
phone_score: self.score_phone(worker1, worker2),
email_score: self.score_email(worker1, worker2),
phonetic_name_score: if self.config.use_phonetic_matching {
self.score_phonetic_names(worker1, worker2)
} else {
None
},
}
}
fn calculate_weighted_score(&self, breakdown: &MatchBreakdown) -> f64 {
let mut total_weight = 0.0;
let mut weighted_sum = 0.0;
if let Some(score) = breakdown.uk_nhs_number_score {
weighted_sum += score * self.config.uk_nhs_number_weight;
total_weight += self.config.uk_nhs_number_weight;
}
if let Some(score) = breakdown.fr_nir_score {
weighted_sum += score * self.config.fr_nir_weight;
total_weight += self.config.fr_nir_weight;
}
if let Some(score) = breakdown.es_tsi_score {
weighted_sum += score * self.config.es_tsi_weight;
total_weight += self.config.es_tsi_weight;
}
if let Some(score) = breakdown.ie_ihi_score {
weighted_sum += score * self.config.ie_ihi_weight;
total_weight += self.config.ie_ihi_weight;
}
if let Some(score) = breakdown.uk_hc_number_score {
weighted_sum += score * self.config.uk_hc_number_weight;
total_weight += self.config.uk_hc_number_weight;
}
if let Some(score) = breakdown.us_ssn_score {
weighted_sum += score * self.config.us_ssn_weight;
total_weight += self.config.us_ssn_weight;
}
if let Some(score) = breakdown.au_ihi_score {
weighted_sum += score * self.config.au_ihi_weight;
total_weight += self.config.au_ihi_weight;
}
if let Some(score) = breakdown.de_kvnr_score {
weighted_sum += score * self.config.de_kvnr_weight;
total_weight += self.config.de_kvnr_weight;
}
if let Some(score) = breakdown.it_cf_score {
weighted_sum += score * self.config.it_cf_weight;
total_weight += self.config.it_cf_weight;
}
if let Some(score) = breakdown.nl_bsn_score {
weighted_sum += score * self.config.nl_bsn_weight;
total_weight += self.config.nl_bsn_weight;
}
if let Some(score) = breakdown.se_workernummer_score {
weighted_sum += score * self.config.se_workernummer_weight;
total_weight += self.config.se_workernummer_weight;
}
if let Some(score) = breakdown.uk_chi_number_score {
weighted_sum += score * self.config.uk_chi_number_weight;
total_weight += self.config.uk_chi_number_weight;
}
if let Some(score) = breakdown.be_nn_score {
weighted_sum += score * self.config.be_nn_weight;
total_weight += self.config.be_nn_weight;
}
if let Some(score) = breakdown.bg_egn_score {
weighted_sum += score * self.config.bg_egn_weight;
total_weight += self.config.bg_egn_weight;
}
if let Some(score) = breakdown.cz_rc_score {
weighted_sum += score * self.config.cz_rc_weight;
total_weight += self.config.cz_rc_weight;
}
if let Some(score) = breakdown.dk_cpr_score {
weighted_sum += score * self.config.dk_cpr_weight;
total_weight += self.config.dk_cpr_weight;
}
if let Some(score) = breakdown.ee_ik_score {
weighted_sum += score * self.config.ee_ik_weight;
total_weight += self.config.ee_ik_weight;
}
if let Some(score) = breakdown.es_dni_score {
weighted_sum += score * self.config.es_dni_weight;
total_weight += self.config.es_dni_weight;
}
if let Some(score) = breakdown.fi_hetu_score {
weighted_sum += score * self.config.fi_hetu_weight;
total_weight += self.config.fi_hetu_weight;
}
if let Some(score) = breakdown.hr_oib_score {
weighted_sum += score * self.config.hr_oib_weight;
total_weight += self.config.hr_oib_weight;
}
if let Some(score) = breakdown.is_kt_score {
weighted_sum += score * self.config.is_kt_weight;
total_weight += self.config.is_kt_weight;
}
if let Some(score) = breakdown.lt_ak_score {
weighted_sum += score * self.config.lt_ak_weight;
total_weight += self.config.lt_ak_weight;
}
if let Some(score) = breakdown.lv_pk_score {
weighted_sum += score * self.config.lv_pk_weight;
total_weight += self.config.lv_pk_weight;
}
if let Some(score) = breakdown.mt_id_score {
weighted_sum += score * self.config.mt_id_weight;
total_weight += self.config.mt_id_weight;
}
if let Some(score) = breakdown.no_fnr_score {
weighted_sum += score * self.config.no_fnr_weight;
total_weight += self.config.no_fnr_weight;
}
if let Some(score) = breakdown.pl_pesel_score {
weighted_sum += score * self.config.pl_pesel_weight;
total_weight += self.config.pl_pesel_weight;
}
if let Some(score) = breakdown.ro_cnp_score {
weighted_sum += score * self.config.ro_cnp_weight;
total_weight += self.config.ro_cnp_weight;
}
if let Some(score) = breakdown.si_emso_score {
weighted_sum += score * self.config.si_emso_weight;
total_weight += self.config.si_emso_weight;
}
if let Some(score) = breakdown.sk_rc_score {
weighted_sum += score * self.config.sk_rc_weight;
total_weight += self.config.sk_rc_weight;
}
if let Some(score) = breakdown.uk_nino_score {
weighted_sum += score * self.config.uk_nino_weight;
total_weight += self.config.uk_nino_weight;
}
if let Some(score) = breakdown.gr_dss_score {
weighted_sum += score * self.config.gr_dss_weight;
total_weight += self.config.gr_dss_weight;
}
if let Some(score) = breakdown.li_id_score {
weighted_sum += score * self.config.li_id_weight;
total_weight += self.config.li_id_weight;
}
if let Some(score) = breakdown.nl_id_score {
weighted_sum += score * self.config.nl_id_weight;
total_weight += self.config.nl_id_weight;
}
if let Some(score) = breakdown.pl_nip_score {
weighted_sum += score * self.config.pl_nip_weight;
total_weight += self.config.pl_nip_weight;
}
if let Some(score) = breakdown.pt_nif_score {
weighted_sum += score * self.config.pt_nif_weight;
total_weight += self.config.pt_nif_weight;
}
if let Some(score) = breakdown.br_cpf_score {
weighted_sum += score * self.config.br_cpf_weight;
total_weight += self.config.br_cpf_weight;
}
if let Some(score) = breakdown.cn_rrn_score {
weighted_sum += score * self.config.cn_rrn_weight;
total_weight += self.config.cn_rrn_weight;
}
if let Some(score) = breakdown.in_aadhaar_score {
weighted_sum += score * self.config.in_aadhaar_weight;
total_weight += self.config.in_aadhaar_weight;
}
if let Some(score) = breakdown.jp_my_number_score {
weighted_sum += score * self.config.jp_my_number_weight;
total_weight += self.config.jp_my_number_weight;
}
if let Some(score) = breakdown.mx_curp_score {
weighted_sum += score * self.config.mx_curp_weight;
total_weight += self.config.mx_curp_weight;
}
if let Some(score) = breakdown.nz_nhi_score {
weighted_sum += score * self.config.nz_nhi_weight;
total_weight += self.config.nz_nhi_weight;
}
if let Some(score) = breakdown.za_id_score {
weighted_sum += score * self.config.za_id_weight;
total_weight += self.config.za_id_weight;
}
if let Some(score) = breakdown.passport_book_score {
weighted_sum += score * self.config.passport_book_weight;
total_weight += self.config.passport_book_weight;
}
if let Some(score) = breakdown.given_name_score {
weighted_sum += score * self.config.given_name_weight;
total_weight += self.config.given_name_weight;
}
if let Some(score) = breakdown.family_name_score {
weighted_sum += score * self.config.family_name_weight;
total_weight += self.config.family_name_weight;
}
if let Some(score) = breakdown.date_of_birth_score {
weighted_sum += score * self.config.date_of_birth_weight;
total_weight += self.config.date_of_birth_weight;
}
if let Some(score) = breakdown.gender_score {
weighted_sum += score * self.config.gender_weight;
total_weight += self.config.gender_weight;
}
if let Some(score) = breakdown.blood_type_score {
weighted_sum += score * self.config.blood_type_weight;
total_weight += self.config.blood_type_weight;
}
if let Some(score) = breakdown.multiple_birth_score {
weighted_sum += score * self.config.multiple_birth_weight;
total_weight += self.config.multiple_birth_weight;
}
if let Some(score) = breakdown.address_score {
weighted_sum += score * self.config.address_weight;
total_weight += self.config.address_weight;
}
if let Some(score) = breakdown.birth_place_score {
weighted_sum += score * self.config.birth_place_weight;
total_weight += self.config.birth_place_weight;
}
if let Some(score) = breakdown.death_date_score {
weighted_sum += score * self.config.death_date_weight;
total_weight += self.config.death_date_weight;
}
if let Some(score) = breakdown.death_place_score {
weighted_sum += score * self.config.death_place_weight;
total_weight += self.config.death_place_weight;
}
if let Some(score) = breakdown.phone_score {
weighted_sum += score * self.config.phone_weight;
total_weight += self.config.phone_weight;
}
if let Some(score) = breakdown.email_score {
weighted_sum += score * self.config.email_weight;
total_weight += self.config.email_weight;
}
if let Some(score) = breakdown.phonetic_name_score
&& score > 0.9
{
weighted_sum += score * 0.05;
total_weight += 0.05;
}
if total_weight > 0.0 {
weighted_sum / total_weight
} else {
0.0
}
}
fn score_given_name(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
let (g1, g2) = match (&worker1.given_name, &worker2.given_name) {
(Some(a), Some(b)) => (a.as_str(), b.as_str()),
_ => return None,
};
let given = self.score_name(g1, g2);
let blended = match (&worker1.middle_name, &worker2.middle_name) {
(Some(m1), Some(m2)) => {
let middle = self.score_name(m1, m2);
0.95 * given + 0.05 * middle
}
_ => given,
};
Some(blended)
}
fn score_family_name(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
match (&worker1.family_name, &worker2.family_name) {
(Some(name1), Some(name2)) => Some(self.score_name(name1, name2)),
_ => None,
}
}
fn score_name(&self, name1: &str, name2: &str) -> f64 {
let norm1 = Normalizer::normalize_name(name1);
let norm2 = Normalizer::normalize_name(name2);
let base = match self.config.name_algorithm {
SimilarityAlgorithm::JaroWinkler => Scorer::jaro_winkler_similarity(&norm1, &norm2),
SimilarityAlgorithm::Levenshtein => Scorer::levenshtein_similarity(&norm1, &norm2),
SimilarityAlgorithm::Exact => Scorer::exact_match(&norm1, &norm2),
SimilarityAlgorithm::Combined => Scorer::combined_similarity(&norm1, &norm2),
};
if !self.config.nickname_table.is_empty()
&& self.config.nickname_table.are_equivalent(&norm1, &norm2)
{
base.max(0.9)
} else {
base
}
}
fn score_date_of_birth(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
match (worker1.date_of_birth, worker2.date_of_birth) {
(Some(dob1), Some(dob2)) => Some(score_dob_pair(dob1, dob2)),
_ => None,
}
}
fn score_gender(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
match (worker1.gender, worker2.gender) {
(Some(g1), Some(g2)) => Some(if g1 == g2 { 1.0 } else { 0.0 }),
_ => None,
}
}
fn score_blood_type(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
match (worker1.blood_type, worker2.blood_type) {
(Some(b1), Some(b2)) => Some(if b1 == b2 { 1.0 } else { 0.0 }),
_ => None,
}
}
fn score_multiple_birth(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
match (worker1.multiple_birth, worker2.multiple_birth) {
(Some(m1), Some(m2)) => Some(f64::from(m1 == m2)),
_ => None,
}
}
fn score_address(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
let all_p1: Vec<&Address> = worker1
.address
.as_ref()
.into_iter()
.chain(worker1.previous_addresses.iter())
.collect();
let all_p2: Vec<&Address> = worker2
.address
.as_ref()
.into_iter()
.chain(worker2.previous_addresses.iter())
.collect();
if all_p1.is_empty() || all_p2.is_empty() {
return None;
}
let mut best = f64::NEG_INFINITY;
for a1 in &all_p1 {
for a2 in &all_p2 {
let s = self.compare_addresses(a1, a2);
if s > best {
best = s;
}
}
}
Some(best)
}
fn score_birth_place(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
score_named_place(worker1.birth_place.as_ref()?, worker2.birth_place.as_ref()?)
}
fn score_death_place(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
score_named_place(worker1.death_place.as_ref()?, worker2.death_place.as_ref()?)
}
fn score_death_date(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
Some(score_dob_pair(worker1.death_date?, worker2.death_date?))
}
fn compare_addresses(&self, addr1: &Address, addr2: &Address) -> f64 {
let mut weighted_sum = 0.0_f64;
let mut total_weight = 0.0_f64;
if let (Some(pc1), Some(pc2)) = (&addr1.postcode, &addr2.postcode) {
let norm1 = Normalizer::normalize_postcode(pc1);
let norm2 = Normalizer::normalize_postcode(pc2);
weighted_sum += f64::from(norm1 == norm2) * 0.5;
total_weight += 0.5;
}
if let (Some(city1), Some(city2)) = (&addr1.city, &addr2.city) {
let norm1 = Normalizer::normalize_name(city1);
let norm2 = Normalizer::normalize_name(city2);
weighted_sum += Scorer::jaro_winkler_similarity(&norm1, &norm2) * 0.3;
total_weight += 0.3;
}
if let (Some(line1), Some(line2)) = (&addr1.line1, &addr2.line1) {
let parsed1 = Normalizer::parse_address_line(line1);
let parsed2 = Normalizer::parse_address_line(line2);
let street_sim = Scorer::jaro_winkler_similarity(&parsed1.street, &parsed2.street);
let house_score = match (&parsed1.house_number, &parsed2.house_number) {
(Some(a), Some(b)) => Some(f64::from(a == b)),
_ => None,
};
let line1_score = match house_score {
Some(h) => 0.6 * street_sim + 0.4 * h,
None => street_sim,
};
weighted_sum += line1_score * 0.2;
total_weight += 0.2;
}
if total_weight == 0.0 {
0.5
} else {
weighted_sum / total_weight
}
}
fn score_phone(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
let phone1 = worker1.phone.as_ref().or(worker1.mobile.as_ref())?;
let phone2 = worker2.phone.as_ref().or(worker2.mobile.as_ref())?;
let default = self.config.phone_default_country.as_deref();
let e164_1 = Normalizer::normalize_phone_e164(phone1, default);
let e164_2 = Normalizer::normalize_phone_e164(phone2, default);
if let (Some(a), Some(b)) = (&e164_1, &e164_2) {
return Some(f64::from(a == b));
}
let norm1 = Normalizer::normalize_phone(phone1);
let norm2 = Normalizer::normalize_phone(phone2);
Some(f64::from(norm1 == norm2))
}
fn score_email(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
let email1 = worker1.email.as_ref()?;
let email2 = worker2.email.as_ref()?;
let fold = self.config.gmail_dot_folding;
let canonical1 = Normalizer::normalize_email(email1, fold)?;
let canonical2 = Normalizer::normalize_email(email2, fold)?;
Some(f64::from(canonical1 == canonical2))
}
fn score_phonetic_names(&self, worker1: &Worker, worker2: &Worker) -> Option<f64> {
let p1_given_name = worker1.given_name.as_ref()?;
let p1_given_name_phonetic = Normalizer::phonetic_code(p1_given_name);
let p1_family_name = worker1.family_name.as_ref()?;
let p1_family_name_phonetic = Normalizer::phonetic_code(p1_family_name);
let p2_given_name = worker2.given_name.as_ref()?;
let p2_given_name_phonetic = Normalizer::phonetic_code(p2_given_name);
let p2_family_name = worker2.family_name.as_ref()?;
let p2_family_name_phonetic = Normalizer::phonetic_code(p2_family_name);
let given_name_match = f64::from(p1_given_name_phonetic == p2_given_name_phonetic);
let family_name_match = f64::from(p1_family_name_phonetic == p2_family_name_phonetic);
Some((given_name_match + family_name_match) / 2.0)
}
}
fn identifier_equal<F>(a: &Option<String>, b: &Option<String>, parser: F) -> bool
where
F: Fn(&str) -> Option<String>,
{
match (a, b) {
(Some(x), Some(y)) => match (parser(x), parser(y)) {
(Some(cx), Some(cy)) => cx == cy,
_ => false,
},
_ => false,
}
}
fn identifier_score<F>(a: &Option<String>, b: &Option<String>, parser: F) -> Option<f64>
where
F: Fn(&str) -> Option<String>,
{
if let (Some(x), Some(y)) = (a, b)
&& let (Some(cx), Some(cy)) = (parser(x), parser(y))
{
return Some(f64::from(cx == cy));
}
None
}
fn passport_books_share_pair(a: &[PassportBook], b: &[PassportBook]) -> bool {
for ba in a {
for bb in b {
if ba.country == bb.country && ba.number == bb.number {
return true;
}
}
}
false
}
fn score_passport_books(a: &[PassportBook], b: &[PassportBook]) -> Option<f64> {
if a.is_empty() || b.is_empty() {
return None;
}
Some(f64::from(passport_books_share_pair(a, b)))
}
fn score_named_place(a: &Address, b: &Address) -> Option<f64> {
let city = match (&a.city, &b.city) {
(Some(c1), Some(c2)) => Some(Scorer::jaro_winkler_similarity(
&Normalizer::normalize_name(c1),
&Normalizer::normalize_name(c2),
)),
_ => None,
};
let country = match (&a.country, &b.country) {
(Some(c1), Some(c2)) => Some(f64::from(
Normalizer::normalize_name(c1) == Normalizer::normalize_name(c2),
)),
_ => None,
};
match (city, country) {
(Some(c), Some(co)) => Some(0.7 * c + 0.3 * co),
(Some(c), None) => Some(c),
(None, Some(co)) => Some(co),
(None, None) => None,
}
}
fn score_dob_pair(dob1: NaiveDate, dob2: NaiveDate) -> f64 {
if dob1 == dob2 {
return 1.0;
}
if dob1.year() == dob2.year()
&& let Some(swapped) = NaiveDate::from_ymd_opt(dob1.year(), dob1.day(), dob1.month())
&& swapped == dob2
{
return 0.5;
}
0.0
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::Gender;
use chrono::NaiveDate;
fn dob(y: i32, m: u32, d: u32) -> NaiveDate {
NaiveDate::from_ymd_opt(y, m, d).expect("valid date")
}
#[test]
fn config_default_values() {
let c = MatchConfig::default();
assert!((c.match_threshold - 0.85).abs() < 1e-9);
assert!((c.uk_nhs_number_weight - 0.30).abs() < 1e-9);
assert!(c.use_phonetic_matching);
assert!(!c.strict_mode);
}
#[test]
fn config_strict_raises_threshold_and_sets_flag() {
let c = MatchConfig::strict();
assert!((c.match_threshold - 0.95).abs() < 1e-9);
assert!(c.strict_mode);
}
#[test]
fn config_default_round_trips_through_json() {
let cfg = MatchConfig::default();
let json = serde_json::to_string(&cfg).expect("serialise");
let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
assert!((cfg.match_threshold - back.match_threshold).abs() < 1e-12);
assert!((cfg.uk_nhs_number_weight - back.uk_nhs_number_weight).abs() < 1e-12);
assert_eq!(cfg.use_phonetic_matching, back.use_phonetic_matching);
assert!(matches!(back.name_algorithm, SimilarityAlgorithm::Combined));
assert_eq!(cfg.strict_mode, back.strict_mode);
assert_eq!(cfg.nickname_table, back.nickname_table);
assert_eq!(cfg.gmail_dot_folding, back.gmail_dot_folding);
assert_eq!(cfg.phone_default_country, back.phone_default_country);
}
#[test]
fn config_strict_round_trips_through_json() {
let cfg = MatchConfig::strict();
let json = serde_json::to_string(&cfg).expect("serialise");
let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
assert!((back.match_threshold - 0.95).abs() < 1e-12);
assert!(back.strict_mode);
}
#[test]
fn config_lenient_round_trips_through_json() {
let cfg = MatchConfig::lenient();
let json = serde_json::to_string(&cfg).expect("serialise");
let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
assert!((back.match_threshold - 0.75).abs() < 1e-12);
}
#[test]
fn config_partial_json_fills_missing_fields_from_default() {
let partial = r#"{"match_threshold": 0.80, "gmail_dot_folding": true}"#;
let cfg: MatchConfig = serde_json::from_str(partial).expect("partial json");
assert!((cfg.match_threshold - 0.80).abs() < 1e-12);
assert!(cfg.gmail_dot_folding);
assert!((cfg.uk_nhs_number_weight - 0.30).abs() < 1e-12);
assert!(matches!(cfg.name_algorithm, SimilarityAlgorithm::Combined));
assert_eq!(cfg.phone_default_country.as_deref(), Some("GB"));
}
#[test]
fn similarity_algorithm_round_trips_through_json() {
for alg in [
SimilarityAlgorithm::JaroWinkler,
SimilarityAlgorithm::Levenshtein,
SimilarityAlgorithm::Exact,
SimilarityAlgorithm::Combined,
] {
let json = serde_json::to_string(&alg).expect("serialise");
let back: SimilarityAlgorithm = serde_json::from_str(&json).expect("deserialise");
assert_eq!(alg, back);
}
}
#[test]
fn config_lenient_lowers_threshold() {
let c = MatchConfig::lenient();
assert!((c.match_threshold - 0.75).abs() < 1e-9);
assert!(c.use_phonetic_matching);
}
#[test]
fn exact_clone_is_a_match() {
let p = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.uk_nhs_number("9434765919")
.build();
let result = MatchingEngine::default_config().match_workers(&p, &p.clone());
assert!(result.is_match);
assert!(result.score > 0.95);
}
#[test]
fn fuzzy_given_name_still_matches() {
let a = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let b = Worker::builder()
.given_name("Jon")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let r = MatchingEngine::default_config().match_workers(&a, &b);
assert!(r.is_match);
assert!(r.score > 0.85);
}
#[test]
fn completely_different_patients_do_not_match() {
let a = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let b = Worker::builder()
.given_name("Jane")
.family_name("Doe")
.date_of_birth(dob(1990, 3, 20))
.gender(Gender::Female)
.build();
let r = MatchingEngine::default_config().match_workers(&a, &b);
assert!(!r.is_match);
assert!(r.score < 0.5);
}
#[test]
fn no_overlapping_fields_returns_zero_score() {
let a = Worker::builder().given_name("Solo").build();
let b = Worker::builder().family_name("Only").build();
let r = MatchingEngine::default_config().match_workers(&a, &b);
assert_eq!(r.score, 0.0);
assert!(!r.is_match);
}
#[test]
fn unparseable_uk_nhs_number_is_none_not_zero() {
let a = Worker::builder()
.uk_nhs_number("not-a-number")
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.build();
let b = Worker::builder()
.uk_nhs_number("also-not-a-number")
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.build();
let r = MatchingEngine::default_config().match_workers(&a, &b);
assert_eq!(
r.breakdown.uk_nhs_number_score, None,
"unparseable NHS numbers should not produce a 0.0 penalty"
);
assert!(r.is_match, "should still match on demographics");
}
#[test]
fn missing_field_yields_none_in_breakdown() {
let a = Worker::builder().given_name("Ada").build();
let b = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let r = MatchingEngine::default_config().match_workers(&a, &b);
assert!(r.breakdown.given_name_score.is_some());
assert!(r.breakdown.family_name_score.is_none());
}
#[test]
fn phonetic_match_is_a_bonus_not_a_penalty() {
let p = Worker::builder()
.given_name("Stephen")
.family_name("Jones")
.build();
let with_phon = MatchingEngine::new(MatchConfig {
use_phonetic_matching: true,
..MatchConfig::default()
})
.match_workers(&p, &p.clone());
let without_phon = MatchingEngine::new(MatchConfig {
use_phonetic_matching: false,
..MatchConfig::default()
})
.match_workers(&p, &p.clone());
assert!(with_phon.score >= without_phon.score);
}
#[test]
fn phonetic_score_disabled_when_config_off() {
let p = Worker::builder()
.given_name("Steven")
.family_name("Smith")
.build();
let q = Worker::builder()
.given_name("Stephen")
.family_name("Smyth")
.build();
let r = MatchingEngine::new(MatchConfig {
use_phonetic_matching: false,
..MatchConfig::default()
})
.match_workers(&p, &q);
assert_eq!(r.breakdown.phonetic_name_score, None);
}
#[test]
fn address_with_no_subfields_is_neutral_half() {
let a = Address::new();
let b = Address::new();
let engine = MatchingEngine::default_config();
let score = engine.compare_addresses(&a, &b);
assert!(
(score - 0.5).abs() < 1e-9,
"empty addresses must be neutral (0.5), got {score}"
);
}
#[test]
fn address_postcode_dominates() {
let mut a = Address::new();
a.postcode = Some("CF10 1AA".into());
let mut b = Address::new();
b.postcode = Some("CF10 1AA".into());
let s = MatchingEngine::default_config().compare_addresses(&a, &b);
assert!(s > 0.0);
}
#[test]
fn deterministic_uk_nhs_match_overrides_demographics() {
let a = Worker::builder()
.uk_nhs_number("943 476 5919")
.given_name("Bob")
.build();
let b = Worker::builder()
.uk_nhs_number("9434765919")
.given_name("Alice") .build();
assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn deterministic_demographics_match_when_all_align() {
let p = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
assert!(MatchingEngine::default_config().deterministic_match(&p, &p.clone()));
}
#[test]
fn deterministic_demographics_tolerates_missing_gender() {
let a = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.build();
let b = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn deterministic_rejects_when_dob_differs() {
let a = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let b = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 16)) .gender(Gender::Male)
.build();
assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn deterministic_rejects_when_gender_differs() {
let a = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let b = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Female)
.build();
assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn strict_mode_requires_deterministic_for_is_match() {
let cfg = MatchConfig {
match_threshold: 0.85,
strict_mode: true,
..MatchConfig::default()
};
let p1 = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let p2 = Worker::builder()
.given_name("Jon") .family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let engine = MatchingEngine::new(cfg);
let r = engine.match_workers(&p1, &p2);
assert!(
r.score >= 0.85,
"fuzzy score should clear lowered threshold"
);
assert!(!engine.deterministic_match(&p1, &p2));
assert!(
!r.is_match,
"strict mode must reject fuzzy-only matches even above threshold"
);
}
#[test]
fn strict_mode_accepts_when_deterministic_holds() {
let cfg = MatchConfig::strict();
let p1 = Worker::builder()
.uk_nhs_number("9434765919")
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let p2 = p1.clone();
let r = MatchingEngine::new(cfg).match_workers(&p1, &p2);
assert!(r.is_match);
}
#[test]
fn non_strict_mode_accepts_fuzzy_match_above_threshold() {
let p1 = Worker::builder()
.given_name("John")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let p2 = Worker::builder()
.given_name("Jon")
.family_name("Smith")
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let r = MatchingEngine::default_config().match_workers(&p1, &p2);
assert!(r.is_match);
}
#[test]
fn match_one_to_many_empty_candidates_yields_empty_vec() {
let engine = MatchingEngine::default_config();
let q = Worker::builder().given_name("Solo").build();
assert!(engine.match_one_to_many(&q, &[]).is_empty());
}
#[test]
fn match_one_to_many_preserves_order() {
let engine = MatchingEngine::default_config();
let q = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let candidates = vec![
Worker::builder()
.given_name("Grace")
.family_name("Hopper")
.build(),
q.clone(),
Worker::builder()
.given_name("Alan")
.family_name("Turing")
.build(),
];
let r = engine.match_one_to_many(&q, &candidates);
assert_eq!(r.len(), 3);
assert!(r[1].score > r[0].score);
assert!(r[1].score > r[2].score);
assert!(r[1].is_match);
}
#[test]
fn match_one_to_many_matches_individual_scoring() {
let engine = MatchingEngine::default_config();
let q = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let candidates = vec![
Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build(),
Worker::builder()
.given_name("Alan")
.family_name("Turing")
.build(),
];
let batch = engine.match_one_to_many(&q, &candidates);
for (i, c) in candidates.iter().enumerate() {
let individual = engine.match_workers(&q, c);
assert!((batch[i].score - individual.score).abs() < 1e-12);
assert_eq!(batch[i].is_match, individual.is_match);
assert_eq!(batch[i].confidence, individual.confidence);
}
}
#[test]
fn rank_one_to_many_sorts_by_score_descending() {
let engine = MatchingEngine::default_config();
let q = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let candidates = vec![
Worker::builder()
.given_name("Grace")
.family_name("Hopper")
.build(),
q.clone(),
Worker::builder()
.given_name("Alan")
.family_name("Turing")
.build(),
];
let ranked = engine.rank_one_to_many(&q, &candidates);
assert_eq!(ranked.len(), 3);
assert_eq!(ranked[0].0, 1);
for w in ranked.windows(2) {
assert!(w[0].1.score >= w[1].1.score);
}
}
#[test]
fn rank_one_to_many_breaks_ties_by_ascending_original_index() {
let engine = MatchingEngine::default_config();
let q = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let twin = q.clone();
let candidates = vec![twin.clone(), twin.clone(), twin.clone()];
let ranked = engine.rank_one_to_many(&q, &candidates);
assert_eq!(ranked.len(), 3);
assert_eq!(ranked[0].0, 0);
assert_eq!(ranked[1].0, 1);
assert_eq!(ranked[2].0, 2);
}
#[test]
fn match_one_to_many_is_deterministic_across_calls() {
let engine = MatchingEngine::default_config();
let q = Worker::builder().given_name("X").family_name("Y").build();
let candidates = vec![
Worker::builder().given_name("X").family_name("Y").build(),
Worker::builder().given_name("A").family_name("B").build(),
];
let a = engine.match_one_to_many(&q, &candidates);
let b = engine.match_one_to_many(&q, &candidates);
for i in 0..a.len() {
assert!((a[i].score - b[i].score).abs() < 1e-12);
}
}
#[test]
fn dob_pair_exact_equal_scores_one() {
assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1995, 1, 10)), 1.0);
}
#[test]
fn dob_pair_day_month_swap_scores_half() {
assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1995, 10, 1)), 0.5);
assert_eq!(score_dob_pair(dob(1995, 10, 1), dob(1995, 1, 10)), 0.5);
}
#[test]
fn dob_pair_swap_requires_year_to_match() {
assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1996, 10, 1)), 0.0);
}
#[test]
fn dob_pair_swap_skipped_when_day_exceeds_12() {
assert_eq!(score_dob_pair(dob(1995, 1, 25), dob(1995, 1, 26)), 0.0);
assert_eq!(score_dob_pair(dob(1995, 1, 25), dob(1995, 2, 25)), 0.0);
}
#[test]
fn dob_pair_unrelated_dates_score_zero() {
assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1980, 6, 30)), 0.0);
assert_eq!(score_dob_pair(dob(1995, 1, 10), dob(1995, 1, 11)), 0.0);
}
#[test]
fn dob_pair_day_equals_month_collapses_to_exact() {
assert_eq!(score_dob_pair(dob(1995, 5, 5), dob(1995, 5, 5)), 1.0);
assert_eq!(score_dob_pair(dob(1995, 5, 5), dob(1995, 5, 6)), 0.0);
}
#[test]
fn dob_pair_invalid_swap_target_does_not_panic() {
assert_eq!(score_dob_pair(dob(2000, 2, 29), dob(2000, 2, 29)), 1.0);
assert_eq!(score_dob_pair(dob(2000, 2, 12), dob(2000, 12, 2)), 0.5);
}
#[test]
fn deterministic_match_still_rejects_transposed_dob() {
let a = Worker::builder()
.given_name("Thomas")
.family_name("Price")
.date_of_birth(dob(1995, 1, 10))
.gender(Gender::Male)
.build();
let b = Worker::builder()
.given_name("Thomas")
.family_name("Price")
.date_of_birth(dob(1995, 10, 1))
.gender(Gender::Male)
.build();
assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn transposed_dob_lifts_probabilistic_score_above_zero() {
let a = Worker::builder()
.given_name("Thomas")
.family_name("Price")
.date_of_birth(dob(1995, 1, 10))
.gender(Gender::Male)
.build();
let b = Worker::builder()
.given_name("Thomas")
.family_name("Price")
.date_of_birth(dob(1995, 10, 1))
.gender(Gender::Male)
.build();
let r = MatchingEngine::default_config().match_workers(&a, &b);
assert_eq!(r.breakdown.date_of_birth_score, Some(0.5));
assert!(r.score > 0.6);
}
#[test]
fn confidence_band_boundaries_are_inclusive_on_the_low_side() {
assert_eq!(Confidence::from_score(0.90), Confidence::High);
assert_eq!(Confidence::from_score(0.89), Confidence::Medium);
assert_eq!(Confidence::from_score(0.75), Confidence::Medium);
assert_eq!(Confidence::from_score(0.74), Confidence::Low);
}
#[test]
fn confidence_handles_degenerate_inputs_gracefully() {
assert_eq!(Confidence::from_score(f64::NAN), Confidence::Low);
assert_eq!(Confidence::from_score(-1.0), Confidence::Low);
assert_eq!(Confidence::from_score(2.0), Confidence::High);
}
#[test]
fn confidence_is_independent_of_match_threshold() {
let p = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let strict = MatchingEngine::new(MatchConfig::strict()).match_workers(&p, &p.clone());
let lenient = MatchingEngine::new(MatchConfig::lenient()).match_workers(&p, &p.clone());
assert_eq!(strict.confidence, lenient.confidence);
assert_eq!(strict.confidence, Confidence::High);
}
#[test]
fn match_result_carries_confidence() {
let p = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let r = MatchingEngine::default_config().match_workers(&p, &p.clone());
assert_eq!(r.confidence, Confidence::High);
}
#[test]
fn match_result_confidence_round_trips_via_serde() {
let p = Worker::builder()
.given_name("Ada")
.family_name("Lovelace")
.build();
let r = MatchingEngine::default_config().match_workers(&p, &p.clone());
let json = serde_json::to_string(&r).unwrap();
let back: MatchResult = serde_json::from_str(&json).unwrap();
assert_eq!(r.confidence, back.confidence);
}
#[test]
fn deterministic_rejects_when_names_missing() {
let a = Worker::builder()
.date_of_birth(dob(1980, 5, 15))
.gender(Gender::Male)
.build();
let b = a.clone();
assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn score_named_place_both_subfields_blend_seven_three() {
let a = Address::new().with_city("Paris").with_country("France");
let b = Address::new().with_city("Paris").with_country("France");
assert_eq!(score_named_place(&a, &b), Some(1.0));
}
#[test]
fn score_named_place_city_only_matches_returns_city_score() {
let a = Address::new().with_city("Cardiff");
let b = Address::new().with_city("Cardiff");
assert_eq!(score_named_place(&a, &b), Some(1.0));
}
#[test]
fn score_named_place_country_only_matches_returns_country_score() {
let a = Address::new().with_country("Wales");
let b = Address::new().with_country("Wales");
assert_eq!(score_named_place(&a, &b), Some(1.0));
}
#[test]
fn score_named_place_empty_returns_none() {
let a = Address::new();
let b = Address::new();
assert_eq!(score_named_place(&a, &b), None);
}
#[test]
fn score_named_place_city_partial_country_mismatch_blends() {
let a = Address::new().with_city("Paris").with_country("France");
let b = Address::new().with_city("Paris").with_country("USA");
let s = score_named_place(&a, &b).unwrap();
assert!((s - 0.7).abs() < 1e-9);
}
#[test]
fn match_config_default_carries_death_weights() {
let c = MatchConfig::default();
assert!((c.death_date_weight - 0.10).abs() < 1e-9);
assert!((c.death_place_weight - 0.05).abs() < 1e-9);
}
#[test]
fn breakdown_carries_death_date_score_when_both_sides_present() {
let p1 = Worker::builder()
.given_name("X")
.family_name("Y")
.death_date(dob(2020, 3, 14))
.build();
let p2 = Worker::builder()
.given_name("X")
.family_name("Y")
.death_date(dob(2020, 3, 14))
.build();
let r = MatchingEngine::default_config().match_workers(&p1, &p2);
assert_eq!(r.breakdown.death_date_score, Some(1.0));
}
#[test]
fn address_subscore_exact_postcode_plus_slightly_different_street_clears_seven_tenths() {
let engine = MatchingEngine::default_config();
let a = Address::new();
let a = Address {
line1: Some("10 High Street".into()),
postcode: Some("CF10 1AA".into()),
city: Some("Cardiff".into()),
..a
};
let b = Address {
line1: Some("10 High Road".into()),
postcode: Some("CF10 1AA".into()),
city: Some("Cardiff".into()),
..Address::new()
};
let s = engine.compare_addresses(&a, &b);
assert!(
s >= 0.7,
"exact postcode + slight street typo should score ≥ 0.7: {s}"
);
}
#[test]
fn address_subscore_postcode_only_match_returns_one() {
let engine = MatchingEngine::default_config();
let a = Address {
postcode: Some("CF10 1AA".into()),
..Address::new()
};
let b = Address {
postcode: Some("CF10 1AA".into()),
..Address::new()
};
let s = engine.compare_addresses(&a, &b);
assert!((s - 1.0).abs() < 1e-9, "postcode-only match: {s}");
}
#[test]
fn address_subscore_no_comparable_fields_returns_neutral_half() {
let engine = MatchingEngine::default_config();
let s = engine.compare_addresses(&Address::new(), &Address::new());
assert!((s - 0.5).abs() < 1e-9, "neutral fallback: {s}");
}
#[test]
fn address_subscore_postcode_match_plus_street_mismatch_dominated_by_postcode() {
let engine = MatchingEngine::default_config();
let a = Address {
postcode: Some("CF10 1AA".into()),
line1: Some("Wholly Different".into()),
..Address::new()
};
let b = Address {
postcode: Some("CF10 1AA".into()),
line1: Some("Completely Other".into()),
..Address::new()
};
let s = engine.compare_addresses(&a, &b);
assert!(s >= 0.5, "postcode should still dominate: {s}");
}
#[test]
fn breakdown_omits_death_place_score_when_one_side_absent() {
let p1 = Worker::builder()
.given_name("X")
.family_name("Y")
.death_place(Address::new().with_city("Cambridge"))
.build();
let p2 = Worker::builder().given_name("X").family_name("Y").build();
let r = MatchingEngine::default_config().match_workers(&p1, &p2);
assert_eq!(r.breakdown.death_place_score, None);
}
}