Skip to main content

zer_blocking/keys/
phonetic_initial.rs

1use rphonetic::{DoubleMetaphone, Encoder, Soundex};
2use zer_core::{record::Record, schema::Schema};
3
4use crate::normalize::{extract_surname_token, normalize_text};
5use super::{BlockingKey, phonetic::PhoneticAlgo};
6
7/// Composite blocking key: `"{phonetic(surname)}:{initial(firstname)}:{year}"`.
8///
9/// Compared to `PhoneticNameDobKey`, splitting by first-name initial reduces
10/// bucket size ~20× for high-frequency surnames (e.g. Dutch "De Jong", "Jansen"),
11/// dramatically cutting false candidate pairs while preserving recall for true
12/// matches that share surname phonetic code, first-name initial, and birth year.
13pub struct PhoneticNameDobInitialKey {
14    algo:          PhoneticAlgo,
15    name_field:    String,
16    initial_field: String,
17    dob_field:     String,
18}
19
20impl PhoneticNameDobInitialKey {
21    pub fn new(name_field: &str, initial_field: &str, dob_field: &str) -> Self {
22        Self {
23            algo:          PhoneticAlgo::DoubleMetaphone,
24            name_field:    name_field.into(),
25            initial_field: initial_field.into(),
26            dob_field:     dob_field.into(),
27        }
28    }
29
30    pub fn with_algo(mut self, algo: PhoneticAlgo) -> Self {
31        self.algo = algo;
32        self
33    }
34
35    fn encode(&self, s: &str) -> String {
36        match self.algo {
37            PhoneticAlgo::DoubleMetaphone => DoubleMetaphone::default().encode(s),
38            PhoneticAlgo::Soundex        => Soundex::default().encode(s),
39        }
40    }
41}
42
43impl BlockingKey for PhoneticNameDobInitialKey {
44    fn name(&self) -> &str {
45        "phonetic_initial_dob"
46    }
47
48    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
49        let surname_cow = record.field_as_str(&self.name_field);
50        let surname_raw = match surname_cow.as_deref() {
51            Some(s) => s,
52            None    => return vec![],
53        };
54        let initial_cow = record.field_as_str(&self.initial_field);
55        let initial_raw = match initial_cow.as_deref() {
56            Some(s) => s,
57            None    => return vec![],
58        };
59        let dob_cow = record.field_as_str(&self.dob_field);
60        let dob_raw = match dob_cow.as_deref() {
61            Some(s) => s,
62            None    => return vec![],
63        };
64
65        let normalized = normalize_text(surname_raw);
66        let surname    = extract_surname_token(&normalized);
67        if surname.is_empty() {
68            return vec![];
69        }
70
71        let code = self.encode(surname);
72        if code.is_empty() {
73            return vec![];
74        }
75
76        let initial = initial_raw
77            .trim()
78            .chars()
79            .next()
80            .map(|c| c.to_uppercase().to_string())
81            .unwrap_or_default();
82        if initial.is_empty() {
83            return vec![];
84        }
85
86        let year = dob_raw.trim().get(..4).unwrap_or("").to_string();
87        if year.len() < 4 || !year.chars().all(|c| c.is_ascii_digit()) {
88            return vec![];
89        }
90
91        vec![format!("{}:{}:{}", code, initial, year)]
92    }
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98    use zer_core::{record::FieldValue, schema::SchemaBuilder, schema::FieldKind};
99
100    fn make_schema() -> Schema {
101        SchemaBuilder::new()
102            .field("last_name",  FieldKind::Name)
103            .field("first_name", FieldKind::Name)
104            .field("dob",        FieldKind::Date)
105            .build()
106            .unwrap()
107    }
108
109    fn make_record(id: u64, last: &str, first: &str, dob: &str) -> Record {
110        Record::new(id)
111            .insert("last_name",  FieldValue::Text(last.into()))
112            .insert("first_name", FieldValue::Text(first.into()))
113            .insert("dob",        FieldValue::Text(dob.into()))
114    }
115
116    #[test]
117    fn same_surname_same_initial_same_year_collide() {
118        let schema = make_schema();
119        let key    = PhoneticNameDobInitialKey::new("last_name", "first_name", "dob");
120
121        let r1 = make_record(1, "Jong", "Anna",    "1990-03-01");
122        let r2 = make_record(2, "Jong", "Annelies", "1990-07-15");
123
124        let k1 = key.extract(&r1, &schema);
125        let k2 = key.extract(&r2, &schema);
126        assert!(!k1.is_empty());
127        assert_eq!(k1, k2, "Same surname/initial/year should collide");
128    }
129
130    #[test]
131    fn same_surname_different_initial_no_collision() {
132        let schema = make_schema();
133        let key    = PhoneticNameDobInitialKey::new("last_name", "first_name", "dob");
134
135        let r1 = make_record(1, "Jong", "Anna",  "1990-03-01");
136        let r2 = make_record(2, "Jong", "Pieter", "1990-03-01");
137
138        let k1 = key.extract(&r1, &schema);
139        let k2 = key.extract(&r2, &schema);
140        assert_ne!(k1, k2, "Different initials should not collide");
141    }
142
143    #[test]
144    fn different_dob_year_no_collision() {
145        let schema = make_schema();
146        let key    = PhoneticNameDobInitialKey::new("last_name", "first_name", "dob");
147
148        let r1 = make_record(1, "Berg", "Anna", "1970-01-01");
149        let r2 = make_record(2, "Berg", "Anna", "1985-01-01");
150
151        let k1 = key.extract(&r1, &schema);
152        let k2 = key.extract(&r2, &schema);
153        assert_ne!(k1, k2, "Different DOB years should not collide");
154    }
155
156    #[test]
157    fn missing_initial_field_returns_empty() {
158        let schema = make_schema();
159        let key    = PhoneticNameDobInitialKey::new("last_name", "first_name", "dob");
160
161        let r = Record::new(1)
162            .insert("last_name", FieldValue::Text("Berg".into()))
163            .insert("dob",       FieldValue::Text("1990-01-01".into()));
164        assert!(key.extract(&r, &schema).is_empty());
165    }
166
167    #[test]
168    fn tussenvoegsel_stripped_from_surname() {
169        let schema = make_schema();
170        let key    = PhoneticNameDobInitialKey::new("last_name", "first_name", "dob");
171
172        let r1 = make_record(1, "van den Berg", "Anna", "1990-06-15");
173        let r2 = make_record(2, "Berg",          "Anna", "1990-06-15");
174
175        let k1 = key.extract(&r1, &schema);
176        let k2 = key.extract(&r2, &schema);
177        assert!(!k1.is_empty());
178        assert_eq!(k1, k2, "van den Berg and Berg should collide after prefix stripping");
179    }
180}