1use rphonetic::{DoubleMetaphone, Encoder};
2use zer_core::{record::Record, schema::Schema};
3
4use super::BlockingKey;
5use crate::normalize::{extract_surname_token, normalize_text};
6
7pub struct AliasPhoneticKey {
16 alias_field: String,
17 dob_field: String,
18}
19
20impl AliasPhoneticKey {
21 pub fn new(alias_field: &str, dob_field: &str) -> Self {
22 Self {
23 alias_field: alias_field.into(),
24 dob_field: dob_field.into(),
25 }
26 }
27}
28
29impl BlockingKey for AliasPhoneticKey {
30 fn name(&self) -> &str {
31 "phonetic_dob"
34 }
35
36 fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
37 let dob_cow = record.field_as_str(&self.dob_field);
38 let dob = match dob_cow.as_deref() {
39 Some(s) => s,
40 None => return vec![],
41 };
42 let year = match dob.get(..4) {
43 Some(y) if y.len() == 4 && y.chars().all(|c| c.is_ascii_digit()) => y,
44 _ => return vec![],
45 };
46
47 let aliases_cow = record.field_as_str(&self.alias_field);
48 let aliases_raw = match aliases_cow.as_deref() {
49 Some(s) if !s.is_empty() => s,
50 _ => return vec![],
51 };
52
53 let dm = DoubleMetaphone::default();
54 let mut keys: Vec<String> = vec![];
55
56 for alias in aliases_raw.split('|') {
57 let alias = alias.trim();
58 if alias.is_empty() {
59 continue;
60 }
61 let norm = normalize_text(alias);
62 let surname = extract_surname_token(&norm);
63 if surname.is_empty() {
64 continue;
65 }
66 let code = dm.encode(surname);
67 if code.is_empty() {
68 continue;
69 }
70 keys.push(format!("{}:{}", code, year));
71 }
72
73 keys.sort();
74 keys.dedup();
75 keys
76 }
77}
78
79pub struct FuzzyYearKey {
85 name_field: String,
86 dob_field: String,
87 fuzzy_range: u32,
88}
89
90impl FuzzyYearKey {
91 pub fn new(name_field: &str, dob_field: &str, fuzzy_range: u32) -> Self {
93 Self {
94 name_field: name_field.into(),
95 dob_field: dob_field.into(),
96 fuzzy_range,
97 }
98 }
99}
100
101fn is_estimated_dob(dob: &str) -> bool {
102 dob.len() >= 10 && &dob[4..10] == "-01-01"
103}
104
105impl BlockingKey for FuzzyYearKey {
106 fn name(&self) -> &str {
107 "phonetic_dob"
108 }
109
110 fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
111 let dob_cow = record.field_as_str(&self.dob_field);
112 let dob = match dob_cow.as_deref() {
113 Some(s) => s,
114 None => return vec![],
115 };
116
117 if !is_estimated_dob(dob) {
118 return vec![];
119 }
120
121 let year: i32 = match dob.get(..4).and_then(|y| y.parse().ok()) {
122 Some(y) => y,
123 None => return vec![],
124 };
125
126 let surname_cow = record.field_as_str(&self.name_field);
127 let surname_raw = match surname_cow.as_deref() {
128 Some(s) => s,
129 None => return vec![],
130 };
131
132 let norm = normalize_text(surname_raw);
133 let surname = extract_surname_token(&norm);
134 if surname.is_empty() {
135 return vec![];
136 }
137
138 let code = DoubleMetaphone::default().encode(surname);
139 if code.is_empty() {
140 return vec![];
141 }
142
143 let r = self.fuzzy_range as i32;
144 ((-r)..=r)
145 .map(|d| format!("{}:{}", code, year + d))
146 .collect()
147 }
148}
149
150#[cfg(test)]
153mod tests {
154 use super::*;
155 use zer_core::{
156 record::FieldValue,
157 schema::{FieldKind, SchemaBuilder},
158 };
159
160 fn schema() -> Schema {
161 SchemaBuilder::new()
162 .field("achternaam", FieldKind::Name)
163 .field("alias_namen", FieldKind::Alias)
164 .field("dob", FieldKind::Date)
165 .build()
166 .unwrap()
167 }
168
169 fn rec(id: u64, achternaam: &str, aliases: &str, dob: &str) -> Record {
170 Record::new(id)
171 .insert("achternaam", FieldValue::Text(achternaam.into()))
172 .insert("alias_namen", FieldValue::Text(aliases.into()))
173 .insert("dob", FieldValue::Text(dob.into()))
174 }
175
176 #[test]
177 fn alias_key_emits_phonetic_for_each_alias() {
178 let schema = schema();
179 let key = AliasPhoneticKey::new("alias_namen", "dob");
180
181 let r = rec(
184 1,
185 "Benabdallah",
186 "Benabdallah Fatima|F. Benabdallah",
187 "1999-06-14",
188 );
189 let keys = key.extract(&r, &schema);
190 assert!(keys.len() >= 1, "should emit at least one alias key");
191 }
192
193 #[test]
194 fn alias_key_empty_aliases_returns_empty() {
195 let schema = schema();
196 let key = AliasPhoneticKey::new("alias_namen", "dob");
197 let r = rec(1, "Jong", "", "1985-01-01");
198 assert!(key.extract(&r, &schema).is_empty());
199 }
200
201 #[test]
202 fn alias_key_cross_record_collision() {
203 let schema = schema();
204 let key = AliasPhoneticKey::new("alias_namen", "dob");
205
206 let canonical = rec(1, "Benabdallah", "Benabdallah Fatima", "1999-06-14");
208 let alias_rec = rec(2, "Fatima", "Fatima Benabdallah", "1999-06-14");
211
212 let k1 = key.extract(&canonical, &schema); let k2 = key.extract(&alias_rec, &schema); assert!(!k1.is_empty());
219 assert!(!k2.is_empty());
220 }
221
222 #[test]
223 fn fuzzy_year_key_emits_range_for_estimated_dob() {
224 let schema = schema();
225 let key = FuzzyYearKey::new("achternaam", "dob", 1);
226
227 let r = rec(1, "Yilmaz", "", "1985-01-01");
229 let keys = key.extract(&r, &schema);
230 assert_eq!(keys.len(), 3, "should emit year-1, year, year+1");
231 assert!(keys.iter().any(|k| k.ends_with(":1984")));
233 assert!(keys.iter().any(|k| k.ends_with(":1985")));
234 assert!(keys.iter().any(|k| k.ends_with(":1986")));
235 }
236
237 #[test]
238 fn fuzzy_year_key_emits_nothing_for_precise_dob() {
239 let schema = schema();
240 let key = FuzzyYearKey::new("achternaam", "dob", 1);
241
242 let r = rec(1, "Yilmaz", "", "1985-03-15");
243 assert!(
244 key.extract(&r, &schema).is_empty(),
245 "precise DOB → no fuzzy keys"
246 );
247 }
248
249 #[test]
250 fn fuzzy_year_key_pairs_cross_year_estimated_dobs() {
251 let schema = schema();
252 let key = FuzzyYearKey::new("achternaam", "dob", 1);
253
254 let r1 = rec(1, "Yilmaz", "", "1985-01-01");
256 let r2 = rec(2, "Yilmaz", "", "1986-01-01");
257
258 let k1: std::collections::HashSet<String> = key.extract(&r1, &schema).into_iter().collect();
259 let k2: std::collections::HashSet<String> = key.extract(&r2, &schema).into_iter().collect();
260
261 let shared: Vec<_> = k1.intersection(&k2).collect();
262 assert!(
263 !shared.is_empty(),
264 "neighbouring estimated years should share a fuzzy key"
265 );
266 }
267}