1use rphonetic::{DoubleMetaphone, Encoder};
2use zer_core::{record::Record, schema::Schema};
3
4use crate::normalize::{extract_surname_token, normalize_text};
5use super::BlockingKey;
6
7pub struct AliasPhoneticKey {
16 alias_field: String,
17 dob_field: String,
18}
19
20impl AliasPhoneticKey {
21 pub fn new(alias_field: &str, dob_field: &str) -> Self {
22 Self {
23 alias_field: alias_field.into(),
24 dob_field: dob_field.into(),
25 }
26 }
27}
28
29impl BlockingKey for AliasPhoneticKey {
30 fn name(&self) -> &str {
31 "phonetic_dob"
34 }
35
36 fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
37 let dob_cow = record.field_as_str(&self.dob_field);
38 let dob = match dob_cow.as_deref() {
39 Some(s) => s,
40 None => return vec![],
41 };
42 let year = match dob.get(..4) {
43 Some(y) if y.len() == 4 && y.chars().all(|c| c.is_ascii_digit()) => y,
44 _ => return vec![],
45 };
46
47 let aliases_cow = record.field_as_str(&self.alias_field);
48 let aliases_raw = match aliases_cow.as_deref() {
49 Some(s) if !s.is_empty() => s,
50 _ => return vec![],
51 };
52
53 let dm = DoubleMetaphone::default();
54 let mut keys: Vec<String> = vec![];
55
56 for alias in aliases_raw.split('|') {
57 let alias = alias.trim();
58 if alias.is_empty() { continue; }
59 let norm = normalize_text(alias);
60 let surname = extract_surname_token(&norm);
61 if surname.is_empty() { continue; }
62 let code = dm.encode(surname);
63 if code.is_empty() { continue; }
64 keys.push(format!("{}:{}", code, year));
65 }
66
67 keys.sort();
68 keys.dedup();
69 keys
70 }
71}
72
73pub struct FuzzyYearKey {
79 name_field: String,
80 dob_field: String,
81 fuzzy_range: u32,
82}
83
84impl FuzzyYearKey {
85 pub fn new(name_field: &str, dob_field: &str, fuzzy_range: u32) -> Self {
87 Self {
88 name_field: name_field.into(),
89 dob_field: dob_field.into(),
90 fuzzy_range,
91 }
92 }
93}
94
95fn is_estimated_dob(dob: &str) -> bool {
96 dob.len() >= 10 && &dob[4..10] == "-01-01"
97}
98
99impl BlockingKey for FuzzyYearKey {
100 fn name(&self) -> &str {
101 "phonetic_dob"
102 }
103
104 fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
105 let dob_cow = record.field_as_str(&self.dob_field);
106 let dob = match dob_cow.as_deref() {
107 Some(s) => s,
108 None => return vec![],
109 };
110
111 if !is_estimated_dob(dob) {
112 return vec![];
113 }
114
115 let year: i32 = match dob.get(..4).and_then(|y| y.parse().ok()) {
116 Some(y) => y,
117 None => return vec![],
118 };
119
120 let surname_cow = record.field_as_str(&self.name_field);
121 let surname_raw = match surname_cow.as_deref() {
122 Some(s) => s,
123 None => return vec![],
124 };
125
126 let norm = normalize_text(surname_raw);
127 let surname = extract_surname_token(&norm);
128 if surname.is_empty() { return vec![]; }
129
130 let code = DoubleMetaphone::default().encode(surname);
131 if code.is_empty() { return vec![]; }
132
133 let r = self.fuzzy_range as i32;
134 ((-r)..=r)
135 .map(|d| format!("{}:{}", code, year + d))
136 .collect()
137 }
138}
139
140#[cfg(test)]
143mod tests {
144 use super::*;
145 use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
146
147 fn schema() -> Schema {
148 SchemaBuilder::new()
149 .field("achternaam", FieldKind::Name)
150 .field("alias_namen", FieldKind::Alias)
151 .field("dob", FieldKind::Date)
152 .build()
153 .unwrap()
154 }
155
156 fn rec(id: u64, achternaam: &str, aliases: &str, dob: &str) -> Record {
157 Record::new(id)
158 .insert("achternaam", FieldValue::Text(achternaam.into()))
159 .insert("alias_namen", FieldValue::Text(aliases.into()))
160 .insert("dob", FieldValue::Text(dob.into()))
161 }
162
163 #[test]
164 fn alias_key_emits_phonetic_for_each_alias() {
165 let schema = schema();
166 let key = AliasPhoneticKey::new("alias_namen", "dob");
167
168 let r = rec(1, "Benabdallah", "Benabdallah Fatima|F. Benabdallah", "1999-06-14");
171 let keys = key.extract(&r, &schema);
172 assert!(keys.len() >= 1, "should emit at least one alias key");
173 }
174
175 #[test]
176 fn alias_key_empty_aliases_returns_empty() {
177 let schema = schema();
178 let key = AliasPhoneticKey::new("alias_namen", "dob");
179 let r = rec(1, "Jong", "", "1985-01-01");
180 assert!(key.extract(&r, &schema).is_empty());
181 }
182
183 #[test]
184 fn alias_key_cross_record_collision() {
185 let schema = schema();
186 let key = AliasPhoneticKey::new("alias_namen", "dob");
187
188 let canonical = rec(1, "Benabdallah", "Benabdallah Fatima", "1999-06-14");
190 let alias_rec = rec(2, "Fatima", "Fatima Benabdallah", "1999-06-14");
193
194 let k1 = key.extract(&canonical, &schema); let k2 = key.extract(&alias_rec, &schema); assert!(!k1.is_empty());
201 assert!(!k2.is_empty());
202 }
203
204 #[test]
205 fn fuzzy_year_key_emits_range_for_estimated_dob() {
206 let schema = schema();
207 let key = FuzzyYearKey::new("achternaam", "dob", 1);
208
209 let r = rec(1, "Yilmaz", "", "1985-01-01");
211 let keys = key.extract(&r, &schema);
212 assert_eq!(keys.len(), 3, "should emit year-1, year, year+1");
213 assert!(keys.iter().any(|k| k.ends_with(":1984")));
215 assert!(keys.iter().any(|k| k.ends_with(":1985")));
216 assert!(keys.iter().any(|k| k.ends_with(":1986")));
217 }
218
219 #[test]
220 fn fuzzy_year_key_emits_nothing_for_precise_dob() {
221 let schema = schema();
222 let key = FuzzyYearKey::new("achternaam", "dob", 1);
223
224 let r = rec(1, "Yilmaz", "", "1985-03-15");
225 assert!(key.extract(&r, &schema).is_empty(), "precise DOB → no fuzzy keys");
226 }
227
228 #[test]
229 fn fuzzy_year_key_pairs_cross_year_estimated_dobs() {
230 let schema = schema();
231 let key = FuzzyYearKey::new("achternaam", "dob", 1);
232
233 let r1 = rec(1, "Yilmaz", "", "1985-01-01");
235 let r2 = rec(2, "Yilmaz", "", "1986-01-01");
236
237 let k1: std::collections::HashSet<String> = key.extract(&r1, &schema).into_iter().collect();
238 let k2: std::collections::HashSet<String> = key.extract(&r2, &schema).into_iter().collect();
239
240 let shared: Vec<_> = k1.intersection(&k2).collect();
241 assert!(!shared.is_empty(), "neighbouring estimated years should share a fuzzy key");
242 }
243}