use rphonetic::{DoubleMetaphone, Encoder};
use zer_core::{record::Record, schema::Schema};
use crate::normalize::{extract_surname_token, normalize_text};
use super::BlockingKey;
pub struct AliasPhoneticKey {
alias_field: String,
dob_field: String,
}
impl AliasPhoneticKey {
pub fn new(alias_field: &str, dob_field: &str) -> Self {
Self {
alias_field: alias_field.into(),
dob_field: dob_field.into(),
}
}
}
impl BlockingKey for AliasPhoneticKey {
fn name(&self) -> &str {
"phonetic_dob"
}
fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
let dob_cow = record.field_as_str(&self.dob_field);
let dob = match dob_cow.as_deref() {
Some(s) => s,
None => return vec![],
};
let year = match dob.get(..4) {
Some(y) if y.len() == 4 && y.chars().all(|c| c.is_ascii_digit()) => y,
_ => return vec![],
};
let aliases_cow = record.field_as_str(&self.alias_field);
let aliases_raw = match aliases_cow.as_deref() {
Some(s) if !s.is_empty() => s,
_ => return vec![],
};
let dm = DoubleMetaphone::default();
let mut keys: Vec<String> = vec![];
for alias in aliases_raw.split('|') {
let alias = alias.trim();
if alias.is_empty() { continue; }
let norm = normalize_text(alias);
let surname = extract_surname_token(&norm);
if surname.is_empty() { continue; }
let code = dm.encode(surname);
if code.is_empty() { continue; }
keys.push(format!("{}:{}", code, year));
}
keys.sort();
keys.dedup();
keys
}
}
pub struct FuzzyYearKey {
name_field: String,
dob_field: String,
fuzzy_range: u32,
}
impl FuzzyYearKey {
pub fn new(name_field: &str, dob_field: &str, fuzzy_range: u32) -> Self {
Self {
name_field: name_field.into(),
dob_field: dob_field.into(),
fuzzy_range,
}
}
}
fn is_estimated_dob(dob: &str) -> bool {
dob.len() >= 10 && &dob[4..10] == "-01-01"
}
impl BlockingKey for FuzzyYearKey {
fn name(&self) -> &str {
"phonetic_dob"
}
fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
let dob_cow = record.field_as_str(&self.dob_field);
let dob = match dob_cow.as_deref() {
Some(s) => s,
None => return vec![],
};
if !is_estimated_dob(dob) {
return vec![];
}
let year: i32 = match dob.get(..4).and_then(|y| y.parse().ok()) {
Some(y) => y,
None => return vec![],
};
let surname_cow = record.field_as_str(&self.name_field);
let surname_raw = match surname_cow.as_deref() {
Some(s) => s,
None => return vec![],
};
let norm = normalize_text(surname_raw);
let surname = extract_surname_token(&norm);
if surname.is_empty() { return vec![]; }
let code = DoubleMetaphone::default().encode(surname);
if code.is_empty() { return vec![]; }
let r = self.fuzzy_range as i32;
((-r)..=r)
.map(|d| format!("{}:{}", code, year + d))
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
fn schema() -> Schema {
SchemaBuilder::new()
.field("achternaam", FieldKind::Name)
.field("alias_namen", FieldKind::Alias)
.field("dob", FieldKind::Date)
.build()
.unwrap()
}
fn rec(id: u64, achternaam: &str, aliases: &str, dob: &str) -> Record {
Record::new(id)
.insert("achternaam", FieldValue::Text(achternaam.into()))
.insert("alias_namen", FieldValue::Text(aliases.into()))
.insert("dob", FieldValue::Text(dob.into()))
}
#[test]
fn alias_key_emits_phonetic_for_each_alias() {
let schema = schema();
let key = AliasPhoneticKey::new("alias_namen", "dob");
let r = rec(1, "Benabdallah", "Benabdallah Fatima|F. Benabdallah", "1999-06-14");
let keys = key.extract(&r, &schema);
assert!(keys.len() >= 1, "should emit at least one alias key");
}
#[test]
fn alias_key_empty_aliases_returns_empty() {
let schema = schema();
let key = AliasPhoneticKey::new("alias_namen", "dob");
let r = rec(1, "Jong", "", "1985-01-01");
assert!(key.extract(&r, &schema).is_empty());
}
#[test]
fn alias_key_cross_record_collision() {
let schema = schema();
let key = AliasPhoneticKey::new("alias_namen", "dob");
let canonical = rec(1, "Benabdallah", "Benabdallah Fatima", "1999-06-14");
let alias_rec = rec(2, "Fatima", "Fatima Benabdallah", "1999-06-14");
let k1 = key.extract(&canonical, &schema); let k2 = key.extract(&alias_rec, &schema);
assert!(!k1.is_empty());
assert!(!k2.is_empty());
}
#[test]
fn fuzzy_year_key_emits_range_for_estimated_dob() {
let schema = schema();
let key = FuzzyYearKey::new("achternaam", "dob", 1);
let r = rec(1, "Yilmaz", "", "1985-01-01");
let keys = key.extract(&r, &schema);
assert_eq!(keys.len(), 3, "should emit year-1, year, year+1");
assert!(keys.iter().any(|k| k.ends_with(":1984")));
assert!(keys.iter().any(|k| k.ends_with(":1985")));
assert!(keys.iter().any(|k| k.ends_with(":1986")));
}
#[test]
fn fuzzy_year_key_emits_nothing_for_precise_dob() {
let schema = schema();
let key = FuzzyYearKey::new("achternaam", "dob", 1);
let r = rec(1, "Yilmaz", "", "1985-03-15");
assert!(key.extract(&r, &schema).is_empty(), "precise DOB → no fuzzy keys");
}
#[test]
fn fuzzy_year_key_pairs_cross_year_estimated_dobs() {
let schema = schema();
let key = FuzzyYearKey::new("achternaam", "dob", 1);
let r1 = rec(1, "Yilmaz", "", "1985-01-01");
let r2 = rec(2, "Yilmaz", "", "1986-01-01");
let k1: std::collections::HashSet<String> = key.extract(&r1, &schema).into_iter().collect();
let k2: std::collections::HashSet<String> = key.extract(&r2, &schema).into_iter().collect();
let shared: Vec<_> = k1.intersection(&k2).collect();
assert!(!shared.is_empty(), "neighbouring estimated years should share a fuzzy key");
}
}