Skip to main content

zer_blocking/keys/
phonetic.rs

1use rphonetic::{DoubleMetaphone, Encoder, Soundex};
2use zer_core::{record::Record, schema::Schema};
3
4use crate::normalize::{extract_surname_token, normalize_text};
5use super::BlockingKey;
6
7/// Phonetic encoding algorithm.
8#[derive(Debug, Clone, Copy)]
9pub enum PhoneticAlgo {
10    DoubleMetaphone,
11    Soundex,
12}
13
14/// Blocking key that encodes the surname phonetically combined with the birth year.
15pub struct PhoneticNameDobKey {
16    algo:       PhoneticAlgo,
17    name_field: String,
18    dob_field:  String,
19}
20
21impl PhoneticNameDobKey {
22    pub fn new(name_field: &str, dob_field: &str) -> Self {
23        Self {
24            algo:       PhoneticAlgo::DoubleMetaphone,
25            name_field: name_field.into(),
26            dob_field:  dob_field.into(),
27        }
28    }
29
30    pub fn with_algo(mut self, algo: PhoneticAlgo) -> Self {
31        self.algo = algo;
32        self
33    }
34
35    fn encode(&self, s: &str) -> String {
36        match self.algo {
37            PhoneticAlgo::DoubleMetaphone => DoubleMetaphone::default().encode(s),
38            PhoneticAlgo::Soundex        => Soundex::default().encode(s),
39        }
40    }
41}
42
43impl BlockingKey for PhoneticNameDobKey {
44    fn name(&self) -> &str {
45        "phonetic_dob"
46    }
47
48    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
49        let surname_cow = record.field_as_str(&self.name_field);
50        let surname_raw = match surname_cow.as_deref() {
51            Some(s) => s,
52            None    => return vec![],
53        };
54        let dob_cow = record.field_as_str(&self.dob_field);
55        let dob_raw = match dob_cow.as_deref() {
56            Some(s) => s,
57            None    => return vec![],
58        };
59
60        let normalized = normalize_text(surname_raw);
61        let surname    = extract_surname_token(&normalized);
62        if surname.is_empty() {
63            return vec![];
64        }
65
66        let code = self.encode(surname);
67        if code.is_empty() {
68            return vec![];
69        }
70
71        let year = dob_raw.trim().get(..4).unwrap_or("").to_string();
72        if year.len() < 4 || !year.chars().all(|c| c.is_ascii_digit()) {
73            return vec![];
74        }
75
76        vec![format!("{}:{}", code, year)]
77    }
78}
79
80#[cfg(test)]
81mod tests {
82    use super::*;
83    use zer_core::{record::FieldValue, schema::SchemaBuilder, schema::FieldKind};
84
85    fn make_schema() -> Schema {
86        SchemaBuilder::new()
87            .field("last_name", FieldKind::Name)
88            .field("dob",       FieldKind::Date)
89            .build()
90            .unwrap()
91    }
92
93    fn make_record(id: u64, last_name: &str, dob: &str) -> Record {
94        Record::new(id)
95            .insert("last_name", FieldValue::Text(last_name.into()))
96            .insert("dob",       FieldValue::Text(dob.into()))
97    }
98
99    #[test]
100    fn phonetic_key_same_surname_variants_collide() {
101        let schema = make_schema();
102        let key    = PhoneticNameDobKey::new("last_name", "dob");
103
104        let r1 = make_record(1, "Smith", "1985-03-01");
105        let r2 = make_record(2, "Smyth", "1985-03-01");
106        let r3 = make_record(3, "Smythe", "1985-03-01");
107
108        let k1 = key.extract(&r1, &schema);
109        let k2 = key.extract(&r2, &schema);
110        let k3 = key.extract(&r3, &schema);
111
112        assert!(!k1.is_empty(), "SMITH should produce a key");
113        assert_eq!(k1, k2, "SMITH and SMYTH should share a phonetic key");
114        assert_eq!(k1, k3, "SMITH and SMYTHE should share a phonetic key");
115    }
116
117    #[test]
118    fn phonetic_key_different_dob_year_no_collision() {
119        let schema = make_schema();
120        let key    = PhoneticNameDobKey::new("last_name", "dob");
121
122        let r1 = make_record(1, "Berg", "1970-01-01");
123        let r2 = make_record(2, "Berg", "1985-01-01");
124
125        let k1 = key.extract(&r1, &schema);
126        let k2 = key.extract(&r2, &schema);
127        assert_ne!(k1, k2, "Different DOB years should produce different keys");
128    }
129
130    #[test]
131    fn phonetic_key_missing_field_returns_empty() {
132        let schema = make_schema();
133        let key    = PhoneticNameDobKey::new("last_name", "dob");
134
135        let r = Record::new(1).insert("last_name", FieldValue::Text("Berg".into()));
136        assert!(key.extract(&r, &schema).is_empty());
137    }
138
139    #[test]
140    fn phonetic_key_tussenvoegsel_stripped() {
141        let schema = make_schema();
142        let key    = PhoneticNameDobKey::new("last_name", "dob");
143
144        let r1 = make_record(1, "van den Berg", "1990-06-15");
145        let r2 = make_record(2, "Berg", "1990-06-15");
146
147        let k1 = key.extract(&r1, &schema);
148        let k2 = key.extract(&r2, &schema);
149        assert!(!k1.is_empty());
150        assert_eq!(k1, k2, "van den Berg and Berg should collide after prefix stripping");
151    }
152}