zer_blocking/keys/
phonetic.rs1use rphonetic::{DoubleMetaphone, Encoder, Soundex};
2use zer_core::{record::Record, schema::Schema};
3
4use crate::normalize::{extract_surname_token, normalize_text};
5use super::BlockingKey;
6
7#[derive(Debug, Clone, Copy)]
9pub enum PhoneticAlgo {
10 DoubleMetaphone,
11 Soundex,
12}
13
14pub struct PhoneticNameDobKey {
16 algo: PhoneticAlgo,
17 name_field: String,
18 dob_field: String,
19}
20
21impl PhoneticNameDobKey {
22 pub fn new(name_field: &str, dob_field: &str) -> Self {
23 Self {
24 algo: PhoneticAlgo::DoubleMetaphone,
25 name_field: name_field.into(),
26 dob_field: dob_field.into(),
27 }
28 }
29
30 pub fn with_algo(mut self, algo: PhoneticAlgo) -> Self {
31 self.algo = algo;
32 self
33 }
34
35 fn encode(&self, s: &str) -> String {
36 match self.algo {
37 PhoneticAlgo::DoubleMetaphone => DoubleMetaphone::default().encode(s),
38 PhoneticAlgo::Soundex => Soundex::default().encode(s),
39 }
40 }
41}
42
43impl BlockingKey for PhoneticNameDobKey {
44 fn name(&self) -> &str {
45 "phonetic_dob"
46 }
47
48 fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
49 let surname_cow = record.field_as_str(&self.name_field);
50 let surname_raw = match surname_cow.as_deref() {
51 Some(s) => s,
52 None => return vec![],
53 };
54 let dob_cow = record.field_as_str(&self.dob_field);
55 let dob_raw = match dob_cow.as_deref() {
56 Some(s) => s,
57 None => return vec![],
58 };
59
60 let normalized = normalize_text(surname_raw);
61 let surname = extract_surname_token(&normalized);
62 if surname.is_empty() {
63 return vec![];
64 }
65
66 let code = self.encode(surname);
67 if code.is_empty() {
68 return vec![];
69 }
70
71 let year = dob_raw.trim().get(..4).unwrap_or("").to_string();
72 if year.len() < 4 || !year.chars().all(|c| c.is_ascii_digit()) {
73 return vec![];
74 }
75
76 vec![format!("{}:{}", code, year)]
77 }
78}
79
80#[cfg(test)]
81mod tests {
82 use super::*;
83 use zer_core::{record::FieldValue, schema::SchemaBuilder, schema::FieldKind};
84
85 fn make_schema() -> Schema {
86 SchemaBuilder::new()
87 .field("last_name", FieldKind::Name)
88 .field("dob", FieldKind::Date)
89 .build()
90 .unwrap()
91 }
92
93 fn make_record(id: u64, last_name: &str, dob: &str) -> Record {
94 Record::new(id)
95 .insert("last_name", FieldValue::Text(last_name.into()))
96 .insert("dob", FieldValue::Text(dob.into()))
97 }
98
99 #[test]
100 fn phonetic_key_same_surname_variants_collide() {
101 let schema = make_schema();
102 let key = PhoneticNameDobKey::new("last_name", "dob");
103
104 let r1 = make_record(1, "Smith", "1985-03-01");
105 let r2 = make_record(2, "Smyth", "1985-03-01");
106 let r3 = make_record(3, "Smythe", "1985-03-01");
107
108 let k1 = key.extract(&r1, &schema);
109 let k2 = key.extract(&r2, &schema);
110 let k3 = key.extract(&r3, &schema);
111
112 assert!(!k1.is_empty(), "SMITH should produce a key");
113 assert_eq!(k1, k2, "SMITH and SMYTH should share a phonetic key");
114 assert_eq!(k1, k3, "SMITH and SMYTHE should share a phonetic key");
115 }
116
117 #[test]
118 fn phonetic_key_different_dob_year_no_collision() {
119 let schema = make_schema();
120 let key = PhoneticNameDobKey::new("last_name", "dob");
121
122 let r1 = make_record(1, "Berg", "1970-01-01");
123 let r2 = make_record(2, "Berg", "1985-01-01");
124
125 let k1 = key.extract(&r1, &schema);
126 let k2 = key.extract(&r2, &schema);
127 assert_ne!(k1, k2, "Different DOB years should produce different keys");
128 }
129
130 #[test]
131 fn phonetic_key_missing_field_returns_empty() {
132 let schema = make_schema();
133 let key = PhoneticNameDobKey::new("last_name", "dob");
134
135 let r = Record::new(1).insert("last_name", FieldValue::Text("Berg".into()));
136 assert!(key.extract(&r, &schema).is_empty());
137 }
138
139 #[test]
140 fn phonetic_key_tussenvoegsel_stripped() {
141 let schema = make_schema();
142 let key = PhoneticNameDobKey::new("last_name", "dob");
143
144 let r1 = make_record(1, "van den Berg", "1990-06-15");
145 let r2 = make_record(2, "Berg", "1990-06-15");
146
147 let k1 = key.extract(&r1, &schema);
148 let k2 = key.extract(&r2, &schema);
149 assert!(!k1.is_empty());
150 assert_eq!(k1, k2, "van den Berg and Berg should collide after prefix stripping");
151 }
152}