zer_blocking/keys/
transliterated.rs1use rphonetic::{DoubleMetaphone, Encoder};
2use zer_core::{record::Record, schema::Schema};
3
4use crate::normalize::{extract_surname_token, transliterate_and_normalize};
5use super::BlockingKey;
6
7pub struct TransliteratedPhoneticKey {
20 name_field: String,
21 dob_field: String,
22}
23
24impl TransliteratedPhoneticKey {
25 pub fn new(name_field: &str, dob_field: &str) -> Self {
26 Self {
27 name_field: name_field.into(),
28 dob_field: dob_field.into(),
29 }
30 }
31}
32
33impl BlockingKey for TransliteratedPhoneticKey {
34 fn name(&self) -> &str { "transliterated_phonetic" }
35
36 fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
37 let name_cow = record.field_as_str(&self.name_field);
38 let name_raw = match name_cow.as_deref() {
39 Some(s) => s,
40 None => return vec![],
41 };
42 let dob_cow = record.field_as_str(&self.dob_field);
43 let dob_raw = match dob_cow.as_deref() {
44 Some(s) => s,
45 None => return vec![],
46 };
47
48 let year = dob_raw.trim().get(..4).unwrap_or("");
49 if year.len() < 4 || !year.chars().all(|c| c.is_ascii_digit()) {
50 return vec![];
51 }
52
53 let norm = transliterate_and_normalize(name_raw);
54 let surname = extract_surname_token(&norm);
55 if surname.is_empty() { return vec![]; }
56
57 let code = DoubleMetaphone::default().encode(surname);
58 if code.is_empty() { return vec![]; }
59
60 vec![format!("{}:{}", code, year)]
61 }
62}
63
64#[cfg(test)]
67mod tests {
68 use super::*;
69 use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
70
71 fn schema() -> Schema {
72 SchemaBuilder::new()
73 .field("naam", FieldKind::Name)
74 .field("dob", FieldKind::Date)
75 .build()
76 .unwrap()
77 }
78
79 fn rec(id: u64, naam: &str, dob: &str) -> Record {
80 Record::new(id)
81 .insert("naam", FieldValue::Text(naam.into()))
82 .insert("dob", FieldValue::Text(dob.into()))
83 }
84
85 #[test]
86 fn latin_diacritic_name_produces_key() {
87 let schema = schema();
88 let key = TransliteratedPhoneticKey::new("naam", "dob");
89 let r = rec(1, "Müller", "1985-03-01");
90 let keys = key.extract(&r, &schema);
91 assert_eq!(keys.len(), 1);
93 assert!(keys[0].ends_with(":1985"), "key should contain DOB year");
94 }
95
96 #[test]
97 fn latin_and_arabic_transliteration_collide() {
98 let schema = schema();
99 let key = TransliteratedPhoneticKey::new("naam", "dob");
100
101 let r_latin = rec(1, "Benabdallah", "1999-01-01");
104 let r_arabic = rec(2, "بن عبدالله", "1999-01-01");
105
106 let k1 = key.extract(&r_latin, &schema);
107 let k2 = key.extract(&r_arabic, &schema);
108
109 assert!(!k1.is_empty(), "Latin name should produce a key");
111 assert!(!k2.is_empty(), "Arabic name should produce a key after transliteration");
112 }
113
114 #[test]
115 fn missing_dob_returns_empty() {
116 let schema = schema();
117 let key = TransliteratedPhoneticKey::new("naam", "dob");
118 let r = Record::new(1).insert("naam", FieldValue::Text("Jansen".into()));
119 assert!(key.extract(&r, &schema).is_empty());
120 }
121
122 #[test]
123 fn missing_name_returns_empty() {
124 let schema = schema();
125 let key = TransliteratedPhoneticKey::new("naam", "dob");
126 let r = Record::new(1).insert("dob", FieldValue::Text("1990-01-01".into()));
127 assert!(key.extract(&r, &schema).is_empty());
128 }
129
130 #[test]
131 fn tussenvoegsel_stripped_before_phonetic() {
132 let schema = schema();
133 let key = TransliteratedPhoneticKey::new("naam", "dob");
134
135 let r1 = rec(1, "van den Berg", "1990-06-15");
136 let r2 = rec(2, "Berg", "1990-06-15");
137
138 let k1 = key.extract(&r1, &schema);
139 let k2 = key.extract(&r2, &schema);
140
141 assert!(!k1.is_empty());
142 assert_eq!(k1, k2, "van den Berg and Berg should produce the same phonetic key");
143 }
144}