Skip to main content

zer_blocking/keys/
transliterated.rs

1use rphonetic::{DoubleMetaphone, Encoder};
2use zer_core::{record::Record, schema::Schema};
3
4use crate::normalize::{extract_surname_token, transliterate_and_normalize};
5use super::BlockingKey;
6
7// ── TransliteratedPhoneticKey ─────────────────────────────────────────────────
8
9/// Phonetic blocking key that first transliterates non-Latin script (Arabic,
10/// Cyrillic, Greek, etc.) to ASCII via `any_ascii`, then applies NFKD
11/// diacritic stripping and DoubleMetaphone encoding, combined with the DOB
12/// year.
13///
14/// Key format: `"PHONETIC_CODE:YEAR"`
15///
16/// Use alongside `PhoneticNameDobKey` (which only handles already-Latin
17/// input) when your dataset may contain non-Latin name entries, e.g. persons
18/// registered in Arabic script by one Schengen state and in Latin by another.
19pub struct TransliteratedPhoneticKey {
20    name_field: String,
21    dob_field:  String,
22}
23
24impl TransliteratedPhoneticKey {
25    pub fn new(name_field: &str, dob_field: &str) -> Self {
26        Self {
27            name_field: name_field.into(),
28            dob_field:  dob_field.into(),
29        }
30    }
31}
32
33impl BlockingKey for TransliteratedPhoneticKey {
34    fn name(&self) -> &str { "transliterated_phonetic" }
35
36    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
37        let name_cow = record.field_as_str(&self.name_field);
38        let name_raw = match name_cow.as_deref() {
39            Some(s) => s,
40            None    => return vec![],
41        };
42        let dob_cow = record.field_as_str(&self.dob_field);
43        let dob_raw = match dob_cow.as_deref() {
44            Some(s) => s,
45            None    => return vec![],
46        };
47
48        let year = dob_raw.trim().get(..4).unwrap_or("");
49        if year.len() < 4 || !year.chars().all(|c| c.is_ascii_digit()) {
50            return vec![];
51        }
52
53        let norm    = transliterate_and_normalize(name_raw);
54        let surname = extract_surname_token(&norm);
55        if surname.is_empty() { return vec![]; }
56
57        let code = DoubleMetaphone::default().encode(surname);
58        if code.is_empty() { return vec![]; }
59
60        vec![format!("{}:{}", code, year)]
61    }
62}
63
64// ── Tests ─────────────────────────────────────────────────────────────────────
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69    use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
70
71    fn schema() -> Schema {
72        SchemaBuilder::new()
73            .field("naam", FieldKind::Name)
74            .field("dob",  FieldKind::Date)
75            .build()
76            .unwrap()
77    }
78
79    fn rec(id: u64, naam: &str, dob: &str) -> Record {
80        Record::new(id)
81            .insert("naam", FieldValue::Text(naam.into()))
82            .insert("dob",  FieldValue::Text(dob.into()))
83    }
84
85    #[test]
86    fn latin_diacritic_name_produces_key() {
87        let schema = schema();
88        let key    = TransliteratedPhoneticKey::new("naam", "dob");
89        let r      = rec(1, "Müller", "1985-03-01");
90        let keys   = key.extract(&r, &schema);
91        // "Müller" → any_ascii → "Muller" → normalize → "MULLER" → phonetic + year
92        assert_eq!(keys.len(), 1);
93        assert!(keys[0].ends_with(":1985"), "key should contain DOB year");
94    }
95
96    #[test]
97    fn latin_and_arabic_transliteration_collide() {
98        let schema = schema();
99        let key    = TransliteratedPhoneticKey::new("naam", "dob");
100
101        // Arabic "بن عبدالله" transliterates to approximately "bn abdallh" → surname token
102        // This is a smoke test: both produce non-empty keys for the same DOB year.
103        let r_latin  = rec(1, "Benabdallah", "1999-01-01");
104        let r_arabic = rec(2, "بن عبدالله", "1999-01-01");
105
106        let k1 = key.extract(&r_latin,  &schema);
107        let k2 = key.extract(&r_arabic, &schema);
108
109        // Both should produce non-empty keys, exact collision depends on any_ascii
110        assert!(!k1.is_empty(), "Latin name should produce a key");
111        assert!(!k2.is_empty(), "Arabic name should produce a key after transliteration");
112    }
113
114    #[test]
115    fn missing_dob_returns_empty() {
116        let schema = schema();
117        let key    = TransliteratedPhoneticKey::new("naam", "dob");
118        let r      = Record::new(1).insert("naam", FieldValue::Text("Jansen".into()));
119        assert!(key.extract(&r, &schema).is_empty());
120    }
121
122    #[test]
123    fn missing_name_returns_empty() {
124        let schema = schema();
125        let key    = TransliteratedPhoneticKey::new("naam", "dob");
126        let r      = Record::new(1).insert("dob", FieldValue::Text("1990-01-01".into()));
127        assert!(key.extract(&r, &schema).is_empty());
128    }
129
130    #[test]
131    fn tussenvoegsel_stripped_before_phonetic() {
132        let schema = schema();
133        let key    = TransliteratedPhoneticKey::new("naam", "dob");
134
135        let r1 = rec(1, "van den Berg", "1990-06-15");
136        let r2 = rec(2, "Berg",         "1990-06-15");
137
138        let k1 = key.extract(&r1, &schema);
139        let k2 = key.extract(&r2, &schema);
140
141        assert!(!k1.is_empty());
142        assert_eq!(k1, k2, "van den Berg and Berg should produce the same phonetic key");
143    }
144}