Skip to main content

zer_compare/similarity/
name.rs

1use std::collections::HashSet;
2
3use rphonetic::{DoubleMetaphone, Encoder};
4use unicode_normalization::UnicodeNormalization;
5use zer_core::{record::FieldValue, schema::FieldKind};
6
7use crate::similarity::SimilarityFn;
8
9/// Strip diacritics via NFKD decomposition and keep only ASCII characters,
10/// then uppercase. Prevents rphonetic from panicking on multi-byte chars.
11fn to_ascii_upper(s: &str) -> String {
12    s.nfkd()
13        .filter(|c| c.is_ascii())
14        .collect::<String>()
15        .to_ascii_uppercase()
16}
17
18// ── Jaro-Winkler ─────────────────────────────────────────────────────────────
19
20pub struct JaroWinklerSimilarity;
21
22impl SimilarityFn for JaroWinklerSimilarity {
23    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
24        match (a, b) {
25            (FieldValue::Text(a), FieldValue::Text(b)) => {
26                strsim::jaro_winkler(a, b) as f32
27            }
28            _ => 0.0,
29        }
30    }
31    fn similarity_str(&self, a: &str, b: &str) -> f32 {
32        strsim::jaro_winkler(a, b) as f32
33    }
34    fn field_kind(&self) -> FieldKind { FieldKind::Name }
35}
36
37// ── Phonetic equality (Double Metaphone) ─────────────────────────────────────
38
39pub struct PhoneticEqualitySimilarity;
40
41impl SimilarityFn for PhoneticEqualitySimilarity {
42    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
43        match (a, b) {
44            (FieldValue::Text(a), FieldValue::Text(b)) => {
45                let dm     = DoubleMetaphone::default();
46                let norm_a = to_ascii_upper(a);
47                let norm_b = to_ascii_upper(b);
48                if norm_a.is_empty() || norm_b.is_empty() { return 0.0; }
49                let code_a = dm.encode(&norm_a);
50                let code_b = dm.encode(&norm_b);
51                if code_a.is_empty() || code_b.is_empty() { return 0.0; }
52                if code_a == code_b { 1.0 } else { 0.0 }
53            }
54            _ => 0.0,
55        }
56    }
57    fn similarity_str(&self, a: &str, b: &str) -> f32 {
58        let dm     = DoubleMetaphone::default();
59        let norm_a = to_ascii_upper(a);
60        let norm_b = to_ascii_upper(b);
61        if norm_a.is_empty() || norm_b.is_empty() { return 0.0; }
62        let code_a = dm.encode(&norm_a);
63        let code_b = dm.encode(&norm_b);
64        if code_a.is_empty() || code_b.is_empty() { return 0.0; }
65        if code_a == code_b { 1.0 } else { 0.0 }
66    }
67    fn field_kind(&self) -> FieldKind { FieldKind::Name }
68}
69
70// ── Token overlap (Jaccard) ───────────────────────────────────────────────────
71
72pub struct TokenOverlapSimilarity;
73
74impl SimilarityFn for TokenOverlapSimilarity {
75    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
76        match (a, b) {
77            (FieldValue::Text(a), FieldValue::Text(b)) => jaccard_tokens(a, b),
78            _ => 0.0,
79        }
80    }
81    fn similarity_str(&self, a: &str, b: &str) -> f32 { jaccard_tokens(a, b) }
82    fn field_kind(&self) -> FieldKind { FieldKind::Name }
83}
84
85/// Jaccard coefficient of whitespace-separated token sets.
86pub(crate) fn jaccard_tokens(a: &str, b: &str) -> f32 {
87    let set_a: HashSet<&str> = a.split_whitespace().collect();
88    let set_b: HashSet<&str> = b.split_whitespace().collect();
89    if set_a.is_empty() && set_b.is_empty() { return 1.0; }
90    let intersection = set_a.intersection(&set_b).count();
91    let union        = set_a.len() + set_b.len() - intersection;
92    if union == 0 { return 0.0; }
93    intersection as f32 / union as f32
94}
95
96// ── Levenshtein edit distance ─────────────────────────────────────────────────
97
98/// Normalised Levenshtein similarity in [0.0, 1.0].
99///
100/// The raw edit distance is clipped at `max_distance`; distances above that
101/// return 0.0.  Within the allowed range similarity is
102/// `1.0 - dist / max_distance`.
103///
104/// ```
105/// use zer_compare::similarity::name::LevenshteinSimilarity;
106/// use zer_compare::similarity::SimilarityFn;
107/// use zer_core::record::FieldValue;
108///
109/// let sim = LevenshteinSimilarity { max_distance: 3 };
110/// let a = FieldValue::Text("Jansen".into());
111/// let b = FieldValue::Text("Jansen".into());
112/// assert_eq!(sim.similarity(&a, &b), 1.0); // exact match
113/// ```
114pub struct LevenshteinSimilarity {
115    /// Maximum edit distance above which similarity is 0.0.
116    pub max_distance: usize,
117}
118
119impl SimilarityFn for LevenshteinSimilarity {
120    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
121        let (sa, sb) = match (a, b) {
122            (FieldValue::Text(a), FieldValue::Text(b)) => (a.as_str(), b.as_str()),
123            _ => return 0.0,
124        };
125        let dist = edit_distance::edit_distance(sa, sb);
126        if dist > self.max_distance {
127            0.0
128        } else {
129            1.0 - (dist as f32 / self.max_distance.max(1) as f32)
130        }
131    }
132    fn similarity_str(&self, a: &str, b: &str) -> f32 {
133        let dist = edit_distance::edit_distance(a, b);
134        if dist > self.max_distance { 0.0 }
135        else { 1.0 - (dist as f32 / self.max_distance.max(1) as f32) }
136    }
137    fn field_kind(&self) -> zer_core::schema::FieldKind { zer_core::schema::FieldKind::Name }
138}
139
140// ── Alias token overlap (pipe-delimited multi-value field) ────────────────────
141
142/// Compares pipe-delimited alias lists by taking the maximum Jaccard score
143/// across all cross-product pairs of individual alias values.
144pub struct AliasTokenOverlapSimilarity;
145
146impl SimilarityFn for AliasTokenOverlapSimilarity {
147    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
148        match (a, b) {
149            (FieldValue::Text(a), FieldValue::Text(b)) => {
150                let aliases_a: Vec<&str> = a.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
151                let aliases_b: Vec<&str> = b.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
152                if aliases_a.is_empty() || aliases_b.is_empty() { return 0.0; }
153                aliases_a.iter()
154                    .flat_map(|aa| aliases_b.iter().map(move |ab| jaccard_tokens(aa, ab)))
155                    .fold(0.0_f32, f32::max)
156            }
157            _ => 0.0,
158        }
159    }
160    fn similarity_str(&self, a: &str, b: &str) -> f32 {
161        let aliases_a: Vec<&str> = a.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
162        let aliases_b: Vec<&str> = b.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
163        if aliases_a.is_empty() || aliases_b.is_empty() { return 0.0; }
164        aliases_a.iter()
165            .flat_map(|aa| aliases_b.iter().map(move |ab| jaccard_tokens(aa, ab)))
166            .fold(0.0_f32, f32::max)
167    }
168    fn field_kind(&self) -> FieldKind { FieldKind::Alias }
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn jaro_winkler_similar_names() {
177        let sim = JaroWinklerSimilarity;
178        let a   = FieldValue::Text("JOHN SMITH".into());
179        let b   = FieldValue::Text("JON SMYTH".into());
180        let s   = sim.similarity(&a, &b);
181        assert!(s > 0.8, "similar names should score > 0.8, got {s}");
182    }
183
184    #[test]
185    fn jaro_winkler_different_names() {
186        let sim = JaroWinklerSimilarity;
187        let a   = FieldValue::Text("JOHN SMITH".into());
188        let b   = FieldValue::Text("JANE DOE".into());
189        let s   = sim.similarity(&a, &b);
190        assert!(s < 0.6, "very different names should score < 0.6, got {s}");
191    }
192
193    #[test]
194    fn phonetic_equality_sound_alikes() {
195        let sim = PhoneticEqualitySimilarity;
196        let a   = FieldValue::Text("Smith".into());
197        let b   = FieldValue::Text("Smyth".into());
198        assert_eq!(sim.similarity(&a, &b), 1.0, "Smith and Smyth should be phonetically equal");
199    }
200
201    #[test]
202    fn phonetic_equality_different() {
203        let sim = PhoneticEqualitySimilarity;
204        let a   = FieldValue::Text("Jansen".into());
205        let b   = FieldValue::Text("Bakker".into());
206        assert_eq!(sim.similarity(&a, &b), 0.0, "Jansen and Bakker should not be phonetically equal");
207    }
208
209    #[test]
210    fn token_overlap_swapped_name() {
211        let sim = TokenOverlapSimilarity;
212        let a   = FieldValue::Text("John Smith".into());
213        let b   = FieldValue::Text("Smith John".into());
214        assert_eq!(sim.similarity(&a, &b), 1.0, "token overlap should be 1.0 for swapped tokens");
215    }
216
217    #[test]
218    fn token_overlap_partial() {
219        let sim = TokenOverlapSimilarity;
220        // "Alice van Berg" ∩ "Alice Berg" = {"Alice","Berg"}, union=3 → 0.67 > 0.3
221        let a   = FieldValue::Text("Alice van Berg".into());
222        let b   = FieldValue::Text("Alice Berg".into());
223        let s   = sim.similarity(&a, &b);
224        assert!(s > 0.3, "partial name overlap should produce > 0.3, got {s}");
225    }
226
227    #[test]
228    fn alias_overlap_cross_product() {
229        let sim = AliasTokenOverlapSimilarity;
230        // "Benabdallah Fatima" matches in alias_b
231        let a = FieldValue::Text("Benabdallah Fatima|F. Benabdallah".into());
232        let b = FieldValue::Text("Fatima Benabdallah".into());
233        let s = sim.similarity(&a, &b);
234        assert!(s > 0.5, "alias cross-product should find overlap, got {s}");
235    }
236
237    #[test]
238    fn alias_overlap_empty_field() {
239        let sim = AliasTokenOverlapSimilarity;
240        let a   = FieldValue::Text("".into());
241        let b   = FieldValue::Text("Jansen".into());
242        assert_eq!(sim.similarity(&a, &b), 0.0);
243    }
244
245    #[test]
246    fn similarity_null_fields_return_zero() {
247        let sim = JaroWinklerSimilarity;
248        assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("test".into())), 0.0);
249        assert_eq!(sim.similarity(&FieldValue::Text("test".into()), &FieldValue::Null), 0.0);
250    }
251
252    #[test]
253    fn levenshtein_exact_match() {
254        let sim = LevenshteinSimilarity { max_distance: 3 };
255        let a   = FieldValue::Text("Jansen".into());
256        let b   = FieldValue::Text("Jansen".into());
257        assert_eq!(sim.similarity(&a, &b), 1.0, "edit distance 0 must yield 1.0");
258    }
259
260    #[test]
261    fn levenshtein_over_max() {
262        let sim = LevenshteinSimilarity { max_distance: 2 };
263        let a   = FieldValue::Text("hello".into());
264        let b   = FieldValue::Text("world".into()); // edit distance 4 > max_distance 2
265        assert_eq!(sim.similarity(&a, &b), 0.0, "edit distance > max_distance must yield 0.0");
266    }
267
268    #[test]
269    fn levenshtein_partial() {
270        let sim = LevenshteinSimilarity { max_distance: 3 };
271        let a   = FieldValue::Text("Jansen".into());
272        let b   = FieldValue::Text("Jansem".into()); // edit distance 1
273        let s   = sim.similarity(&a, &b);
274        assert!(s > 0.0 && s < 1.0, "partial distance must yield value in (0.0, 1.0), got {s}");
275    }
276
277    #[test]
278    fn levenshtein_null_fields_return_zero() {
279        let sim = LevenshteinSimilarity { max_distance: 3 };
280        assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("test".into())), 0.0);
281        assert_eq!(sim.similarity(&FieldValue::Text("test".into()), &FieldValue::Null), 0.0);
282    }
283}