1use std::collections::HashSet;
2
3use rphonetic::{DoubleMetaphone, Encoder};
4use unicode_normalization::UnicodeNormalization;
5use zer_core::{record::FieldValue, schema::FieldKind};
6
7use crate::similarity::SimilarityFn;
8
9fn to_ascii_upper(s: &str) -> String {
12 s.nfkd()
13 .filter(|c| c.is_ascii())
14 .collect::<String>()
15 .to_ascii_uppercase()
16}
17
18pub struct JaroWinklerSimilarity;
21
22impl SimilarityFn for JaroWinklerSimilarity {
23 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
24 match (a, b) {
25 (FieldValue::Text(a), FieldValue::Text(b)) => {
26 strsim::jaro_winkler(a, b) as f32
27 }
28 _ => 0.0,
29 }
30 }
31 fn similarity_str(&self, a: &str, b: &str) -> f32 {
32 strsim::jaro_winkler(a, b) as f32
33 }
34 fn field_kind(&self) -> FieldKind { FieldKind::Name }
35}
36
37pub struct PhoneticEqualitySimilarity;
40
41impl SimilarityFn for PhoneticEqualitySimilarity {
42 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
43 match (a, b) {
44 (FieldValue::Text(a), FieldValue::Text(b)) => {
45 let dm = DoubleMetaphone::default();
46 let norm_a = to_ascii_upper(a);
47 let norm_b = to_ascii_upper(b);
48 if norm_a.is_empty() || norm_b.is_empty() { return 0.0; }
49 let code_a = dm.encode(&norm_a);
50 let code_b = dm.encode(&norm_b);
51 if code_a.is_empty() || code_b.is_empty() { return 0.0; }
52 if code_a == code_b { 1.0 } else { 0.0 }
53 }
54 _ => 0.0,
55 }
56 }
57 fn similarity_str(&self, a: &str, b: &str) -> f32 {
58 let dm = DoubleMetaphone::default();
59 let norm_a = to_ascii_upper(a);
60 let norm_b = to_ascii_upper(b);
61 if norm_a.is_empty() || norm_b.is_empty() { return 0.0; }
62 let code_a = dm.encode(&norm_a);
63 let code_b = dm.encode(&norm_b);
64 if code_a.is_empty() || code_b.is_empty() { return 0.0; }
65 if code_a == code_b { 1.0 } else { 0.0 }
66 }
67 fn field_kind(&self) -> FieldKind { FieldKind::Name }
68}
69
70pub struct TokenOverlapSimilarity;
73
74impl SimilarityFn for TokenOverlapSimilarity {
75 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
76 match (a, b) {
77 (FieldValue::Text(a), FieldValue::Text(b)) => jaccard_tokens(a, b),
78 _ => 0.0,
79 }
80 }
81 fn similarity_str(&self, a: &str, b: &str) -> f32 { jaccard_tokens(a, b) }
82 fn field_kind(&self) -> FieldKind { FieldKind::Name }
83}
84
85pub(crate) fn jaccard_tokens(a: &str, b: &str) -> f32 {
87 let set_a: HashSet<&str> = a.split_whitespace().collect();
88 let set_b: HashSet<&str> = b.split_whitespace().collect();
89 if set_a.is_empty() && set_b.is_empty() { return 1.0; }
90 let intersection = set_a.intersection(&set_b).count();
91 let union = set_a.len() + set_b.len() - intersection;
92 if union == 0 { return 0.0; }
93 intersection as f32 / union as f32
94}
95
96pub struct LevenshteinSimilarity {
115 pub max_distance: usize,
117}
118
119impl SimilarityFn for LevenshteinSimilarity {
120 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
121 let (sa, sb) = match (a, b) {
122 (FieldValue::Text(a), FieldValue::Text(b)) => (a.as_str(), b.as_str()),
123 _ => return 0.0,
124 };
125 let dist = edit_distance::edit_distance(sa, sb);
126 if dist > self.max_distance {
127 0.0
128 } else {
129 1.0 - (dist as f32 / self.max_distance.max(1) as f32)
130 }
131 }
132 fn similarity_str(&self, a: &str, b: &str) -> f32 {
133 let dist = edit_distance::edit_distance(a, b);
134 if dist > self.max_distance { 0.0 }
135 else { 1.0 - (dist as f32 / self.max_distance.max(1) as f32) }
136 }
137 fn field_kind(&self) -> zer_core::schema::FieldKind { zer_core::schema::FieldKind::Name }
138}
139
140pub struct AliasTokenOverlapSimilarity;
145
146impl SimilarityFn for AliasTokenOverlapSimilarity {
147 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
148 match (a, b) {
149 (FieldValue::Text(a), FieldValue::Text(b)) => {
150 let aliases_a: Vec<&str> = a.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
151 let aliases_b: Vec<&str> = b.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
152 if aliases_a.is_empty() || aliases_b.is_empty() { return 0.0; }
153 aliases_a.iter()
154 .flat_map(|aa| aliases_b.iter().map(move |ab| jaccard_tokens(aa, ab)))
155 .fold(0.0_f32, f32::max)
156 }
157 _ => 0.0,
158 }
159 }
160 fn similarity_str(&self, a: &str, b: &str) -> f32 {
161 let aliases_a: Vec<&str> = a.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
162 let aliases_b: Vec<&str> = b.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
163 if aliases_a.is_empty() || aliases_b.is_empty() { return 0.0; }
164 aliases_a.iter()
165 .flat_map(|aa| aliases_b.iter().map(move |ab| jaccard_tokens(aa, ab)))
166 .fold(0.0_f32, f32::max)
167 }
168 fn field_kind(&self) -> FieldKind { FieldKind::Alias }
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174
175 #[test]
176 fn jaro_winkler_similar_names() {
177 let sim = JaroWinklerSimilarity;
178 let a = FieldValue::Text("JOHN SMITH".into());
179 let b = FieldValue::Text("JON SMYTH".into());
180 let s = sim.similarity(&a, &b);
181 assert!(s > 0.8, "similar names should score > 0.8, got {s}");
182 }
183
184 #[test]
185 fn jaro_winkler_different_names() {
186 let sim = JaroWinklerSimilarity;
187 let a = FieldValue::Text("JOHN SMITH".into());
188 let b = FieldValue::Text("JANE DOE".into());
189 let s = sim.similarity(&a, &b);
190 assert!(s < 0.6, "very different names should score < 0.6, got {s}");
191 }
192
193 #[test]
194 fn phonetic_equality_sound_alikes() {
195 let sim = PhoneticEqualitySimilarity;
196 let a = FieldValue::Text("Smith".into());
197 let b = FieldValue::Text("Smyth".into());
198 assert_eq!(sim.similarity(&a, &b), 1.0, "Smith and Smyth should be phonetically equal");
199 }
200
201 #[test]
202 fn phonetic_equality_different() {
203 let sim = PhoneticEqualitySimilarity;
204 let a = FieldValue::Text("Jansen".into());
205 let b = FieldValue::Text("Bakker".into());
206 assert_eq!(sim.similarity(&a, &b), 0.0, "Jansen and Bakker should not be phonetically equal");
207 }
208
209 #[test]
210 fn token_overlap_swapped_name() {
211 let sim = TokenOverlapSimilarity;
212 let a = FieldValue::Text("John Smith".into());
213 let b = FieldValue::Text("Smith John".into());
214 assert_eq!(sim.similarity(&a, &b), 1.0, "token overlap should be 1.0 for swapped tokens");
215 }
216
217 #[test]
218 fn token_overlap_partial() {
219 let sim = TokenOverlapSimilarity;
220 let a = FieldValue::Text("Alice van Berg".into());
222 let b = FieldValue::Text("Alice Berg".into());
223 let s = sim.similarity(&a, &b);
224 assert!(s > 0.3, "partial name overlap should produce > 0.3, got {s}");
225 }
226
227 #[test]
228 fn alias_overlap_cross_product() {
229 let sim = AliasTokenOverlapSimilarity;
230 let a = FieldValue::Text("Benabdallah Fatima|F. Benabdallah".into());
232 let b = FieldValue::Text("Fatima Benabdallah".into());
233 let s = sim.similarity(&a, &b);
234 assert!(s > 0.5, "alias cross-product should find overlap, got {s}");
235 }
236
237 #[test]
238 fn alias_overlap_empty_field() {
239 let sim = AliasTokenOverlapSimilarity;
240 let a = FieldValue::Text("".into());
241 let b = FieldValue::Text("Jansen".into());
242 assert_eq!(sim.similarity(&a, &b), 0.0);
243 }
244
245 #[test]
246 fn similarity_null_fields_return_zero() {
247 let sim = JaroWinklerSimilarity;
248 assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("test".into())), 0.0);
249 assert_eq!(sim.similarity(&FieldValue::Text("test".into()), &FieldValue::Null), 0.0);
250 }
251
252 #[test]
253 fn levenshtein_exact_match() {
254 let sim = LevenshteinSimilarity { max_distance: 3 };
255 let a = FieldValue::Text("Jansen".into());
256 let b = FieldValue::Text("Jansen".into());
257 assert_eq!(sim.similarity(&a, &b), 1.0, "edit distance 0 must yield 1.0");
258 }
259
260 #[test]
261 fn levenshtein_over_max() {
262 let sim = LevenshteinSimilarity { max_distance: 2 };
263 let a = FieldValue::Text("hello".into());
264 let b = FieldValue::Text("world".into()); assert_eq!(sim.similarity(&a, &b), 0.0, "edit distance > max_distance must yield 0.0");
266 }
267
268 #[test]
269 fn levenshtein_partial() {
270 let sim = LevenshteinSimilarity { max_distance: 3 };
271 let a = FieldValue::Text("Jansen".into());
272 let b = FieldValue::Text("Jansem".into()); let s = sim.similarity(&a, &b);
274 assert!(s > 0.0 && s < 1.0, "partial distance must yield value in (0.0, 1.0), got {s}");
275 }
276
277 #[test]
278 fn levenshtein_null_fields_return_zero() {
279 let sim = LevenshteinSimilarity { max_distance: 3 };
280 assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("test".into())), 0.0);
281 assert_eq!(sim.similarity(&FieldValue::Text("test".into()), &FieldValue::Null), 0.0);
282 }
283}