1use strsim::{jaro_winkler, levenshtein};
10use tracing::debug;
11use unicode_normalization::UnicodeNormalization;
12
13const PREPOSITIONS: &[&str] = &["de", "da", "do", "das", "dos", "e", "&"];
15
16#[derive(Debug, Clone)]
18pub struct NameMatcher {
19 pub threshold: f64,
21 pub first_name_weight: f64,
23 pub last_name_weight: f64,
25}
26
27impl Default for NameMatcher {
28 fn default() -> Self {
29 Self {
30 threshold: 0.85,
31 first_name_weight: 0.4,
32 last_name_weight: 0.6,
33 }
34 }
35}
36
37impl NameMatcher {
38 pub fn new() -> Self {
40 Self::default()
41 }
42
43 pub fn with_threshold(mut self, threshold: f64) -> Self {
45 self.threshold = threshold;
46 self
47 }
48
49 pub fn normalize(&self, name: &str) -> String {
56 let normalized: String = name
58 .nfkd()
59 .filter(|c| !c.is_ascii() || c.is_alphanumeric() || c.is_whitespace())
60 .collect::<String>()
61 .to_lowercase();
62
63 let ascii: String = normalized
65 .chars()
66 .filter(|c| c.is_ascii_alphanumeric() || c.is_whitespace())
67 .collect();
68
69 let words: Vec<&str> = ascii
71 .split_whitespace()
72 .filter(|word| !PREPOSITIONS.contains(word))
73 .collect();
74
75 words.join(" ")
76 }
77
78 pub fn extract_parts(&self, name: &str) -> NameParts {
80 let normalized = self.normalize(name);
81 let words: Vec<&str> = normalized.split_whitespace().collect();
82
83 match words.len() {
84 0 => NameParts {
85 first: String::new(),
86 middle: Vec::new(),
87 last: String::new(),
88 full_normalized: normalized,
89 },
90 1 => NameParts {
91 first: words[0].to_string(),
92 middle: Vec::new(),
93 last: String::new(),
94 full_normalized: normalized,
95 },
96 2 => NameParts {
97 first: words[0].to_string(),
98 middle: Vec::new(),
99 last: words[1].to_string(),
100 full_normalized: normalized,
101 },
102 _ => NameParts {
103 first: words[0].to_string(),
104 middle: words[1..words.len() - 1]
105 .iter()
106 .map(|s| s.to_string())
107 .collect(),
108 last: words[words.len() - 1].to_string(),
109 full_normalized: normalized,
110 },
111 }
112 }
113
114 pub fn similarity(&self, name1: &str, name2: &str) -> f64 {
118 let parts1 = self.extract_parts(name1);
119 let parts2 = self.extract_parts(name2);
120
121 if parts1.full_normalized.is_empty() || parts2.full_normalized.is_empty() {
123 return 0.0;
124 }
125
126 let first_score = self.compare_parts(&parts1.first, &parts2.first);
128 let last_score = self.compare_parts(&parts1.last, &parts2.last);
129
130 let full_score = jaro_winkler(&parts1.full_normalized, &parts2.full_normalized);
132
133 let abbrev_bonus = self.abbreviation_bonus(&parts1, &parts2);
135
136 let base_score = if parts1.last.is_empty() || parts2.last.is_empty() {
138 first_score * 0.6 + full_score * 0.4
140 } else {
141 first_score * self.first_name_weight + last_score * self.last_name_weight
142 };
143
144 let combined = (base_score * 0.7 + full_score * 0.3 + abbrev_bonus).min(1.0);
146
147 debug!(
148 name1 = name1,
149 name2 = name2,
150 first_score,
151 last_score,
152 full_score,
153 abbrev_bonus,
154 combined,
155 "Name similarity calculated"
156 );
157
158 combined
159 }
160
161 fn compare_parts(&self, part1: &str, part2: &str) -> f64 {
163 if part1.is_empty() || part2.is_empty() {
164 return 0.0;
165 }
166
167 if self.is_abbreviation(part1, part2) || self.is_abbreviation(part2, part1) {
169 return 0.95;
170 }
171
172 jaro_winkler(part1, part2)
174 }
175
176 fn is_abbreviation(&self, short: &str, long: &str) -> bool {
178 if short.len() > long.len() {
179 return false;
180 }
181
182 let short_clean = short.trim_end_matches('.');
184 if short_clean.len() <= 2 && long.starts_with(short_clean) {
185 return true;
186 }
187
188 false
189 }
190
191 fn abbreviation_bonus(&self, parts1: &NameParts, parts2: &NameParts) -> f64 {
193 let mut bonus: f64 = 0.0;
194
195 for m1 in &parts1.middle {
197 for m2 in &parts2.middle {
198 if self.is_abbreviation(m1, m2) || self.is_abbreviation(m2, m1) {
199 bonus += 0.05;
200 }
201 }
202 }
203
204 for m1 in &parts1.middle {
206 if self.is_abbreviation(m1, &parts2.first) || self.is_abbreviation(m1, &parts2.last) {
207 bonus += 0.03;
208 }
209 }
210
211 bonus.min(0.15)
212 }
213
214 pub fn matches(&self, name1: &str, name2: &str) -> bool {
216 self.similarity(name1, name2) >= self.threshold
217 }
218
219 pub fn to_entity_id(&self, name: &str) -> String {
221 self.normalize(name).replace(' ', "_")
222 }
223
224 pub fn distance(&self, name1: &str, name2: &str) -> usize {
226 let n1 = self.normalize(name1);
227 let n2 = self.normalize(name2);
228 levenshtein(&n1, &n2)
229 }
230}
231
232#[derive(Debug, Clone)]
234pub struct NameParts {
235 pub first: String,
237 pub middle: Vec<String>,
239 pub last: String,
241 pub full_normalized: String,
243}
244
245#[cfg(test)]
246mod tests {
247 use super::*;
248
249 #[test]
250 fn test_normalize() {
251 let matcher = NameMatcher::new();
252
253 assert_eq!(
254 matcher.normalize("Lucas Melo de Oliveira"),
255 "lucas melo oliveira"
256 );
257
258 assert_eq!(matcher.normalize("ANA CLARA DA SILVA"), "ana clara silva");
259
260 assert_eq!(
261 matcher.normalize("João dos Santos & Filhos"),
262 "joao santos filhos"
263 );
264
265 assert_eq!(matcher.normalize("JOSÉ MARÍA"), "jose maria");
266 }
267
268 #[test]
269 fn test_extract_parts() {
270 let matcher = NameMatcher::new();
271
272 let parts = matcher.extract_parts("Lucas Melo de Oliveira");
273 assert_eq!(parts.first, "lucas");
274 assert_eq!(parts.middle, vec!["melo"]);
275 assert_eq!(parts.last, "oliveira");
276
277 let parts2 = matcher.extract_parts("Ana Silva");
278 assert_eq!(parts2.first, "ana");
279 assert!(parts2.middle.is_empty());
280 assert_eq!(parts2.last, "silva");
281 }
282
283 #[test]
284 fn test_high_similarity() {
285 let matcher = NameMatcher::new();
286
287 let sim1 = matcher.similarity("Lucas Melo Oliveira", "Lucas M. Oliveira");
289 assert!(sim1 >= 0.85, "Expected >= 0.85, got {}", sim1);
290
291 let sim2 = matcher.similarity("Ana Clara Silva", "Ana C Silva");
292 assert!(sim2 >= 0.85, "Expected >= 0.85, got {}", sim2);
293
294 let sim3 = matcher.similarity("João Santos", "João Santos");
296 assert!(sim3 >= 0.99, "Expected >= 0.99, got {}", sim3);
297 }
298
299 #[test]
300 fn test_low_similarity() {
301 let matcher = NameMatcher::new();
302
303 let sim = matcher.similarity("Lucas Oliveira", "Maria Souza");
305 assert!(sim < 0.6, "Expected < 0.6, got {}", sim);
306
307 let sim2 = matcher.similarity("Pedro Silva", "Ana Santos");
308 assert!(sim2 < 0.6, "Expected < 0.6, got {}", sim2);
309
310 let sim3 = matcher.similarity("Xyz Abc", "Qrs Tuv");
312 assert!(sim3 < 0.5, "Expected < 0.5, got {}", sim3);
313 }
314
315 #[test]
316 fn test_abbreviation_handling() {
317 let matcher = NameMatcher::new();
318
319 let sim = matcher.similarity("Lucas M. Oliveira", "Lucas Melo Oliveira");
321 assert!(sim >= 0.90, "Abbreviation should score high: {}", sim);
322
323 let sim2 = matcher.similarity("Ana C. Silva", "Ana Clara Silva");
325 assert!(sim2 >= 0.88, "Abbreviation should score high: {}", sim2);
326 }
327
328 #[test]
329 fn test_matches() {
330 let matcher = NameMatcher::new().with_threshold(0.85);
331
332 assert!(matcher.matches("Lucas Oliveira", "Lucas M. Oliveira"));
333 assert!(!matcher.matches("Lucas Oliveira", "Pedro Santos"));
334 }
335
336 #[test]
337 fn test_to_entity_id() {
338 let matcher = NameMatcher::new();
339
340 assert_eq!(
341 matcher.to_entity_id("Lucas Melo de Oliveira"),
342 "lucas_melo_oliveira"
343 );
344 }
345
346 #[test]
347 fn test_distance() {
348 let matcher = NameMatcher::new();
349
350 assert_eq!(matcher.distance("Lucas", "Lucas"), 0);
351 assert!(matcher.distance("Lucas", "Lucaz") <= 2);
352 }
353}