Skip to main content

cortexai_data/
name.rs

1//! Name matching with Brazilian conventions
2//!
3//! Fuzzy matching that handles:
4//! - Prepositions (de, da, do, das, dos)
5//! - Abbreviations (M. for Melo)
6//! - Accents and special characters
7//! - Case insensitivity
8
9use strsim::{jaro_winkler, levenshtein};
10use tracing::debug;
11use unicode_normalization::UnicodeNormalization;
12
13/// Portuguese prepositions commonly found in names
14const PREPOSITIONS: &[&str] = &["de", "da", "do", "das", "dos", "e", "&"];
15
16/// Name matcher with Brazilian conventions
17#[derive(Debug, Clone)]
18pub struct NameMatcher {
19    /// Minimum similarity threshold for a match
20    pub threshold: f64,
21    /// Weight for first name matching
22    pub first_name_weight: f64,
23    /// Weight for last name matching
24    pub last_name_weight: f64,
25}
26
27impl Default for NameMatcher {
28    fn default() -> Self {
29        Self {
30            threshold: 0.85,
31            first_name_weight: 0.4,
32            last_name_weight: 0.6,
33        }
34    }
35}
36
37impl NameMatcher {
38    /// Create a new name matcher
39    pub fn new() -> Self {
40        Self::default()
41    }
42
43    /// Set minimum similarity threshold
44    pub fn with_threshold(mut self, threshold: f64) -> Self {
45        self.threshold = threshold;
46        self
47    }
48
49    /// Normalize a name for comparison
50    ///
51    /// - Converts to lowercase
52    /// - Removes accents
53    /// - Removes prepositions
54    /// - Normalizes whitespace
55    pub fn normalize(&self, name: &str) -> String {
56        // Normalize unicode and convert to lowercase
57        let normalized: String = name
58            .nfkd()
59            .filter(|c| !c.is_ascii() || c.is_alphanumeric() || c.is_whitespace())
60            .collect::<String>()
61            .to_lowercase();
62
63        // Remove accents (keep only ASCII)
64        let ascii: String = normalized
65            .chars()
66            .filter(|c| c.is_ascii_alphanumeric() || c.is_whitespace())
67            .collect();
68
69        // Split into words and filter prepositions
70        let words: Vec<&str> = ascii
71            .split_whitespace()
72            .filter(|word| !PREPOSITIONS.contains(word))
73            .collect();
74
75        words.join(" ")
76    }
77
78    /// Extract name parts (first name, middle names, last name)
79    pub fn extract_parts(&self, name: &str) -> NameParts {
80        let normalized = self.normalize(name);
81        let words: Vec<&str> = normalized.split_whitespace().collect();
82
83        match words.len() {
84            0 => NameParts {
85                first: String::new(),
86                middle: Vec::new(),
87                last: String::new(),
88                full_normalized: normalized,
89            },
90            1 => NameParts {
91                first: words[0].to_string(),
92                middle: Vec::new(),
93                last: String::new(),
94                full_normalized: normalized,
95            },
96            2 => NameParts {
97                first: words[0].to_string(),
98                middle: Vec::new(),
99                last: words[1].to_string(),
100                full_normalized: normalized,
101            },
102            _ => NameParts {
103                first: words[0].to_string(),
104                middle: words[1..words.len() - 1]
105                    .iter()
106                    .map(|s| s.to_string())
107                    .collect(),
108                last: words[words.len() - 1].to_string(),
109                full_normalized: normalized,
110            },
111        }
112    }
113
114    /// Calculate similarity between two names
115    ///
116    /// Returns a score between 0.0 and 1.0
117    pub fn similarity(&self, name1: &str, name2: &str) -> f64 {
118        let parts1 = self.extract_parts(name1);
119        let parts2 = self.extract_parts(name2);
120
121        // If either name is empty, no match
122        if parts1.full_normalized.is_empty() || parts2.full_normalized.is_empty() {
123            return 0.0;
124        }
125
126        // Calculate component scores
127        let first_score = self.compare_parts(&parts1.first, &parts2.first);
128        let last_score = self.compare_parts(&parts1.last, &parts2.last);
129
130        // Full name comparison using Jaro-Winkler
131        let full_score = jaro_winkler(&parts1.full_normalized, &parts2.full_normalized);
132
133        // Check for abbreviation matches
134        let abbrev_bonus = self.abbreviation_bonus(&parts1, &parts2);
135
136        // Weighted combination
137        let base_score = if parts1.last.is_empty() || parts2.last.is_empty() {
138            // Only first name available
139            first_score * 0.6 + full_score * 0.4
140        } else {
141            first_score * self.first_name_weight + last_score * self.last_name_weight
142        };
143
144        // Combine with full name score and abbreviation bonus
145        let combined = (base_score * 0.7 + full_score * 0.3 + abbrev_bonus).min(1.0);
146
147        debug!(
148            name1 = name1,
149            name2 = name2,
150            first_score,
151            last_score,
152            full_score,
153            abbrev_bonus,
154            combined,
155            "Name similarity calculated"
156        );
157
158        combined
159    }
160
161    /// Compare two name parts
162    fn compare_parts(&self, part1: &str, part2: &str) -> f64 {
163        if part1.is_empty() || part2.is_empty() {
164            return 0.0;
165        }
166
167        // Check for abbreviation
168        if self.is_abbreviation(part1, part2) || self.is_abbreviation(part2, part1) {
169            return 0.95;
170        }
171
172        // Use Jaro-Winkler for fuzzy matching
173        jaro_winkler(part1, part2)
174    }
175
176    /// Check if one string is an abbreviation of another
177    fn is_abbreviation(&self, short: &str, long: &str) -> bool {
178        if short.len() > long.len() {
179            return false;
180        }
181
182        // Check if short is just the first letter(s) of long
183        let short_clean = short.trim_end_matches('.');
184        if short_clean.len() <= 2 && long.starts_with(short_clean) {
185            return true;
186        }
187
188        false
189    }
190
191    /// Calculate bonus for abbreviation matches
192    fn abbreviation_bonus(&self, parts1: &NameParts, parts2: &NameParts) -> f64 {
193        let mut bonus: f64 = 0.0;
194
195        // Check middle name abbreviations
196        for m1 in &parts1.middle {
197            for m2 in &parts2.middle {
198                if self.is_abbreviation(m1, m2) || self.is_abbreviation(m2, m1) {
199                    bonus += 0.05;
200                }
201            }
202        }
203
204        // Check if middle name in one matches full in other
205        for m1 in &parts1.middle {
206            if self.is_abbreviation(m1, &parts2.first) || self.is_abbreviation(m1, &parts2.last) {
207                bonus += 0.03;
208            }
209        }
210
211        bonus.min(0.15)
212    }
213
214    /// Check if two names match above threshold
215    pub fn matches(&self, name1: &str, name2: &str) -> bool {
216        self.similarity(name1, name2) >= self.threshold
217    }
218
219    /// Generate a normalized entity ID from a name
220    pub fn to_entity_id(&self, name: &str) -> String {
221        self.normalize(name).replace(' ', "_")
222    }
223
224    /// Calculate Levenshtein distance between names
225    pub fn distance(&self, name1: &str, name2: &str) -> usize {
226        let n1 = self.normalize(name1);
227        let n2 = self.normalize(name2);
228        levenshtein(&n1, &n2)
229    }
230}
231
232/// Extracted name parts
233#[derive(Debug, Clone)]
234pub struct NameParts {
235    /// First name
236    pub first: String,
237    /// Middle names
238    pub middle: Vec<String>,
239    /// Last name
240    pub last: String,
241    /// Full normalized name
242    pub full_normalized: String,
243}
244
245#[cfg(test)]
246mod tests {
247    use super::*;
248
249    #[test]
250    fn test_normalize() {
251        let matcher = NameMatcher::new();
252
253        assert_eq!(
254            matcher.normalize("Lucas Melo de Oliveira"),
255            "lucas melo oliveira"
256        );
257
258        assert_eq!(matcher.normalize("ANA CLARA DA SILVA"), "ana clara silva");
259
260        assert_eq!(
261            matcher.normalize("João dos Santos & Filhos"),
262            "joao santos filhos"
263        );
264
265        assert_eq!(matcher.normalize("JOSÉ MARÍA"), "jose maria");
266    }
267
268    #[test]
269    fn test_extract_parts() {
270        let matcher = NameMatcher::new();
271
272        let parts = matcher.extract_parts("Lucas Melo de Oliveira");
273        assert_eq!(parts.first, "lucas");
274        assert_eq!(parts.middle, vec!["melo"]);
275        assert_eq!(parts.last, "oliveira");
276
277        let parts2 = matcher.extract_parts("Ana Silva");
278        assert_eq!(parts2.first, "ana");
279        assert!(parts2.middle.is_empty());
280        assert_eq!(parts2.last, "silva");
281    }
282
283    #[test]
284    fn test_high_similarity() {
285        let matcher = NameMatcher::new();
286
287        // Very similar names
288        let sim1 = matcher.similarity("Lucas Melo Oliveira", "Lucas M. Oliveira");
289        assert!(sim1 >= 0.85, "Expected >= 0.85, got {}", sim1);
290
291        let sim2 = matcher.similarity("Ana Clara Silva", "Ana C Silva");
292        assert!(sim2 >= 0.85, "Expected >= 0.85, got {}", sim2);
293
294        // Exact match
295        let sim3 = matcher.similarity("João Santos", "João Santos");
296        assert!(sim3 >= 0.99, "Expected >= 0.99, got {}", sim3);
297    }
298
299    #[test]
300    fn test_low_similarity() {
301        let matcher = NameMatcher::new();
302
303        // Very different names should have low similarity
304        let sim = matcher.similarity("Lucas Oliveira", "Maria Souza");
305        assert!(sim < 0.6, "Expected < 0.6, got {}", sim);
306
307        let sim2 = matcher.similarity("Pedro Silva", "Ana Santos");
308        assert!(sim2 < 0.6, "Expected < 0.6, got {}", sim2);
309
310        // Completely unrelated names
311        let sim3 = matcher.similarity("Xyz Abc", "Qrs Tuv");
312        assert!(sim3 < 0.5, "Expected < 0.5, got {}", sim3);
313    }
314
315    #[test]
316    fn test_abbreviation_handling() {
317        let matcher = NameMatcher::new();
318
319        // M. should match Melo
320        let sim = matcher.similarity("Lucas M. Oliveira", "Lucas Melo Oliveira");
321        assert!(sim >= 0.90, "Abbreviation should score high: {}", sim);
322
323        // C. should match Clara
324        let sim2 = matcher.similarity("Ana C. Silva", "Ana Clara Silva");
325        assert!(sim2 >= 0.88, "Abbreviation should score high: {}", sim2);
326    }
327
328    #[test]
329    fn test_matches() {
330        let matcher = NameMatcher::new().with_threshold(0.85);
331
332        assert!(matcher.matches("Lucas Oliveira", "Lucas M. Oliveira"));
333        assert!(!matcher.matches("Lucas Oliveira", "Pedro Santos"));
334    }
335
336    #[test]
337    fn test_to_entity_id() {
338        let matcher = NameMatcher::new();
339
340        assert_eq!(
341            matcher.to_entity_id("Lucas Melo de Oliveira"),
342            "lucas_melo_oliveira"
343        );
344    }
345
346    #[test]
347    fn test_distance() {
348        let matcher = NameMatcher::new();
349
350        assert_eq!(matcher.distance("Lucas", "Lucas"), 0);
351        assert!(matcher.distance("Lucas", "Lucaz") <= 2);
352    }
353}