fuzzy_cmp/fuzzy/
compare.rs

1// use crate::prelude::*;
2
3/// Returns levenshtein distance between strings
4pub fn distance(s1: &str, s2: &str) -> usize {
5    distance::levenshtein(&s1, &s2)
6}
7
8/// Compares 2 string by levenshtein distance
9pub fn compare(s1: &str, s2: &str) -> f32 {
10    let s1 = s1.trim().to_lowercase();
11    let s2 = s2.trim().to_lowercase();
12    
13    if s1 == s2 { return 1.0; }
14    else if s1.is_empty() || s2.is_empty() { return 0.0; }
15    
16    let dist = distance(&s1, &s2);
17    let max_len = s1.chars().count().max(s2.chars().count()).max(1);
18    let coof = 1.0 - (dist as f32 / max_len as f32);
19
20    coof.max(0.0).min(1.0)
21}
22
23/// Deep comparison using word-by-word scoring with positional bonus
24pub fn deep_compare(s1: &str, s2: &str, min_coef: f32) -> f32 {
25    let s1 = s1.trim().to_lowercase();
26    let s2 = s2.trim().to_lowercase();
27    if s1.is_empty() || s2.is_empty() { return 0.0; }
28
29    // split by words:
30    let words1: Vec<&str> = s1.split_whitespace().collect();
31    let words2: Vec<&str> = s2.split_whitespace().collect();
32    
33    // get len's:
34    let s1_len = words1.len();
35    let s2_len = words2.len();
36    let max_len = s1_len.max(s2_len);
37    
38    let mut scores = vec![0f32; max_len];
39    let mut covered = vec![false; max_len];
40    
41    // compare words (s1 vs s2):
42    for (i, word1) in words1.iter().take(max_len).enumerate() {
43        let mut best_score = 0.0f32;
44        
45        for (j, word2) in words2.iter().enumerate() {
46            let coef = compare(word1, word2);
47            let mut score = coef / 5.0;
48            
49            // add position bonus: |i - j| <= 1
50            if (i as i32 - j as i32).abs() <= 1 {
51                score += 1.0;
52            }
53            
54            // update best score:
55            best_score = best_score.max(score);
56
57            // add coverage score:
58            if coef >= min_coef {
59                covered[j] = true;
60            }
61        }
62        
63        scores[i] = best_score.min(6.0);
64    }
65    
66    // calc compare scores:
67    let max_score = 6.0 * max_len as f32;
68    let compare_coef = scores.iter().sum::<f32>() / max_score;
69
70    // calc coverage scores:
71    let covered_count = covered.into_iter().filter(|&x| x).count();
72    let coverage_coef = (covered_count as f32 / s2_len as f32).min(1.0);
73    
74    compare_coef + coverage_coef
75}