use strsim::{jaro_winkler, levenshtein};
pub struct Scorer;
impl Scorer {
pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
if s1.is_empty() && s2.is_empty() {
return 1.0;
}
if s1.is_empty() || s2.is_empty() {
return 0.0;
}
jaro_winkler(s1, s2)
}
pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
if s1.is_empty() && s2.is_empty() {
return 1.0;
}
if s1.is_empty() || s2.is_empty() {
return 0.0;
}
let distance = levenshtein(s1, s2);
let max_len = s1.len().max(s2.len());
1.0 - (distance as f64 / max_len as f64)
}
pub fn exact_match(s1: &str, s2: &str) -> f64 {
if s1 == s2 { 1.0 } else { 0.0 }
}
pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
let jw = Self::jaro_winkler_similarity(s1, s2);
let lev = Self::levenshtein_similarity(s1, s2);
0.7 * jw + 0.3 * lev
}
pub fn optional_field_score(
field1: &Option<String>,
field2: &Option<String>,
algorithm: SimilarityAlgorithm,
) -> f64 {
match (field1, field2) {
(None, None) => 1.0,
(None, Some(_)) | (Some(_), None) => 0.0,
(Some(s1), Some(s2)) => match algorithm {
SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
},
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum SimilarityAlgorithm {
JaroWinkler,
Levenshtein,
Exact,
Combined,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn jaro_winkler_identical() {
assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
}
#[test]
fn jaro_winkler_close_typo() {
assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
}
#[test]
fn jaro_winkler_distant() {
assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
}
#[test]
fn jaro_winkler_empty_pair_is_one() {
assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
}
#[test]
fn jaro_winkler_single_empty_is_zero() {
assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
}
#[test]
fn jaro_winkler_in_unit_interval() {
for (a, b) in [("a", "b"), ("smith", "smyth"), ("abc", "xyz")] {
let s = Scorer::jaro_winkler_similarity(a, b);
assert!((0.0..=1.0).contains(&s), "out of range: {s}");
}
}
#[test]
fn levenshtein_identical() {
assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
}
#[test]
fn levenshtein_one_edit() {
let s = Scorer::levenshtein_similarity("smith", "smyth");
assert!((s - 0.8).abs() < 1e-9, "got {s}");
}
#[test]
fn levenshtein_completely_different() {
assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
}
#[test]
fn levenshtein_empty_pair_is_one() {
assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
}
#[test]
fn levenshtein_single_empty_is_zero() {
assert_eq!(Scorer::levenshtein_similarity("smith", ""), 0.0);
assert_eq!(Scorer::levenshtein_similarity("", "smith"), 0.0);
}
#[test]
fn exact_match_basic() {
assert_eq!(Scorer::exact_match("test", "test"), 1.0);
assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
assert_eq!(Scorer::exact_match("test", "other"), 0.0);
assert_eq!(Scorer::exact_match("", ""), 1.0);
}
#[test]
fn combined_identical_is_one() {
assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
}
#[test]
fn combined_close_typo_is_high() {
let s = Scorer::combined_similarity("Stephen", "Steven");
assert!(s > 0.80, "got {s}");
}
#[test]
fn combined_distant_is_low() {
assert!(Scorer::combined_similarity("alice", "zachary") < 0.5);
}
#[test]
fn optional_field_both_none_is_one() {
let n: Option<String> = None;
assert_eq!(
Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
1.0
);
}
#[test]
fn optional_field_asymmetric_is_zero() {
let n: Option<String> = None;
let s = Some("x".to_string());
assert_eq!(
Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
0.0
);
assert_eq!(
Scorer::optional_field_score(&n, &s, SimilarityAlgorithm::Exact),
0.0
);
}
#[test]
fn optional_field_some_some_uses_algorithm() {
let a = Some("smith".to_string());
let b = Some("smyth".to_string());
let jw = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::JaroWinkler);
let lv = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Levenshtein);
let ex = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact);
let cb = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Combined);
assert!(jw > 0.85);
assert!(lv >= 0.79);
assert_eq!(ex, 0.0);
assert!(cb > 0.8);
}
}