use strsim::{jaro_winkler, levenshtein};
use crate::normalizer::Normalizer;
pub struct Scorer;
impl Scorer {
#[must_use]
pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
if s1.is_empty() && s2.is_empty() {
return 1.0;
}
if s1.is_empty() || s2.is_empty() {
return 0.0;
}
jaro_winkler(s1, s2)
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
if s1.is_empty() && s2.is_empty() {
return 1.0;
}
if s1.is_empty() || s2.is_empty() {
return 0.0;
}
let distance = levenshtein(s1, s2);
let max_len = s1.len().max(s2.len());
1.0 - (distance as f64 / max_len as f64)
}
#[must_use]
pub fn exact_match(s1: &str, s2: &str) -> f64 {
if s1 == s2 { 1.0 } else { 0.0 }
}
#[must_use]
pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
let jw = Self::jaro_winkler_similarity(s1, s2);
let lev = Self::levenshtein_similarity(s1, s2);
0.7 * jw + 0.3 * lev
}
#[must_use]
pub fn haversine_metres(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
const EARTH_RADIUS_M: f64 = 6_371_000.0;
let to_rad = |d: f64| d.to_radians();
let phi1 = to_rad(lat1);
let phi2 = to_rad(lat2);
let dphi = to_rad(lat2 - lat1);
let dlambda = to_rad(lon2 - lon1);
let a =
(dphi / 2.0).sin().powi(2) + phi1.cos() * phi2.cos() * (dlambda / 2.0).sin().powi(2);
let c = 2.0 * a.sqrt().clamp(0.0, 1.0).asin();
EARTH_RADIUS_M * c
}
#[must_use]
pub fn coordinates_score(distance_metres: f64, scale_metres: f64) -> f64 {
gaussian_decay(distance_metres, scale_metres)
}
#[must_use]
pub fn seconds_between(t1: &str, t2: &str) -> Option<i64> {
let s1 = Normalizer::parse_iso8601_unix_seconds(t1)?;
let s2 = Normalizer::parse_iso8601_unix_seconds(t2)?;
Some((s1 - s2).abs())
}
#[must_use]
pub fn start_date_score(difference_seconds: f64, scale_seconds: f64) -> f64 {
gaussian_decay(difference_seconds, scale_seconds)
}
#[must_use]
pub fn optional_field_score(
field1: &Option<String>,
field2: &Option<String>,
algorithm: SimilarityAlgorithm,
) -> f64 {
match (field1, field2) {
(None, None) => 1.0,
(None, Some(_)) | (Some(_), None) => 0.0,
(Some(s1), Some(s2)) => match algorithm {
SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
},
}
}
}
fn gaussian_decay(distance: f64, scale: f64) -> f64 {
if !distance.is_finite() || !scale.is_finite() || scale <= 0.0 || distance < 0.0 {
return 0.0;
}
let ratio = distance / scale;
let s = (-(ratio * ratio)).exp();
s.clamp(0.0, 1.0)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum SimilarityAlgorithm {
JaroWinkler,
Levenshtein,
Exact,
Combined,
}
#[cfg(test)]
#[allow(clippy::float_cmp)]
mod tests {
use super::*;
#[test]
fn jaro_winkler_identical() {
assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
}
#[test]
fn jaro_winkler_close_typo() {
assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
}
#[test]
fn jaro_winkler_distant() {
assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
}
#[test]
fn jaro_winkler_empty_pair_is_one() {
assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
}
#[test]
fn jaro_winkler_single_empty_is_zero() {
assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
}
#[test]
fn levenshtein_identical() {
assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
}
#[test]
fn levenshtein_one_edit() {
let s = Scorer::levenshtein_similarity("smith", "smyth");
assert!((s - 0.8).abs() < 1e-9, "got {s}");
}
#[test]
fn levenshtein_empty_pair_is_one() {
assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
}
#[test]
fn exact_match_basic() {
assert_eq!(Scorer::exact_match("test", "test"), 1.0);
assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
assert_eq!(Scorer::exact_match("test", "other"), 0.0);
assert_eq!(Scorer::exact_match("", ""), 1.0);
}
#[test]
fn combined_identical_is_one() {
assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
}
#[test]
fn combined_close_typo_is_high() {
let s = Scorer::combined_similarity("Stephen", "Steven");
assert!(s > 0.80, "got {s}");
}
#[test]
fn haversine_identical_is_zero() {
let d = Scorer::haversine_metres(51.5, -0.12, 51.5, -0.12);
assert!(d.abs() < 1e-6);
}
#[test]
fn haversine_london_paris_about_343km() {
let d = Scorer::haversine_metres(51.507_22, -0.127_5, 48.853_0, 2.349_2) / 1000.0;
assert!(d > 330.0 && d < 355.0, "got {d}");
}
#[test]
fn coordinates_score_contract() {
let scale = 50.0;
assert!((Scorer::coordinates_score(0.0, scale) - 1.0).abs() < 1e-12);
let one_e = Scorer::coordinates_score(scale, scale);
assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
let far = Scorer::coordinates_score(3.0 * scale, scale);
assert!(far < 1e-3);
}
#[test]
fn coordinates_score_rejects_pathological_inputs() {
assert_eq!(Scorer::coordinates_score(f64::NAN, 50.0), 0.0);
assert_eq!(Scorer::coordinates_score(10.0, 0.0), 0.0);
assert_eq!(Scorer::coordinates_score(-1.0, 50.0), 0.0);
}
#[test]
fn seconds_between_identical_is_zero() {
let d = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T09:00:00Z");
assert_eq!(d, Some(0));
}
#[test]
fn seconds_between_one_hour() {
let d = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T10:00:00Z");
assert_eq!(d, Some(3600));
}
#[test]
fn seconds_between_is_symmetric_absolute() {
let a = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T10:00:00Z");
let b = Scorer::seconds_between("2024-06-26T10:00:00Z", "2024-06-26T09:00:00Z");
assert_eq!(a, b);
}
#[test]
fn seconds_between_one_day() {
let d = Scorer::seconds_between("2024-06-26", "2024-06-27");
assert_eq!(d, Some(86_400));
}
#[test]
fn seconds_between_rejects_garbage() {
assert!(Scorer::seconds_between("not-a-date", "2024-06-26").is_none());
assert!(Scorer::seconds_between("2024-06-26", "also-not-a-date").is_none());
}
#[test]
fn start_date_score_contract() {
let scale = 3600.0;
assert!((Scorer::start_date_score(0.0, scale) - 1.0).abs() < 1e-12);
let one_e = Scorer::start_date_score(scale, scale);
assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
let far = Scorer::start_date_score(3.0 * scale, scale);
assert!(far < 1e-3);
}
#[test]
fn start_date_score_rejects_pathological_inputs() {
assert_eq!(Scorer::start_date_score(f64::NAN, 3600.0), 0.0);
assert_eq!(Scorer::start_date_score(10.0, 0.0), 0.0);
assert_eq!(Scorer::start_date_score(-1.0, 3600.0), 0.0);
}
#[test]
fn optional_field_both_none_is_one() {
let n: Option<String> = None;
assert_eq!(
Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
1.0
);
}
#[test]
fn optional_field_asymmetric_is_zero() {
let n: Option<String> = None;
let s = Some("x".to_string());
assert_eq!(
Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
0.0
);
}
}