Skip to main content

worker_matcher/
scorer.rs

1//! Scoring algorithms for string similarity and field comparison.
2//!
3//! This module exposes a small, focused set of similarity primitives that
4//! the matching engine composes together. All scores are normalised to the
5//! closed interval `[0.0, 1.0]`, where `1.0` means "identical" and `0.0`
6//! means "no observable similarity".
7//!
8//! ## Algorithm choice
9//!
10//! | Algorithm | Strength | Weakness |
11//! |---|---|---|
12//! | [`SimilarityAlgorithm::JaroWinkler`] | Good for short strings; rewards a common prefix. | Saturates quickly on long strings. |
13//! | [`SimilarityAlgorithm::Levenshtein`] | Cheap to reason about; tracks edit distance. | Sensitive to length differences. |
14//! | [`SimilarityAlgorithm::Exact`]       | Fast; defensible to non-technical reviewers. | No tolerance for typos. |
15//! | [`SimilarityAlgorithm::Combined`]    | Default; balances JW and Levenshtein. | Bespoke weighting (0.7 JW + 0.3 Lev). |
16//!
17//! ## Example
18//!
19//! ```
20//! use worker_matcher::Scorer;
21//!
22//! let same  = Scorer::jaro_winkler_similarity("smith", "smith");
23//! let close = Scorer::jaro_winkler_similarity("smith", "smyth");
24//! let far   = Scorer::jaro_winkler_similarity("smith", "jones");
25//!
26//! assert!(same  > 0.99);
27//! assert!(close > 0.85);
28//! assert!(far   < same);
29//! ```
30
31use strsim::{jaro_winkler, levenshtein};
32
33/// Stateless namespace for string-similarity scorers.
34///
35/// Like [`crate::Normalizer`], `Scorer` is a unit type with no fields;
36/// every method is associated.
37///
38/// ```
39/// use worker_matcher::Scorer;
40/// assert_eq!(Scorer::exact_match("a", "a"), 1.0);
41/// ```
42pub struct Scorer;
43
44impl Scorer {
45    /// Jaro-Winkler similarity, normalised to `[0.0, 1.0]`.
46    ///
47    /// Higher values indicate greater similarity. Strings sharing a common
48    /// prefix score noticeably higher than strings that diverge at the start.
49    ///
50    /// # Edge cases
51    ///
52    /// - Two empty strings → `1.0` (identical).
53    /// - One empty, one not → `0.0`.
54    ///
55    /// # Examples
56    ///
57    /// ```
58    /// use worker_matcher::Scorer;
59    /// assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
60    /// assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
61    /// assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
62    /// assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
63    /// ```
64    pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
65        if s1.is_empty() && s2.is_empty() {
66            return 1.0;
67        }
68        if s1.is_empty() || s2.is_empty() {
69            return 0.0;
70        }
71        jaro_winkler(s1, s2)
72    }
73
74    /// Normalised Levenshtein similarity, in `[0.0, 1.0]`.
75    ///
76    /// Computed as `1 - (edit_distance / max_len)`. Higher values indicate
77    /// greater similarity.
78    ///
79    /// # Edge cases
80    ///
81    /// - Two empty strings → `1.0`.
82    /// - One empty, one not → `0.0`.
83    ///
84    /// # Examples
85    ///
86    /// ```
87    /// use worker_matcher::Scorer;
88    /// assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
89    /// assert!(Scorer::levenshtein_similarity("smith", "smyth") >= 0.79);
90    /// assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
91    /// assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
92    /// ```
93    pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
94        if s1.is_empty() && s2.is_empty() {
95            return 1.0;
96        }
97        if s1.is_empty() || s2.is_empty() {
98            return 0.0;
99        }
100
101        let distance = levenshtein(s1, s2);
102        let max_len = s1.len().max(s2.len());
103        1.0 - (distance as f64 / max_len as f64)
104    }
105
106    /// Binary exact-match score: `1.0` if `s1 == s2`, else `0.0`.
107    ///
108    /// Case-sensitive and whitespace-sensitive. Pair with
109    /// [`crate::Normalizer`] when comparing user-entered text.
110    ///
111    /// # Examples
112    ///
113    /// ```
114    /// use worker_matcher::Scorer;
115    /// assert_eq!(Scorer::exact_match("test", "test"), 1.0);
116    /// assert_eq!(Scorer::exact_match("Test", "test"), 0.0);  // case-sensitive
117    /// assert_eq!(Scorer::exact_match("a", "b"),       0.0);
118    /// ```
119    pub fn exact_match(s1: &str, s2: &str) -> f64 {
120        if s1 == s2 { 1.0 } else { 0.0 }
121    }
122
123    /// Weighted combination of Jaro-Winkler (0.7) and Levenshtein (0.3).
124    ///
125    /// Defaults are tuned for workeral names. Jaro-Winkler dominates because
126    /// it handles short-string prefix matches better; Levenshtein contributes
127    /// stability for longer or rearranged inputs.
128    ///
129    /// # Example
130    ///
131    /// ```
132    /// use worker_matcher::Scorer;
133    ///
134    /// let s = Scorer::combined_similarity("Stephen", "Steven");
135    /// assert!(s > 0.80, "combined score for Stephen/Steven was {s}");
136    /// ```
137    pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
138        let jw = Self::jaro_winkler_similarity(s1, s2);
139        let lev = Self::levenshtein_similarity(s1, s2);
140        0.7 * jw + 0.3 * lev
141    }
142
143    /// Score two `Option<String>` fields using the chosen algorithm.
144    ///
145    /// Returns:
146    ///
147    /// - `1.0` if both are `None` (both absent → trivially "match").
148    /// - `0.0` if exactly one is `None` (asymmetric data → "differ").
149    /// - The chosen algorithm's similarity if both are `Some`.
150    ///
151    /// Note: the matching engine intentionally does **not** use this
152    /// helper — it skips fields where either side is absent so that they
153    /// neither contribute nor penalise. This helper is kept for callers
154    /// who want a different policy.
155    ///
156    /// # Examples
157    ///
158    /// ```
159    /// use worker_matcher::{Scorer, SimilarityAlgorithm};
160    ///
161    /// let none: Option<String> = None;
162    /// let a = Some("hello".to_string());
163    /// let b = Some("hello".to_string());
164    ///
165    /// assert_eq!(Scorer::optional_field_score(&none, &none, SimilarityAlgorithm::Exact), 1.0);
166    /// assert_eq!(Scorer::optional_field_score(&a,    &none, SimilarityAlgorithm::Exact), 0.0);
167    /// assert_eq!(Scorer::optional_field_score(&a,    &b,    SimilarityAlgorithm::Exact), 1.0);
168    /// ```
169    pub fn optional_field_score(
170        field1: &Option<String>,
171        field2: &Option<String>,
172        algorithm: SimilarityAlgorithm,
173    ) -> f64 {
174        match (field1, field2) {
175            (None, None) => 1.0,
176            (None, Some(_)) | (Some(_), None) => 0.0,
177            (Some(s1), Some(s2)) => match algorithm {
178                SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
179                SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
180                SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
181                SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
182            },
183        }
184    }
185}
186
187/// Algorithm selector for name comparison in [`crate::MatchConfig`].
188///
189/// The enum is `Copy`, so it is cheap to embed in a config struct or to
190/// pass through scoring helpers.
191///
192/// ```
193/// use worker_matcher::SimilarityAlgorithm;
194/// let alg = SimilarityAlgorithm::Combined;
195/// let same = alg;          // Copy
196/// assert!(matches!(same, SimilarityAlgorithm::Combined));
197/// ```
198#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
199pub enum SimilarityAlgorithm {
200    /// Jaro-Winkler similarity — favours common prefixes; good for names.
201    JaroWinkler,
202    /// Normalised Levenshtein similarity — tracks edit distance.
203    Levenshtein,
204    /// Exact equality — binary `1.0` / `0.0`.
205    Exact,
206    /// Weighted blend of Jaro-Winkler (0.7) and Levenshtein (0.3). The default.
207    Combined,
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    // ---------- jaro_winkler ----------
215
216    #[test]
217    fn jaro_winkler_identical() {
218        assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
219    }
220
221    #[test]
222    fn jaro_winkler_close_typo() {
223        assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
224    }
225
226    #[test]
227    fn jaro_winkler_distant() {
228        assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
229    }
230
231    #[test]
232    fn jaro_winkler_empty_pair_is_one() {
233        assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
234    }
235
236    #[test]
237    fn jaro_winkler_single_empty_is_zero() {
238        assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
239        assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
240    }
241
242    #[test]
243    fn jaro_winkler_in_unit_interval() {
244        for (a, b) in [("a", "b"), ("smith", "smyth"), ("abc", "xyz")] {
245            let s = Scorer::jaro_winkler_similarity(a, b);
246            assert!((0.0..=1.0).contains(&s), "out of range: {s}");
247        }
248    }
249
250    // ---------- levenshtein ----------
251
252    #[test]
253    fn levenshtein_identical() {
254        assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
255    }
256
257    #[test]
258    fn levenshtein_one_edit() {
259        // 1 substitution over 5 chars: 1 - 1/5 = 0.8
260        let s = Scorer::levenshtein_similarity("smith", "smyth");
261        assert!((s - 0.8).abs() < 1e-9, "got {s}");
262    }
263
264    #[test]
265    fn levenshtein_completely_different() {
266        assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
267    }
268
269    #[test]
270    fn levenshtein_empty_pair_is_one() {
271        assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
272    }
273
274    #[test]
275    fn levenshtein_single_empty_is_zero() {
276        assert_eq!(Scorer::levenshtein_similarity("smith", ""), 0.0);
277        assert_eq!(Scorer::levenshtein_similarity("", "smith"), 0.0);
278    }
279
280    // ---------- exact ----------
281
282    #[test]
283    fn exact_match_basic() {
284        assert_eq!(Scorer::exact_match("test", "test"), 1.0);
285        assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
286        assert_eq!(Scorer::exact_match("test", "other"), 0.0);
287        assert_eq!(Scorer::exact_match("", ""), 1.0);
288    }
289
290    // ---------- combined ----------
291
292    #[test]
293    fn combined_identical_is_one() {
294        assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
295    }
296
297    #[test]
298    fn combined_close_typo_is_high() {
299        let s = Scorer::combined_similarity("Stephen", "Steven");
300        assert!(s > 0.80, "got {s}");
301    }
302
303    #[test]
304    fn combined_distant_is_low() {
305        assert!(Scorer::combined_similarity("alice", "zachary") < 0.5);
306    }
307
308    // ---------- optional_field_score ----------
309
310    #[test]
311    fn optional_field_both_none_is_one() {
312        let n: Option<String> = None;
313        assert_eq!(
314            Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
315            1.0
316        );
317    }
318
319    #[test]
320    fn optional_field_asymmetric_is_zero() {
321        let n: Option<String> = None;
322        let s = Some("x".to_string());
323        assert_eq!(
324            Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
325            0.0
326        );
327        assert_eq!(
328            Scorer::optional_field_score(&n, &s, SimilarityAlgorithm::Exact),
329            0.0
330        );
331    }
332
333    #[test]
334    fn optional_field_some_some_uses_algorithm() {
335        let a = Some("smith".to_string());
336        let b = Some("smyth".to_string());
337        let jw = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::JaroWinkler);
338        let lv = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Levenshtein);
339        let ex = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact);
340        let cb = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Combined);
341        assert!(jw > 0.85);
342        assert!(lv >= 0.79);
343        assert_eq!(ex, 0.0);
344        assert!(cb > 0.8);
345    }
346}