worker_matcher/scorer.rs
1//! Scoring algorithms for string similarity and field comparison.
2//!
3//! This module exposes a small, focused set of similarity primitives that
4//! the matching engine composes together. All scores are normalised to the
5//! closed interval `[0.0, 1.0]`, where `1.0` means "identical" and `0.0`
6//! means "no observable similarity".
7//!
8//! ## Algorithm choice
9//!
10//! | Algorithm | Strength | Weakness |
11//! |---|---|---|
12//! | [`SimilarityAlgorithm::JaroWinkler`] | Good for short strings; rewards a common prefix. | Saturates quickly on long strings. |
13//! | [`SimilarityAlgorithm::Levenshtein`] | Cheap to reason about; tracks edit distance. | Sensitive to length differences. |
14//! | [`SimilarityAlgorithm::Exact`] | Fast; defensible to non-technical reviewers. | No tolerance for typos. |
15//! | [`SimilarityAlgorithm::Combined`] | Default; balances JW and Levenshtein. | Bespoke weighting (0.7 JW + 0.3 Lev). |
16//!
17//! ## Example
18//!
19//! ```
20//! use worker_matcher::Scorer;
21//!
22//! let same = Scorer::jaro_winkler_similarity("smith", "smith");
23//! let close = Scorer::jaro_winkler_similarity("smith", "smyth");
24//! let far = Scorer::jaro_winkler_similarity("smith", "jones");
25//!
26//! assert!(same > 0.99);
27//! assert!(close > 0.85);
28//! assert!(far < same);
29//! ```
30
31use strsim::{jaro_winkler, levenshtein};
32
33/// Stateless namespace for string-similarity scorers.
34///
35/// Like [`crate::Normalizer`], `Scorer` is a unit type with no fields;
36/// every method is associated.
37///
38/// ```
39/// use worker_matcher::Scorer;
40/// assert_eq!(Scorer::exact_match("a", "a"), 1.0);
41/// ```
42pub struct Scorer;
43
44impl Scorer {
45 /// Jaro-Winkler similarity, normalised to `[0.0, 1.0]`.
46 ///
47 /// Higher values indicate greater similarity. Strings sharing a common
48 /// prefix score noticeably higher than strings that diverge at the start.
49 ///
50 /// # Edge cases
51 ///
52 /// - Two empty strings → `1.0` (identical).
53 /// - One empty, one not → `0.0`.
54 ///
55 /// # Examples
56 ///
57 /// ```
58 /// use worker_matcher::Scorer;
59 /// assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
60 /// assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
61 /// assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
62 /// assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
63 /// ```
64 pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
65 if s1.is_empty() && s2.is_empty() {
66 return 1.0;
67 }
68 if s1.is_empty() || s2.is_empty() {
69 return 0.0;
70 }
71 jaro_winkler(s1, s2)
72 }
73
74 /// Normalised Levenshtein similarity, in `[0.0, 1.0]`.
75 ///
76 /// Computed as `1 - (edit_distance / max_len)`. Higher values indicate
77 /// greater similarity.
78 ///
79 /// # Edge cases
80 ///
81 /// - Two empty strings → `1.0`.
82 /// - One empty, one not → `0.0`.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use worker_matcher::Scorer;
88 /// assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
89 /// assert!(Scorer::levenshtein_similarity("smith", "smyth") >= 0.79);
90 /// assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
91 /// assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
92 /// ```
93 pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
94 if s1.is_empty() && s2.is_empty() {
95 return 1.0;
96 }
97 if s1.is_empty() || s2.is_empty() {
98 return 0.0;
99 }
100
101 let distance = levenshtein(s1, s2);
102 let max_len = s1.len().max(s2.len());
103 1.0 - (distance as f64 / max_len as f64)
104 }
105
106 /// Binary exact-match score: `1.0` if `s1 == s2`, else `0.0`.
107 ///
108 /// Case-sensitive and whitespace-sensitive. Pair with
109 /// [`crate::Normalizer`] when comparing user-entered text.
110 ///
111 /// # Examples
112 ///
113 /// ```
114 /// use worker_matcher::Scorer;
115 /// assert_eq!(Scorer::exact_match("test", "test"), 1.0);
116 /// assert_eq!(Scorer::exact_match("Test", "test"), 0.0); // case-sensitive
117 /// assert_eq!(Scorer::exact_match("a", "b"), 0.0);
118 /// ```
119 pub fn exact_match(s1: &str, s2: &str) -> f64 {
120 if s1 == s2 { 1.0 } else { 0.0 }
121 }
122
123 /// Weighted combination of Jaro-Winkler (0.7) and Levenshtein (0.3).
124 ///
125 /// Defaults are tuned for workeral names. Jaro-Winkler dominates because
126 /// it handles short-string prefix matches better; Levenshtein contributes
127 /// stability for longer or rearranged inputs.
128 ///
129 /// # Example
130 ///
131 /// ```
132 /// use worker_matcher::Scorer;
133 ///
134 /// let s = Scorer::combined_similarity("Stephen", "Steven");
135 /// assert!(s > 0.80, "combined score for Stephen/Steven was {s}");
136 /// ```
137 pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
138 let jw = Self::jaro_winkler_similarity(s1, s2);
139 let lev = Self::levenshtein_similarity(s1, s2);
140 0.7 * jw + 0.3 * lev
141 }
142
143 /// Score two `Option<String>` fields using the chosen algorithm.
144 ///
145 /// Returns:
146 ///
147 /// - `1.0` if both are `None` (both absent → trivially "match").
148 /// - `0.0` if exactly one is `None` (asymmetric data → "differ").
149 /// - The chosen algorithm's similarity if both are `Some`.
150 ///
151 /// Note: the matching engine intentionally does **not** use this
152 /// helper — it skips fields where either side is absent so that they
153 /// neither contribute nor penalise. This helper is kept for callers
154 /// who want a different policy.
155 ///
156 /// # Examples
157 ///
158 /// ```
159 /// use worker_matcher::{Scorer, SimilarityAlgorithm};
160 ///
161 /// let none: Option<String> = None;
162 /// let a = Some("hello".to_string());
163 /// let b = Some("hello".to_string());
164 ///
165 /// assert_eq!(Scorer::optional_field_score(&none, &none, SimilarityAlgorithm::Exact), 1.0);
166 /// assert_eq!(Scorer::optional_field_score(&a, &none, SimilarityAlgorithm::Exact), 0.0);
167 /// assert_eq!(Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact), 1.0);
168 /// ```
169 pub fn optional_field_score(
170 field1: &Option<String>,
171 field2: &Option<String>,
172 algorithm: SimilarityAlgorithm,
173 ) -> f64 {
174 match (field1, field2) {
175 (None, None) => 1.0,
176 (None, Some(_)) | (Some(_), None) => 0.0,
177 (Some(s1), Some(s2)) => match algorithm {
178 SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
179 SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
180 SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
181 SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
182 },
183 }
184 }
185}
186
187/// Algorithm selector for name comparison in [`crate::MatchConfig`].
188///
189/// The enum is `Copy`, so it is cheap to embed in a config struct or to
190/// pass through scoring helpers.
191///
192/// ```
193/// use worker_matcher::SimilarityAlgorithm;
194/// let alg = SimilarityAlgorithm::Combined;
195/// let same = alg; // Copy
196/// assert!(matches!(same, SimilarityAlgorithm::Combined));
197/// ```
198#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
199pub enum SimilarityAlgorithm {
200 /// Jaro-Winkler similarity — favours common prefixes; good for names.
201 JaroWinkler,
202 /// Normalised Levenshtein similarity — tracks edit distance.
203 Levenshtein,
204 /// Exact equality — binary `1.0` / `0.0`.
205 Exact,
206 /// Weighted blend of Jaro-Winkler (0.7) and Levenshtein (0.3). The default.
207 Combined,
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 // ---------- jaro_winkler ----------
215
216 #[test]
217 fn jaro_winkler_identical() {
218 assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
219 }
220
221 #[test]
222 fn jaro_winkler_close_typo() {
223 assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
224 }
225
226 #[test]
227 fn jaro_winkler_distant() {
228 assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
229 }
230
231 #[test]
232 fn jaro_winkler_empty_pair_is_one() {
233 assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
234 }
235
236 #[test]
237 fn jaro_winkler_single_empty_is_zero() {
238 assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
239 assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
240 }
241
242 #[test]
243 fn jaro_winkler_in_unit_interval() {
244 for (a, b) in [("a", "b"), ("smith", "smyth"), ("abc", "xyz")] {
245 let s = Scorer::jaro_winkler_similarity(a, b);
246 assert!((0.0..=1.0).contains(&s), "out of range: {s}");
247 }
248 }
249
250 // ---------- levenshtein ----------
251
252 #[test]
253 fn levenshtein_identical() {
254 assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
255 }
256
257 #[test]
258 fn levenshtein_one_edit() {
259 // 1 substitution over 5 chars: 1 - 1/5 = 0.8
260 let s = Scorer::levenshtein_similarity("smith", "smyth");
261 assert!((s - 0.8).abs() < 1e-9, "got {s}");
262 }
263
264 #[test]
265 fn levenshtein_completely_different() {
266 assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
267 }
268
269 #[test]
270 fn levenshtein_empty_pair_is_one() {
271 assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
272 }
273
274 #[test]
275 fn levenshtein_single_empty_is_zero() {
276 assert_eq!(Scorer::levenshtein_similarity("smith", ""), 0.0);
277 assert_eq!(Scorer::levenshtein_similarity("", "smith"), 0.0);
278 }
279
280 // ---------- exact ----------
281
282 #[test]
283 fn exact_match_basic() {
284 assert_eq!(Scorer::exact_match("test", "test"), 1.0);
285 assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
286 assert_eq!(Scorer::exact_match("test", "other"), 0.0);
287 assert_eq!(Scorer::exact_match("", ""), 1.0);
288 }
289
290 // ---------- combined ----------
291
292 #[test]
293 fn combined_identical_is_one() {
294 assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
295 }
296
297 #[test]
298 fn combined_close_typo_is_high() {
299 let s = Scorer::combined_similarity("Stephen", "Steven");
300 assert!(s > 0.80, "got {s}");
301 }
302
303 #[test]
304 fn combined_distant_is_low() {
305 assert!(Scorer::combined_similarity("alice", "zachary") < 0.5);
306 }
307
308 // ---------- optional_field_score ----------
309
310 #[test]
311 fn optional_field_both_none_is_one() {
312 let n: Option<String> = None;
313 assert_eq!(
314 Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
315 1.0
316 );
317 }
318
319 #[test]
320 fn optional_field_asymmetric_is_zero() {
321 let n: Option<String> = None;
322 let s = Some("x".to_string());
323 assert_eq!(
324 Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
325 0.0
326 );
327 assert_eq!(
328 Scorer::optional_field_score(&n, &s, SimilarityAlgorithm::Exact),
329 0.0
330 );
331 }
332
333 #[test]
334 fn optional_field_some_some_uses_algorithm() {
335 let a = Some("smith".to_string());
336 let b = Some("smyth".to_string());
337 let jw = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::JaroWinkler);
338 let lv = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Levenshtein);
339 let ex = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact);
340 let cb = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Combined);
341 assert!(jw > 0.85);
342 assert!(lv >= 0.79);
343 assert_eq!(ex, 0.0);
344 assert!(cb > 0.8);
345 }
346}