thing_matcher/scorer.rs
1//! Scoring algorithms for string similarity and field comparison.
2//!
3//! This module exposes a small, focused set of similarity primitives that
4//! the matching engine composes together. All scores are normalised to the
5//! closed interval `[0.0, 1.0]`, where `1.0` means "identical" and `0.0`
6//! means "no observable similarity".
7//!
8//! ## Algorithm choice
9//!
10//! | Algorithm | Strength | Weakness |
11//! |---|---|---|
12//! | [`SimilarityAlgorithm::JaroWinkler`] | Good for short strings; rewards a common prefix. | Saturates quickly on long strings. |
13//! | [`SimilarityAlgorithm::Levenshtein`] | Cheap to reason about; tracks edit distance. | Sensitive to length differences. |
14//! | [`SimilarityAlgorithm::Exact`] | Fast; defensible to non-technical reviewers. | No tolerance for typos. |
15//! | [`SimilarityAlgorithm::Combined`] | Default; balances JW and Levenshtein. | Bespoke weighting (0.7 JW + 0.3 Lev). |
16//!
17//! ## Example
18//!
19//! ```
20//! use thing_matcher::Scorer;
21//!
22//! let same = Scorer::jaro_winkler_similarity("smith", "smith");
23//! let close = Scorer::jaro_winkler_similarity("smith", "smyth");
24//! let far = Scorer::jaro_winkler_similarity("smith", "jones");
25//!
26//! assert!(same > 0.99);
27//! assert!(close > 0.85);
28//! assert!(far < same);
29//! ```
30
31use strsim::{jaro_winkler, levenshtein};
32
33/// Stateless namespace for string-similarity scorers.
34///
35/// Like [`crate::Normalizer`], `Scorer` is a unit type with no fields;
36/// every method is associated.
37///
38/// ```
39/// use thing_matcher::Scorer;
40/// assert_eq!(Scorer::exact_match("a", "a"), 1.0);
41/// ```
42pub struct Scorer;
43
44impl Scorer {
45 /// Jaro-Winkler similarity, normalised to `[0.0, 1.0]`.
46 ///
47 /// Higher values indicate greater similarity. Strings sharing a common
48 /// prefix score noticeably higher than strings that diverge at the start.
49 ///
50 /// # Edge cases
51 ///
52 /// - Two empty strings → `1.0` (identical).
53 /// - One empty, one not → `0.0`.
54 ///
55 /// # Examples
56 ///
57 /// ```
58 /// use thing_matcher::Scorer;
59 /// assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
60 /// assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
61 /// assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
62 /// assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
63 /// ```
64 pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
65 if s1.is_empty() && s2.is_empty() {
66 return 1.0;
67 }
68 if s1.is_empty() || s2.is_empty() {
69 return 0.0;
70 }
71 jaro_winkler(s1, s2)
72 }
73
74 /// Normalised Levenshtein similarity, in `[0.0, 1.0]`.
75 ///
76 /// Computed as `1 - (edit_distance / max_len)`. Higher values indicate
77 /// greater similarity.
78 ///
79 /// # Edge cases
80 ///
81 /// - Two empty strings → `1.0`.
82 /// - One empty, one not → `0.0`.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use thing_matcher::Scorer;
88 /// assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
89 /// assert!(Scorer::levenshtein_similarity("smith", "smyth") >= 0.79);
90 /// assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
91 /// assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
92 /// ```
93 pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
94 if s1.is_empty() && s2.is_empty() {
95 return 1.0;
96 }
97 if s1.is_empty() || s2.is_empty() {
98 return 0.0;
99 }
100
101 let distance = levenshtein(s1, s2);
102 let max_len = s1.len().max(s2.len());
103 1.0 - (distance as f64 / max_len as f64)
104 }
105
106 /// Binary exact-match score: `1.0` if `s1 == s2`, else `0.0`.
107 ///
108 /// Case-sensitive and whitespace-sensitive. Pair with
109 /// [`crate::Normalizer`] when comparing user-entered text.
110 ///
111 /// # Examples
112 ///
113 /// ```
114 /// use thing_matcher::Scorer;
115 /// assert_eq!(Scorer::exact_match("test", "test"), 1.0);
116 /// assert_eq!(Scorer::exact_match("Test", "test"), 0.0); // case-sensitive
117 /// assert_eq!(Scorer::exact_match("a", "b"), 0.0);
118 /// ```
119 pub fn exact_match(s1: &str, s2: &str) -> f64 {
120 if s1 == s2 { 1.0 } else { 0.0 }
121 }
122
123 /// Weighted combination of Jaro-Winkler (0.7) and Levenshtein (0.3).
124 ///
125 /// Defaults are tuned for short names. Jaro-Winkler dominates because
126 /// it handles short-string prefix matches better; Levenshtein contributes
127 /// stability for longer or rearranged inputs.
128 ///
129 /// # Example
130 ///
131 /// ```
132 /// use thing_matcher::Scorer;
133 ///
134 /// let s = Scorer::combined_similarity("Stephen", "Steven");
135 /// assert!(s > 0.80, "combined score for Stephen/Steven was {s}");
136 /// ```
137 pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
138 let jw = Self::jaro_winkler_similarity(s1, s2);
139 let lev = Self::levenshtein_similarity(s1, s2);
140 0.7 * jw + 0.3 * lev
141 }
142
143 /// Jaccard similarity for two collections of strings, computed as
144 /// `|intersection| / |union|`. Returns `1.0` if both collections are
145 /// empty, `0.0` if exactly one is empty.
146 ///
147 /// Useful for `sameAs`, `additionalType`, and `subjectOf` set
148 /// comparison. The caller is responsible for pre-normalising each
149 /// element (e.g. via [`crate::Normalizer::normalize_url`]).
150 ///
151 /// # Examples
152 ///
153 /// ```
154 /// use thing_matcher::Scorer;
155 /// let a = ["x", "y", "z"];
156 /// let b = ["y", "z", "w"];
157 /// // intersection = {y, z}, union = {x, y, z, w}
158 /// let s = Scorer::jaccard_set_similarity(&a, &b);
159 /// assert!((s - 0.5).abs() < 1e-9);
160 /// ```
161 pub fn jaccard_set_similarity<S, T>(a: &[S], b: &[T]) -> f64
162 where
163 S: AsRef<str>,
164 T: AsRef<str>,
165 {
166 if a.is_empty() && b.is_empty() {
167 return 1.0;
168 }
169 if a.is_empty() || b.is_empty() {
170 return 0.0;
171 }
172 let a_set: std::collections::BTreeSet<&str> = a.iter().map(|s| s.as_ref()).collect();
173 let b_set: std::collections::BTreeSet<&str> = b.iter().map(|s| s.as_ref()).collect();
174 let intersection = a_set.intersection(&b_set).count();
175 let union = a_set.union(&b_set).count();
176 if union == 0 {
177 return 0.0;
178 }
179 intersection as f64 / union as f64
180 }
181
182 /// Score two `Option<String>` fields using the chosen algorithm.
183 ///
184 /// Returns:
185 ///
186 /// - `1.0` if both are `None` (both absent → trivially "match").
187 /// - `0.0` if exactly one is `None` (asymmetric data → "differ").
188 /// - The chosen algorithm's similarity if both are `Some`.
189 ///
190 /// Note: the matching engine intentionally does **not** use this
191 /// helper — it skips fields where either side is absent so that they
192 /// neither contribute nor penalise. This helper is kept for callers
193 /// who want a different policy.
194 ///
195 /// # Examples
196 ///
197 /// ```
198 /// use thing_matcher::{Scorer, SimilarityAlgorithm};
199 ///
200 /// let none: Option<String> = None;
201 /// let a = Some("hello".to_string());
202 /// let b = Some("hello".to_string());
203 ///
204 /// assert_eq!(Scorer::optional_field_score(&none, &none, SimilarityAlgorithm::Exact), 1.0);
205 /// assert_eq!(Scorer::optional_field_score(&a, &none, SimilarityAlgorithm::Exact), 0.0);
206 /// assert_eq!(Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact), 1.0);
207 /// ```
208 pub fn optional_field_score(
209 field1: &Option<String>,
210 field2: &Option<String>,
211 algorithm: SimilarityAlgorithm,
212 ) -> f64 {
213 match (field1, field2) {
214 (None, None) => 1.0,
215 (None, Some(_)) | (Some(_), None) => 0.0,
216 (Some(s1), Some(s2)) => match algorithm {
217 SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
218 SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
219 SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
220 SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
221 },
222 }
223 }
224}
225
226/// Algorithm selector for name comparison in [`crate::MatchConfig`].
227///
228/// The enum is `Copy`, so it is cheap to embed in a config struct or to
229/// pass through scoring helpers.
230///
231/// ```
232/// use thing_matcher::SimilarityAlgorithm;
233/// let alg = SimilarityAlgorithm::Combined;
234/// let same = alg; // Copy
235/// assert!(matches!(same, SimilarityAlgorithm::Combined));
236/// ```
237#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
238pub enum SimilarityAlgorithm {
239 /// Jaro-Winkler similarity — favours common prefixes; good for names.
240 JaroWinkler,
241 /// Normalised Levenshtein similarity — tracks edit distance.
242 Levenshtein,
243 /// Exact equality — binary `1.0` / `0.0`.
244 Exact,
245 /// Weighted blend of Jaro-Winkler (0.7) and Levenshtein (0.3). The default.
246 Combined,
247}
248
249#[cfg(test)]
250mod tests {
251 use super::*;
252
253 // ---------- jaro_winkler ----------
254
255 #[test]
256 fn jaro_winkler_identical() {
257 assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
258 }
259
260 #[test]
261 fn jaro_winkler_close_typo() {
262 assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
263 }
264
265 #[test]
266 fn jaro_winkler_distant() {
267 assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
268 }
269
270 #[test]
271 fn jaro_winkler_empty_pair_is_one() {
272 assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
273 }
274
275 #[test]
276 fn jaro_winkler_single_empty_is_zero() {
277 assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
278 assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
279 }
280
281 #[test]
282 fn jaro_winkler_in_unit_interval() {
283 for (a, b) in [("a", "b"), ("smith", "smyth"), ("abc", "xyz")] {
284 let s = Scorer::jaro_winkler_similarity(a, b);
285 assert!((0.0..=1.0).contains(&s), "out of range: {s}");
286 }
287 }
288
289 // ---------- levenshtein ----------
290
291 #[test]
292 fn levenshtein_identical() {
293 assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
294 }
295
296 #[test]
297 fn levenshtein_one_edit() {
298 // 1 substitution over 5 chars: 1 - 1/5 = 0.8
299 let s = Scorer::levenshtein_similarity("smith", "smyth");
300 assert!((s - 0.8).abs() < 1e-9, "got {s}");
301 }
302
303 #[test]
304 fn levenshtein_completely_different() {
305 assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
306 }
307
308 #[test]
309 fn levenshtein_empty_pair_is_one() {
310 assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
311 }
312
313 #[test]
314 fn levenshtein_single_empty_is_zero() {
315 assert_eq!(Scorer::levenshtein_similarity("smith", ""), 0.0);
316 assert_eq!(Scorer::levenshtein_similarity("", "smith"), 0.0);
317 }
318
319 // ---------- exact ----------
320
321 #[test]
322 fn exact_match_basic() {
323 assert_eq!(Scorer::exact_match("test", "test"), 1.0);
324 assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
325 assert_eq!(Scorer::exact_match("test", "other"), 0.0);
326 assert_eq!(Scorer::exact_match("", ""), 1.0);
327 }
328
329 // ---------- combined ----------
330
331 #[test]
332 fn combined_identical_is_one() {
333 assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
334 }
335
336 #[test]
337 fn combined_close_typo_is_high() {
338 let s = Scorer::combined_similarity("Stephen", "Steven");
339 assert!(s > 0.80, "got {s}");
340 }
341
342 #[test]
343 fn combined_distant_is_low() {
344 assert!(Scorer::combined_similarity("alice", "zachary") < 0.5);
345 }
346
347 // ---------- jaccard ----------
348
349 #[test]
350 fn jaccard_identical_sets_is_one() {
351 let a = ["x", "y"];
352 let b = ["y", "x"];
353 assert!((Scorer::jaccard_set_similarity(&a, &b) - 1.0).abs() < 1e-9);
354 }
355
356 #[test]
357 fn jaccard_disjoint_sets_is_zero() {
358 let a = ["x", "y"];
359 let b = ["a", "b"];
360 assert!(Scorer::jaccard_set_similarity(&a, &b) < 1e-9);
361 }
362
363 #[test]
364 fn jaccard_both_empty_is_one() {
365 let a: [&str; 0] = [];
366 let b: [&str; 0] = [];
367 assert_eq!(Scorer::jaccard_set_similarity(&a, &b), 1.0);
368 }
369
370 #[test]
371 fn jaccard_one_empty_is_zero() {
372 let a = ["x"];
373 let b: [&str; 0] = [];
374 assert_eq!(Scorer::jaccard_set_similarity(&a, &b), 0.0);
375 assert_eq!(Scorer::jaccard_set_similarity(&b, &a), 0.0);
376 }
377
378 #[test]
379 fn jaccard_partial_overlap_is_intersection_over_union() {
380 let a = ["x", "y", "z"];
381 let b = ["y", "z", "w"];
382 let s = Scorer::jaccard_set_similarity(&a, &b);
383 assert!((s - 0.5).abs() < 1e-9, "got {s}");
384 }
385
386 // ---------- optional_field_score ----------
387
388 #[test]
389 fn optional_field_both_none_is_one() {
390 let n: Option<String> = None;
391 assert_eq!(
392 Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
393 1.0
394 );
395 }
396
397 #[test]
398 fn optional_field_asymmetric_is_zero() {
399 let n: Option<String> = None;
400 let s = Some("x".to_string());
401 assert_eq!(
402 Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
403 0.0
404 );
405 assert_eq!(
406 Scorer::optional_field_score(&n, &s, SimilarityAlgorithm::Exact),
407 0.0
408 );
409 }
410
411 #[test]
412 fn optional_field_some_some_uses_algorithm() {
413 let a = Some("smith".to_string());
414 let b = Some("smyth".to_string());
415 let jw = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::JaroWinkler);
416 let lv = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Levenshtein);
417 let ex = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact);
418 let cb = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Combined);
419 assert!(jw > 0.85);
420 assert!(lv >= 0.79);
421 assert_eq!(ex, 0.0);
422 assert!(cb > 0.8);
423 }
424}