Skip to main content

thing_matcher/
scorer.rs

1//! Scoring algorithms for string similarity and field comparison.
2//!
3//! This module exposes a small, focused set of similarity primitives that
4//! the matching engine composes together. All scores are normalised to the
5//! closed interval `[0.0, 1.0]`, where `1.0` means "identical" and `0.0`
6//! means "no observable similarity".
7//!
8//! ## Algorithm choice
9//!
10//! | Algorithm | Strength | Weakness |
11//! |---|---|---|
12//! | [`SimilarityAlgorithm::JaroWinkler`] | Good for short strings; rewards a common prefix. | Saturates quickly on long strings. |
13//! | [`SimilarityAlgorithm::Levenshtein`] | Cheap to reason about; tracks edit distance. | Sensitive to length differences. |
14//! | [`SimilarityAlgorithm::Exact`]       | Fast; defensible to non-technical reviewers. | No tolerance for typos. |
15//! | [`SimilarityAlgorithm::Combined`]    | Default; balances JW and Levenshtein. | Bespoke weighting (0.7 JW + 0.3 Lev). |
16//!
17//! ## Example
18//!
19//! ```
20//! use thing_matcher::Scorer;
21//!
22//! let same  = Scorer::jaro_winkler_similarity("smith", "smith");
23//! let close = Scorer::jaro_winkler_similarity("smith", "smyth");
24//! let far   = Scorer::jaro_winkler_similarity("smith", "jones");
25//!
26//! assert!(same  > 0.99);
27//! assert!(close > 0.85);
28//! assert!(far   < same);
29//! ```
30
31use strsim::{jaro_winkler, levenshtein};
32
33/// Stateless namespace for string-similarity scorers.
34///
35/// Like [`crate::Normalizer`], `Scorer` is a unit type with no fields;
36/// every method is associated.
37///
38/// ```
39/// use thing_matcher::Scorer;
40/// assert_eq!(Scorer::exact_match("a", "a"), 1.0);
41/// ```
42pub struct Scorer;
43
44impl Scorer {
45    /// Jaro-Winkler similarity, normalised to `[0.0, 1.0]`.
46    ///
47    /// Higher values indicate greater similarity. Strings sharing a common
48    /// prefix score noticeably higher than strings that diverge at the start.
49    ///
50    /// # Edge cases
51    ///
52    /// - Two empty strings → `1.0` (identical).
53    /// - One empty, one not → `0.0`.
54    ///
55    /// # Examples
56    ///
57    /// ```
58    /// use thing_matcher::Scorer;
59    /// assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
60    /// assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
61    /// assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
62    /// assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
63    /// ```
64    pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
65        if s1.is_empty() && s2.is_empty() {
66            return 1.0;
67        }
68        if s1.is_empty() || s2.is_empty() {
69            return 0.0;
70        }
71        jaro_winkler(s1, s2)
72    }
73
74    /// Normalised Levenshtein similarity, in `[0.0, 1.0]`.
75    ///
76    /// Computed as `1 - (edit_distance / max_len)`. Higher values indicate
77    /// greater similarity.
78    ///
79    /// # Edge cases
80    ///
81    /// - Two empty strings → `1.0`.
82    /// - One empty, one not → `0.0`.
83    ///
84    /// # Examples
85    ///
86    /// ```
87    /// use thing_matcher::Scorer;
88    /// assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
89    /// assert!(Scorer::levenshtein_similarity("smith", "smyth") >= 0.79);
90    /// assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
91    /// assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
92    /// ```
93    pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
94        if s1.is_empty() && s2.is_empty() {
95            return 1.0;
96        }
97        if s1.is_empty() || s2.is_empty() {
98            return 0.0;
99        }
100
101        let distance = levenshtein(s1, s2);
102        let max_len = s1.len().max(s2.len());
103        1.0 - (distance as f64 / max_len as f64)
104    }
105
106    /// Binary exact-match score: `1.0` if `s1 == s2`, else `0.0`.
107    ///
108    /// Case-sensitive and whitespace-sensitive. Pair with
109    /// [`crate::Normalizer`] when comparing user-entered text.
110    ///
111    /// # Examples
112    ///
113    /// ```
114    /// use thing_matcher::Scorer;
115    /// assert_eq!(Scorer::exact_match("test", "test"), 1.0);
116    /// assert_eq!(Scorer::exact_match("Test", "test"), 0.0);  // case-sensitive
117    /// assert_eq!(Scorer::exact_match("a", "b"),       0.0);
118    /// ```
119    pub fn exact_match(s1: &str, s2: &str) -> f64 {
120        if s1 == s2 { 1.0 } else { 0.0 }
121    }
122
123    /// Weighted combination of Jaro-Winkler (0.7) and Levenshtein (0.3).
124    ///
125    /// Defaults are tuned for short names. Jaro-Winkler dominates because
126    /// it handles short-string prefix matches better; Levenshtein contributes
127    /// stability for longer or rearranged inputs.
128    ///
129    /// # Example
130    ///
131    /// ```
132    /// use thing_matcher::Scorer;
133    ///
134    /// let s = Scorer::combined_similarity("Stephen", "Steven");
135    /// assert!(s > 0.80, "combined score for Stephen/Steven was {s}");
136    /// ```
137    pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
138        let jw = Self::jaro_winkler_similarity(s1, s2);
139        let lev = Self::levenshtein_similarity(s1, s2);
140        0.7 * jw + 0.3 * lev
141    }
142
143    /// Jaccard similarity for two collections of strings, computed as
144    /// `|intersection| / |union|`. Returns `1.0` if both collections are
145    /// empty, `0.0` if exactly one is empty.
146    ///
147    /// Useful for `sameAs`, `additionalType`, and `subjectOf` set
148    /// comparison. The caller is responsible for pre-normalising each
149    /// element (e.g. via [`crate::Normalizer::normalize_url`]).
150    ///
151    /// # Examples
152    ///
153    /// ```
154    /// use thing_matcher::Scorer;
155    /// let a = ["x", "y", "z"];
156    /// let b = ["y", "z", "w"];
157    /// // intersection = {y, z}, union = {x, y, z, w}
158    /// let s = Scorer::jaccard_set_similarity(&a, &b);
159    /// assert!((s - 0.5).abs() < 1e-9);
160    /// ```
161    pub fn jaccard_set_similarity<S, T>(a: &[S], b: &[T]) -> f64
162    where
163        S: AsRef<str>,
164        T: AsRef<str>,
165    {
166        if a.is_empty() && b.is_empty() {
167            return 1.0;
168        }
169        if a.is_empty() || b.is_empty() {
170            return 0.0;
171        }
172        let a_set: std::collections::BTreeSet<&str> = a.iter().map(|s| s.as_ref()).collect();
173        let b_set: std::collections::BTreeSet<&str> = b.iter().map(|s| s.as_ref()).collect();
174        let intersection = a_set.intersection(&b_set).count();
175        let union = a_set.union(&b_set).count();
176        if union == 0 {
177            return 0.0;
178        }
179        intersection as f64 / union as f64
180    }
181
182    /// Score two `Option<String>` fields using the chosen algorithm.
183    ///
184    /// Returns:
185    ///
186    /// - `1.0` if both are `None` (both absent → trivially "match").
187    /// - `0.0` if exactly one is `None` (asymmetric data → "differ").
188    /// - The chosen algorithm's similarity if both are `Some`.
189    ///
190    /// Note: the matching engine intentionally does **not** use this
191    /// helper — it skips fields where either side is absent so that they
192    /// neither contribute nor penalise. This helper is kept for callers
193    /// who want a different policy.
194    ///
195    /// # Examples
196    ///
197    /// ```
198    /// use thing_matcher::{Scorer, SimilarityAlgorithm};
199    ///
200    /// let none: Option<String> = None;
201    /// let a = Some("hello".to_string());
202    /// let b = Some("hello".to_string());
203    ///
204    /// assert_eq!(Scorer::optional_field_score(&none, &none, SimilarityAlgorithm::Exact), 1.0);
205    /// assert_eq!(Scorer::optional_field_score(&a,    &none, SimilarityAlgorithm::Exact), 0.0);
206    /// assert_eq!(Scorer::optional_field_score(&a,    &b,    SimilarityAlgorithm::Exact), 1.0);
207    /// ```
208    pub fn optional_field_score(
209        field1: &Option<String>,
210        field2: &Option<String>,
211        algorithm: SimilarityAlgorithm,
212    ) -> f64 {
213        match (field1, field2) {
214            (None, None) => 1.0,
215            (None, Some(_)) | (Some(_), None) => 0.0,
216            (Some(s1), Some(s2)) => match algorithm {
217                SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
218                SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
219                SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
220                SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
221            },
222        }
223    }
224}
225
226/// Algorithm selector for name comparison in [`crate::MatchConfig`].
227///
228/// The enum is `Copy`, so it is cheap to embed in a config struct or to
229/// pass through scoring helpers.
230///
231/// ```
232/// use thing_matcher::SimilarityAlgorithm;
233/// let alg = SimilarityAlgorithm::Combined;
234/// let same = alg;          // Copy
235/// assert!(matches!(same, SimilarityAlgorithm::Combined));
236/// ```
237#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
238pub enum SimilarityAlgorithm {
239    /// Jaro-Winkler similarity — favours common prefixes; good for names.
240    JaroWinkler,
241    /// Normalised Levenshtein similarity — tracks edit distance.
242    Levenshtein,
243    /// Exact equality — binary `1.0` / `0.0`.
244    Exact,
245    /// Weighted blend of Jaro-Winkler (0.7) and Levenshtein (0.3). The default.
246    Combined,
247}
248
249#[cfg(test)]
250mod tests {
251    use super::*;
252
253    // ---------- jaro_winkler ----------
254
255    #[test]
256    fn jaro_winkler_identical() {
257        assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
258    }
259
260    #[test]
261    fn jaro_winkler_close_typo() {
262        assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
263    }
264
265    #[test]
266    fn jaro_winkler_distant() {
267        assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
268    }
269
270    #[test]
271    fn jaro_winkler_empty_pair_is_one() {
272        assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
273    }
274
275    #[test]
276    fn jaro_winkler_single_empty_is_zero() {
277        assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
278        assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
279    }
280
281    #[test]
282    fn jaro_winkler_in_unit_interval() {
283        for (a, b) in [("a", "b"), ("smith", "smyth"), ("abc", "xyz")] {
284            let s = Scorer::jaro_winkler_similarity(a, b);
285            assert!((0.0..=1.0).contains(&s), "out of range: {s}");
286        }
287    }
288
289    // ---------- levenshtein ----------
290
291    #[test]
292    fn levenshtein_identical() {
293        assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
294    }
295
296    #[test]
297    fn levenshtein_one_edit() {
298        // 1 substitution over 5 chars: 1 - 1/5 = 0.8
299        let s = Scorer::levenshtein_similarity("smith", "smyth");
300        assert!((s - 0.8).abs() < 1e-9, "got {s}");
301    }
302
303    #[test]
304    fn levenshtein_completely_different() {
305        assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
306    }
307
308    #[test]
309    fn levenshtein_empty_pair_is_one() {
310        assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
311    }
312
313    #[test]
314    fn levenshtein_single_empty_is_zero() {
315        assert_eq!(Scorer::levenshtein_similarity("smith", ""), 0.0);
316        assert_eq!(Scorer::levenshtein_similarity("", "smith"), 0.0);
317    }
318
319    // ---------- exact ----------
320
321    #[test]
322    fn exact_match_basic() {
323        assert_eq!(Scorer::exact_match("test", "test"), 1.0);
324        assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
325        assert_eq!(Scorer::exact_match("test", "other"), 0.0);
326        assert_eq!(Scorer::exact_match("", ""), 1.0);
327    }
328
329    // ---------- combined ----------
330
331    #[test]
332    fn combined_identical_is_one() {
333        assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
334    }
335
336    #[test]
337    fn combined_close_typo_is_high() {
338        let s = Scorer::combined_similarity("Stephen", "Steven");
339        assert!(s > 0.80, "got {s}");
340    }
341
342    #[test]
343    fn combined_distant_is_low() {
344        assert!(Scorer::combined_similarity("alice", "zachary") < 0.5);
345    }
346
347    // ---------- jaccard ----------
348
349    #[test]
350    fn jaccard_identical_sets_is_one() {
351        let a = ["x", "y"];
352        let b = ["y", "x"];
353        assert!((Scorer::jaccard_set_similarity(&a, &b) - 1.0).abs() < 1e-9);
354    }
355
356    #[test]
357    fn jaccard_disjoint_sets_is_zero() {
358        let a = ["x", "y"];
359        let b = ["a", "b"];
360        assert!(Scorer::jaccard_set_similarity(&a, &b) < 1e-9);
361    }
362
363    #[test]
364    fn jaccard_both_empty_is_one() {
365        let a: [&str; 0] = [];
366        let b: [&str; 0] = [];
367        assert_eq!(Scorer::jaccard_set_similarity(&a, &b), 1.0);
368    }
369
370    #[test]
371    fn jaccard_one_empty_is_zero() {
372        let a = ["x"];
373        let b: [&str; 0] = [];
374        assert_eq!(Scorer::jaccard_set_similarity(&a, &b), 0.0);
375        assert_eq!(Scorer::jaccard_set_similarity(&b, &a), 0.0);
376    }
377
378    #[test]
379    fn jaccard_partial_overlap_is_intersection_over_union() {
380        let a = ["x", "y", "z"];
381        let b = ["y", "z", "w"];
382        let s = Scorer::jaccard_set_similarity(&a, &b);
383        assert!((s - 0.5).abs() < 1e-9, "got {s}");
384    }
385
386    // ---------- optional_field_score ----------
387
388    #[test]
389    fn optional_field_both_none_is_one() {
390        let n: Option<String> = None;
391        assert_eq!(
392            Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
393            1.0
394        );
395    }
396
397    #[test]
398    fn optional_field_asymmetric_is_zero() {
399        let n: Option<String> = None;
400        let s = Some("x".to_string());
401        assert_eq!(
402            Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
403            0.0
404        );
405        assert_eq!(
406            Scorer::optional_field_score(&n, &s, SimilarityAlgorithm::Exact),
407            0.0
408        );
409    }
410
411    #[test]
412    fn optional_field_some_some_uses_algorithm() {
413        let a = Some("smith".to_string());
414        let b = Some("smyth".to_string());
415        let jw = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::JaroWinkler);
416        let lv = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Levenshtein);
417        let ex = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact);
418        let cb = Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Combined);
419        assert!(jw > 0.85);
420        assert!(lv >= 0.79);
421        assert_eq!(ex, 0.0);
422        assert!(cb > 0.8);
423    }
424}