Skip to main content

event_matcher/
scorer.rs

1//! Scoring algorithms for string similarity, geographic distance, and
2//! temporal proximity.
3//!
4//! This module exposes a small, focused set of similarity primitives that
5//! the matching engine composes together. All scores are normalised to the
6//! closed interval `[0.0, 1.0]`, where `1.0` means "identical" and `0.0`
7//! means "no observable similarity".
8//!
9//! ## Algorithm choice
10//!
11//! | Algorithm | Strength | Weakness |
12//! |---|---|---|
13//! | [`SimilarityAlgorithm::JaroWinkler`] | Good for short strings; rewards a common prefix. | Saturates quickly on long strings. |
14//! | [`SimilarityAlgorithm::Levenshtein`] | Cheap to reason about; tracks edit distance. | Sensitive to length differences. |
15//! | [`SimilarityAlgorithm::Exact`]       | Fast; defensible to non-technical reviewers. | No tolerance for typos. |
16//! | [`SimilarityAlgorithm::Combined`]    | Default; balances JW and Levenshtein. | Bespoke weighting (0.7 JW + 0.3 Lev). |
17//!
18//! ## Geographic and temporal primitives
19//!
20//! Two Gaussian-decay scorers exist for domain-specific distances:
21//!
22//! - [`Scorer::coordinates_score`] turns a [`Scorer::haversine_metres`]
23//!   distance into a similarity, parameterised by a metre scale.
24//! - [`Scorer::start_date_score`] turns a [`Scorer::seconds_between`]
25//!   difference into a similarity, parameterised by a seconds scale.
26//!
27//! ## Example
28//!
29//! ```
30//! use event_matcher::Scorer;
31//!
32//! let same  = Scorer::jaro_winkler_similarity("smith", "smith");
33//! let close = Scorer::jaro_winkler_similarity("smith", "smyth");
34//! let far   = Scorer::jaro_winkler_similarity("smith", "jones");
35//!
36//! assert!(same  > 0.99);
37//! assert!(close > 0.85);
38//! assert!(far   < same);
39//! ```
40
41use strsim::{jaro_winkler, levenshtein};
42
43use crate::normalizer::Normalizer;
44
45/// Stateless namespace for similarity scorers.
46///
47/// Like [`crate::Normalizer`], `Scorer` is a unit type with no fields;
48/// every method is associated.
49///
50/// ```
51/// use event_matcher::Scorer;
52/// assert_eq!(Scorer::exact_match("a", "a"), 1.0);
53/// ```
54pub struct Scorer;
55
56impl Scorer {
57    /// Jaro-Winkler similarity, normalised to `[0.0, 1.0]`.
58    ///
59    /// Higher values indicate greater similarity. Strings sharing a common
60    /// prefix score noticeably higher than strings that diverge at the start.
61    ///
62    /// # Edge cases
63    ///
64    /// - Two empty strings → `1.0` (identical).
65    /// - One empty, one not → `0.0`.
66    ///
67    /// # Examples
68    ///
69    /// ```
70    /// use event_matcher::Scorer;
71    /// assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
72    /// assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
73    /// assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
74    /// assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
75    /// ```
76    #[must_use]
77    pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
78        if s1.is_empty() && s2.is_empty() {
79            return 1.0;
80        }
81        if s1.is_empty() || s2.is_empty() {
82            return 0.0;
83        }
84        jaro_winkler(s1, s2)
85    }
86
87    /// Normalised Levenshtein similarity, in `[0.0, 1.0]`.
88    ///
89    /// Computed as `1 - (edit_distance / max_len)`. Higher values indicate
90    /// greater similarity.
91    ///
92    /// # Edge cases
93    ///
94    /// - Two empty strings → `1.0`.
95    /// - One empty, one not → `0.0`.
96    ///
97    /// # Examples
98    ///
99    /// ```
100    /// use event_matcher::Scorer;
101    /// assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
102    /// assert!(Scorer::levenshtein_similarity("smith", "smyth") >= 0.79);
103    /// assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
104    /// assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
105    /// ```
106    #[must_use]
107    // `distance` and `max_len` are string-length counts, far below f64's
108    // 52-bit mantissa limit, so the usize->f64 casts are effectively exact.
109    #[allow(clippy::cast_precision_loss)]
110    pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
111        if s1.is_empty() && s2.is_empty() {
112            return 1.0;
113        }
114        if s1.is_empty() || s2.is_empty() {
115            return 0.0;
116        }
117
118        let distance = levenshtein(s1, s2);
119        let max_len = s1.len().max(s2.len());
120        1.0 - (distance as f64 / max_len as f64)
121    }
122
123    /// Binary exact-match score: `1.0` if `s1 == s2`, else `0.0`.
124    ///
125    /// Case-sensitive and whitespace-sensitive. Pair with
126    /// [`crate::Normalizer`] when comparing user-entered text.
127    ///
128    /// # Examples
129    ///
130    /// ```
131    /// use event_matcher::Scorer;
132    /// assert_eq!(Scorer::exact_match("test", "test"), 1.0);
133    /// assert_eq!(Scorer::exact_match("Test", "test"), 0.0);  // case-sensitive
134    /// assert_eq!(Scorer::exact_match("a", "b"),       0.0);
135    /// ```
136    #[must_use]
137    pub fn exact_match(s1: &str, s2: &str) -> f64 {
138        if s1 == s2 { 1.0 } else { 0.0 }
139    }
140
141    /// Weighted combination of Jaro-Winkler (0.7) and Levenshtein (0.3).
142    ///
143    /// Defaults are tuned for short names. Jaro-Winkler dominates because
144    /// it handles short-string prefix matches better; Levenshtein contributes
145    /// stability for longer or rearranged inputs.
146    ///
147    /// # Example
148    ///
149    /// ```
150    /// use event_matcher::Scorer;
151    ///
152    /// let s = Scorer::combined_similarity("Stephen", "Steven");
153    /// assert!(s > 0.80, "combined score for Stephen/Steven was {s}");
154    /// ```
155    #[must_use]
156    pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
157        let jw = Self::jaro_winkler_similarity(s1, s2);
158        let lev = Self::levenshtein_similarity(s1, s2);
159        0.7 * jw + 0.3 * lev
160    }
161
162    /// Great-circle distance in metres between two geographic points,
163    /// computed via the Haversine formula on a sphere of mean Earth
164    /// radius `6_371_000` m.
165    ///
166    /// All inputs are decimal degrees. The function is total over `f64`:
167    /// non-finite inputs produce `f64::NAN`, but no panic.
168    ///
169    /// # Examples
170    ///
171    /// Identical points are zero distance apart:
172    ///
173    /// ```
174    /// use event_matcher::Scorer;
175    /// let d = Scorer::haversine_metres(51.5, -0.12, 51.5, -0.12);
176    /// assert!(d.abs() < 1e-6, "got {d}");
177    /// ```
178    ///
179    /// London to Paris is about 343 km:
180    ///
181    /// ```
182    /// use event_matcher::Scorer;
183    /// let d = Scorer::haversine_metres(51.507_22, -0.127_5, 48.853_0, 2.349_2);
184    /// let km = d / 1000.0;
185    /// assert!(km > 330.0 && km < 355.0, "London-Paris km was {km}");
186    /// ```
187    #[must_use]
188    pub fn haversine_metres(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
189        const EARTH_RADIUS_M: f64 = 6_371_000.0;
190        let to_rad = |d: f64| d.to_radians();
191        let phi1 = to_rad(lat1);
192        let phi2 = to_rad(lat2);
193        let dphi = to_rad(lat2 - lat1);
194        let dlambda = to_rad(lon2 - lon1);
195        let a =
196            (dphi / 2.0).sin().powi(2) + phi1.cos() * phi2.cos() * (dlambda / 2.0).sin().powi(2);
197        let c = 2.0 * a.sqrt().clamp(0.0, 1.0).asin();
198        EARTH_RADIUS_M * c
199    }
200
201    /// Gaussian-decay similarity for a geographic distance.
202    ///
203    /// Returns `exp(-(d/s)^2)` clamped to `[0.0, 1.0]`.
204    ///
205    /// Contract:
206    ///
207    /// - `d == 0` → `1.0`.
208    /// - `d == scale` → `1/e` (approximately `0.368`).
209    /// - `d == 3 * scale` → approximately `0.0001` (close to zero).
210    /// - Negative or non-finite inputs are treated as missing and return `0.0`.
211    ///
212    /// # Examples
213    ///
214    /// ```
215    /// use event_matcher::Scorer;
216    ///
217    /// let scale = 50.0;
218    /// assert!((Scorer::coordinates_score(0.0, scale) - 1.0).abs() < 1e-12);
219    ///
220    /// let one_e = Scorer::coordinates_score(scale, scale);
221    /// assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
222    ///
223    /// let far = Scorer::coordinates_score(3.0 * scale, scale);
224    /// assert!(far < 1e-3, "got {far}");
225    /// ```
226    #[must_use]
227    pub fn coordinates_score(distance_metres: f64, scale_metres: f64) -> f64 {
228        gaussian_decay(distance_metres, scale_metres)
229    }
230
231    /// Absolute difference in seconds between two ISO-8601 timestamps.
232    ///
233    /// Both inputs are passed through [`Normalizer::parse_iso8601_unix_seconds`].
234    /// Returns `None` if either string fails to parse.
235    ///
236    /// Dates without a time component are anchored to `00:00:00 UTC`.
237    /// Local-time-only strings (no timezone) are interpreted as UTC for
238    /// the sole purpose of computing a difference — callers who care about
239    /// absolute wall-clock equivalence should pre-normalise to a common
240    /// timezone before scoring.
241    ///
242    /// # Examples
243    ///
244    /// ```
245    /// use event_matcher::Scorer;
246    ///
247    /// let same = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T09:00:00Z");
248    /// assert_eq!(same, Some(0));
249    ///
250    /// let one_hour = Scorer::seconds_between(
251    ///     "2024-06-26T09:00:00Z",
252    ///     "2024-06-26T10:00:00Z",
253    /// );
254    /// assert_eq!(one_hour, Some(3600));
255    ///
256    /// // Antisymmetric in absolute value.
257    /// let other_way = Scorer::seconds_between(
258    ///     "2024-06-26T10:00:00Z",
259    ///     "2024-06-26T09:00:00Z",
260    /// );
261    /// assert_eq!(other_way, Some(3600));
262    ///
263    /// assert!(Scorer::seconds_between("not-a-date", "2024-06-26").is_none());
264    /// ```
265    #[must_use]
266    pub fn seconds_between(t1: &str, t2: &str) -> Option<i64> {
267        let s1 = Normalizer::parse_iso8601_unix_seconds(t1)?;
268        let s2 = Normalizer::parse_iso8601_unix_seconds(t2)?;
269        Some((s1 - s2).abs())
270    }
271
272    /// Gaussian-decay similarity for a temporal difference.
273    ///
274    /// Returns `exp(-(d/s)^2)` clamped to `[0.0, 1.0]`, where `d` is the
275    /// absolute difference in seconds and `s` is the configured scale in
276    /// seconds.
277    ///
278    /// Contract:
279    ///
280    /// - `d == 0` → `1.0`.
281    /// - `d == scale` → `1/e` (approximately `0.368`).
282    /// - `d == 3 * scale` → approximately `0.0001`.
283    /// - Negative or non-finite inputs are treated as missing and return `0.0`.
284    ///
285    /// # Examples
286    ///
287    /// ```
288    /// use event_matcher::Scorer;
289    ///
290    /// let scale_seconds = 3600.0; // one hour
291    /// assert!((Scorer::start_date_score(0.0, scale_seconds) - 1.0).abs() < 1e-12);
292    ///
293    /// let one_e = Scorer::start_date_score(scale_seconds, scale_seconds);
294    /// assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
295    ///
296    /// let far = Scorer::start_date_score(3.0 * scale_seconds, scale_seconds);
297    /// assert!(far < 1e-3, "got {far}");
298    /// ```
299    #[must_use]
300    pub fn start_date_score(difference_seconds: f64, scale_seconds: f64) -> f64 {
301        gaussian_decay(difference_seconds, scale_seconds)
302    }
303
304    /// Score two `Option<String>` fields using the chosen algorithm.
305    ///
306    /// Returns:
307    ///
308    /// - `1.0` if both are `None` (both absent → trivially "match").
309    /// - `0.0` if exactly one is `None` (asymmetric data → "differ").
310    /// - The chosen algorithm's similarity if both are `Some`.
311    ///
312    /// Note: the matching engine intentionally does **not** use this
313    /// helper — it skips fields where either side is absent so that they
314    /// neither contribute nor penalise. This helper is kept for callers
315    /// who want a different policy.
316    ///
317    /// # Examples
318    ///
319    /// ```
320    /// use event_matcher::{Scorer, SimilarityAlgorithm};
321    ///
322    /// let none: Option<String> = None;
323    /// let a = Some("hello".to_string());
324    /// let b = Some("hello".to_string());
325    ///
326    /// assert_eq!(Scorer::optional_field_score(&none, &none, SimilarityAlgorithm::Exact), 1.0);
327    /// assert_eq!(Scorer::optional_field_score(&a,    &none, SimilarityAlgorithm::Exact), 0.0);
328    /// assert_eq!(Scorer::optional_field_score(&a,    &b,    SimilarityAlgorithm::Exact), 1.0);
329    /// ```
330    #[must_use]
331    pub fn optional_field_score(
332        field1: &Option<String>,
333        field2: &Option<String>,
334        algorithm: SimilarityAlgorithm,
335    ) -> f64 {
336        match (field1, field2) {
337            (None, None) => 1.0,
338            (None, Some(_)) | (Some(_), None) => 0.0,
339            (Some(s1), Some(s2)) => match algorithm {
340                SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
341                SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
342                SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
343                SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
344            },
345        }
346    }
347}
348
349fn gaussian_decay(distance: f64, scale: f64) -> f64 {
350    if !distance.is_finite() || !scale.is_finite() || scale <= 0.0 || distance < 0.0 {
351        return 0.0;
352    }
353    let ratio = distance / scale;
354    let s = (-(ratio * ratio)).exp();
355    s.clamp(0.0, 1.0)
356}
357
358/// Algorithm selector for name comparison in [`crate::MatchConfig`].
359///
360/// The enum is `Copy`, so it is cheap to embed in a config struct or to
361/// pass through scoring helpers.
362///
363/// ```
364/// use event_matcher::SimilarityAlgorithm;
365/// let alg = SimilarityAlgorithm::Combined;
366/// let same = alg;          // Copy
367/// assert!(matches!(same, SimilarityAlgorithm::Combined));
368/// ```
369#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
370pub enum SimilarityAlgorithm {
371    /// Jaro-Winkler similarity — favours common prefixes; good for names.
372    JaroWinkler,
373    /// Normalised Levenshtein similarity — tracks edit distance.
374    Levenshtein,
375    /// Exact equality — binary `1.0` / `0.0`.
376    Exact,
377    /// Weighted blend of Jaro-Winkler (0.7) and Levenshtein (0.3). The default.
378    Combined,
379}
380
381#[cfg(test)]
382// Scorers return exact `0.0` / `1.0` sentinels at their boundaries; these
383// tests assert those literal values, where exact comparison is correct.
384#[allow(clippy::float_cmp)]
385mod tests {
386    use super::*;
387
388    #[test]
389    fn jaro_winkler_identical() {
390        assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
391    }
392
393    #[test]
394    fn jaro_winkler_close_typo() {
395        assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
396    }
397
398    #[test]
399    fn jaro_winkler_distant() {
400        assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
401    }
402
403    #[test]
404    fn jaro_winkler_empty_pair_is_one() {
405        assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
406    }
407
408    #[test]
409    fn jaro_winkler_single_empty_is_zero() {
410        assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
411        assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
412    }
413
414    #[test]
415    fn levenshtein_identical() {
416        assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
417    }
418
419    #[test]
420    fn levenshtein_one_edit() {
421        let s = Scorer::levenshtein_similarity("smith", "smyth");
422        assert!((s - 0.8).abs() < 1e-9, "got {s}");
423    }
424
425    #[test]
426    fn levenshtein_empty_pair_is_one() {
427        assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
428    }
429
430    #[test]
431    fn exact_match_basic() {
432        assert_eq!(Scorer::exact_match("test", "test"), 1.0);
433        assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
434        assert_eq!(Scorer::exact_match("test", "other"), 0.0);
435        assert_eq!(Scorer::exact_match("", ""), 1.0);
436    }
437
438    #[test]
439    fn combined_identical_is_one() {
440        assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
441    }
442
443    #[test]
444    fn combined_close_typo_is_high() {
445        let s = Scorer::combined_similarity("Stephen", "Steven");
446        assert!(s > 0.80, "got {s}");
447    }
448
449    #[test]
450    fn haversine_identical_is_zero() {
451        let d = Scorer::haversine_metres(51.5, -0.12, 51.5, -0.12);
452        assert!(d.abs() < 1e-6);
453    }
454
455    #[test]
456    fn haversine_london_paris_about_343km() {
457        let d = Scorer::haversine_metres(51.507_22, -0.127_5, 48.853_0, 2.349_2) / 1000.0;
458        assert!(d > 330.0 && d < 355.0, "got {d}");
459    }
460
461    #[test]
462    fn coordinates_score_contract() {
463        let scale = 50.0;
464        assert!((Scorer::coordinates_score(0.0, scale) - 1.0).abs() < 1e-12);
465        let one_e = Scorer::coordinates_score(scale, scale);
466        assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
467        let far = Scorer::coordinates_score(3.0 * scale, scale);
468        assert!(far < 1e-3);
469    }
470
471    #[test]
472    fn coordinates_score_rejects_pathological_inputs() {
473        assert_eq!(Scorer::coordinates_score(f64::NAN, 50.0), 0.0);
474        assert_eq!(Scorer::coordinates_score(10.0, 0.0), 0.0);
475        assert_eq!(Scorer::coordinates_score(-1.0, 50.0), 0.0);
476    }
477
478    #[test]
479    fn seconds_between_identical_is_zero() {
480        let d = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T09:00:00Z");
481        assert_eq!(d, Some(0));
482    }
483
484    #[test]
485    fn seconds_between_one_hour() {
486        let d = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T10:00:00Z");
487        assert_eq!(d, Some(3600));
488    }
489
490    #[test]
491    fn seconds_between_is_symmetric_absolute() {
492        let a = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T10:00:00Z");
493        let b = Scorer::seconds_between("2024-06-26T10:00:00Z", "2024-06-26T09:00:00Z");
494        assert_eq!(a, b);
495    }
496
497    #[test]
498    fn seconds_between_one_day() {
499        let d = Scorer::seconds_between("2024-06-26", "2024-06-27");
500        assert_eq!(d, Some(86_400));
501    }
502
503    #[test]
504    fn seconds_between_rejects_garbage() {
505        assert!(Scorer::seconds_between("not-a-date", "2024-06-26").is_none());
506        assert!(Scorer::seconds_between("2024-06-26", "also-not-a-date").is_none());
507    }
508
509    #[test]
510    fn start_date_score_contract() {
511        let scale = 3600.0;
512        assert!((Scorer::start_date_score(0.0, scale) - 1.0).abs() < 1e-12);
513        let one_e = Scorer::start_date_score(scale, scale);
514        assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
515        let far = Scorer::start_date_score(3.0 * scale, scale);
516        assert!(far < 1e-3);
517    }
518
519    #[test]
520    fn start_date_score_rejects_pathological_inputs() {
521        assert_eq!(Scorer::start_date_score(f64::NAN, 3600.0), 0.0);
522        assert_eq!(Scorer::start_date_score(10.0, 0.0), 0.0);
523        assert_eq!(Scorer::start_date_score(-1.0, 3600.0), 0.0);
524    }
525
526    #[test]
527    fn optional_field_both_none_is_one() {
528        let n: Option<String> = None;
529        assert_eq!(
530            Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
531            1.0
532        );
533    }
534
535    #[test]
536    fn optional_field_asymmetric_is_zero() {
537        let n: Option<String> = None;
538        let s = Some("x".to_string());
539        assert_eq!(
540            Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
541            0.0
542        );
543    }
544}