event_matcher/scorer.rs
1//! Scoring algorithms for string similarity, geographic distance, and
2//! temporal proximity.
3//!
4//! This module exposes a small, focused set of similarity primitives that
5//! the matching engine composes together. All scores are normalised to the
6//! closed interval `[0.0, 1.0]`, where `1.0` means "identical" and `0.0`
7//! means "no observable similarity".
8//!
9//! ## Algorithm choice
10//!
11//! | Algorithm | Strength | Weakness |
12//! |---|---|---|
13//! | [`SimilarityAlgorithm::JaroWinkler`] | Good for short strings; rewards a common prefix. | Saturates quickly on long strings. |
14//! | [`SimilarityAlgorithm::Levenshtein`] | Cheap to reason about; tracks edit distance. | Sensitive to length differences. |
15//! | [`SimilarityAlgorithm::Exact`] | Fast; defensible to non-technical reviewers. | No tolerance for typos. |
16//! | [`SimilarityAlgorithm::Combined`] | Default; balances JW and Levenshtein. | Bespoke weighting (0.7 JW + 0.3 Lev). |
17//!
18//! ## Geographic and temporal primitives
19//!
20//! Two Gaussian-decay scorers exist for domain-specific distances:
21//!
22//! - [`Scorer::coordinates_score`] turns a [`Scorer::haversine_metres`]
23//! distance into a similarity, parameterised by a metre scale.
24//! - [`Scorer::start_date_score`] turns a [`Scorer::seconds_between`]
25//! difference into a similarity, parameterised by a seconds scale.
26//!
27//! ## Example
28//!
29//! ```
30//! use event_matcher::Scorer;
31//!
32//! let same = Scorer::jaro_winkler_similarity("smith", "smith");
33//! let close = Scorer::jaro_winkler_similarity("smith", "smyth");
34//! let far = Scorer::jaro_winkler_similarity("smith", "jones");
35//!
36//! assert!(same > 0.99);
37//! assert!(close > 0.85);
38//! assert!(far < same);
39//! ```
40
41use strsim::{jaro_winkler, levenshtein};
42
43use crate::normalizer::Normalizer;
44
45/// Stateless namespace for similarity scorers.
46///
47/// Like [`crate::Normalizer`], `Scorer` is a unit type with no fields;
48/// every method is associated.
49///
50/// ```
51/// use event_matcher::Scorer;
52/// assert_eq!(Scorer::exact_match("a", "a"), 1.0);
53/// ```
54pub struct Scorer;
55
56impl Scorer {
57 /// Jaro-Winkler similarity, normalised to `[0.0, 1.0]`.
58 ///
59 /// Higher values indicate greater similarity. Strings sharing a common
60 /// prefix score noticeably higher than strings that diverge at the start.
61 ///
62 /// # Edge cases
63 ///
64 /// - Two empty strings → `1.0` (identical).
65 /// - One empty, one not → `0.0`.
66 ///
67 /// # Examples
68 ///
69 /// ```
70 /// use event_matcher::Scorer;
71 /// assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
72 /// assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
73 /// assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
74 /// assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
75 /// ```
76 #[must_use]
77 pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
78 if s1.is_empty() && s2.is_empty() {
79 return 1.0;
80 }
81 if s1.is_empty() || s2.is_empty() {
82 return 0.0;
83 }
84 jaro_winkler(s1, s2)
85 }
86
87 /// Normalised Levenshtein similarity, in `[0.0, 1.0]`.
88 ///
89 /// Computed as `1 - (edit_distance / max_len)`. Higher values indicate
90 /// greater similarity.
91 ///
92 /// # Edge cases
93 ///
94 /// - Two empty strings → `1.0`.
95 /// - One empty, one not → `0.0`.
96 ///
97 /// # Examples
98 ///
99 /// ```
100 /// use event_matcher::Scorer;
101 /// assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
102 /// assert!(Scorer::levenshtein_similarity("smith", "smyth") >= 0.79);
103 /// assert!(Scorer::levenshtein_similarity("abc", "xyz") < 0.5);
104 /// assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
105 /// ```
106 #[must_use]
107 // `distance` and `max_len` are string-length counts, far below f64's
108 // 52-bit mantissa limit, so the usize->f64 casts are effectively exact.
109 #[allow(clippy::cast_precision_loss)]
110 pub fn levenshtein_similarity(s1: &str, s2: &str) -> f64 {
111 if s1.is_empty() && s2.is_empty() {
112 return 1.0;
113 }
114 if s1.is_empty() || s2.is_empty() {
115 return 0.0;
116 }
117
118 let distance = levenshtein(s1, s2);
119 let max_len = s1.len().max(s2.len());
120 1.0 - (distance as f64 / max_len as f64)
121 }
122
123 /// Binary exact-match score: `1.0` if `s1 == s2`, else `0.0`.
124 ///
125 /// Case-sensitive and whitespace-sensitive. Pair with
126 /// [`crate::Normalizer`] when comparing user-entered text.
127 ///
128 /// # Examples
129 ///
130 /// ```
131 /// use event_matcher::Scorer;
132 /// assert_eq!(Scorer::exact_match("test", "test"), 1.0);
133 /// assert_eq!(Scorer::exact_match("Test", "test"), 0.0); // case-sensitive
134 /// assert_eq!(Scorer::exact_match("a", "b"), 0.0);
135 /// ```
136 #[must_use]
137 pub fn exact_match(s1: &str, s2: &str) -> f64 {
138 if s1 == s2 { 1.0 } else { 0.0 }
139 }
140
141 /// Weighted combination of Jaro-Winkler (0.7) and Levenshtein (0.3).
142 ///
143 /// Defaults are tuned for short names. Jaro-Winkler dominates because
144 /// it handles short-string prefix matches better; Levenshtein contributes
145 /// stability for longer or rearranged inputs.
146 ///
147 /// # Example
148 ///
149 /// ```
150 /// use event_matcher::Scorer;
151 ///
152 /// let s = Scorer::combined_similarity("Stephen", "Steven");
153 /// assert!(s > 0.80, "combined score for Stephen/Steven was {s}");
154 /// ```
155 #[must_use]
156 pub fn combined_similarity(s1: &str, s2: &str) -> f64 {
157 let jw = Self::jaro_winkler_similarity(s1, s2);
158 let lev = Self::levenshtein_similarity(s1, s2);
159 0.7 * jw + 0.3 * lev
160 }
161
162 /// Great-circle distance in metres between two geographic points,
163 /// computed via the Haversine formula on a sphere of mean Earth
164 /// radius `6_371_000` m.
165 ///
166 /// All inputs are decimal degrees. The function is total over `f64`:
167 /// non-finite inputs produce `f64::NAN`, but no panic.
168 ///
169 /// # Examples
170 ///
171 /// Identical points are zero distance apart:
172 ///
173 /// ```
174 /// use event_matcher::Scorer;
175 /// let d = Scorer::haversine_metres(51.5, -0.12, 51.5, -0.12);
176 /// assert!(d.abs() < 1e-6, "got {d}");
177 /// ```
178 ///
179 /// London to Paris is about 343 km:
180 ///
181 /// ```
182 /// use event_matcher::Scorer;
183 /// let d = Scorer::haversine_metres(51.507_22, -0.127_5, 48.853_0, 2.349_2);
184 /// let km = d / 1000.0;
185 /// assert!(km > 330.0 && km < 355.0, "London-Paris km was {km}");
186 /// ```
187 #[must_use]
188 pub fn haversine_metres(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
189 const EARTH_RADIUS_M: f64 = 6_371_000.0;
190 let to_rad = |d: f64| d.to_radians();
191 let phi1 = to_rad(lat1);
192 let phi2 = to_rad(lat2);
193 let dphi = to_rad(lat2 - lat1);
194 let dlambda = to_rad(lon2 - lon1);
195 let a =
196 (dphi / 2.0).sin().powi(2) + phi1.cos() * phi2.cos() * (dlambda / 2.0).sin().powi(2);
197 let c = 2.0 * a.sqrt().clamp(0.0, 1.0).asin();
198 EARTH_RADIUS_M * c
199 }
200
201 /// Gaussian-decay similarity for a geographic distance.
202 ///
203 /// Returns `exp(-(d/s)^2)` clamped to `[0.0, 1.0]`.
204 ///
205 /// Contract:
206 ///
207 /// - `d == 0` → `1.0`.
208 /// - `d == scale` → `1/e` (approximately `0.368`).
209 /// - `d == 3 * scale` → approximately `0.0001` (close to zero).
210 /// - Negative or non-finite inputs are treated as missing and return `0.0`.
211 ///
212 /// # Examples
213 ///
214 /// ```
215 /// use event_matcher::Scorer;
216 ///
217 /// let scale = 50.0;
218 /// assert!((Scorer::coordinates_score(0.0, scale) - 1.0).abs() < 1e-12);
219 ///
220 /// let one_e = Scorer::coordinates_score(scale, scale);
221 /// assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
222 ///
223 /// let far = Scorer::coordinates_score(3.0 * scale, scale);
224 /// assert!(far < 1e-3, "got {far}");
225 /// ```
226 #[must_use]
227 pub fn coordinates_score(distance_metres: f64, scale_metres: f64) -> f64 {
228 gaussian_decay(distance_metres, scale_metres)
229 }
230
231 /// Absolute difference in seconds between two ISO-8601 timestamps.
232 ///
233 /// Both inputs are passed through [`Normalizer::parse_iso8601_unix_seconds`].
234 /// Returns `None` if either string fails to parse.
235 ///
236 /// Dates without a time component are anchored to `00:00:00 UTC`.
237 /// Local-time-only strings (no timezone) are interpreted as UTC for
238 /// the sole purpose of computing a difference — callers who care about
239 /// absolute wall-clock equivalence should pre-normalise to a common
240 /// timezone before scoring.
241 ///
242 /// # Examples
243 ///
244 /// ```
245 /// use event_matcher::Scorer;
246 ///
247 /// let same = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T09:00:00Z");
248 /// assert_eq!(same, Some(0));
249 ///
250 /// let one_hour = Scorer::seconds_between(
251 /// "2024-06-26T09:00:00Z",
252 /// "2024-06-26T10:00:00Z",
253 /// );
254 /// assert_eq!(one_hour, Some(3600));
255 ///
256 /// // Antisymmetric in absolute value.
257 /// let other_way = Scorer::seconds_between(
258 /// "2024-06-26T10:00:00Z",
259 /// "2024-06-26T09:00:00Z",
260 /// );
261 /// assert_eq!(other_way, Some(3600));
262 ///
263 /// assert!(Scorer::seconds_between("not-a-date", "2024-06-26").is_none());
264 /// ```
265 #[must_use]
266 pub fn seconds_between(t1: &str, t2: &str) -> Option<i64> {
267 let s1 = Normalizer::parse_iso8601_unix_seconds(t1)?;
268 let s2 = Normalizer::parse_iso8601_unix_seconds(t2)?;
269 Some((s1 - s2).abs())
270 }
271
272 /// Gaussian-decay similarity for a temporal difference.
273 ///
274 /// Returns `exp(-(d/s)^2)` clamped to `[0.0, 1.0]`, where `d` is the
275 /// absolute difference in seconds and `s` is the configured scale in
276 /// seconds.
277 ///
278 /// Contract:
279 ///
280 /// - `d == 0` → `1.0`.
281 /// - `d == scale` → `1/e` (approximately `0.368`).
282 /// - `d == 3 * scale` → approximately `0.0001`.
283 /// - Negative or non-finite inputs are treated as missing and return `0.0`.
284 ///
285 /// # Examples
286 ///
287 /// ```
288 /// use event_matcher::Scorer;
289 ///
290 /// let scale_seconds = 3600.0; // one hour
291 /// assert!((Scorer::start_date_score(0.0, scale_seconds) - 1.0).abs() < 1e-12);
292 ///
293 /// let one_e = Scorer::start_date_score(scale_seconds, scale_seconds);
294 /// assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
295 ///
296 /// let far = Scorer::start_date_score(3.0 * scale_seconds, scale_seconds);
297 /// assert!(far < 1e-3, "got {far}");
298 /// ```
299 #[must_use]
300 pub fn start_date_score(difference_seconds: f64, scale_seconds: f64) -> f64 {
301 gaussian_decay(difference_seconds, scale_seconds)
302 }
303
304 /// Score two `Option<String>` fields using the chosen algorithm.
305 ///
306 /// Returns:
307 ///
308 /// - `1.0` if both are `None` (both absent → trivially "match").
309 /// - `0.0` if exactly one is `None` (asymmetric data → "differ").
310 /// - The chosen algorithm's similarity if both are `Some`.
311 ///
312 /// Note: the matching engine intentionally does **not** use this
313 /// helper — it skips fields where either side is absent so that they
314 /// neither contribute nor penalise. This helper is kept for callers
315 /// who want a different policy.
316 ///
317 /// # Examples
318 ///
319 /// ```
320 /// use event_matcher::{Scorer, SimilarityAlgorithm};
321 ///
322 /// let none: Option<String> = None;
323 /// let a = Some("hello".to_string());
324 /// let b = Some("hello".to_string());
325 ///
326 /// assert_eq!(Scorer::optional_field_score(&none, &none, SimilarityAlgorithm::Exact), 1.0);
327 /// assert_eq!(Scorer::optional_field_score(&a, &none, SimilarityAlgorithm::Exact), 0.0);
328 /// assert_eq!(Scorer::optional_field_score(&a, &b, SimilarityAlgorithm::Exact), 1.0);
329 /// ```
330 #[must_use]
331 pub fn optional_field_score(
332 field1: &Option<String>,
333 field2: &Option<String>,
334 algorithm: SimilarityAlgorithm,
335 ) -> f64 {
336 match (field1, field2) {
337 (None, None) => 1.0,
338 (None, Some(_)) | (Some(_), None) => 0.0,
339 (Some(s1), Some(s2)) => match algorithm {
340 SimilarityAlgorithm::JaroWinkler => Self::jaro_winkler_similarity(s1, s2),
341 SimilarityAlgorithm::Levenshtein => Self::levenshtein_similarity(s1, s2),
342 SimilarityAlgorithm::Exact => Self::exact_match(s1, s2),
343 SimilarityAlgorithm::Combined => Self::combined_similarity(s1, s2),
344 },
345 }
346 }
347}
348
349fn gaussian_decay(distance: f64, scale: f64) -> f64 {
350 if !distance.is_finite() || !scale.is_finite() || scale <= 0.0 || distance < 0.0 {
351 return 0.0;
352 }
353 let ratio = distance / scale;
354 let s = (-(ratio * ratio)).exp();
355 s.clamp(0.0, 1.0)
356}
357
358/// Algorithm selector for name comparison in [`crate::MatchConfig`].
359///
360/// The enum is `Copy`, so it is cheap to embed in a config struct or to
361/// pass through scoring helpers.
362///
363/// ```
364/// use event_matcher::SimilarityAlgorithm;
365/// let alg = SimilarityAlgorithm::Combined;
366/// let same = alg; // Copy
367/// assert!(matches!(same, SimilarityAlgorithm::Combined));
368/// ```
369#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
370pub enum SimilarityAlgorithm {
371 /// Jaro-Winkler similarity — favours common prefixes; good for names.
372 JaroWinkler,
373 /// Normalised Levenshtein similarity — tracks edit distance.
374 Levenshtein,
375 /// Exact equality — binary `1.0` / `0.0`.
376 Exact,
377 /// Weighted blend of Jaro-Winkler (0.7) and Levenshtein (0.3). The default.
378 Combined,
379}
380
381#[cfg(test)]
382// Scorers return exact `0.0` / `1.0` sentinels at their boundaries; these
383// tests assert those literal values, where exact comparison is correct.
384#[allow(clippy::float_cmp)]
385mod tests {
386 use super::*;
387
388 #[test]
389 fn jaro_winkler_identical() {
390 assert!(Scorer::jaro_winkler_similarity("smith", "smith") > 0.99);
391 }
392
393 #[test]
394 fn jaro_winkler_close_typo() {
395 assert!(Scorer::jaro_winkler_similarity("smith", "smyth") > 0.85);
396 }
397
398 #[test]
399 fn jaro_winkler_distant() {
400 assert!(Scorer::jaro_winkler_similarity("jones", "james") < 0.8);
401 }
402
403 #[test]
404 fn jaro_winkler_empty_pair_is_one() {
405 assert_eq!(Scorer::jaro_winkler_similarity("", ""), 1.0);
406 }
407
408 #[test]
409 fn jaro_winkler_single_empty_is_zero() {
410 assert_eq!(Scorer::jaro_winkler_similarity("smith", ""), 0.0);
411 assert_eq!(Scorer::jaro_winkler_similarity("", "smith"), 0.0);
412 }
413
414 #[test]
415 fn levenshtein_identical() {
416 assert_eq!(Scorer::levenshtein_similarity("smith", "smith"), 1.0);
417 }
418
419 #[test]
420 fn levenshtein_one_edit() {
421 let s = Scorer::levenshtein_similarity("smith", "smyth");
422 assert!((s - 0.8).abs() < 1e-9, "got {s}");
423 }
424
425 #[test]
426 fn levenshtein_empty_pair_is_one() {
427 assert_eq!(Scorer::levenshtein_similarity("", ""), 1.0);
428 }
429
430 #[test]
431 fn exact_match_basic() {
432 assert_eq!(Scorer::exact_match("test", "test"), 1.0);
433 assert_eq!(Scorer::exact_match("test", "Test"), 0.0);
434 assert_eq!(Scorer::exact_match("test", "other"), 0.0);
435 assert_eq!(Scorer::exact_match("", ""), 1.0);
436 }
437
438 #[test]
439 fn combined_identical_is_one() {
440 assert!((Scorer::combined_similarity("smith", "smith") - 1.0).abs() < 1e-9);
441 }
442
443 #[test]
444 fn combined_close_typo_is_high() {
445 let s = Scorer::combined_similarity("Stephen", "Steven");
446 assert!(s > 0.80, "got {s}");
447 }
448
449 #[test]
450 fn haversine_identical_is_zero() {
451 let d = Scorer::haversine_metres(51.5, -0.12, 51.5, -0.12);
452 assert!(d.abs() < 1e-6);
453 }
454
455 #[test]
456 fn haversine_london_paris_about_343km() {
457 let d = Scorer::haversine_metres(51.507_22, -0.127_5, 48.853_0, 2.349_2) / 1000.0;
458 assert!(d > 330.0 && d < 355.0, "got {d}");
459 }
460
461 #[test]
462 fn coordinates_score_contract() {
463 let scale = 50.0;
464 assert!((Scorer::coordinates_score(0.0, scale) - 1.0).abs() < 1e-12);
465 let one_e = Scorer::coordinates_score(scale, scale);
466 assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
467 let far = Scorer::coordinates_score(3.0 * scale, scale);
468 assert!(far < 1e-3);
469 }
470
471 #[test]
472 fn coordinates_score_rejects_pathological_inputs() {
473 assert_eq!(Scorer::coordinates_score(f64::NAN, 50.0), 0.0);
474 assert_eq!(Scorer::coordinates_score(10.0, 0.0), 0.0);
475 assert_eq!(Scorer::coordinates_score(-1.0, 50.0), 0.0);
476 }
477
478 #[test]
479 fn seconds_between_identical_is_zero() {
480 let d = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T09:00:00Z");
481 assert_eq!(d, Some(0));
482 }
483
484 #[test]
485 fn seconds_between_one_hour() {
486 let d = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T10:00:00Z");
487 assert_eq!(d, Some(3600));
488 }
489
490 #[test]
491 fn seconds_between_is_symmetric_absolute() {
492 let a = Scorer::seconds_between("2024-06-26T09:00:00Z", "2024-06-26T10:00:00Z");
493 let b = Scorer::seconds_between("2024-06-26T10:00:00Z", "2024-06-26T09:00:00Z");
494 assert_eq!(a, b);
495 }
496
497 #[test]
498 fn seconds_between_one_day() {
499 let d = Scorer::seconds_between("2024-06-26", "2024-06-27");
500 assert_eq!(d, Some(86_400));
501 }
502
503 #[test]
504 fn seconds_between_rejects_garbage() {
505 assert!(Scorer::seconds_between("not-a-date", "2024-06-26").is_none());
506 assert!(Scorer::seconds_between("2024-06-26", "also-not-a-date").is_none());
507 }
508
509 #[test]
510 fn start_date_score_contract() {
511 let scale = 3600.0;
512 assert!((Scorer::start_date_score(0.0, scale) - 1.0).abs() < 1e-12);
513 let one_e = Scorer::start_date_score(scale, scale);
514 assert!((one_e - (1.0_f64 / std::f64::consts::E)).abs() < 1e-12);
515 let far = Scorer::start_date_score(3.0 * scale, scale);
516 assert!(far < 1e-3);
517 }
518
519 #[test]
520 fn start_date_score_rejects_pathological_inputs() {
521 assert_eq!(Scorer::start_date_score(f64::NAN, 3600.0), 0.0);
522 assert_eq!(Scorer::start_date_score(10.0, 0.0), 0.0);
523 assert_eq!(Scorer::start_date_score(-1.0, 3600.0), 0.0);
524 }
525
526 #[test]
527 fn optional_field_both_none_is_one() {
528 let n: Option<String> = None;
529 assert_eq!(
530 Scorer::optional_field_score(&n, &n, SimilarityAlgorithm::Exact),
531 1.0
532 );
533 }
534
535 #[test]
536 fn optional_field_asymmetric_is_zero() {
537 let n: Option<String> = None;
538 let s = Some("x".to_string());
539 assert_eq!(
540 Scorer::optional_field_score(&s, &n, SimilarityAlgorithm::Exact),
541 0.0
542 );
543 }
544}