Skip to main content

event_matcher/
matcher.rs

1//! Event matcher engine: deterministic and probabilistic algorithms.
2//!
3//! This is the orchestration layer of the crate. It pulls together the data
4//! types from [`crate::models`], the text transformations from
5//! [`crate::normalizer`], and the similarity primitives from
6//! [`crate::scorer`] to produce a single answer about whether two event
7//! records refer to the same event.
8//!
9//! ## Two strategies, one engine
10//!
11//! - [`MatchingEngine::deterministic_match`] — fast, binary. Returns `true`
12//!   iff the two events share any external event-ID pair, or share an
13//!   exact normalised primary name plus the same normalised start date.
14//! - [`MatchingEngine::match_events`] — weighted probabilistic scoring,
15//!   returning a [`MatchResult`] with per-field [`MatchBreakdown`].
16//!
17//! ## Example
18//!
19//! ```
20//! use event_matcher::{MatchingEngine, Event};
21//!
22//! let a = Event::builder()
23//!     .name("Glastonbury Festival 2024")
24//!     .start_date("2024-06-26T09:00:00Z")
25//!     .build();
26//!
27//! let b = Event::builder()
28//!     .name("Glasto 2024")
29//!     .add_alternate_name("Glastonbury Festival 2024")
30//!     .start_date("2024-06-26T09:15:00Z")
31//!     .build();
32//!
33//! let engine = MatchingEngine::default_config();
34//! let result = engine.match_events(&a, &b);
35//! assert!(result.is_match);
36//! ```
37
38use crate::models::{Address, Event, Location};
39use crate::normalizer::Normalizer;
40use crate::scorer::{Scorer, SimilarityAlgorithm};
41use serde::{Deserialize, Serialize};
42
43/// Tunable configuration for the matching engine.
44///
45/// All weights are dimensionless and contribute to a renormalised weighted
46/// sum — they do not need to add to `1.0`. The matching pipeline divides
47/// the weighted sum by the sum of *participating* weights so that missing
48/// fields neither contribute nor penalise. The score is then compared
49/// against [`MatchConfig::match_threshold`] to produce the `is_match`
50/// boolean.
51///
52/// Two presets cover most needs:
53///
54/// - [`MatchConfig::strict`]  — `match_threshold = 0.95`, `strict_mode = true`.
55/// - [`MatchConfig::lenient`] — `match_threshold = 0.65`, phonetic on.
56///
57/// # Example
58///
59/// ```
60/// use event_matcher::{MatchConfig, SimilarityAlgorithm};
61///
62/// let custom = MatchConfig {
63///     match_threshold: 0.80,
64///     name_weight: 0.20,
65///     start_date_weight: 0.25,
66///     start_date_scale_seconds: 3600.0,
67///     end_date_weight: 0.05,
68///     location_weight: 0.15,
69///     coordinates_scale_metres: 100.0,
70///     category_weight: 0.08,
71///     country_code_weight: 0.04,
72///     event_ids_weight: 0.15,
73///     organizer_weight: 0.04,
74///     performers_weight: 0.02,
75///     url_weight: 0.02,
76///     use_phonetic_matching: true,
77///     name_algorithm: SimilarityAlgorithm::Combined,
78///     strict_mode: false,
79/// };
80/// assert_eq!(custom.match_threshold, 0.80);
81/// ```
82#[derive(Debug, Clone, Serialize, Deserialize)]
83#[serde(default)]
84pub struct MatchConfig {
85    /// Threshold score for considering two events a match (`0.0..=1.0`).
86    pub match_threshold: f64,
87
88    /// Weight for name similarity (best-of cartesian product across the
89    /// primary `name` and `alternate_names` on both sides).
90    pub name_weight: f64,
91
92    /// Weight for `start_date` similarity (Gaussian decay over absolute
93    /// seconds difference).
94    pub start_date_weight: f64,
95
96    /// Time scale, in seconds, controlling the Gaussian decay of the
97    /// `start_date` score. At a separation equal to `scale` the score is
98    /// `1/e ~= 0.368`. Defaults to one hour (`3600.0`).
99    pub start_date_scale_seconds: f64,
100
101    /// Weight for `end_date` similarity (same Gaussian-decay shape as
102    /// `start_date`).
103    pub end_date_weight: f64,
104
105    /// Weight for location similarity (weighted blend of venue name,
106    /// address, and coordinates).
107    pub location_weight: f64,
108
109    /// Distance scale, in metres, controlling the Gaussian decay of the
110    /// coordinates sub-score inside `location`. Defaults to `100.0`.
111    pub coordinates_scale_metres: f64,
112
113    /// Weight for [`EventCategory`](crate::models::EventCategory) equality
114    /// (1.0 / 0.0 when both sides set).
115    pub category_weight: f64,
116
117    /// Weight for case-insensitive equality of
118    /// `country_code_as_iso_3166_1_alpha_2`.
119    pub country_code_weight: f64,
120
121    /// Weight for "shared external event ID" (1.0 if any `(scheme, value)`
122    /// pair is shared, 0.0 otherwise).
123    pub event_ids_weight: f64,
124
125    /// Weight for organiser-name similarity (Combined string similarity
126    /// after name normalisation).
127    pub organizer_weight: f64,
128
129    /// Weight for performer-list similarity (best-of cartesian product
130    /// after name normalisation).
131    pub performers_weight: f64,
132
133    /// Weight for canonical-URL exact match after trimming whitespace.
134    pub url_weight: f64,
135
136    /// Whether to add a phonetic-name bonus when both names sound alike.
137    pub use_phonetic_matching: bool,
138
139    /// Similarity algorithm to use when comparing names.
140    pub name_algorithm: SimilarityAlgorithm,
141
142    /// Reserved flag for stricter deterministic enforcement. When `true`,
143    /// `is_match` requires both a probabilistic score above the threshold
144    /// *and* a deterministic match.
145    pub strict_mode: bool,
146}
147
148impl Default for MatchConfig {
149    /// Production-ready defaults.
150    ///
151    /// ```
152    /// use event_matcher::{MatchConfig, SimilarityAlgorithm};
153    /// let c = MatchConfig::default();
154    /// assert!((c.match_threshold - 0.80).abs() < 1e-9);
155    /// assert!(matches!(c.name_algorithm, SimilarityAlgorithm::Combined));
156    /// ```
157    fn default() -> Self {
158        Self {
159            match_threshold: 0.80,
160            name_weight: 0.20,
161            start_date_weight: 0.25,
162            start_date_scale_seconds: 3600.0,
163            end_date_weight: 0.05,
164            location_weight: 0.15,
165            coordinates_scale_metres: 100.0,
166            category_weight: 0.08,
167            country_code_weight: 0.04,
168            event_ids_weight: 0.15,
169            organizer_weight: 0.04,
170            performers_weight: 0.02,
171            url_weight: 0.02,
172            use_phonetic_matching: false,
173            name_algorithm: SimilarityAlgorithm::Combined,
174            strict_mode: false,
175        }
176    }
177}
178
179impl MatchConfig {
180    /// A stricter preset: `match_threshold = 0.95`, `strict_mode = true`.
181    ///
182    /// Use when callers must rely on the answer and false positives are
183    /// more dangerous than false negatives.
184    ///
185    /// ```
186    /// use event_matcher::MatchConfig;
187    /// let c = MatchConfig::strict();
188    /// assert!((c.match_threshold - 0.95).abs() < 1e-9);
189    /// assert!(c.strict_mode);
190    /// ```
191    #[must_use]
192    pub fn strict() -> Self {
193        Self {
194            match_threshold: 0.95,
195            strict_mode: true,
196            ..Default::default()
197        }
198    }
199
200    /// A more forgiving preset: `match_threshold = 0.65`, phonetic matching on.
201    ///
202    /// Use when triaging large candidate sets where false negatives are
203    /// worse than false positives.
204    ///
205    /// ```
206    /// use event_matcher::MatchConfig;
207    /// let c = MatchConfig::lenient();
208    /// assert!((c.match_threshold - 0.65).abs() < 1e-9);
209    /// assert!(c.use_phonetic_matching);
210    /// ```
211    #[must_use]
212    pub fn lenient() -> Self {
213        Self {
214            match_threshold: 0.65,
215            use_phonetic_matching: true,
216            ..Default::default()
217        }
218    }
219}
220
221/// Qualitative confidence band derived from the probabilistic
222/// [`MatchResult::score`].
223///
224/// The bands are fixed across all `MatchConfig` presets — they do **not**
225/// follow `match_threshold`. They are intended for triage UIs and audit
226/// logs where a coarse High/Medium/Low summary is more useful than the
227/// raw float.
228///
229/// Boundaries:
230///
231/// | Score range | Band |
232/// |---|---|
233/// | `score >= 0.90` | `High` |
234/// | `0.75 <= score < 0.90` | `Medium` |
235/// | `score < 0.75` | `Low` |
236#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
237pub enum Confidence {
238    /// Score is at or above `0.90`. Strong match.
239    High,
240    /// Score is in `0.75..0.90`. Medium-confidence match.
241    Medium,
242    /// Score is below `0.75`. Candidate at best.
243    Low,
244}
245
246impl Confidence {
247    /// Bucket a probabilistic score into one of the three bands.
248    ///
249    /// NaN inputs and negatives degrade to `Low`; scores above `1.0` are
250    /// treated as `High`.
251    ///
252    /// ```
253    /// use event_matcher::Confidence;
254    ///
255    /// assert_eq!(Confidence::from_score(f64::NAN), Confidence::Low);
256    /// assert_eq!(Confidence::from_score(-0.5),     Confidence::Low);
257    /// assert_eq!(Confidence::from_score(2.0),      Confidence::High);
258    /// ```
259    #[must_use]
260    pub fn from_score(score: f64) -> Self {
261        if score >= 0.90 {
262            Confidence::High
263        } else if score >= 0.75 {
264            Confidence::Medium
265        } else {
266            Confidence::Low
267        }
268    }
269}
270
271/// Outcome of a probabilistic event match.
272///
273/// Contains the overall renormalised `score`, the threshold-derived
274/// `is_match` boolean, a coarse [`Confidence`] band, and a per-field
275/// [`MatchBreakdown`] for audit.
276///
277/// `MatchResult` implements `Serialize + Deserialize` so it can be persisted
278/// or returned over an API.
279#[derive(Debug, Clone, Serialize, Deserialize)]
280pub struct MatchResult {
281    /// Overall match score in `[0.0, 1.0]`.
282    pub score: f64,
283
284    /// `true` if `score >= MatchConfig::match_threshold`.
285    pub is_match: bool,
286
287    /// Coarse confidence band derived from `score`. Defaults to
288    /// [`Confidence::Low`] on legacy JSON payloads that pre-date the field.
289    #[serde(default = "default_confidence")]
290    pub confidence: Confidence,
291
292    /// Per-field score contributions for explainability.
293    pub breakdown: MatchBreakdown,
294}
295
296fn default_confidence() -> Confidence {
297    Confidence::Low
298}
299
300/// Per-field score breakdown returned with every [`MatchResult`].
301///
302/// Each field is `Option<f64>`:
303///
304/// - `Some(score)` — the field was scored; the value is in `[0.0, 1.0]`.
305/// - `None` — the field was missing on at least one side and so did not
306///   participate in the weighted sum.
307#[derive(Debug, Clone, Serialize, Deserialize)]
308pub struct MatchBreakdown {
309    /// Best-of-cartesian-product similarity across primary name +
310    /// alternate names on both sides, using the configured algorithm.
311    pub name_score: Option<f64>,
312    /// Maximum Soundex match across the same name pairs. `None` when
313    /// `use_phonetic_matching` is false or either side has no names.
314    pub name_phonetic_score: Option<f64>,
315    /// Gaussian-decay score over the absolute seconds difference between
316    /// the two `start_date` values. `None` if either is missing or fails
317    /// to parse as ISO 8601.
318    pub start_date_score: Option<f64>,
319    /// Gaussian-decay score over the absolute seconds difference between
320    /// the two `end_date` values. `None` if either is missing or fails
321    /// to parse as ISO 8601.
322    pub end_date_score: Option<f64>,
323    /// Weighted blend of venue-name similarity, address similarity, and
324    /// coordinates similarity. `None` if either side has no location.
325    pub location_score: Option<f64>,
326    /// `1.0` if both categories set and structurally equal; `0.0` if both
327    /// set but differ; `None` if either is `None`.
328    pub category_score: Option<f64>,
329    /// `1.0` if both country codes set and equal after trim + ASCII
330    /// lowercase; `0.0` otherwise; `None` if either is `None`.
331    pub country_code_score: Option<f64>,
332    /// `1.0` if both `event_ids` non-empty and they share any
333    /// `(scheme, value)` pair; `0.0` if both non-empty but none shared;
334    /// `None` if either side is empty.
335    pub event_ids_score: Option<f64>,
336    /// Combined string similarity for the organiser, after name
337    /// normalisation. `None` if either side is absent.
338    pub organizer_score: Option<f64>,
339    /// Best-of cartesian product across performer lists, after name
340    /// normalisation. `None` if either side has no performers.
341    pub performers_score: Option<f64>,
342    /// `1.0` if both URLs set and equal after trimming, else `0.0`. `None`
343    /// if either is absent.
344    pub url_score: Option<f64>,
345}
346
347/// Event matcher engine.
348///
349/// The engine is **immutable after construction** and cheap to clone (it
350/// owns only a [`MatchConfig`]). Construct one and call its methods from any
351/// thread.
352///
353/// ```
354/// use event_matcher::{MatchConfig, MatchingEngine};
355///
356/// let engine_a = MatchingEngine::default_config();
357/// let engine_b = MatchingEngine::new(MatchConfig::strict());
358/// # let _ = (engine_a, engine_b);
359/// ```
360pub struct MatchingEngine {
361    config: MatchConfig,
362}
363
364impl MatchingEngine {
365    /// Construct an engine with the given configuration.
366    #[must_use]
367    pub fn new(config: MatchConfig) -> Self {
368        Self { config }
369    }
370
371    /// Construct an engine with [`MatchConfig::default`].
372    #[must_use]
373    pub fn default_config() -> Self {
374        Self::new(MatchConfig::default())
375    }
376
377    /// Compare two events probabilistically and return a [`MatchResult`].
378    ///
379    /// The score is the weight-renormalised sum of every component that
380    /// scored on both records. Missing fields are skipped, not penalised.
381    ///
382    /// ```
383    /// use event_matcher::{MatchingEngine, Event};
384    ///
385    /// let e = Event::builder()
386    ///     .name("RustConf 2024")
387    ///     .start_date("2024-09-10T09:00:00Z")
388    ///     .build();
389    ///
390    /// let result = MatchingEngine::default_config().match_events(&e, &e);
391    /// assert!(result.is_match);
392    /// assert!(result.score > 0.99);
393    /// ```
394    #[must_use]
395    pub fn match_events(&self, event1: &Event, event2: &Event) -> MatchResult {
396        let breakdown = self.calculate_breakdown(event1, event2);
397        let score = self.calculate_weighted_score(&breakdown);
398        let above_threshold = score >= self.config.match_threshold;
399        let is_match = if self.config.strict_mode {
400            above_threshold && self.deterministic_match(event1, event2)
401        } else {
402            above_threshold
403        };
404        let confidence = Confidence::from_score(score);
405
406        MatchResult {
407            score,
408            is_match,
409            confidence,
410            breakdown,
411        }
412    }
413
414    /// Score a single query against many candidates. Returns one
415    /// [`MatchResult`] per candidate, in the same order as the input slice.
416    ///
417    /// ```
418    /// use event_matcher::{MatchingEngine, Event};
419    ///
420    /// let query = Event::builder().name("RustConf 2024").build();
421    /// let candidates = vec![
422    ///     Event::builder().name("RustConf 2024").build(),
423    ///     Event::builder().name("GoConf 2024").build(),
424    /// ];
425    ///
426    /// let results = MatchingEngine::default_config().match_one_to_many(&query, &candidates);
427    /// assert_eq!(results.len(), 2);
428    /// assert!(results[0].is_match);
429    /// assert!(!results[1].is_match);
430    /// ```
431    #[must_use]
432    pub fn match_one_to_many(&self, query: &Event, candidates: &[Event]) -> Vec<MatchResult> {
433        candidates
434            .iter()
435            .map(|c| self.match_events(query, c))
436            .collect()
437    }
438
439    /// Score and rank: return `(original_index, MatchResult)` tuples
440    /// sorted by descending score. Ties are broken by ascending original
441    /// index, so the result is deterministic.
442    #[must_use]
443    pub fn rank_one_to_many(
444        &self,
445        query: &Event,
446        candidates: &[Event],
447    ) -> Vec<(usize, MatchResult)> {
448        let mut indexed: Vec<(usize, MatchResult)> = self
449            .match_one_to_many(query, candidates)
450            .into_iter()
451            .enumerate()
452            .collect();
453        indexed.sort_by(|a, b| {
454            b.1.score
455                .partial_cmp(&a.1.score)
456                .unwrap_or(std::cmp::Ordering::Equal)
457                .then_with(|| a.0.cmp(&b.0))
458        });
459        indexed
460    }
461
462    /// Compare two events deterministically and return a single boolean.
463    ///
464    /// Returns `true` iff either:
465    ///
466    /// - the events share any `(scheme, value)` pair in their `event_ids`
467    ///   lists, OR
468    /// - both have a primary `name` that normalises to the same value AND
469    ///   both have a `start_date` that parses to the same instant.
470    ///
471    /// ```
472    /// use event_matcher::{MatchingEngine, Event, EventId, EventIdScheme};
473    ///
474    /// let id = EventId::new(EventIdScheme::Eventbrite, "123456789").unwrap();
475    /// let a = Event::builder().name("RustConf 2024").add_event_id(id.clone()).build();
476    /// let b = Event::builder().name("RC '24").add_event_id(id).build();
477    /// assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
478    /// ```
479    #[must_use]
480    pub fn deterministic_match(&self, event1: &Event, event2: &Event) -> bool {
481        if shares_event_id(event1, event2) {
482            return true;
483        }
484        name_and_start_date_match(event1, event2)
485    }
486
487    fn calculate_breakdown(&self, event1: &Event, event2: &Event) -> MatchBreakdown {
488        MatchBreakdown {
489            name_score: self.score_name(event1, event2),
490            name_phonetic_score: if self.config.use_phonetic_matching {
491                Self::score_phonetic_names(event1, event2)
492            } else {
493                None
494            },
495            start_date_score: self.score_start_date(event1, event2),
496            end_date_score: self.score_end_date(event1, event2),
497            location_score: self.score_location(event1, event2),
498            category_score: score_category(event1, event2),
499            country_code_score: score_country_code(event1, event2),
500            event_ids_score: score_event_ids(event1, event2),
501            organizer_score: Self::score_organizer(event1, event2),
502            performers_score: Self::score_performers(event1, event2),
503            url_score: score_url(event1, event2),
504        }
505    }
506
507    fn calculate_weighted_score(&self, breakdown: &MatchBreakdown) -> f64 {
508        let mut total_weight = 0.0;
509        let mut weighted_sum = 0.0;
510
511        let mut accumulate = |opt: Option<f64>, weight: f64| {
512            if let Some(score) = opt {
513                weighted_sum += score * weight;
514                total_weight += weight;
515            }
516        };
517
518        accumulate(breakdown.name_score, self.config.name_weight);
519        accumulate(breakdown.start_date_score, self.config.start_date_weight);
520        accumulate(breakdown.end_date_score, self.config.end_date_weight);
521        accumulate(breakdown.location_score, self.config.location_weight);
522        accumulate(breakdown.category_score, self.config.category_weight);
523        accumulate(
524            breakdown.country_code_score,
525            self.config.country_code_weight,
526        );
527        accumulate(breakdown.event_ids_score, self.config.event_ids_weight);
528        accumulate(breakdown.organizer_score, self.config.organizer_weight);
529        accumulate(breakdown.performers_score, self.config.performers_weight);
530        accumulate(breakdown.url_score, self.config.url_weight);
531
532        // Phonetic match is a bonus only — never lowers the score.
533        if let Some(score) = breakdown.name_phonetic_score
534            && score > 0.9
535        {
536            weighted_sum += score * 0.05;
537            total_weight += 0.05;
538        }
539
540        if total_weight > 0.0 {
541            weighted_sum / total_weight
542        } else {
543            0.0
544        }
545    }
546
547    fn score_name(&self, e1: &Event, e2: &Event) -> Option<f64> {
548        let names1 = collect_names(e1);
549        let names2 = collect_names(e2);
550        if names1.is_empty() || names2.is_empty() {
551            return None;
552        }
553        let mut best = f64::NEG_INFINITY;
554        for n1 in &names1 {
555            for n2 in &names2 {
556                let s = self.score_name_pair(n1, n2);
557                if s > best {
558                    best = s;
559                }
560            }
561        }
562        Some(best)
563    }
564
565    fn score_name_pair(&self, name1: &str, name2: &str) -> f64 {
566        let norm1 = Normalizer::normalize_name(name1);
567        let norm2 = Normalizer::normalize_name(name2);
568        match self.config.name_algorithm {
569            SimilarityAlgorithm::JaroWinkler => Scorer::jaro_winkler_similarity(&norm1, &norm2),
570            SimilarityAlgorithm::Levenshtein => Scorer::levenshtein_similarity(&norm1, &norm2),
571            SimilarityAlgorithm::Exact => Scorer::exact_match(&norm1, &norm2),
572            SimilarityAlgorithm::Combined => Scorer::combined_similarity(&norm1, &norm2),
573        }
574    }
575
576    fn score_phonetic_names(e1: &Event, e2: &Event) -> Option<f64> {
577        let names1 = collect_names(e1);
578        let names2 = collect_names(e2);
579        if names1.is_empty() || names2.is_empty() {
580            return None;
581        }
582        let codes1: Vec<String> = names1
583            .iter()
584            .map(|n| Normalizer::phonetic_code(n))
585            .collect();
586        let codes2: Vec<String> = names2
587            .iter()
588            .map(|n| Normalizer::phonetic_code(n))
589            .collect();
590        let mut best = 0.0_f64;
591        for c1 in &codes1 {
592            for c2 in &codes2 {
593                if !c1.is_empty() && c1 == c2 {
594                    best = 1.0;
595                }
596            }
597        }
598        Some(best)
599    }
600
601    // A signed second-delta between two event timestamps. The magnitude is
602    // tiny relative to f64's 52-bit mantissa, so the lossy i64->f64 cast is
603    // harmless here.
604    #[allow(clippy::cast_precision_loss)]
605    fn score_start_date(&self, e1: &Event, e2: &Event) -> Option<f64> {
606        let d = Scorer::seconds_between(e1.start_date.as_deref()?, e2.start_date.as_deref()?)?;
607        Some(Scorer::start_date_score(
608            d as f64,
609            self.config.start_date_scale_seconds,
610        ))
611    }
612
613    #[allow(clippy::cast_precision_loss)]
614    fn score_end_date(&self, e1: &Event, e2: &Event) -> Option<f64> {
615        let d = Scorer::seconds_between(e1.end_date.as_deref()?, e2.end_date.as_deref()?)?;
616        Some(Scorer::start_date_score(
617            d as f64,
618            self.config.start_date_scale_seconds,
619        ))
620    }
621
622    fn score_location(&self, e1: &Event, e2: &Event) -> Option<f64> {
623        match (e1.location.as_ref(), e2.location.as_ref()) {
624            (Some(l1), Some(l2)) => Some(self.compare_locations(l1, l2)),
625            _ => None,
626        }
627    }
628
629    fn compare_locations(&self, l1: &Location, l2: &Location) -> f64 {
630        // Each sub-component contributes a raw score in `[0.0, 1.0]` and a
631        // weight. Final score is the weight-renormalised average across
632        // sub-components that fired. Coordinates dominate (`0.5`), then
633        // address (`0.3`), then venue name (`0.15`), then virtual URL
634        // (`0.05`).
635        let mut weighted_sum = 0.0_f64;
636        let mut total_weight = 0.0_f64;
637
638        if let (Some(lat1), Some(lon1), Some(lat2), Some(lon2)) =
639            (l1.latitude, l1.longitude, l2.latitude, l2.longitude)
640            && let (Some((la1, lo1)), Some((la2, lo2))) = (
641                valid_coords(Some(lat1), Some(lon1)),
642                valid_coords(Some(lat2), Some(lon2)),
643            )
644        {
645            let d = Scorer::haversine_metres(la1, lo1, la2, lo2);
646            weighted_sum +=
647                Scorer::coordinates_score(d, self.config.coordinates_scale_metres) * 0.5;
648            total_weight += 0.5;
649        }
650
651        if let (Some(a1), Some(a2)) = (l1.address.as_ref(), l2.address.as_ref()) {
652            weighted_sum += compare_addresses(a1, a2) * 0.3;
653            total_weight += 0.3;
654        }
655
656        if let (Some(v1), Some(v2)) = (l1.venue_name.as_deref(), l2.venue_name.as_deref()) {
657            let n1 = Normalizer::normalize_name(v1);
658            let n2 = Normalizer::normalize_name(v2);
659            weighted_sum += Scorer::combined_similarity(&n1, &n2) * 0.15;
660            total_weight += 0.15;
661        }
662
663        if let (Some(u1), Some(u2)) = (l1.virtual_url.as_deref(), l2.virtual_url.as_deref()) {
664            weighted_sum += f64::from(u1.trim() == u2.trim()) * 0.05;
665            total_weight += 0.05;
666        }
667
668        if total_weight == 0.0 {
669            0.5
670        } else {
671            weighted_sum / total_weight
672        }
673    }
674
675    fn score_organizer(e1: &Event, e2: &Event) -> Option<f64> {
676        let o1 = e1.organizer.as_deref()?;
677        let o2 = e2.organizer.as_deref()?;
678        let n1 = Normalizer::normalize_name(o1);
679        let n2 = Normalizer::normalize_name(o2);
680        Some(Scorer::combined_similarity(&n1, &n2))
681    }
682
683    fn score_performers(e1: &Event, e2: &Event) -> Option<f64> {
684        if e1.performers.is_empty() || e2.performers.is_empty() {
685            return None;
686        }
687        let mut best = 0.0_f64;
688        for a in &e1.performers {
689            for b in &e2.performers {
690                let na = Normalizer::normalize_name(a);
691                let nb = Normalizer::normalize_name(b);
692                let s = Scorer::combined_similarity(&na, &nb);
693                if s > best {
694                    best = s;
695                }
696            }
697        }
698        Some(best)
699    }
700}
701
702// ---- Free helpers ------------------------------------------------------
703
704/// Collect an event's primary name plus alternate names into a single vec
705/// of references. Empty / whitespace-only strings are skipped.
706fn collect_names(event: &Event) -> Vec<&String> {
707    event
708        .name
709        .iter()
710        .chain(event.alternate_names.iter())
711        .filter(|s| !s.trim().is_empty())
712        .collect()
713}
714
715/// Validate that lat/lon are finite and fall in the conventional ranges.
716fn valid_coords(lat: Option<f64>, lon: Option<f64>) -> Option<(f64, f64)> {
717    let lat = lat?;
718    let lon = lon?;
719    if !lat.is_finite() || !lon.is_finite() {
720        return None;
721    }
722    if !(-90.0..=90.0).contains(&lat) || !(-180.0..=180.0).contains(&lon) {
723        return None;
724    }
725    Some((lat, lon))
726}
727
728fn score_category(e1: &Event, e2: &Event) -> Option<f64> {
729    match (&e1.category, &e2.category) {
730        (Some(a), Some(b)) => Some(if a == b { 1.0 } else { 0.0 }),
731        _ => None,
732    }
733}
734
735fn score_country_code(e1: &Event, e2: &Event) -> Option<f64> {
736    let a = e1.country_code_as_iso_3166_1_alpha_2.as_ref()?;
737    let b = e2.country_code_as_iso_3166_1_alpha_2.as_ref()?;
738    let na = a.trim().to_ascii_lowercase();
739    let nb = b.trim().to_ascii_lowercase();
740    Some(if na == nb { 1.0 } else { 0.0 })
741}
742
743fn shares_event_id(e1: &Event, e2: &Event) -> bool {
744    if e1.event_ids.is_empty() || e2.event_ids.is_empty() {
745        return false;
746    }
747    for id1 in &e1.event_ids {
748        for id2 in &e2.event_ids {
749            if id1 == id2 {
750                return true;
751            }
752        }
753    }
754    false
755}
756
757fn score_event_ids(e1: &Event, e2: &Event) -> Option<f64> {
758    if e1.event_ids.is_empty() || e2.event_ids.is_empty() {
759        return None;
760    }
761    Some(if shares_event_id(e1, e2) { 1.0 } else { 0.0 })
762}
763
764fn score_url(e1: &Event, e2: &Event) -> Option<f64> {
765    let u1 = e1.url.as_deref()?;
766    let u2 = e2.url.as_deref()?;
767    Some(f64::from(u1.trim() == u2.trim()))
768}
769
770fn name_and_start_date_match(e1: &Event, e2: &Event) -> bool {
771    let (Some(n1), Some(n2)) = (&e1.name, &e2.name) else {
772        return false;
773    };
774    if Normalizer::normalize_name(n1) != Normalizer::normalize_name(n2) {
775        return false;
776    }
777    let (Some(sd1), Some(sd2)) = (&e1.start_date, &e2.start_date) else {
778        return false;
779    };
780    match (
781        Normalizer::parse_iso8601_unix_seconds(sd1),
782        Normalizer::parse_iso8601_unix_seconds(sd2),
783    ) {
784        (Some(a), Some(b)) => a == b,
785        _ => false,
786    }
787}
788
789/// Compare two postal addresses; same blend rule as the previous
790/// place-matcher implementation: postcode dominates (0.5), then city
791/// (0.3), then line 1 (0.2).
792fn compare_addresses(addr1: &Address, addr2: &Address) -> f64 {
793    let mut weighted_sum = 0.0_f64;
794    let mut total_weight = 0.0_f64;
795
796    if let (Some(pc1), Some(pc2)) = (&addr1.postcode, &addr2.postcode) {
797        let norm1 = Normalizer::normalize_postcode(pc1);
798        let norm2 = Normalizer::normalize_postcode(pc2);
799        weighted_sum += f64::from(norm1 == norm2) * 0.5;
800        total_weight += 0.5;
801    }
802
803    if let (Some(city1), Some(city2)) = (&addr1.city, &addr2.city) {
804        let norm1 = Normalizer::normalize_name(city1);
805        let norm2 = Normalizer::normalize_name(city2);
806        weighted_sum += Scorer::jaro_winkler_similarity(&norm1, &norm2) * 0.3;
807        total_weight += 0.3;
808    }
809
810    if let (Some(line1), Some(line2)) = (&addr1.line1, &addr2.line1) {
811        let parsed1 = Normalizer::parse_address_line(line1);
812        let parsed2 = Normalizer::parse_address_line(line2);
813        let street_sim = Scorer::jaro_winkler_similarity(&parsed1.street, &parsed2.street);
814        let house_score = match (&parsed1.house_number, &parsed2.house_number) {
815            (Some(a), Some(b)) => Some(f64::from(a == b)),
816            _ => None,
817        };
818        let line1_score = match house_score {
819            Some(h) => 0.6 * street_sim + 0.4 * h,
820            None => street_sim,
821        };
822        weighted_sum += line1_score * 0.2;
823        total_weight += 0.2;
824    }
825
826    if total_weight == 0.0 {
827        0.5
828    } else {
829        weighted_sum / total_weight
830    }
831}
832
833#[cfg(test)]
834// Some assertions check exact sentinel scores (`0.0` / `1.0`), where exact
835// float comparison is the intended behaviour.
836#[allow(clippy::float_cmp)]
837mod tests {
838    use super::*;
839    use crate::models::{EventCategory, EventId, EventIdScheme};
840
841    // ---------- MatchConfig presets ----------
842
843    #[test]
844    fn config_default_values() {
845        let c = MatchConfig::default();
846        assert!((c.match_threshold - 0.80).abs() < 1e-9);
847        assert!(!c.strict_mode);
848    }
849
850    #[test]
851    fn config_strict_raises_threshold_and_sets_flag() {
852        let c = MatchConfig::strict();
853        assert!((c.match_threshold - 0.95).abs() < 1e-9);
854        assert!(c.strict_mode);
855    }
856
857    #[test]
858    fn config_lenient_lowers_threshold() {
859        let c = MatchConfig::lenient();
860        assert!((c.match_threshold - 0.65).abs() < 1e-9);
861        assert!(c.use_phonetic_matching);
862    }
863
864    // ---------- MatchConfig serde ----------
865
866    #[test]
867    fn config_default_round_trips_through_json() {
868        let cfg = MatchConfig::default();
869        let json = serde_json::to_string(&cfg).expect("serialise");
870        let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
871        assert!((cfg.match_threshold - back.match_threshold).abs() < 1e-12);
872        assert!((cfg.name_weight - back.name_weight).abs() < 1e-12);
873        assert!((cfg.start_date_weight - back.start_date_weight).abs() < 1e-12);
874        assert!(matches!(back.name_algorithm, SimilarityAlgorithm::Combined));
875        assert_eq!(cfg.strict_mode, back.strict_mode);
876    }
877
878    #[test]
879    fn config_partial_json_fills_missing_fields_from_default() {
880        let partial = r#"{"match_threshold": 0.80}"#;
881        let cfg: MatchConfig = serde_json::from_str(partial).expect("partial json");
882        assert!((cfg.match_threshold - 0.80).abs() < 1e-12);
883        assert!(matches!(cfg.name_algorithm, SimilarityAlgorithm::Combined));
884    }
885
886    // ---------- probabilistic match ----------
887
888    #[test]
889    fn exact_clone_is_a_match() {
890        let e = Event::builder()
891            .name("RustConf 2024")
892            .start_date("2024-09-10T09:00:00Z")
893            .build();
894        let result = MatchingEngine::default_config().match_events(&e, &e.clone());
895        assert!(result.is_match);
896        assert!(result.score > 0.95);
897    }
898
899    #[test]
900    fn name_match_takes_best_of_cartesian_product() {
901        let p1 = Event::builder().name("RustConf 2024").build();
902        let p2 = Event::builder()
903            .name("Rust Conference 2024")
904            .add_alternate_name("RustConf 2024")
905            .build();
906        let r = MatchingEngine::default_config().match_events(&p1, &p2);
907        let s = r.breakdown.name_score.expect("scored");
908        assert!(s > 0.99, "got {s}");
909    }
910
911    #[test]
912    fn unrelated_events_do_not_match() {
913        let a = Event::builder()
914            .name("RustConf 2024")
915            .start_date("2024-09-10T09:00:00Z")
916            .build();
917        let b = Event::builder()
918            .name("Sydney Opera Concert")
919            .start_date("2025-03-15T20:00:00Z")
920            .build();
921        let r = MatchingEngine::default_config().match_events(&a, &b);
922        assert!(!r.is_match);
923        assert!(r.score < 0.5);
924    }
925
926    #[test]
927    fn no_overlapping_fields_returns_zero_score() {
928        let a = Event::builder().url("https://example.org/a").build();
929        let b = Event::builder().url("https://example.org/b").build();
930        let r = MatchingEngine::default_config().match_events(&a, &b);
931        assert_eq!(r.score, 0.0);
932    }
933
934    // ---------- start_date ----------
935
936    #[test]
937    fn start_date_score_one_when_identical() {
938        let a = Event::builder()
939            .name("X")
940            .start_date("2024-06-26T09:00:00Z")
941            .build();
942        let b = a.clone();
943        let r = MatchingEngine::default_config().match_events(&a, &b);
944        assert!((r.breakdown.start_date_score.unwrap() - 1.0).abs() < 1e-9);
945    }
946
947    #[test]
948    fn start_date_score_decays_with_time_gap() {
949        let a = Event::builder()
950            .name("X")
951            .start_date("2024-06-26T09:00:00Z")
952            .build();
953        let b = Event::builder()
954            .name("X")
955            .start_date("2024-07-26T09:00:00Z")
956            .build();
957        let r = MatchingEngine::default_config().match_events(&a, &b);
958        assert!(r.breakdown.start_date_score.unwrap() < 1e-3);
959    }
960
961    #[test]
962    fn start_date_score_none_when_one_side_missing() {
963        let a = Event::builder().name("X").start_date("2024-06-26").build();
964        let b = Event::builder().name("X").build();
965        let r = MatchingEngine::default_config().match_events(&a, &b);
966        assert!(r.breakdown.start_date_score.is_none());
967    }
968
969    #[test]
970    fn start_date_score_none_when_garbage() {
971        let a = Event::builder().name("X").start_date("not-a-date").build();
972        let b = Event::builder().name("X").start_date("2024-06-26").build();
973        let r = MatchingEngine::default_config().match_events(&a, &b);
974        assert!(r.breakdown.start_date_score.is_none());
975    }
976
977    // ---------- category ----------
978
979    #[test]
980    fn category_equality_scores_one_else_zero() {
981        let a = Event::builder()
982            .name("X")
983            .category(EventCategory::MusicEvent)
984            .build();
985        let b = Event::builder()
986            .name("X")
987            .category(EventCategory::MusicEvent)
988            .build();
989        let c = Event::builder()
990            .name("X")
991            .category(EventCategory::ComedyEvent)
992            .build();
993        let engine = MatchingEngine::default_config();
994        assert_eq!(
995            engine.match_events(&a, &b).breakdown.category_score,
996            Some(1.0)
997        );
998        assert_eq!(
999            engine.match_events(&a, &c).breakdown.category_score,
1000            Some(0.0)
1001        );
1002    }
1003
1004    #[test]
1005    fn category_score_none_when_either_missing() {
1006        let a = Event::builder()
1007            .name("X")
1008            .category(EventCategory::MusicEvent)
1009            .build();
1010        let b = Event::builder().name("X").build();
1011        let r = MatchingEngine::default_config().match_events(&a, &b);
1012        assert!(r.breakdown.category_score.is_none());
1013    }
1014
1015    // ---------- country code ----------
1016
1017    #[test]
1018    fn country_code_case_insensitive_equality() {
1019        let a = Event::builder()
1020            .name("X")
1021            .country_code_as_iso_3166_1_alpha_2("gb")
1022            .build();
1023        let b = Event::builder()
1024            .name("X")
1025            .country_code_as_iso_3166_1_alpha_2("GB")
1026            .build();
1027        let r = MatchingEngine::default_config().match_events(&a, &b);
1028        assert_eq!(r.breakdown.country_code_score, Some(1.0));
1029    }
1030
1031    #[test]
1032    fn country_code_mismatch_scores_zero() {
1033        let a = Event::builder()
1034            .name("X")
1035            .country_code_as_iso_3166_1_alpha_2("GB")
1036            .build();
1037        let b = Event::builder()
1038            .name("X")
1039            .country_code_as_iso_3166_1_alpha_2("FR")
1040            .build();
1041        let r = MatchingEngine::default_config().match_events(&a, &b);
1042        assert_eq!(r.breakdown.country_code_score, Some(0.0));
1043    }
1044
1045    // ---------- event_ids ----------
1046
1047    #[test]
1048    fn event_ids_shared_scores_one() {
1049        let id = EventId::new(EventIdScheme::Eventbrite, "12345").unwrap();
1050        let a = Event::builder().name("X").add_event_id(id.clone()).build();
1051        let b = Event::builder().name("X").add_event_id(id).build();
1052        let r = MatchingEngine::default_config().match_events(&a, &b);
1053        assert_eq!(r.breakdown.event_ids_score, Some(1.0));
1054    }
1055
1056    #[test]
1057    fn event_ids_scheme_scoped_no_cross_match() {
1058        let a = Event::builder()
1059            .name("X")
1060            .add_event_id(EventId::new(EventIdScheme::Eventbrite, "X").unwrap())
1061            .build();
1062        let b = Event::builder()
1063            .name("X")
1064            .add_event_id(EventId::new(EventIdScheme::Meetup, "X").unwrap())
1065            .build();
1066        let r = MatchingEngine::default_config().match_events(&a, &b);
1067        assert_eq!(r.breakdown.event_ids_score, Some(0.0));
1068    }
1069
1070    #[test]
1071    fn event_ids_none_when_either_side_empty() {
1072        let a = Event::builder().name("X").build();
1073        let b = Event::builder()
1074            .name("X")
1075            .add_event_id(EventId::new(EventIdScheme::Eventbrite, "Q1").unwrap())
1076            .build();
1077        let r = MatchingEngine::default_config().match_events(&a, &b);
1078        assert!(r.breakdown.event_ids_score.is_none());
1079    }
1080
1081    // ---------- deterministic match ----------
1082
1083    #[test]
1084    fn deterministic_via_shared_event_id() {
1085        let id = EventId::new(EventIdScheme::Eventbrite, "12345").unwrap();
1086        let a = Event::builder()
1087            .name("RustConf 2024")
1088            .add_event_id(id.clone())
1089            .build();
1090        let b = Event::builder()
1091            .name("Wholly Different")
1092            .add_event_id(id)
1093            .build();
1094        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1095    }
1096
1097    #[test]
1098    fn deterministic_via_name_and_start_date() {
1099        let a = Event::builder()
1100            .name("RustConf 2024")
1101            .start_date("2024-09-10T09:00:00Z")
1102            .build();
1103        let b = Event::builder()
1104            .name("RustConf 2024")
1105            .start_date("2024-09-10T09:00:00Z")
1106            .build();
1107        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1108    }
1109
1110    #[test]
1111    fn deterministic_via_name_and_start_date_accepts_equivalent_offsets() {
1112        let a = Event::builder()
1113            .name("RustConf 2024")
1114            .start_date("2024-09-10T09:00:00Z")
1115            .build();
1116        let b = Event::builder()
1117            .name("RustConf 2024")
1118            .start_date("2024-09-10T11:00:00+02:00")
1119            .build();
1120        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1121    }
1122
1123    #[test]
1124    fn deterministic_rejects_when_name_differs_and_no_shared_id() {
1125        let a = Event::builder()
1126            .name("X")
1127            .start_date("2024-09-10T09:00:00Z")
1128            .build();
1129        let b = Event::builder()
1130            .name("Y")
1131            .start_date("2024-09-10T09:00:00Z")
1132            .build();
1133        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
1134    }
1135
1136    #[test]
1137    fn deterministic_rejects_when_start_date_missing_and_no_shared_id() {
1138        let a = Event::builder().name("X").build();
1139        let b = Event::builder().name("X").build();
1140        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
1141    }
1142
1143    // ---------- strict_mode enforcement ----------
1144
1145    #[test]
1146    fn strict_mode_requires_deterministic_for_is_match() {
1147        let cfg = MatchConfig {
1148            match_threshold: 0.50,
1149            strict_mode: true,
1150            ..MatchConfig::default()
1151        };
1152        let e1 = Event::builder()
1153            .name("Cafe Centrale Concert")
1154            .start_date("2024-09-10T09:00:00Z")
1155            .build();
1156        let e2 = Event::builder()
1157            .name("Cafe Central Concert") // close but not equal under normalisation
1158            .start_date("2024-09-10T09:00:00Z")
1159            .build();
1160        let engine = MatchingEngine::new(cfg);
1161        let r = engine.match_events(&e1, &e2);
1162        assert!(r.score >= 0.50);
1163        assert!(!engine.deterministic_match(&e1, &e2));
1164        assert!(!r.is_match);
1165    }
1166
1167    // ---------- batch APIs ----------
1168
1169    #[test]
1170    fn match_one_to_many_empty_candidates_yields_empty_vec() {
1171        let engine = MatchingEngine::default_config();
1172        let q = Event::builder().name("Solo").build();
1173        assert!(engine.match_one_to_many(&q, &[]).is_empty());
1174    }
1175
1176    #[test]
1177    fn rank_one_to_many_sorts_by_score_descending() {
1178        let engine = MatchingEngine::default_config();
1179        let q = Event::builder().name("RustConf 2024").build();
1180        let candidates = vec![
1181            Event::builder().name("PyConf 2024").build(),
1182            q.clone(),
1183            Event::builder().name("GoConf 2024").build(),
1184        ];
1185        let ranked = engine.rank_one_to_many(&q, &candidates);
1186        assert_eq!(ranked[0].0, 1);
1187        for w in ranked.windows(2) {
1188            assert!(w[0].1.score >= w[1].1.score);
1189        }
1190    }
1191
1192    // ---------- Confidence ----------
1193
1194    #[test]
1195    fn confidence_band_boundaries_are_inclusive_on_the_low_side() {
1196        assert_eq!(Confidence::from_score(0.90), Confidence::High);
1197        assert_eq!(Confidence::from_score(0.89), Confidence::Medium);
1198        assert_eq!(Confidence::from_score(0.75), Confidence::Medium);
1199        assert_eq!(Confidence::from_score(0.74), Confidence::Low);
1200    }
1201
1202    // ---------- location ----------
1203
1204    #[test]
1205    fn location_postcode_match_dominates() {
1206        let l1 = Location::new().with_address(Address::new().with_postcode("BA4 4BY"));
1207        let l2 = Location::new().with_address(Address::new().with_postcode("BA4 4BY"));
1208        let s = MatchingEngine::default_config().compare_locations(&l1, &l2);
1209        assert!((s - 1.0).abs() < 1e-9, "got {s}");
1210    }
1211
1212    #[test]
1213    fn location_score_none_when_either_side_absent() {
1214        let a = Event::builder()
1215            .name("X")
1216            .location(Location::new().with_venue_name("Worthy Farm"))
1217            .build();
1218        let b = Event::builder().name("X").build();
1219        let r = MatchingEngine::default_config().match_events(&a, &b);
1220        assert!(r.breakdown.location_score.is_none());
1221    }
1222
1223    // ---------- organizer / performers / url ----------
1224
1225    #[test]
1226    fn organizer_match_after_normalisation() {
1227        let a = Event::builder()
1228            .name("X")
1229            .organizer("Rust Foundation")
1230            .build();
1231        let b = Event::builder()
1232            .name("X")
1233            .organizer("rust foundation")
1234            .build();
1235        let r = MatchingEngine::default_config().match_events(&a, &b);
1236        assert!(r.breakdown.organizer_score.unwrap() > 0.99);
1237    }
1238
1239    #[test]
1240    fn performers_match_takes_best_of_cartesian_product() {
1241        let a = Event::builder()
1242            .name("X")
1243            .add_performer("Niko Matsakis")
1244            .add_performer("Tyler Mandry")
1245            .build();
1246        let b = Event::builder()
1247            .name("X")
1248            .add_performer("Carol Nichols")
1249            .add_performer("Niko Matsakis")
1250            .build();
1251        let r = MatchingEngine::default_config().match_events(&a, &b);
1252        assert!(r.breakdown.performers_score.unwrap() > 0.99);
1253    }
1254
1255    #[test]
1256    fn url_match_is_exact_after_trim() {
1257        let a = Event::builder()
1258            .name("X")
1259            .url("https://rustconf.com")
1260            .build();
1261        let b = Event::builder()
1262            .name("X")
1263            .url("  https://rustconf.com  ")
1264            .build();
1265        let r = MatchingEngine::default_config().match_events(&a, &b);
1266        assert_eq!(r.breakdown.url_score, Some(1.0));
1267    }
1268
1269    // ---------- phonetic ----------
1270
1271    #[test]
1272    fn phonetic_score_none_when_off() {
1273        let p = Event::builder().name("Stephen Concert").build();
1274        let q = Event::builder().name("Steven Concert").build();
1275        let r = MatchingEngine::new(MatchConfig {
1276            use_phonetic_matching: false,
1277            ..MatchConfig::default()
1278        })
1279        .match_events(&p, &q);
1280        assert!(r.breakdown.name_phonetic_score.is_none());
1281    }
1282
1283    #[test]
1284    fn phonetic_score_some_when_on() {
1285        let p = Event::builder().name("Stephen").build();
1286        let q = Event::builder().name("Steven").build();
1287        let r = MatchingEngine::new(MatchConfig {
1288            use_phonetic_matching: true,
1289            ..MatchConfig::default()
1290        })
1291        .match_events(&p, &q);
1292        assert!(r.breakdown.name_phonetic_score.is_some());
1293    }
1294}