Skip to main content

thing_matcher/
matcher.rs

1//! Thing matcher engine: deterministic and probabilistic algorithms.
2//!
3//! This is the orchestration layer of the crate. It pulls together the data
4//! types from [`crate::models`], the text transformations from
5//! [`crate::normalizer`], and the similarity primitives from
6//! [`crate::scorer`] to produce a single answer about whether two `Thing`
7//! records refer to the same item.
8//!
9//! ## Two strategies, one engine
10//!
11//! - [`MatchingEngine::deterministic_match`] — fast, binary. Returns `true`
12//!   iff the two things share any `(property_id, value)` identifier, any
13//!   `sameAs` URL, or the same canonical `url`.
14//! - [`MatchingEngine::match_things`] — weighted probabilistic scoring,
15//!   returning a [`MatchResult`] with per-field [`MatchBreakdown`].
16//!
17//! ## Example
18//!
19//! ```
20//! use thing_matcher::{MatchingEngine, Thing};
21//!
22//! let a = Thing::builder()
23//!     .name("Eiffel Tower")
24//!     .url("https://www.toureiffel.paris/")
25//!     .build();
26//!
27//! let b = Thing::builder()
28//!     .name("La Tour Eiffel")
29//!     .add_alternate_name("Eiffel Tower")
30//!     .url("https://www.toureiffel.paris/")
31//!     .build();
32//!
33//! let engine = MatchingEngine::default_config();
34//! let result = engine.match_things(&a, &b);
35//! assert!(result.is_match);
36//! ```
37
38use crate::models::Thing;
39use crate::normalizer::Normalizer;
40use crate::scorer::{Scorer, SimilarityAlgorithm};
41use serde::{Deserialize, Serialize};
42
43/// Tunable configuration for the matching engine.
44///
45/// All weights are dimensionless and contribute to a renormalised weighted
46/// sum — they do not need to add to `1.0`. The matching pipeline divides
47/// the weighted sum by the sum of *participating* weights so that missing
48/// fields neither contribute nor penalise. The score is then compared
49/// against [`MatchConfig::match_threshold`] to produce the `is_match`
50/// boolean.
51///
52/// Two presets cover most needs:
53///
54/// - [`MatchConfig::strict`]  — `match_threshold = 0.95`, `strict_mode = true`.
55/// - [`MatchConfig::lenient`] — `match_threshold = 0.65`, phonetic on.
56///
57/// # Example
58///
59/// ```
60/// use thing_matcher::{MatchConfig, SimilarityAlgorithm};
61///
62/// let custom = MatchConfig {
63///     match_threshold: 0.80,
64///     name_weight: 0.30,
65///     description_weight: 0.10,
66///     disambiguating_description_weight: 0.05,
67///     identifiers_weight: 0.25,
68///     url_weight: 0.05,
69///     same_as_weight: 0.15,
70///     image_weight: 0.03,
71///     main_entity_of_page_weight: 0.02,
72///     additional_types_weight: 0.05,
73///     use_phonetic_matching: true,
74///     name_algorithm: SimilarityAlgorithm::Combined,
75///     strict_mode: false,
76/// };
77/// assert_eq!(custom.match_threshold, 0.80);
78/// ```
79#[derive(Debug, Clone, Serialize, Deserialize)]
80#[serde(default)]
81pub struct MatchConfig {
82    /// Threshold score for considering two things a match (`0.0..=1.0`).
83    pub match_threshold: f64,
84
85    /// Weight for name similarity (best-of cartesian product across the
86    /// primary `name` and `alternate_names` on both sides).
87    pub name_weight: f64,
88
89    /// Weight for free-form `description` similarity.
90    pub description_weight: f64,
91
92    /// Weight for `disambiguatingDescription` similarity.
93    pub disambiguating_description_weight: f64,
94
95    /// Weight for "shared identifier" (1.0 if any `(property_id, value)`
96    /// pair is shared, 0.0 otherwise).
97    pub identifiers_weight: f64,
98
99    /// Weight for canonical `url` exact match (after URL normalisation).
100    pub url_weight: f64,
101
102    /// Weight for `sameAs` URL set similarity (Jaccard).
103    pub same_as_weight: f64,
104
105    /// Weight for `image` URL exact match (after URL normalisation).
106    pub image_weight: f64,
107
108    /// Weight for `mainEntityOfPage` URL exact match (after URL
109    /// normalisation).
110    pub main_entity_of_page_weight: f64,
111
112    /// Weight for `additionalType` URI set similarity (Jaccard).
113    pub additional_types_weight: f64,
114
115    /// Whether to add a phonetic-name bonus when both names sound alike.
116    pub use_phonetic_matching: bool,
117
118    /// Similarity algorithm to use when comparing names.
119    pub name_algorithm: SimilarityAlgorithm,
120
121    /// Reserved flag for stricter deterministic enforcement.
122    pub strict_mode: bool,
123}
124
125impl Default for MatchConfig {
126    /// Production-ready defaults.
127    ///
128    /// ```
129    /// use thing_matcher::{MatchConfig, SimilarityAlgorithm};
130    /// let c = MatchConfig::default();
131    /// assert!((c.match_threshold - 0.80).abs() < 1e-9);
132    /// assert!(matches!(c.name_algorithm, SimilarityAlgorithm::Combined));
133    /// ```
134    fn default() -> Self {
135        Self {
136            match_threshold: 0.80,
137            name_weight: 0.30,
138            description_weight: 0.10,
139            disambiguating_description_weight: 0.05,
140            identifiers_weight: 0.25,
141            url_weight: 0.05,
142            same_as_weight: 0.15,
143            image_weight: 0.03,
144            main_entity_of_page_weight: 0.02,
145            additional_types_weight: 0.05,
146            use_phonetic_matching: false,
147            name_algorithm: SimilarityAlgorithm::Combined,
148            strict_mode: false,
149        }
150    }
151}
152
153impl MatchConfig {
154    /// A stricter preset: `match_threshold = 0.95`, `strict_mode = true`.
155    ///
156    /// Use when callers must rely on the answer and false positives are
157    /// more dangerous than false negatives.
158    ///
159    /// ```
160    /// use thing_matcher::MatchConfig;
161    /// let c = MatchConfig::strict();
162    /// assert!((c.match_threshold - 0.95).abs() < 1e-9);
163    /// assert!(c.strict_mode);
164    /// ```
165    pub fn strict() -> Self {
166        Self {
167            match_threshold: 0.95,
168            strict_mode: true,
169            ..Default::default()
170        }
171    }
172
173    /// A more forgiving preset: `match_threshold = 0.65`, phonetic matching on.
174    ///
175    /// Use when triaging large candidate sets where false negatives are
176    /// worse than false positives.
177    ///
178    /// ```
179    /// use thing_matcher::MatchConfig;
180    /// let c = MatchConfig::lenient();
181    /// assert!((c.match_threshold - 0.65).abs() < 1e-9);
182    /// assert!(c.use_phonetic_matching);
183    /// ```
184    pub fn lenient() -> Self {
185        Self {
186            match_threshold: 0.65,
187            use_phonetic_matching: true,
188            ..Default::default()
189        }
190    }
191}
192
193/// Qualitative confidence band derived from the probabilistic
194/// [`MatchResult::score`].
195///
196/// The bands are fixed across all `MatchConfig` presets — they do **not**
197/// follow `match_threshold`. They are intended for triage UIs and audit
198/// logs where a coarse High/Medium/Low summary is more useful than the
199/// raw float. The `is_match` boolean remains the authoritative go/no-go
200/// signal because it incorporates the configured threshold.
201///
202/// Boundaries:
203///
204/// | Score range | Band |
205/// |---|---|
206/// | `score >= 0.90` | `High` |
207/// | `0.75 <= score < 0.90` | `Medium` |
208/// | `score < 0.75` | `Low` |
209///
210/// # Examples
211///
212/// ```
213/// use thing_matcher::Confidence;
214///
215/// assert_eq!(Confidence::from_score(0.99), Confidence::High);
216/// assert_eq!(Confidence::from_score(0.90), Confidence::High);   // inclusive
217/// assert_eq!(Confidence::from_score(0.85), Confidence::Medium);
218/// assert_eq!(Confidence::from_score(0.75), Confidence::Medium); // inclusive
219/// assert_eq!(Confidence::from_score(0.50), Confidence::Low);
220/// assert_eq!(Confidence::from_score(0.00), Confidence::Low);
221/// ```
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
223pub enum Confidence {
224    /// Score is at or above `0.90`. Strong match; safe to act on with
225    /// minimal review.
226    High,
227    /// Score is in `0.75..0.90`. Medium-confidence match; the per-field
228    /// [`MatchBreakdown`] should be inspected before downstream use.
229    Medium,
230    /// Score is below `0.75`. Treat as a candidate at best; require
231    /// additional evidence before treating as the same thing.
232    Low,
233}
234
235impl Confidence {
236    /// Bucket a probabilistic score into one of the three bands.
237    ///
238    /// The function is total over `f64`: NaN inputs degrade to `Low`,
239    /// negative scores degrade to `Low`, scores above `1.0` are treated
240    /// as `High`. In practice the matcher only ever produces values in
241    /// `[0.0, 1.0]`, so callers shouldn't encounter the degenerate
242    /// inputs.
243    ///
244    /// ```
245    /// use thing_matcher::Confidence;
246    ///
247    /// assert_eq!(Confidence::from_score(f64::NAN), Confidence::Low);
248    /// assert_eq!(Confidence::from_score(-0.5),     Confidence::Low);
249    /// assert_eq!(Confidence::from_score(2.0),      Confidence::High);
250    /// ```
251    pub fn from_score(score: f64) -> Self {
252        if score >= 0.90 {
253            Confidence::High
254        } else if score >= 0.75 {
255            Confidence::Medium
256        } else {
257            Confidence::Low
258        }
259    }
260}
261
262/// Outcome of a probabilistic thing match.
263///
264/// Contains the overall renormalised `score`, the threshold-derived
265/// `is_match` boolean, a coarse [`Confidence`] band, and a per-field
266/// [`MatchBreakdown`] for audit.
267///
268/// `MatchResult` implements `Serialize + Deserialize` so it can be persisted
269/// or returned over an API.
270///
271/// ```
272/// use thing_matcher::{Confidence, MatchingEngine, Thing};
273///
274/// let t = Thing::builder().name("Eiffel Tower").build();
275/// let q = t.clone();
276/// let result = MatchingEngine::default_config().match_things(&t, &q);
277/// assert_eq!(result.confidence, Confidence::High);
278///
279/// // Round-trip through JSON.
280/// let json = serde_json::to_string(&result).unwrap();
281/// let back: thing_matcher::MatchResult = serde_json::from_str(&json).unwrap();
282/// assert!((result.score - back.score).abs() < 1e-12);
283/// assert_eq!(result.is_match, back.is_match);
284/// assert_eq!(result.confidence, back.confidence);
285/// ```
286#[derive(Debug, Clone, Serialize, Deserialize)]
287pub struct MatchResult {
288    /// Overall match score in `[0.0, 1.0]`.
289    pub score: f64,
290
291    /// `true` if `score >= MatchConfig::match_threshold`.
292    pub is_match: bool,
293
294    /// Coarse confidence band derived from `score`. Defaults to
295    /// [`Confidence::Low`] on legacy JSON payloads that pre-date the field.
296    #[serde(default = "default_confidence")]
297    pub confidence: Confidence,
298
299    /// Per-field score contributions for explainability.
300    pub breakdown: MatchBreakdown,
301}
302
303/// Backstop for legacy `MatchResult` JSON payloads that lack the
304/// `confidence` field. Returns `Confidence::Low` so a deserialised
305/// payload that pre-dates the field is unambiguously flagged as
306/// "needs re-scoring".
307fn default_confidence() -> Confidence {
308    Confidence::Low
309}
310
311/// Per-field score breakdown returned with every [`MatchResult`].
312///
313/// Each field is `Option<f64>`:
314///
315/// - `Some(score)` — the field was scored; the value is in `[0.0, 1.0]`.
316/// - `None` — the field was missing on at least one side and so did not
317///   participate in the weighted sum.
318///
319/// The breakdown exists so an auditor can see *why* a match was
320/// flagged. Do not throw it away in downstream services.
321#[derive(Debug, Clone, Serialize, Deserialize)]
322pub struct MatchBreakdown {
323    /// Best-of-cartesian-product similarity across (primary name +
324    /// alternate names) on both sides, using the configured algorithm.
325    pub name_score: Option<f64>,
326    /// Maximum Soundex match across the same name pairs. `None` when
327    /// `use_phonetic_matching` is false or either side has no names.
328    pub name_phonetic_score: Option<f64>,
329    /// `Combined` similarity over `description`, after `normalize_text`.
330    /// `None` if either side is absent.
331    pub description_score: Option<f64>,
332    /// `Combined` similarity over `disambiguatingDescription`, after
333    /// `normalize_text`. `None` if either side is absent.
334    pub disambiguating_description_score: Option<f64>,
335    /// `1.0` if both `identifiers` non-empty and they share any
336    /// `(property_id, value)` pair; `0.0` if both non-empty but none
337    /// shared; `None` if either side is empty.
338    pub identifiers_score: Option<f64>,
339    /// `1.0` if both `url`s normalise to the same string; `0.0`
340    /// otherwise; `None` if either side is absent.
341    pub url_score: Option<f64>,
342    /// Jaccard set similarity over the union of `sameAs` URLs after
343    /// `normalize_url`. `None` if both sides are empty.
344    pub same_as_score: Option<f64>,
345    /// `1.0` if both `image`s normalise to the same string; `0.0`
346    /// otherwise; `None` if either side is absent.
347    pub image_score: Option<f64>,
348    /// `1.0` if both `mainEntityOfPage`s normalise to the same string;
349    /// `0.0` otherwise; `None` if either side is absent.
350    pub main_entity_of_page_score: Option<f64>,
351    /// Jaccard set similarity over the union of `additionalType` URIs
352    /// after `normalize_url`. `None` if both sides are empty.
353    pub additional_types_score: Option<f64>,
354}
355
356/// Thing matcher engine.
357///
358/// The engine is **immutable after construction** and cheap to clone (it
359/// owns only a [`MatchConfig`]). Construct one and call its methods from
360/// any thread.
361///
362/// ```
363/// use thing_matcher::{MatchConfig, MatchingEngine};
364///
365/// let engine_a = MatchingEngine::default_config();
366/// let engine_b = MatchingEngine::new(MatchConfig::strict());
367/// # let _ = (engine_a, engine_b);
368/// ```
369pub struct MatchingEngine {
370    config: MatchConfig,
371}
372
373impl MatchingEngine {
374    /// Construct an engine with the given configuration.
375    ///
376    /// ```
377    /// use thing_matcher::{MatchConfig, MatchingEngine};
378    /// let engine = MatchingEngine::new(MatchConfig::lenient());
379    /// # let _ = engine;
380    /// ```
381    pub fn new(config: MatchConfig) -> Self {
382        Self { config }
383    }
384
385    /// Construct an engine with [`MatchConfig::default`].
386    ///
387    /// ```
388    /// use thing_matcher::MatchingEngine;
389    /// let engine = MatchingEngine::default_config();
390    /// # let _ = engine;
391    /// ```
392    pub fn default_config() -> Self {
393        Self::new(MatchConfig::default())
394    }
395
396    /// Compare two things probabilistically and return a [`MatchResult`].
397    ///
398    /// The score is the weight-renormalised sum of every component that
399    /// scored on both records. Missing fields are skipped, not penalised.
400    ///
401    /// ```
402    /// use thing_matcher::{MatchingEngine, Thing};
403    ///
404    /// let t = Thing::builder()
405    ///     .name("Eiffel Tower")
406    ///     .url("https://www.toureiffel.paris/")
407    ///     .build();
408    ///
409    /// let result = MatchingEngine::default_config().match_things(&t, &t);
410    /// assert!(result.is_match);
411    /// assert!(result.score > 0.99);
412    /// ```
413    pub fn match_things(&self, thing1: &Thing, thing2: &Thing) -> MatchResult {
414        let breakdown = self.calculate_breakdown(thing1, thing2);
415        let score = self.calculate_weighted_score(&breakdown);
416        let above_threshold = score >= self.config.match_threshold;
417        // Under strict mode, `is_match` ALSO requires a deterministic match.
418        let is_match = if self.config.strict_mode {
419            above_threshold && self.deterministic_match(thing1, thing2)
420        } else {
421            above_threshold
422        };
423        let confidence = Confidence::from_score(score);
424
425        MatchResult {
426            score,
427            is_match,
428            confidence,
429            breakdown,
430        }
431    }
432
433    /// Score a single query against many candidates. Returns one
434    /// [`MatchResult`] per candidate, in the same order as the input slice.
435    ///
436    /// The engine is immutable and `Send + Sync`, so call-sites that want
437    /// parallel evaluation can wrap the call in `rayon::par_iter` or similar
438    /// without further changes to this crate.
439    ///
440    /// # Examples
441    ///
442    /// ```
443    /// use thing_matcher::{MatchingEngine, Thing};
444    ///
445    /// let query = Thing::builder().name("Eiffel Tower").build();
446    /// let candidates = vec![
447    ///     Thing::builder().name("Eiffel Tower").build(),
448    ///     Thing::builder().name("Big Ben").build(),
449    /// ];
450    ///
451    /// let engine = MatchingEngine::default_config();
452    /// let results = engine.match_one_to_many(&query, &candidates);
453    /// assert_eq!(results.len(), 2);
454    /// assert!(results[0].is_match);
455    /// assert!(!results[1].is_match);
456    /// ```
457    ///
458    /// Empty candidates yield an empty result:
459    ///
460    /// ```
461    /// # use thing_matcher::{MatchingEngine, Thing};
462    /// let q = Thing::builder().name("Solo").build();
463    /// let r = MatchingEngine::default_config().match_one_to_many(&q, &[]);
464    /// assert!(r.is_empty());
465    /// ```
466    pub fn match_one_to_many(&self, query: &Thing, candidates: &[Thing]) -> Vec<MatchResult> {
467        candidates
468            .iter()
469            .map(|c| self.match_things(query, c))
470            .collect()
471    }
472
473    /// Score and rank: return `(original_index, MatchResult)` tuples
474    /// sorted by descending score. Ties are broken by ascending original
475    /// index, so the result is deterministic.
476    ///
477    /// # Examples
478    ///
479    /// ```
480    /// use thing_matcher::{MatchingEngine, Thing};
481    ///
482    /// let query = Thing::builder().name("Eiffel Tower").build();
483    /// let candidates = vec![
484    ///     Thing::builder().name("Big Ben").build(),                 // index 0
485    ///     Thing::builder().name("Eiffel Tower").build(),            // index 1 — best match
486    ///     Thing::builder().name("Statue of Liberty").build(),       // index 2
487    /// ];
488    ///
489    /// let ranked = MatchingEngine::default_config().rank_one_to_many(&query, &candidates);
490    /// assert_eq!(ranked.len(), 3);
491    /// assert_eq!(ranked[0].0, 1);
492    /// assert!(ranked[0].1.score >= ranked[1].1.score);
493    /// assert!(ranked[1].1.score >= ranked[2].1.score);
494    /// ```
495    pub fn rank_one_to_many(
496        &self,
497        query: &Thing,
498        candidates: &[Thing],
499    ) -> Vec<(usize, MatchResult)> {
500        let mut indexed: Vec<(usize, MatchResult)> = self
501            .match_one_to_many(query, candidates)
502            .into_iter()
503            .enumerate()
504            .collect();
505        indexed.sort_by(|a, b| {
506            b.1.score
507                .partial_cmp(&a.1.score)
508                .unwrap_or(std::cmp::Ordering::Equal)
509                .then_with(|| a.0.cmp(&b.0))
510        });
511        indexed
512    }
513
514    /// Compare two things deterministically and return a single boolean.
515    ///
516    /// Returns `true` iff any of the following hold:
517    ///
518    /// - the things share any `(property_id, value)` pair in their
519    ///   `identifiers` lists;
520    /// - the things share any `sameAs` URL after URL normalisation;
521    /// - both have a `url` that normalises to the same string.
522    ///
523    /// ```
524    /// use thing_matcher::{Identifier, MatchingEngine, Thing};
525    ///
526    /// let id = Identifier::new("wikidata", "Q243").unwrap();
527    /// let a = Thing::builder().name("Eiffel Tower").add_identifier(id.clone()).build();
528    /// let b = Thing::builder().name("Tour Eiffel").add_identifier(id).build();
529    /// assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
530    /// ```
531    pub fn deterministic_match(&self, thing1: &Thing, thing2: &Thing) -> bool {
532        if shares_identifier(thing1, thing2) {
533            return true;
534        }
535        if shares_same_as(thing1, thing2) {
536            return true;
537        }
538        same_canonical_url(thing1, thing2)
539    }
540
541    fn calculate_breakdown(&self, thing1: &Thing, thing2: &Thing) -> MatchBreakdown {
542        MatchBreakdown {
543            name_score: self.score_name(thing1, thing2),
544            name_phonetic_score: if self.config.use_phonetic_matching {
545                self.score_phonetic_names(thing1, thing2)
546            } else {
547                None
548            },
549            description_score: score_text(&thing1.description, &thing2.description),
550            disambiguating_description_score: score_text(
551                &thing1.disambiguating_description,
552                &thing2.disambiguating_description,
553            ),
554            identifiers_score: score_identifiers(thing1, thing2),
555            url_score: score_url(&thing1.url, &thing2.url),
556            same_as_score: score_url_set(&thing1.same_as, &thing2.same_as),
557            image_score: score_url(&thing1.image, &thing2.image),
558            main_entity_of_page_score: score_url(
559                &thing1.main_entity_of_page,
560                &thing2.main_entity_of_page,
561            ),
562            additional_types_score: score_url_set(
563                &thing1.additional_types,
564                &thing2.additional_types,
565            ),
566        }
567    }
568
569    fn calculate_weighted_score(&self, breakdown: &MatchBreakdown) -> f64 {
570        let mut total_weight = 0.0;
571        let mut weighted_sum = 0.0;
572
573        let mut add = |score: Option<f64>, weight: f64| {
574            if let Some(s) = score {
575                weighted_sum += s * weight;
576                total_weight += weight;
577            }
578        };
579
580        add(breakdown.name_score, self.config.name_weight);
581        add(breakdown.description_score, self.config.description_weight);
582        add(
583            breakdown.disambiguating_description_score,
584            self.config.disambiguating_description_weight,
585        );
586        add(breakdown.identifiers_score, self.config.identifiers_weight);
587        add(breakdown.url_score, self.config.url_weight);
588        add(breakdown.same_as_score, self.config.same_as_weight);
589        add(breakdown.image_score, self.config.image_weight);
590        add(
591            breakdown.main_entity_of_page_score,
592            self.config.main_entity_of_page_weight,
593        );
594        add(
595            breakdown.additional_types_score,
596            self.config.additional_types_weight,
597        );
598
599        // Phonetic match is a bonus only — never lowers the score.
600        if let Some(score) = breakdown.name_phonetic_score
601            && score > 0.9
602        {
603            weighted_sum += score * 0.05;
604            total_weight += 0.05;
605        }
606
607        if total_weight > 0.0 {
608            weighted_sum / total_weight
609        } else {
610            0.0
611        }
612    }
613
614    fn score_name(&self, thing1: &Thing, thing2: &Thing) -> Option<f64> {
615        let names1 = collect_names(thing1);
616        let names2 = collect_names(thing2);
617        if names1.is_empty() || names2.is_empty() {
618            return None;
619        }
620        let mut best = f64::NEG_INFINITY;
621        for n1 in &names1 {
622            for n2 in &names2 {
623                let s = self.score_name_pair(n1, n2);
624                if s > best {
625                    best = s;
626                }
627            }
628        }
629        Some(best)
630    }
631
632    fn score_name_pair(&self, name1: &str, name2: &str) -> f64 {
633        let norm1 = Normalizer::normalize_name(name1);
634        let norm2 = Normalizer::normalize_name(name2);
635        match self.config.name_algorithm {
636            SimilarityAlgorithm::JaroWinkler => Scorer::jaro_winkler_similarity(&norm1, &norm2),
637            SimilarityAlgorithm::Levenshtein => Scorer::levenshtein_similarity(&norm1, &norm2),
638            SimilarityAlgorithm::Exact => Scorer::exact_match(&norm1, &norm2),
639            SimilarityAlgorithm::Combined => Scorer::combined_similarity(&norm1, &norm2),
640        }
641    }
642
643    fn score_phonetic_names(&self, thing1: &Thing, thing2: &Thing) -> Option<f64> {
644        let names1 = collect_names(thing1);
645        let names2 = collect_names(thing2);
646        if names1.is_empty() || names2.is_empty() {
647            return None;
648        }
649        let codes1: Vec<String> = names1
650            .iter()
651            .map(|n| Normalizer::phonetic_code(n))
652            .collect();
653        let codes2: Vec<String> = names2
654            .iter()
655            .map(|n| Normalizer::phonetic_code(n))
656            .collect();
657        let mut best = 0.0_f64;
658        for c1 in &codes1 {
659            for c2 in &codes2 {
660                if !c1.is_empty() && c1 == c2 {
661                    best = 1.0;
662                }
663            }
664        }
665        Some(best)
666    }
667}
668
669// ---- Free helpers ------------------------------------------------------
670
671/// Collect a thing's primary name plus alternate names into a single vec
672/// of references. Empty / whitespace-only strings are skipped.
673fn collect_names(thing: &Thing) -> Vec<&String> {
674    thing
675        .name
676        .iter()
677        .chain(thing.alternate_names.iter())
678        .filter(|s| !s.trim().is_empty())
679        .collect()
680}
681
682/// `Combined` similarity over a pair of optional free-form text fields.
683/// Returns `None` if either side is absent.
684fn score_text(a: &Option<String>, b: &Option<String>) -> Option<f64> {
685    let a = a.as_ref()?;
686    let b = b.as_ref()?;
687    let na = Normalizer::normalize_text(a);
688    let nb = Normalizer::normalize_text(b);
689    Some(Scorer::combined_similarity(&na, &nb))
690}
691
692/// Exact match over a pair of optional URL fields, compared after URL
693/// normalisation. Returns `None` if either side is absent.
694fn score_url(a: &Option<String>, b: &Option<String>) -> Option<f64> {
695    let a = a.as_ref()?;
696    let b = b.as_ref()?;
697    let na = Normalizer::normalize_url(a);
698    let nb = Normalizer::normalize_url(b);
699    Some(Scorer::exact_match(&na, &nb))
700}
701
702/// Jaccard set similarity over two URL lists. Returns `None` only if both
703/// sides are empty; an empty-against-non-empty pair scores `0.0`.
704fn score_url_set(a: &[String], b: &[String]) -> Option<f64> {
705    if a.is_empty() && b.is_empty() {
706        return None;
707    }
708    let na: Vec<String> = a.iter().map(|s| Normalizer::normalize_url(s)).collect();
709    let nb: Vec<String> = b.iter().map(|s| Normalizer::normalize_url(s)).collect();
710    Some(Scorer::jaccard_set_similarity(&na, &nb))
711}
712
713/// `Some(1.0)` if any `(property_id, value)` pair is shared, `Some(0.0)`
714/// if both lists are non-empty but no pair is shared, `None` if either
715/// list is empty.
716fn score_identifiers(thing1: &Thing, thing2: &Thing) -> Option<f64> {
717    if thing1.identifiers.is_empty() || thing2.identifiers.is_empty() {
718        return None;
719    }
720    Some(if shares_identifier(thing1, thing2) {
721        1.0
722    } else {
723        0.0
724    })
725}
726
727fn shares_identifier(thing1: &Thing, thing2: &Thing) -> bool {
728    if thing1.identifiers.is_empty() || thing2.identifiers.is_empty() {
729        return false;
730    }
731    for id1 in &thing1.identifiers {
732        for id2 in &thing2.identifiers {
733            if id1 == id2 {
734                return true;
735            }
736        }
737    }
738    false
739}
740
741fn shares_same_as(thing1: &Thing, thing2: &Thing) -> bool {
742    if thing1.same_as.is_empty() || thing2.same_as.is_empty() {
743        return false;
744    }
745    let set1: std::collections::BTreeSet<String> = thing1
746        .same_as
747        .iter()
748        .map(|s| Normalizer::normalize_url(s))
749        .collect();
750    for s in &thing2.same_as {
751        if set1.contains(&Normalizer::normalize_url(s)) {
752            return true;
753        }
754    }
755    false
756}
757
758fn same_canonical_url(thing1: &Thing, thing2: &Thing) -> bool {
759    let (Some(u1), Some(u2)) = (thing1.url.as_ref(), thing2.url.as_ref()) else {
760        return false;
761    };
762    Normalizer::normalize_url(u1) == Normalizer::normalize_url(u2)
763}
764
765#[cfg(test)]
766mod tests {
767    use super::*;
768    use crate::models::Identifier;
769
770    // ---------- MatchConfig presets ----------
771
772    #[test]
773    fn config_default_values() {
774        let c = MatchConfig::default();
775        assert!((c.match_threshold - 0.80).abs() < 1e-9);
776        assert!(!c.strict_mode);
777    }
778
779    #[test]
780    fn config_strict_raises_threshold_and_sets_flag() {
781        let c = MatchConfig::strict();
782        assert!((c.match_threshold - 0.95).abs() < 1e-9);
783        assert!(c.strict_mode);
784    }
785
786    #[test]
787    fn config_lenient_lowers_threshold() {
788        let c = MatchConfig::lenient();
789        assert!((c.match_threshold - 0.65).abs() < 1e-9);
790        assert!(c.use_phonetic_matching);
791    }
792
793    // ---------- MatchConfig serde ----------
794
795    #[test]
796    fn config_default_round_trips_through_json() {
797        let cfg = MatchConfig::default();
798        let json = serde_json::to_string(&cfg).expect("serialise");
799        let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
800        assert!((cfg.match_threshold - back.match_threshold).abs() < 1e-12);
801        assert!((cfg.name_weight - back.name_weight).abs() < 1e-12);
802        assert!((cfg.identifiers_weight - back.identifiers_weight).abs() < 1e-12);
803        assert!(matches!(back.name_algorithm, SimilarityAlgorithm::Combined));
804        assert_eq!(cfg.strict_mode, back.strict_mode);
805    }
806
807    #[test]
808    fn config_partial_json_fills_missing_fields_from_default() {
809        let partial = r#"{"match_threshold": 0.80, "name_weight": 0.5}"#;
810        let cfg: MatchConfig = serde_json::from_str(partial).expect("partial json");
811        assert!((cfg.match_threshold - 0.80).abs() < 1e-12);
812        assert!((cfg.name_weight - 0.5).abs() < 1e-12);
813        assert!(matches!(cfg.name_algorithm, SimilarityAlgorithm::Combined));
814    }
815
816    // ---------- probabilistic match ----------
817
818    #[test]
819    fn exact_clone_is_a_match() {
820        let t = Thing::builder()
821            .name("Eiffel Tower")
822            .url("https://www.toureiffel.paris/")
823            .build();
824        let result = MatchingEngine::default_config().match_things(&t, &t.clone());
825        assert!(result.is_match);
826        assert!(result.score > 0.95);
827    }
828
829    #[test]
830    fn name_match_takes_best_of_cartesian_product() {
831        let t1 = Thing::builder().name("Eiffel Tower").build();
832        let t2 = Thing::builder()
833            .name("La Tour Eiffel")
834            .add_alternate_name("Eiffel Tower")
835            .build();
836        let r = MatchingEngine::default_config().match_things(&t1, &t2);
837        let s = r.breakdown.name_score.expect("scored");
838        assert!(
839            s > 0.99,
840            "best-of cartesian product should pick exact match: {s}"
841        );
842    }
843
844    #[test]
845    fn unrelated_things_do_not_match() {
846        let a = Thing::builder().name("Eiffel Tower").build();
847        let b = Thing::builder().name("Sydney Opera House").build();
848        let r = MatchingEngine::default_config().match_things(&a, &b);
849        assert!(!r.is_match);
850        assert!(r.score < 0.5);
851    }
852
853    #[test]
854    fn no_overlapping_fields_returns_zero_score() {
855        let a = Thing::builder().description("foo").build();
856        let b = Thing::builder()
857            .add_same_as("https://example.org/x")
858            .build();
859        let r = MatchingEngine::default_config().match_things(&a, &b);
860        assert_eq!(r.score, 0.0);
861    }
862
863    // ---------- description / disambiguating_description ----------
864
865    #[test]
866    fn description_identical_scores_one() {
867        let t1 = Thing::builder()
868            .name("X")
869            .description("Iron tower in Paris.")
870            .build();
871        let t2 = Thing::builder()
872            .name("X")
873            .description("Iron tower in Paris.")
874            .build();
875        let r = MatchingEngine::default_config().match_things(&t1, &t2);
876        assert!(r.breakdown.description_score.unwrap() > 0.99);
877    }
878
879    #[test]
880    fn description_score_none_when_either_missing() {
881        let t1 = Thing::builder()
882            .name("X")
883            .description("Iron tower in Paris.")
884            .build();
885        let t2 = Thing::builder().name("X").build();
886        let r = MatchingEngine::default_config().match_things(&t1, &t2);
887        assert!(r.breakdown.description_score.is_none());
888    }
889
890    // ---------- identifiers ----------
891
892    #[test]
893    fn identifiers_shared_scores_one() {
894        let id = Identifier::new("wikidata", "Q243").unwrap();
895        let a = Thing::builder()
896            .name("X")
897            .add_identifier(id.clone())
898            .build();
899        let b = Thing::builder().name("X").add_identifier(id).build();
900        let r = MatchingEngine::default_config().match_things(&a, &b);
901        assert_eq!(r.breakdown.identifiers_score, Some(1.0));
902    }
903
904    #[test]
905    fn identifiers_property_scoped_no_cross_match() {
906        let a = Thing::builder()
907            .name("X")
908            .add_identifier(Identifier::new("google", "X").unwrap())
909            .build();
910        let b = Thing::builder()
911            .name("X")
912            .add_identifier(Identifier::new("wikidata", "X").unwrap())
913            .build();
914        let r = MatchingEngine::default_config().match_things(&a, &b);
915        assert_eq!(r.breakdown.identifiers_score, Some(0.0));
916    }
917
918    #[test]
919    fn identifiers_none_when_either_side_empty() {
920        let a = Thing::builder().name("X").build();
921        let b = Thing::builder()
922            .name("X")
923            .add_identifier(Identifier::new("wikidata", "Q1").unwrap())
924            .build();
925        let r = MatchingEngine::default_config().match_things(&a, &b);
926        assert!(r.breakdown.identifiers_score.is_none());
927    }
928
929    // ---------- url ----------
930
931    #[test]
932    fn url_normalised_equality_scores_one() {
933        let a = Thing::builder()
934            .name("X")
935            .url("HTTPS://Example.ORG/")
936            .build();
937        let b = Thing::builder()
938            .name("X")
939            .url("https://example.org")
940            .build();
941        let r = MatchingEngine::default_config().match_things(&a, &b);
942        assert_eq!(r.breakdown.url_score, Some(1.0));
943    }
944
945    #[test]
946    fn url_mismatch_scores_zero() {
947        let a = Thing::builder().name("X").url("https://a.org").build();
948        let b = Thing::builder().name("X").url("https://b.org").build();
949        let r = MatchingEngine::default_config().match_things(&a, &b);
950        assert_eq!(r.breakdown.url_score, Some(0.0));
951    }
952
953    #[test]
954    fn url_none_when_either_side_missing() {
955        let a = Thing::builder().name("X").url("https://a.org").build();
956        let b = Thing::builder().name("X").build();
957        let r = MatchingEngine::default_config().match_things(&a, &b);
958        assert!(r.breakdown.url_score.is_none());
959    }
960
961    // ---------- sameAs / additional_types ----------
962
963    #[test]
964    fn same_as_jaccard_partial_overlap() {
965        let a = Thing::builder()
966            .name("X")
967            .add_same_as("https://example.org/a")
968            .add_same_as("https://example.org/b")
969            .build();
970        let b = Thing::builder()
971            .name("X")
972            .add_same_as("https://example.org/b")
973            .add_same_as("https://example.org/c")
974            .build();
975        let r = MatchingEngine::default_config().match_things(&a, &b);
976        let s = r.breakdown.same_as_score.expect("scored");
977        // intersection {b}, union {a,b,c} => 1/3
978        assert!((s - 1.0_f64 / 3.0).abs() < 1e-9, "got {s}");
979    }
980
981    #[test]
982    fn same_as_none_when_both_empty() {
983        let a = Thing::builder().name("X").build();
984        let b = Thing::builder().name("X").build();
985        let r = MatchingEngine::default_config().match_things(&a, &b);
986        assert!(r.breakdown.same_as_score.is_none());
987    }
988
989    #[test]
990    fn additional_types_jaccard_full_overlap() {
991        let a = Thing::builder()
992            .name("X")
993            .add_additional_type("https://schema.org/Landmark")
994            .build();
995        let b = Thing::builder()
996            .name("X")
997            .add_additional_type("https://schema.org/Landmark")
998            .build();
999        let r = MatchingEngine::default_config().match_things(&a, &b);
1000        assert_eq!(r.breakdown.additional_types_score, Some(1.0));
1001    }
1002
1003    // ---------- deterministic match ----------
1004
1005    #[test]
1006    fn deterministic_via_shared_identifier() {
1007        let id = Identifier::new("wikidata", "Q243").unwrap();
1008        let a = Thing::builder()
1009            .name("Eiffel Tower")
1010            .add_identifier(id.clone())
1011            .build();
1012        let b = Thing::builder()
1013            .name("Wholly Different")
1014            .add_identifier(id)
1015            .build();
1016        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1017    }
1018
1019    #[test]
1020    fn deterministic_via_shared_same_as() {
1021        let a = Thing::builder()
1022            .name("Eiffel Tower")
1023            .add_same_as("https://www.wikidata.org/wiki/Q243")
1024            .build();
1025        let b = Thing::builder()
1026            .name("Tour Eiffel")
1027            .add_same_as("https://www.wikidata.org/wiki/Q243")
1028            .build();
1029        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1030    }
1031
1032    #[test]
1033    fn deterministic_via_shared_url() {
1034        let a = Thing::builder()
1035            .name("X")
1036            .url("https://example.org/")
1037            .build();
1038        let b = Thing::builder()
1039            .name("Y")
1040            .url("https://example.org")
1041            .build();
1042        assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1043    }
1044
1045    #[test]
1046    fn deterministic_rejects_when_no_shared_identity_signal() {
1047        let a = Thing::builder().name("X").build();
1048        let b = Thing::builder().name("X").build();
1049        assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
1050    }
1051
1052    // ---------- strict_mode enforcement ----------
1053
1054    #[test]
1055    fn strict_mode_requires_deterministic_for_is_match() {
1056        let cfg = MatchConfig {
1057            match_threshold: 0.40,
1058            strict_mode: true,
1059            ..MatchConfig::default()
1060        };
1061        let t1 = Thing::builder().name("Cafe Centrale").build();
1062        let t2 = Thing::builder().name("Cafe Central").build();
1063        let engine = MatchingEngine::new(cfg);
1064        let r = engine.match_things(&t1, &t2);
1065        assert!(r.score >= 0.40, "should clear threshold");
1066        // No shared identifier / sameAs / url → not deterministic
1067        assert!(!engine.deterministic_match(&t1, &t2));
1068        assert!(!r.is_match);
1069    }
1070
1071    // ---------- batch APIs ----------
1072
1073    #[test]
1074    fn match_one_to_many_empty_candidates_yields_empty_vec() {
1075        let engine = MatchingEngine::default_config();
1076        let q = Thing::builder().name("Solo").build();
1077        assert!(engine.match_one_to_many(&q, &[]).is_empty());
1078    }
1079
1080    #[test]
1081    fn rank_one_to_many_sorts_by_score_descending() {
1082        let engine = MatchingEngine::default_config();
1083        let q = Thing::builder().name("Eiffel Tower").build();
1084        let candidates = vec![
1085            Thing::builder().name("Big Ben").build(),
1086            q.clone(),
1087            Thing::builder().name("Statue of Liberty").build(),
1088        ];
1089        let ranked = engine.rank_one_to_many(&q, &candidates);
1090        assert_eq!(ranked[0].0, 1);
1091        for w in ranked.windows(2) {
1092            assert!(w[0].1.score >= w[1].1.score);
1093        }
1094    }
1095
1096    // ---------- Confidence ----------
1097
1098    #[test]
1099    fn confidence_band_boundaries_are_inclusive_on_the_low_side() {
1100        assert_eq!(Confidence::from_score(0.90), Confidence::High);
1101        assert_eq!(Confidence::from_score(0.89), Confidence::Medium);
1102        assert_eq!(Confidence::from_score(0.75), Confidence::Medium);
1103        assert_eq!(Confidence::from_score(0.74), Confidence::Low);
1104    }
1105
1106    // ---------- phonetic ----------
1107
1108    #[test]
1109    fn phonetic_score_none_when_off() {
1110        let t = Thing::builder().name("Stephen").build();
1111        let q = Thing::builder().name("Steven").build();
1112        let r = MatchingEngine::new(MatchConfig {
1113            use_phonetic_matching: false,
1114            ..MatchConfig::default()
1115        })
1116        .match_things(&t, &q);
1117        assert!(r.breakdown.name_phonetic_score.is_none());
1118    }
1119
1120    #[test]
1121    fn phonetic_score_some_when_on() {
1122        let t = Thing::builder().name("Stephen").build();
1123        let q = Thing::builder().name("Steven").build();
1124        let r = MatchingEngine::new(MatchConfig {
1125            use_phonetic_matching: true,
1126            ..MatchConfig::default()
1127        })
1128        .match_things(&t, &q);
1129        assert!(r.breakdown.name_phonetic_score.is_some());
1130    }
1131}