thing_matcher/matcher.rs
1//! Thing matcher engine: deterministic and probabilistic algorithms.
2//!
3//! This is the orchestration layer of the crate. It pulls together the data
4//! types from [`crate::models`], the text transformations from
5//! [`crate::normalizer`], and the similarity primitives from
6//! [`crate::scorer`] to produce a single answer about whether two `Thing`
7//! records refer to the same item.
8//!
9//! ## Two strategies, one engine
10//!
11//! - [`MatchingEngine::deterministic_match`] — fast, binary. Returns `true`
12//! iff the two things share any `(property_id, value)` identifier, any
13//! `sameAs` URL, or the same canonical `url`.
14//! - [`MatchingEngine::match_things`] — weighted probabilistic scoring,
15//! returning a [`MatchResult`] with per-field [`MatchBreakdown`].
16//!
17//! ## Example
18//!
19//! ```
20//! use thing_matcher::{MatchingEngine, Thing};
21//!
22//! let a = Thing::builder()
23//! .name("Eiffel Tower")
24//! .url("https://www.toureiffel.paris/")
25//! .build();
26//!
27//! let b = Thing::builder()
28//! .name("La Tour Eiffel")
29//! .add_alternate_name("Eiffel Tower")
30//! .url("https://www.toureiffel.paris/")
31//! .build();
32//!
33//! let engine = MatchingEngine::default_config();
34//! let result = engine.match_things(&a, &b);
35//! assert!(result.is_match);
36//! ```
37
38use crate::models::Thing;
39use crate::normalizer::Normalizer;
40use crate::scorer::{Scorer, SimilarityAlgorithm};
41use serde::{Deserialize, Serialize};
42
43/// Tunable configuration for the matching engine.
44///
45/// All weights are dimensionless and contribute to a renormalised weighted
46/// sum — they do not need to add to `1.0`. The matching pipeline divides
47/// the weighted sum by the sum of *participating* weights so that missing
48/// fields neither contribute nor penalise. The score is then compared
49/// against [`MatchConfig::match_threshold`] to produce the `is_match`
50/// boolean.
51///
52/// Two presets cover most needs:
53///
54/// - [`MatchConfig::strict`] — `match_threshold = 0.95`, `strict_mode = true`.
55/// - [`MatchConfig::lenient`] — `match_threshold = 0.65`, phonetic on.
56///
57/// # Example
58///
59/// ```
60/// use thing_matcher::{MatchConfig, SimilarityAlgorithm};
61///
62/// let custom = MatchConfig {
63/// match_threshold: 0.80,
64/// name_weight: 0.30,
65/// description_weight: 0.10,
66/// disambiguating_description_weight: 0.05,
67/// identifiers_weight: 0.25,
68/// url_weight: 0.05,
69/// same_as_weight: 0.15,
70/// image_weight: 0.03,
71/// main_entity_of_page_weight: 0.02,
72/// additional_types_weight: 0.05,
73/// use_phonetic_matching: true,
74/// name_algorithm: SimilarityAlgorithm::Combined,
75/// strict_mode: false,
76/// };
77/// assert_eq!(custom.match_threshold, 0.80);
78/// ```
79#[derive(Debug, Clone, Serialize, Deserialize)]
80#[serde(default)]
81pub struct MatchConfig {
82 /// Threshold score for considering two things a match (`0.0..=1.0`).
83 pub match_threshold: f64,
84
85 /// Weight for name similarity (best-of cartesian product across the
86 /// primary `name` and `alternate_names` on both sides).
87 pub name_weight: f64,
88
89 /// Weight for free-form `description` similarity.
90 pub description_weight: f64,
91
92 /// Weight for `disambiguatingDescription` similarity.
93 pub disambiguating_description_weight: f64,
94
95 /// Weight for "shared identifier" (1.0 if any `(property_id, value)`
96 /// pair is shared, 0.0 otherwise).
97 pub identifiers_weight: f64,
98
99 /// Weight for canonical `url` exact match (after URL normalisation).
100 pub url_weight: f64,
101
102 /// Weight for `sameAs` URL set similarity (Jaccard).
103 pub same_as_weight: f64,
104
105 /// Weight for `image` URL exact match (after URL normalisation).
106 pub image_weight: f64,
107
108 /// Weight for `mainEntityOfPage` URL exact match (after URL
109 /// normalisation).
110 pub main_entity_of_page_weight: f64,
111
112 /// Weight for `additionalType` URI set similarity (Jaccard).
113 pub additional_types_weight: f64,
114
115 /// Whether to add a phonetic-name bonus when both names sound alike.
116 pub use_phonetic_matching: bool,
117
118 /// Similarity algorithm to use when comparing names.
119 pub name_algorithm: SimilarityAlgorithm,
120
121 /// Reserved flag for stricter deterministic enforcement.
122 pub strict_mode: bool,
123}
124
125impl Default for MatchConfig {
126 /// Production-ready defaults.
127 ///
128 /// ```
129 /// use thing_matcher::{MatchConfig, SimilarityAlgorithm};
130 /// let c = MatchConfig::default();
131 /// assert!((c.match_threshold - 0.80).abs() < 1e-9);
132 /// assert!(matches!(c.name_algorithm, SimilarityAlgorithm::Combined));
133 /// ```
134 fn default() -> Self {
135 Self {
136 match_threshold: 0.80,
137 name_weight: 0.30,
138 description_weight: 0.10,
139 disambiguating_description_weight: 0.05,
140 identifiers_weight: 0.25,
141 url_weight: 0.05,
142 same_as_weight: 0.15,
143 image_weight: 0.03,
144 main_entity_of_page_weight: 0.02,
145 additional_types_weight: 0.05,
146 use_phonetic_matching: false,
147 name_algorithm: SimilarityAlgorithm::Combined,
148 strict_mode: false,
149 }
150 }
151}
152
153impl MatchConfig {
154 /// A stricter preset: `match_threshold = 0.95`, `strict_mode = true`.
155 ///
156 /// Use when callers must rely on the answer and false positives are
157 /// more dangerous than false negatives.
158 ///
159 /// ```
160 /// use thing_matcher::MatchConfig;
161 /// let c = MatchConfig::strict();
162 /// assert!((c.match_threshold - 0.95).abs() < 1e-9);
163 /// assert!(c.strict_mode);
164 /// ```
165 pub fn strict() -> Self {
166 Self {
167 match_threshold: 0.95,
168 strict_mode: true,
169 ..Default::default()
170 }
171 }
172
173 /// A more forgiving preset: `match_threshold = 0.65`, phonetic matching on.
174 ///
175 /// Use when triaging large candidate sets where false negatives are
176 /// worse than false positives.
177 ///
178 /// ```
179 /// use thing_matcher::MatchConfig;
180 /// let c = MatchConfig::lenient();
181 /// assert!((c.match_threshold - 0.65).abs() < 1e-9);
182 /// assert!(c.use_phonetic_matching);
183 /// ```
184 pub fn lenient() -> Self {
185 Self {
186 match_threshold: 0.65,
187 use_phonetic_matching: true,
188 ..Default::default()
189 }
190 }
191}
192
193/// Qualitative confidence band derived from the probabilistic
194/// [`MatchResult::score`].
195///
196/// The bands are fixed across all `MatchConfig` presets — they do **not**
197/// follow `match_threshold`. They are intended for triage UIs and audit
198/// logs where a coarse High/Medium/Low summary is more useful than the
199/// raw float. The `is_match` boolean remains the authoritative go/no-go
200/// signal because it incorporates the configured threshold.
201///
202/// Boundaries:
203///
204/// | Score range | Band |
205/// |---|---|
206/// | `score >= 0.90` | `High` |
207/// | `0.75 <= score < 0.90` | `Medium` |
208/// | `score < 0.75` | `Low` |
209///
210/// # Examples
211///
212/// ```
213/// use thing_matcher::Confidence;
214///
215/// assert_eq!(Confidence::from_score(0.99), Confidence::High);
216/// assert_eq!(Confidence::from_score(0.90), Confidence::High); // inclusive
217/// assert_eq!(Confidence::from_score(0.85), Confidence::Medium);
218/// assert_eq!(Confidence::from_score(0.75), Confidence::Medium); // inclusive
219/// assert_eq!(Confidence::from_score(0.50), Confidence::Low);
220/// assert_eq!(Confidence::from_score(0.00), Confidence::Low);
221/// ```
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
223pub enum Confidence {
224 /// Score is at or above `0.90`. Strong match; safe to act on with
225 /// minimal review.
226 High,
227 /// Score is in `0.75..0.90`. Medium-confidence match; the per-field
228 /// [`MatchBreakdown`] should be inspected before downstream use.
229 Medium,
230 /// Score is below `0.75`. Treat as a candidate at best; require
231 /// additional evidence before treating as the same thing.
232 Low,
233}
234
235impl Confidence {
236 /// Bucket a probabilistic score into one of the three bands.
237 ///
238 /// The function is total over `f64`: NaN inputs degrade to `Low`,
239 /// negative scores degrade to `Low`, scores above `1.0` are treated
240 /// as `High`. In practice the matcher only ever produces values in
241 /// `[0.0, 1.0]`, so callers shouldn't encounter the degenerate
242 /// inputs.
243 ///
244 /// ```
245 /// use thing_matcher::Confidence;
246 ///
247 /// assert_eq!(Confidence::from_score(f64::NAN), Confidence::Low);
248 /// assert_eq!(Confidence::from_score(-0.5), Confidence::Low);
249 /// assert_eq!(Confidence::from_score(2.0), Confidence::High);
250 /// ```
251 pub fn from_score(score: f64) -> Self {
252 if score >= 0.90 {
253 Confidence::High
254 } else if score >= 0.75 {
255 Confidence::Medium
256 } else {
257 Confidence::Low
258 }
259 }
260}
261
262/// Outcome of a probabilistic thing match.
263///
264/// Contains the overall renormalised `score`, the threshold-derived
265/// `is_match` boolean, a coarse [`Confidence`] band, and a per-field
266/// [`MatchBreakdown`] for audit.
267///
268/// `MatchResult` implements `Serialize + Deserialize` so it can be persisted
269/// or returned over an API.
270///
271/// ```
272/// use thing_matcher::{Confidence, MatchingEngine, Thing};
273///
274/// let t = Thing::builder().name("Eiffel Tower").build();
275/// let q = t.clone();
276/// let result = MatchingEngine::default_config().match_things(&t, &q);
277/// assert_eq!(result.confidence, Confidence::High);
278///
279/// // Round-trip through JSON.
280/// let json = serde_json::to_string(&result).unwrap();
281/// let back: thing_matcher::MatchResult = serde_json::from_str(&json).unwrap();
282/// assert!((result.score - back.score).abs() < 1e-12);
283/// assert_eq!(result.is_match, back.is_match);
284/// assert_eq!(result.confidence, back.confidence);
285/// ```
286#[derive(Debug, Clone, Serialize, Deserialize)]
287pub struct MatchResult {
288 /// Overall match score in `[0.0, 1.0]`.
289 pub score: f64,
290
291 /// `true` if `score >= MatchConfig::match_threshold`.
292 pub is_match: bool,
293
294 /// Coarse confidence band derived from `score`. Defaults to
295 /// [`Confidence::Low`] on legacy JSON payloads that pre-date the field.
296 #[serde(default = "default_confidence")]
297 pub confidence: Confidence,
298
299 /// Per-field score contributions for explainability.
300 pub breakdown: MatchBreakdown,
301}
302
303/// Backstop for legacy `MatchResult` JSON payloads that lack the
304/// `confidence` field. Returns `Confidence::Low` so a deserialised
305/// payload that pre-dates the field is unambiguously flagged as
306/// "needs re-scoring".
307fn default_confidence() -> Confidence {
308 Confidence::Low
309}
310
311/// Per-field score breakdown returned with every [`MatchResult`].
312///
313/// Each field is `Option<f64>`:
314///
315/// - `Some(score)` — the field was scored; the value is in `[0.0, 1.0]`.
316/// - `None` — the field was missing on at least one side and so did not
317/// participate in the weighted sum.
318///
319/// The breakdown exists so an auditor can see *why* a match was
320/// flagged. Do not throw it away in downstream services.
321#[derive(Debug, Clone, Serialize, Deserialize)]
322pub struct MatchBreakdown {
323 /// Best-of-cartesian-product similarity across (primary name +
324 /// alternate names) on both sides, using the configured algorithm.
325 pub name_score: Option<f64>,
326 /// Maximum Soundex match across the same name pairs. `None` when
327 /// `use_phonetic_matching` is false or either side has no names.
328 pub name_phonetic_score: Option<f64>,
329 /// `Combined` similarity over `description`, after `normalize_text`.
330 /// `None` if either side is absent.
331 pub description_score: Option<f64>,
332 /// `Combined` similarity over `disambiguatingDescription`, after
333 /// `normalize_text`. `None` if either side is absent.
334 pub disambiguating_description_score: Option<f64>,
335 /// `1.0` if both `identifiers` non-empty and they share any
336 /// `(property_id, value)` pair; `0.0` if both non-empty but none
337 /// shared; `None` if either side is empty.
338 pub identifiers_score: Option<f64>,
339 /// `1.0` if both `url`s normalise to the same string; `0.0`
340 /// otherwise; `None` if either side is absent.
341 pub url_score: Option<f64>,
342 /// Jaccard set similarity over the union of `sameAs` URLs after
343 /// `normalize_url`. `None` if both sides are empty.
344 pub same_as_score: Option<f64>,
345 /// `1.0` if both `image`s normalise to the same string; `0.0`
346 /// otherwise; `None` if either side is absent.
347 pub image_score: Option<f64>,
348 /// `1.0` if both `mainEntityOfPage`s normalise to the same string;
349 /// `0.0` otherwise; `None` if either side is absent.
350 pub main_entity_of_page_score: Option<f64>,
351 /// Jaccard set similarity over the union of `additionalType` URIs
352 /// after `normalize_url`. `None` if both sides are empty.
353 pub additional_types_score: Option<f64>,
354}
355
356/// Thing matcher engine.
357///
358/// The engine is **immutable after construction** and cheap to clone (it
359/// owns only a [`MatchConfig`]). Construct one and call its methods from
360/// any thread.
361///
362/// ```
363/// use thing_matcher::{MatchConfig, MatchingEngine};
364///
365/// let engine_a = MatchingEngine::default_config();
366/// let engine_b = MatchingEngine::new(MatchConfig::strict());
367/// # let _ = (engine_a, engine_b);
368/// ```
369pub struct MatchingEngine {
370 config: MatchConfig,
371}
372
373impl MatchingEngine {
374 /// Construct an engine with the given configuration.
375 ///
376 /// ```
377 /// use thing_matcher::{MatchConfig, MatchingEngine};
378 /// let engine = MatchingEngine::new(MatchConfig::lenient());
379 /// # let _ = engine;
380 /// ```
381 pub fn new(config: MatchConfig) -> Self {
382 Self { config }
383 }
384
385 /// Construct an engine with [`MatchConfig::default`].
386 ///
387 /// ```
388 /// use thing_matcher::MatchingEngine;
389 /// let engine = MatchingEngine::default_config();
390 /// # let _ = engine;
391 /// ```
392 pub fn default_config() -> Self {
393 Self::new(MatchConfig::default())
394 }
395
396 /// Compare two things probabilistically and return a [`MatchResult`].
397 ///
398 /// The score is the weight-renormalised sum of every component that
399 /// scored on both records. Missing fields are skipped, not penalised.
400 ///
401 /// ```
402 /// use thing_matcher::{MatchingEngine, Thing};
403 ///
404 /// let t = Thing::builder()
405 /// .name("Eiffel Tower")
406 /// .url("https://www.toureiffel.paris/")
407 /// .build();
408 ///
409 /// let result = MatchingEngine::default_config().match_things(&t, &t);
410 /// assert!(result.is_match);
411 /// assert!(result.score > 0.99);
412 /// ```
413 pub fn match_things(&self, thing1: &Thing, thing2: &Thing) -> MatchResult {
414 let breakdown = self.calculate_breakdown(thing1, thing2);
415 let score = self.calculate_weighted_score(&breakdown);
416 let above_threshold = score >= self.config.match_threshold;
417 // Under strict mode, `is_match` ALSO requires a deterministic match.
418 let is_match = if self.config.strict_mode {
419 above_threshold && self.deterministic_match(thing1, thing2)
420 } else {
421 above_threshold
422 };
423 let confidence = Confidence::from_score(score);
424
425 MatchResult {
426 score,
427 is_match,
428 confidence,
429 breakdown,
430 }
431 }
432
433 /// Score a single query against many candidates. Returns one
434 /// [`MatchResult`] per candidate, in the same order as the input slice.
435 ///
436 /// The engine is immutable and `Send + Sync`, so call-sites that want
437 /// parallel evaluation can wrap the call in `rayon::par_iter` or similar
438 /// without further changes to this crate.
439 ///
440 /// # Examples
441 ///
442 /// ```
443 /// use thing_matcher::{MatchingEngine, Thing};
444 ///
445 /// let query = Thing::builder().name("Eiffel Tower").build();
446 /// let candidates = vec![
447 /// Thing::builder().name("Eiffel Tower").build(),
448 /// Thing::builder().name("Big Ben").build(),
449 /// ];
450 ///
451 /// let engine = MatchingEngine::default_config();
452 /// let results = engine.match_one_to_many(&query, &candidates);
453 /// assert_eq!(results.len(), 2);
454 /// assert!(results[0].is_match);
455 /// assert!(!results[1].is_match);
456 /// ```
457 ///
458 /// Empty candidates yield an empty result:
459 ///
460 /// ```
461 /// # use thing_matcher::{MatchingEngine, Thing};
462 /// let q = Thing::builder().name("Solo").build();
463 /// let r = MatchingEngine::default_config().match_one_to_many(&q, &[]);
464 /// assert!(r.is_empty());
465 /// ```
466 pub fn match_one_to_many(&self, query: &Thing, candidates: &[Thing]) -> Vec<MatchResult> {
467 candidates
468 .iter()
469 .map(|c| self.match_things(query, c))
470 .collect()
471 }
472
473 /// Score and rank: return `(original_index, MatchResult)` tuples
474 /// sorted by descending score. Ties are broken by ascending original
475 /// index, so the result is deterministic.
476 ///
477 /// # Examples
478 ///
479 /// ```
480 /// use thing_matcher::{MatchingEngine, Thing};
481 ///
482 /// let query = Thing::builder().name("Eiffel Tower").build();
483 /// let candidates = vec![
484 /// Thing::builder().name("Big Ben").build(), // index 0
485 /// Thing::builder().name("Eiffel Tower").build(), // index 1 — best match
486 /// Thing::builder().name("Statue of Liberty").build(), // index 2
487 /// ];
488 ///
489 /// let ranked = MatchingEngine::default_config().rank_one_to_many(&query, &candidates);
490 /// assert_eq!(ranked.len(), 3);
491 /// assert_eq!(ranked[0].0, 1);
492 /// assert!(ranked[0].1.score >= ranked[1].1.score);
493 /// assert!(ranked[1].1.score >= ranked[2].1.score);
494 /// ```
495 pub fn rank_one_to_many(
496 &self,
497 query: &Thing,
498 candidates: &[Thing],
499 ) -> Vec<(usize, MatchResult)> {
500 let mut indexed: Vec<(usize, MatchResult)> = self
501 .match_one_to_many(query, candidates)
502 .into_iter()
503 .enumerate()
504 .collect();
505 indexed.sort_by(|a, b| {
506 b.1.score
507 .partial_cmp(&a.1.score)
508 .unwrap_or(std::cmp::Ordering::Equal)
509 .then_with(|| a.0.cmp(&b.0))
510 });
511 indexed
512 }
513
514 /// Compare two things deterministically and return a single boolean.
515 ///
516 /// Returns `true` iff any of the following hold:
517 ///
518 /// - the things share any `(property_id, value)` pair in their
519 /// `identifiers` lists;
520 /// - the things share any `sameAs` URL after URL normalisation;
521 /// - both have a `url` that normalises to the same string.
522 ///
523 /// ```
524 /// use thing_matcher::{Identifier, MatchingEngine, Thing};
525 ///
526 /// let id = Identifier::new("wikidata", "Q243").unwrap();
527 /// let a = Thing::builder().name("Eiffel Tower").add_identifier(id.clone()).build();
528 /// let b = Thing::builder().name("Tour Eiffel").add_identifier(id).build();
529 /// assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
530 /// ```
531 pub fn deterministic_match(&self, thing1: &Thing, thing2: &Thing) -> bool {
532 if shares_identifier(thing1, thing2) {
533 return true;
534 }
535 if shares_same_as(thing1, thing2) {
536 return true;
537 }
538 same_canonical_url(thing1, thing2)
539 }
540
541 fn calculate_breakdown(&self, thing1: &Thing, thing2: &Thing) -> MatchBreakdown {
542 MatchBreakdown {
543 name_score: self.score_name(thing1, thing2),
544 name_phonetic_score: if self.config.use_phonetic_matching {
545 self.score_phonetic_names(thing1, thing2)
546 } else {
547 None
548 },
549 description_score: score_text(&thing1.description, &thing2.description),
550 disambiguating_description_score: score_text(
551 &thing1.disambiguating_description,
552 &thing2.disambiguating_description,
553 ),
554 identifiers_score: score_identifiers(thing1, thing2),
555 url_score: score_url(&thing1.url, &thing2.url),
556 same_as_score: score_url_set(&thing1.same_as, &thing2.same_as),
557 image_score: score_url(&thing1.image, &thing2.image),
558 main_entity_of_page_score: score_url(
559 &thing1.main_entity_of_page,
560 &thing2.main_entity_of_page,
561 ),
562 additional_types_score: score_url_set(
563 &thing1.additional_types,
564 &thing2.additional_types,
565 ),
566 }
567 }
568
569 fn calculate_weighted_score(&self, breakdown: &MatchBreakdown) -> f64 {
570 let mut total_weight = 0.0;
571 let mut weighted_sum = 0.0;
572
573 let mut add = |score: Option<f64>, weight: f64| {
574 if let Some(s) = score {
575 weighted_sum += s * weight;
576 total_weight += weight;
577 }
578 };
579
580 add(breakdown.name_score, self.config.name_weight);
581 add(breakdown.description_score, self.config.description_weight);
582 add(
583 breakdown.disambiguating_description_score,
584 self.config.disambiguating_description_weight,
585 );
586 add(breakdown.identifiers_score, self.config.identifiers_weight);
587 add(breakdown.url_score, self.config.url_weight);
588 add(breakdown.same_as_score, self.config.same_as_weight);
589 add(breakdown.image_score, self.config.image_weight);
590 add(
591 breakdown.main_entity_of_page_score,
592 self.config.main_entity_of_page_weight,
593 );
594 add(
595 breakdown.additional_types_score,
596 self.config.additional_types_weight,
597 );
598
599 // Phonetic match is a bonus only — never lowers the score.
600 if let Some(score) = breakdown.name_phonetic_score
601 && score > 0.9
602 {
603 weighted_sum += score * 0.05;
604 total_weight += 0.05;
605 }
606
607 if total_weight > 0.0 {
608 weighted_sum / total_weight
609 } else {
610 0.0
611 }
612 }
613
614 fn score_name(&self, thing1: &Thing, thing2: &Thing) -> Option<f64> {
615 let names1 = collect_names(thing1);
616 let names2 = collect_names(thing2);
617 if names1.is_empty() || names2.is_empty() {
618 return None;
619 }
620 let mut best = f64::NEG_INFINITY;
621 for n1 in &names1 {
622 for n2 in &names2 {
623 let s = self.score_name_pair(n1, n2);
624 if s > best {
625 best = s;
626 }
627 }
628 }
629 Some(best)
630 }
631
632 fn score_name_pair(&self, name1: &str, name2: &str) -> f64 {
633 let norm1 = Normalizer::normalize_name(name1);
634 let norm2 = Normalizer::normalize_name(name2);
635 match self.config.name_algorithm {
636 SimilarityAlgorithm::JaroWinkler => Scorer::jaro_winkler_similarity(&norm1, &norm2),
637 SimilarityAlgorithm::Levenshtein => Scorer::levenshtein_similarity(&norm1, &norm2),
638 SimilarityAlgorithm::Exact => Scorer::exact_match(&norm1, &norm2),
639 SimilarityAlgorithm::Combined => Scorer::combined_similarity(&norm1, &norm2),
640 }
641 }
642
643 fn score_phonetic_names(&self, thing1: &Thing, thing2: &Thing) -> Option<f64> {
644 let names1 = collect_names(thing1);
645 let names2 = collect_names(thing2);
646 if names1.is_empty() || names2.is_empty() {
647 return None;
648 }
649 let codes1: Vec<String> = names1
650 .iter()
651 .map(|n| Normalizer::phonetic_code(n))
652 .collect();
653 let codes2: Vec<String> = names2
654 .iter()
655 .map(|n| Normalizer::phonetic_code(n))
656 .collect();
657 let mut best = 0.0_f64;
658 for c1 in &codes1 {
659 for c2 in &codes2 {
660 if !c1.is_empty() && c1 == c2 {
661 best = 1.0;
662 }
663 }
664 }
665 Some(best)
666 }
667}
668
669// ---- Free helpers ------------------------------------------------------
670
671/// Collect a thing's primary name plus alternate names into a single vec
672/// of references. Empty / whitespace-only strings are skipped.
673fn collect_names(thing: &Thing) -> Vec<&String> {
674 thing
675 .name
676 .iter()
677 .chain(thing.alternate_names.iter())
678 .filter(|s| !s.trim().is_empty())
679 .collect()
680}
681
682/// `Combined` similarity over a pair of optional free-form text fields.
683/// Returns `None` if either side is absent.
684fn score_text(a: &Option<String>, b: &Option<String>) -> Option<f64> {
685 let a = a.as_ref()?;
686 let b = b.as_ref()?;
687 let na = Normalizer::normalize_text(a);
688 let nb = Normalizer::normalize_text(b);
689 Some(Scorer::combined_similarity(&na, &nb))
690}
691
692/// Exact match over a pair of optional URL fields, compared after URL
693/// normalisation. Returns `None` if either side is absent.
694fn score_url(a: &Option<String>, b: &Option<String>) -> Option<f64> {
695 let a = a.as_ref()?;
696 let b = b.as_ref()?;
697 let na = Normalizer::normalize_url(a);
698 let nb = Normalizer::normalize_url(b);
699 Some(Scorer::exact_match(&na, &nb))
700}
701
702/// Jaccard set similarity over two URL lists. Returns `None` only if both
703/// sides are empty; an empty-against-non-empty pair scores `0.0`.
704fn score_url_set(a: &[String], b: &[String]) -> Option<f64> {
705 if a.is_empty() && b.is_empty() {
706 return None;
707 }
708 let na: Vec<String> = a.iter().map(|s| Normalizer::normalize_url(s)).collect();
709 let nb: Vec<String> = b.iter().map(|s| Normalizer::normalize_url(s)).collect();
710 Some(Scorer::jaccard_set_similarity(&na, &nb))
711}
712
713/// `Some(1.0)` if any `(property_id, value)` pair is shared, `Some(0.0)`
714/// if both lists are non-empty but no pair is shared, `None` if either
715/// list is empty.
716fn score_identifiers(thing1: &Thing, thing2: &Thing) -> Option<f64> {
717 if thing1.identifiers.is_empty() || thing2.identifiers.is_empty() {
718 return None;
719 }
720 Some(if shares_identifier(thing1, thing2) {
721 1.0
722 } else {
723 0.0
724 })
725}
726
727fn shares_identifier(thing1: &Thing, thing2: &Thing) -> bool {
728 if thing1.identifiers.is_empty() || thing2.identifiers.is_empty() {
729 return false;
730 }
731 for id1 in &thing1.identifiers {
732 for id2 in &thing2.identifiers {
733 if id1 == id2 {
734 return true;
735 }
736 }
737 }
738 false
739}
740
741fn shares_same_as(thing1: &Thing, thing2: &Thing) -> bool {
742 if thing1.same_as.is_empty() || thing2.same_as.is_empty() {
743 return false;
744 }
745 let set1: std::collections::BTreeSet<String> = thing1
746 .same_as
747 .iter()
748 .map(|s| Normalizer::normalize_url(s))
749 .collect();
750 for s in &thing2.same_as {
751 if set1.contains(&Normalizer::normalize_url(s)) {
752 return true;
753 }
754 }
755 false
756}
757
758fn same_canonical_url(thing1: &Thing, thing2: &Thing) -> bool {
759 let (Some(u1), Some(u2)) = (thing1.url.as_ref(), thing2.url.as_ref()) else {
760 return false;
761 };
762 Normalizer::normalize_url(u1) == Normalizer::normalize_url(u2)
763}
764
765#[cfg(test)]
766mod tests {
767 use super::*;
768 use crate::models::Identifier;
769
770 // ---------- MatchConfig presets ----------
771
772 #[test]
773 fn config_default_values() {
774 let c = MatchConfig::default();
775 assert!((c.match_threshold - 0.80).abs() < 1e-9);
776 assert!(!c.strict_mode);
777 }
778
779 #[test]
780 fn config_strict_raises_threshold_and_sets_flag() {
781 let c = MatchConfig::strict();
782 assert!((c.match_threshold - 0.95).abs() < 1e-9);
783 assert!(c.strict_mode);
784 }
785
786 #[test]
787 fn config_lenient_lowers_threshold() {
788 let c = MatchConfig::lenient();
789 assert!((c.match_threshold - 0.65).abs() < 1e-9);
790 assert!(c.use_phonetic_matching);
791 }
792
793 // ---------- MatchConfig serde ----------
794
795 #[test]
796 fn config_default_round_trips_through_json() {
797 let cfg = MatchConfig::default();
798 let json = serde_json::to_string(&cfg).expect("serialise");
799 let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
800 assert!((cfg.match_threshold - back.match_threshold).abs() < 1e-12);
801 assert!((cfg.name_weight - back.name_weight).abs() < 1e-12);
802 assert!((cfg.identifiers_weight - back.identifiers_weight).abs() < 1e-12);
803 assert!(matches!(back.name_algorithm, SimilarityAlgorithm::Combined));
804 assert_eq!(cfg.strict_mode, back.strict_mode);
805 }
806
807 #[test]
808 fn config_partial_json_fills_missing_fields_from_default() {
809 let partial = r#"{"match_threshold": 0.80, "name_weight": 0.5}"#;
810 let cfg: MatchConfig = serde_json::from_str(partial).expect("partial json");
811 assert!((cfg.match_threshold - 0.80).abs() < 1e-12);
812 assert!((cfg.name_weight - 0.5).abs() < 1e-12);
813 assert!(matches!(cfg.name_algorithm, SimilarityAlgorithm::Combined));
814 }
815
816 // ---------- probabilistic match ----------
817
818 #[test]
819 fn exact_clone_is_a_match() {
820 let t = Thing::builder()
821 .name("Eiffel Tower")
822 .url("https://www.toureiffel.paris/")
823 .build();
824 let result = MatchingEngine::default_config().match_things(&t, &t.clone());
825 assert!(result.is_match);
826 assert!(result.score > 0.95);
827 }
828
829 #[test]
830 fn name_match_takes_best_of_cartesian_product() {
831 let t1 = Thing::builder().name("Eiffel Tower").build();
832 let t2 = Thing::builder()
833 .name("La Tour Eiffel")
834 .add_alternate_name("Eiffel Tower")
835 .build();
836 let r = MatchingEngine::default_config().match_things(&t1, &t2);
837 let s = r.breakdown.name_score.expect("scored");
838 assert!(
839 s > 0.99,
840 "best-of cartesian product should pick exact match: {s}"
841 );
842 }
843
844 #[test]
845 fn unrelated_things_do_not_match() {
846 let a = Thing::builder().name("Eiffel Tower").build();
847 let b = Thing::builder().name("Sydney Opera House").build();
848 let r = MatchingEngine::default_config().match_things(&a, &b);
849 assert!(!r.is_match);
850 assert!(r.score < 0.5);
851 }
852
853 #[test]
854 fn no_overlapping_fields_returns_zero_score() {
855 let a = Thing::builder().description("foo").build();
856 let b = Thing::builder()
857 .add_same_as("https://example.org/x")
858 .build();
859 let r = MatchingEngine::default_config().match_things(&a, &b);
860 assert_eq!(r.score, 0.0);
861 }
862
863 // ---------- description / disambiguating_description ----------
864
865 #[test]
866 fn description_identical_scores_one() {
867 let t1 = Thing::builder()
868 .name("X")
869 .description("Iron tower in Paris.")
870 .build();
871 let t2 = Thing::builder()
872 .name("X")
873 .description("Iron tower in Paris.")
874 .build();
875 let r = MatchingEngine::default_config().match_things(&t1, &t2);
876 assert!(r.breakdown.description_score.unwrap() > 0.99);
877 }
878
879 #[test]
880 fn description_score_none_when_either_missing() {
881 let t1 = Thing::builder()
882 .name("X")
883 .description("Iron tower in Paris.")
884 .build();
885 let t2 = Thing::builder().name("X").build();
886 let r = MatchingEngine::default_config().match_things(&t1, &t2);
887 assert!(r.breakdown.description_score.is_none());
888 }
889
890 // ---------- identifiers ----------
891
892 #[test]
893 fn identifiers_shared_scores_one() {
894 let id = Identifier::new("wikidata", "Q243").unwrap();
895 let a = Thing::builder()
896 .name("X")
897 .add_identifier(id.clone())
898 .build();
899 let b = Thing::builder().name("X").add_identifier(id).build();
900 let r = MatchingEngine::default_config().match_things(&a, &b);
901 assert_eq!(r.breakdown.identifiers_score, Some(1.0));
902 }
903
904 #[test]
905 fn identifiers_property_scoped_no_cross_match() {
906 let a = Thing::builder()
907 .name("X")
908 .add_identifier(Identifier::new("google", "X").unwrap())
909 .build();
910 let b = Thing::builder()
911 .name("X")
912 .add_identifier(Identifier::new("wikidata", "X").unwrap())
913 .build();
914 let r = MatchingEngine::default_config().match_things(&a, &b);
915 assert_eq!(r.breakdown.identifiers_score, Some(0.0));
916 }
917
918 #[test]
919 fn identifiers_none_when_either_side_empty() {
920 let a = Thing::builder().name("X").build();
921 let b = Thing::builder()
922 .name("X")
923 .add_identifier(Identifier::new("wikidata", "Q1").unwrap())
924 .build();
925 let r = MatchingEngine::default_config().match_things(&a, &b);
926 assert!(r.breakdown.identifiers_score.is_none());
927 }
928
929 // ---------- url ----------
930
931 #[test]
932 fn url_normalised_equality_scores_one() {
933 let a = Thing::builder()
934 .name("X")
935 .url("HTTPS://Example.ORG/")
936 .build();
937 let b = Thing::builder()
938 .name("X")
939 .url("https://example.org")
940 .build();
941 let r = MatchingEngine::default_config().match_things(&a, &b);
942 assert_eq!(r.breakdown.url_score, Some(1.0));
943 }
944
945 #[test]
946 fn url_mismatch_scores_zero() {
947 let a = Thing::builder().name("X").url("https://a.org").build();
948 let b = Thing::builder().name("X").url("https://b.org").build();
949 let r = MatchingEngine::default_config().match_things(&a, &b);
950 assert_eq!(r.breakdown.url_score, Some(0.0));
951 }
952
953 #[test]
954 fn url_none_when_either_side_missing() {
955 let a = Thing::builder().name("X").url("https://a.org").build();
956 let b = Thing::builder().name("X").build();
957 let r = MatchingEngine::default_config().match_things(&a, &b);
958 assert!(r.breakdown.url_score.is_none());
959 }
960
961 // ---------- sameAs / additional_types ----------
962
963 #[test]
964 fn same_as_jaccard_partial_overlap() {
965 let a = Thing::builder()
966 .name("X")
967 .add_same_as("https://example.org/a")
968 .add_same_as("https://example.org/b")
969 .build();
970 let b = Thing::builder()
971 .name("X")
972 .add_same_as("https://example.org/b")
973 .add_same_as("https://example.org/c")
974 .build();
975 let r = MatchingEngine::default_config().match_things(&a, &b);
976 let s = r.breakdown.same_as_score.expect("scored");
977 // intersection {b}, union {a,b,c} => 1/3
978 assert!((s - 1.0_f64 / 3.0).abs() < 1e-9, "got {s}");
979 }
980
981 #[test]
982 fn same_as_none_when_both_empty() {
983 let a = Thing::builder().name("X").build();
984 let b = Thing::builder().name("X").build();
985 let r = MatchingEngine::default_config().match_things(&a, &b);
986 assert!(r.breakdown.same_as_score.is_none());
987 }
988
989 #[test]
990 fn additional_types_jaccard_full_overlap() {
991 let a = Thing::builder()
992 .name("X")
993 .add_additional_type("https://schema.org/Landmark")
994 .build();
995 let b = Thing::builder()
996 .name("X")
997 .add_additional_type("https://schema.org/Landmark")
998 .build();
999 let r = MatchingEngine::default_config().match_things(&a, &b);
1000 assert_eq!(r.breakdown.additional_types_score, Some(1.0));
1001 }
1002
1003 // ---------- deterministic match ----------
1004
1005 #[test]
1006 fn deterministic_via_shared_identifier() {
1007 let id = Identifier::new("wikidata", "Q243").unwrap();
1008 let a = Thing::builder()
1009 .name("Eiffel Tower")
1010 .add_identifier(id.clone())
1011 .build();
1012 let b = Thing::builder()
1013 .name("Wholly Different")
1014 .add_identifier(id)
1015 .build();
1016 assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1017 }
1018
1019 #[test]
1020 fn deterministic_via_shared_same_as() {
1021 let a = Thing::builder()
1022 .name("Eiffel Tower")
1023 .add_same_as("https://www.wikidata.org/wiki/Q243")
1024 .build();
1025 let b = Thing::builder()
1026 .name("Tour Eiffel")
1027 .add_same_as("https://www.wikidata.org/wiki/Q243")
1028 .build();
1029 assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1030 }
1031
1032 #[test]
1033 fn deterministic_via_shared_url() {
1034 let a = Thing::builder()
1035 .name("X")
1036 .url("https://example.org/")
1037 .build();
1038 let b = Thing::builder()
1039 .name("Y")
1040 .url("https://example.org")
1041 .build();
1042 assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
1043 }
1044
1045 #[test]
1046 fn deterministic_rejects_when_no_shared_identity_signal() {
1047 let a = Thing::builder().name("X").build();
1048 let b = Thing::builder().name("X").build();
1049 assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
1050 }
1051
1052 // ---------- strict_mode enforcement ----------
1053
1054 #[test]
1055 fn strict_mode_requires_deterministic_for_is_match() {
1056 let cfg = MatchConfig {
1057 match_threshold: 0.40,
1058 strict_mode: true,
1059 ..MatchConfig::default()
1060 };
1061 let t1 = Thing::builder().name("Cafe Centrale").build();
1062 let t2 = Thing::builder().name("Cafe Central").build();
1063 let engine = MatchingEngine::new(cfg);
1064 let r = engine.match_things(&t1, &t2);
1065 assert!(r.score >= 0.40, "should clear threshold");
1066 // No shared identifier / sameAs / url → not deterministic
1067 assert!(!engine.deterministic_match(&t1, &t2));
1068 assert!(!r.is_match);
1069 }
1070
1071 // ---------- batch APIs ----------
1072
1073 #[test]
1074 fn match_one_to_many_empty_candidates_yields_empty_vec() {
1075 let engine = MatchingEngine::default_config();
1076 let q = Thing::builder().name("Solo").build();
1077 assert!(engine.match_one_to_many(&q, &[]).is_empty());
1078 }
1079
1080 #[test]
1081 fn rank_one_to_many_sorts_by_score_descending() {
1082 let engine = MatchingEngine::default_config();
1083 let q = Thing::builder().name("Eiffel Tower").build();
1084 let candidates = vec![
1085 Thing::builder().name("Big Ben").build(),
1086 q.clone(),
1087 Thing::builder().name("Statue of Liberty").build(),
1088 ];
1089 let ranked = engine.rank_one_to_many(&q, &candidates);
1090 assert_eq!(ranked[0].0, 1);
1091 for w in ranked.windows(2) {
1092 assert!(w[0].1.score >= w[1].1.score);
1093 }
1094 }
1095
1096 // ---------- Confidence ----------
1097
1098 #[test]
1099 fn confidence_band_boundaries_are_inclusive_on_the_low_side() {
1100 assert_eq!(Confidence::from_score(0.90), Confidence::High);
1101 assert_eq!(Confidence::from_score(0.89), Confidence::Medium);
1102 assert_eq!(Confidence::from_score(0.75), Confidence::Medium);
1103 assert_eq!(Confidence::from_score(0.74), Confidence::Low);
1104 }
1105
1106 // ---------- phonetic ----------
1107
1108 #[test]
1109 fn phonetic_score_none_when_off() {
1110 let t = Thing::builder().name("Stephen").build();
1111 let q = Thing::builder().name("Steven").build();
1112 let r = MatchingEngine::new(MatchConfig {
1113 use_phonetic_matching: false,
1114 ..MatchConfig::default()
1115 })
1116 .match_things(&t, &q);
1117 assert!(r.breakdown.name_phonetic_score.is_none());
1118 }
1119
1120 #[test]
1121 fn phonetic_score_some_when_on() {
1122 let t = Thing::builder().name("Stephen").build();
1123 let q = Thing::builder().name("Steven").build();
1124 let r = MatchingEngine::new(MatchConfig {
1125 use_phonetic_matching: true,
1126 ..MatchConfig::default()
1127 })
1128 .match_things(&t, &q);
1129 assert!(r.breakdown.name_phonetic_score.is_some());
1130 }
1131}