use crate::models::Thing;
use crate::normalizer::Normalizer;
use crate::scorer::{Scorer, SimilarityAlgorithm};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct MatchConfig {
pub match_threshold: f64,
pub name_weight: f64,
pub description_weight: f64,
pub disambiguating_description_weight: f64,
pub identifiers_weight: f64,
pub url_weight: f64,
pub same_as_weight: f64,
pub image_weight: f64,
pub main_entity_of_page_weight: f64,
pub additional_types_weight: f64,
pub use_phonetic_matching: bool,
pub name_algorithm: SimilarityAlgorithm,
pub strict_mode: bool,
}
impl Default for MatchConfig {
fn default() -> Self {
Self {
match_threshold: 0.80,
name_weight: 0.30,
description_weight: 0.10,
disambiguating_description_weight: 0.05,
identifiers_weight: 0.25,
url_weight: 0.05,
same_as_weight: 0.15,
image_weight: 0.03,
main_entity_of_page_weight: 0.02,
additional_types_weight: 0.05,
use_phonetic_matching: false,
name_algorithm: SimilarityAlgorithm::Combined,
strict_mode: false,
}
}
}
impl MatchConfig {
pub fn strict() -> Self {
Self {
match_threshold: 0.95,
strict_mode: true,
..Default::default()
}
}
pub fn lenient() -> Self {
Self {
match_threshold: 0.65,
use_phonetic_matching: true,
..Default::default()
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Confidence {
High,
Medium,
Low,
}
impl Confidence {
pub fn from_score(score: f64) -> Self {
if score >= 0.90 {
Confidence::High
} else if score >= 0.75 {
Confidence::Medium
} else {
Confidence::Low
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MatchResult {
pub score: f64,
pub is_match: bool,
#[serde(default = "default_confidence")]
pub confidence: Confidence,
pub breakdown: MatchBreakdown,
}
fn default_confidence() -> Confidence {
Confidence::Low
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MatchBreakdown {
pub name_score: Option<f64>,
pub name_phonetic_score: Option<f64>,
pub description_score: Option<f64>,
pub disambiguating_description_score: Option<f64>,
pub identifiers_score: Option<f64>,
pub url_score: Option<f64>,
pub same_as_score: Option<f64>,
pub image_score: Option<f64>,
pub main_entity_of_page_score: Option<f64>,
pub additional_types_score: Option<f64>,
}
pub struct MatchingEngine {
config: MatchConfig,
}
impl MatchingEngine {
pub fn new(config: MatchConfig) -> Self {
Self { config }
}
pub fn default_config() -> Self {
Self::new(MatchConfig::default())
}
pub fn match_things(&self, thing1: &Thing, thing2: &Thing) -> MatchResult {
let breakdown = self.calculate_breakdown(thing1, thing2);
let score = self.calculate_weighted_score(&breakdown);
let above_threshold = score >= self.config.match_threshold;
let is_match = if self.config.strict_mode {
above_threshold && self.deterministic_match(thing1, thing2)
} else {
above_threshold
};
let confidence = Confidence::from_score(score);
MatchResult {
score,
is_match,
confidence,
breakdown,
}
}
pub fn match_one_to_many(&self, query: &Thing, candidates: &[Thing]) -> Vec<MatchResult> {
candidates
.iter()
.map(|c| self.match_things(query, c))
.collect()
}
pub fn rank_one_to_many(
&self,
query: &Thing,
candidates: &[Thing],
) -> Vec<(usize, MatchResult)> {
let mut indexed: Vec<(usize, MatchResult)> = self
.match_one_to_many(query, candidates)
.into_iter()
.enumerate()
.collect();
indexed.sort_by(|a, b| {
b.1.score
.partial_cmp(&a.1.score)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| a.0.cmp(&b.0))
});
indexed
}
pub fn deterministic_match(&self, thing1: &Thing, thing2: &Thing) -> bool {
if shares_identifier(thing1, thing2) {
return true;
}
if shares_same_as(thing1, thing2) {
return true;
}
same_canonical_url(thing1, thing2)
}
fn calculate_breakdown(&self, thing1: &Thing, thing2: &Thing) -> MatchBreakdown {
MatchBreakdown {
name_score: self.score_name(thing1, thing2),
name_phonetic_score: if self.config.use_phonetic_matching {
self.score_phonetic_names(thing1, thing2)
} else {
None
},
description_score: score_text(&thing1.description, &thing2.description),
disambiguating_description_score: score_text(
&thing1.disambiguating_description,
&thing2.disambiguating_description,
),
identifiers_score: score_identifiers(thing1, thing2),
url_score: score_url(&thing1.url, &thing2.url),
same_as_score: score_url_set(&thing1.same_as, &thing2.same_as),
image_score: score_url(&thing1.image, &thing2.image),
main_entity_of_page_score: score_url(
&thing1.main_entity_of_page,
&thing2.main_entity_of_page,
),
additional_types_score: score_url_set(
&thing1.additional_types,
&thing2.additional_types,
),
}
}
fn calculate_weighted_score(&self, breakdown: &MatchBreakdown) -> f64 {
let mut total_weight = 0.0;
let mut weighted_sum = 0.0;
let mut add = |score: Option<f64>, weight: f64| {
if let Some(s) = score {
weighted_sum += s * weight;
total_weight += weight;
}
};
add(breakdown.name_score, self.config.name_weight);
add(breakdown.description_score, self.config.description_weight);
add(
breakdown.disambiguating_description_score,
self.config.disambiguating_description_weight,
);
add(breakdown.identifiers_score, self.config.identifiers_weight);
add(breakdown.url_score, self.config.url_weight);
add(breakdown.same_as_score, self.config.same_as_weight);
add(breakdown.image_score, self.config.image_weight);
add(
breakdown.main_entity_of_page_score,
self.config.main_entity_of_page_weight,
);
add(
breakdown.additional_types_score,
self.config.additional_types_weight,
);
if let Some(score) = breakdown.name_phonetic_score
&& score > 0.9
{
weighted_sum += score * 0.05;
total_weight += 0.05;
}
if total_weight > 0.0 {
weighted_sum / total_weight
} else {
0.0
}
}
fn score_name(&self, thing1: &Thing, thing2: &Thing) -> Option<f64> {
let names1 = collect_names(thing1);
let names2 = collect_names(thing2);
if names1.is_empty() || names2.is_empty() {
return None;
}
let mut best = f64::NEG_INFINITY;
for n1 in &names1 {
for n2 in &names2 {
let s = self.score_name_pair(n1, n2);
if s > best {
best = s;
}
}
}
Some(best)
}
fn score_name_pair(&self, name1: &str, name2: &str) -> f64 {
let norm1 = Normalizer::normalize_name(name1);
let norm2 = Normalizer::normalize_name(name2);
match self.config.name_algorithm {
SimilarityAlgorithm::JaroWinkler => Scorer::jaro_winkler_similarity(&norm1, &norm2),
SimilarityAlgorithm::Levenshtein => Scorer::levenshtein_similarity(&norm1, &norm2),
SimilarityAlgorithm::Exact => Scorer::exact_match(&norm1, &norm2),
SimilarityAlgorithm::Combined => Scorer::combined_similarity(&norm1, &norm2),
}
}
fn score_phonetic_names(&self, thing1: &Thing, thing2: &Thing) -> Option<f64> {
let names1 = collect_names(thing1);
let names2 = collect_names(thing2);
if names1.is_empty() || names2.is_empty() {
return None;
}
let codes1: Vec<String> = names1
.iter()
.map(|n| Normalizer::phonetic_code(n))
.collect();
let codes2: Vec<String> = names2
.iter()
.map(|n| Normalizer::phonetic_code(n))
.collect();
let mut best = 0.0_f64;
for c1 in &codes1 {
for c2 in &codes2 {
if !c1.is_empty() && c1 == c2 {
best = 1.0;
}
}
}
Some(best)
}
}
fn collect_names(thing: &Thing) -> Vec<&String> {
thing
.name
.iter()
.chain(thing.alternate_names.iter())
.filter(|s| !s.trim().is_empty())
.collect()
}
fn score_text(a: &Option<String>, b: &Option<String>) -> Option<f64> {
let a = a.as_ref()?;
let b = b.as_ref()?;
let na = Normalizer::normalize_text(a);
let nb = Normalizer::normalize_text(b);
Some(Scorer::combined_similarity(&na, &nb))
}
fn score_url(a: &Option<String>, b: &Option<String>) -> Option<f64> {
let a = a.as_ref()?;
let b = b.as_ref()?;
let na = Normalizer::normalize_url(a);
let nb = Normalizer::normalize_url(b);
Some(Scorer::exact_match(&na, &nb))
}
fn score_url_set(a: &[String], b: &[String]) -> Option<f64> {
if a.is_empty() && b.is_empty() {
return None;
}
let na: Vec<String> = a.iter().map(|s| Normalizer::normalize_url(s)).collect();
let nb: Vec<String> = b.iter().map(|s| Normalizer::normalize_url(s)).collect();
Some(Scorer::jaccard_set_similarity(&na, &nb))
}
fn score_identifiers(thing1: &Thing, thing2: &Thing) -> Option<f64> {
if thing1.identifiers.is_empty() || thing2.identifiers.is_empty() {
return None;
}
Some(if shares_identifier(thing1, thing2) {
1.0
} else {
0.0
})
}
fn shares_identifier(thing1: &Thing, thing2: &Thing) -> bool {
if thing1.identifiers.is_empty() || thing2.identifiers.is_empty() {
return false;
}
for id1 in &thing1.identifiers {
for id2 in &thing2.identifiers {
if id1 == id2 {
return true;
}
}
}
false
}
fn shares_same_as(thing1: &Thing, thing2: &Thing) -> bool {
if thing1.same_as.is_empty() || thing2.same_as.is_empty() {
return false;
}
let set1: std::collections::BTreeSet<String> = thing1
.same_as
.iter()
.map(|s| Normalizer::normalize_url(s))
.collect();
for s in &thing2.same_as {
if set1.contains(&Normalizer::normalize_url(s)) {
return true;
}
}
false
}
fn same_canonical_url(thing1: &Thing, thing2: &Thing) -> bool {
let (Some(u1), Some(u2)) = (thing1.url.as_ref(), thing2.url.as_ref()) else {
return false;
};
Normalizer::normalize_url(u1) == Normalizer::normalize_url(u2)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::models::Identifier;
#[test]
fn config_default_values() {
let c = MatchConfig::default();
assert!((c.match_threshold - 0.80).abs() < 1e-9);
assert!(!c.strict_mode);
}
#[test]
fn config_strict_raises_threshold_and_sets_flag() {
let c = MatchConfig::strict();
assert!((c.match_threshold - 0.95).abs() < 1e-9);
assert!(c.strict_mode);
}
#[test]
fn config_lenient_lowers_threshold() {
let c = MatchConfig::lenient();
assert!((c.match_threshold - 0.65).abs() < 1e-9);
assert!(c.use_phonetic_matching);
}
#[test]
fn config_default_round_trips_through_json() {
let cfg = MatchConfig::default();
let json = serde_json::to_string(&cfg).expect("serialise");
let back: MatchConfig = serde_json::from_str(&json).expect("deserialise");
assert!((cfg.match_threshold - back.match_threshold).abs() < 1e-12);
assert!((cfg.name_weight - back.name_weight).abs() < 1e-12);
assert!((cfg.identifiers_weight - back.identifiers_weight).abs() < 1e-12);
assert!(matches!(back.name_algorithm, SimilarityAlgorithm::Combined));
assert_eq!(cfg.strict_mode, back.strict_mode);
}
#[test]
fn config_partial_json_fills_missing_fields_from_default() {
let partial = r#"{"match_threshold": 0.80, "name_weight": 0.5}"#;
let cfg: MatchConfig = serde_json::from_str(partial).expect("partial json");
assert!((cfg.match_threshold - 0.80).abs() < 1e-12);
assert!((cfg.name_weight - 0.5).abs() < 1e-12);
assert!(matches!(cfg.name_algorithm, SimilarityAlgorithm::Combined));
}
#[test]
fn exact_clone_is_a_match() {
let t = Thing::builder()
.name("Eiffel Tower")
.url("https://www.toureiffel.paris/")
.build();
let result = MatchingEngine::default_config().match_things(&t, &t.clone());
assert!(result.is_match);
assert!(result.score > 0.95);
}
#[test]
fn name_match_takes_best_of_cartesian_product() {
let t1 = Thing::builder().name("Eiffel Tower").build();
let t2 = Thing::builder()
.name("La Tour Eiffel")
.add_alternate_name("Eiffel Tower")
.build();
let r = MatchingEngine::default_config().match_things(&t1, &t2);
let s = r.breakdown.name_score.expect("scored");
assert!(
s > 0.99,
"best-of cartesian product should pick exact match: {s}"
);
}
#[test]
fn unrelated_things_do_not_match() {
let a = Thing::builder().name("Eiffel Tower").build();
let b = Thing::builder().name("Sydney Opera House").build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert!(!r.is_match);
assert!(r.score < 0.5);
}
#[test]
fn no_overlapping_fields_returns_zero_score() {
let a = Thing::builder().description("foo").build();
let b = Thing::builder()
.add_same_as("https://example.org/x")
.build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert_eq!(r.score, 0.0);
}
#[test]
fn description_identical_scores_one() {
let t1 = Thing::builder()
.name("X")
.description("Iron tower in Paris.")
.build();
let t2 = Thing::builder()
.name("X")
.description("Iron tower in Paris.")
.build();
let r = MatchingEngine::default_config().match_things(&t1, &t2);
assert!(r.breakdown.description_score.unwrap() > 0.99);
}
#[test]
fn description_score_none_when_either_missing() {
let t1 = Thing::builder()
.name("X")
.description("Iron tower in Paris.")
.build();
let t2 = Thing::builder().name("X").build();
let r = MatchingEngine::default_config().match_things(&t1, &t2);
assert!(r.breakdown.description_score.is_none());
}
#[test]
fn identifiers_shared_scores_one() {
let id = Identifier::new("wikidata", "Q243").unwrap();
let a = Thing::builder()
.name("X")
.add_identifier(id.clone())
.build();
let b = Thing::builder().name("X").add_identifier(id).build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert_eq!(r.breakdown.identifiers_score, Some(1.0));
}
#[test]
fn identifiers_property_scoped_no_cross_match() {
let a = Thing::builder()
.name("X")
.add_identifier(Identifier::new("google", "X").unwrap())
.build();
let b = Thing::builder()
.name("X")
.add_identifier(Identifier::new("wikidata", "X").unwrap())
.build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert_eq!(r.breakdown.identifiers_score, Some(0.0));
}
#[test]
fn identifiers_none_when_either_side_empty() {
let a = Thing::builder().name("X").build();
let b = Thing::builder()
.name("X")
.add_identifier(Identifier::new("wikidata", "Q1").unwrap())
.build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert!(r.breakdown.identifiers_score.is_none());
}
#[test]
fn url_normalised_equality_scores_one() {
let a = Thing::builder()
.name("X")
.url("HTTPS://Example.ORG/")
.build();
let b = Thing::builder()
.name("X")
.url("https://example.org")
.build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert_eq!(r.breakdown.url_score, Some(1.0));
}
#[test]
fn url_mismatch_scores_zero() {
let a = Thing::builder().name("X").url("https://a.org").build();
let b = Thing::builder().name("X").url("https://b.org").build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert_eq!(r.breakdown.url_score, Some(0.0));
}
#[test]
fn url_none_when_either_side_missing() {
let a = Thing::builder().name("X").url("https://a.org").build();
let b = Thing::builder().name("X").build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert!(r.breakdown.url_score.is_none());
}
#[test]
fn same_as_jaccard_partial_overlap() {
let a = Thing::builder()
.name("X")
.add_same_as("https://example.org/a")
.add_same_as("https://example.org/b")
.build();
let b = Thing::builder()
.name("X")
.add_same_as("https://example.org/b")
.add_same_as("https://example.org/c")
.build();
let r = MatchingEngine::default_config().match_things(&a, &b);
let s = r.breakdown.same_as_score.expect("scored");
assert!((s - 1.0_f64 / 3.0).abs() < 1e-9, "got {s}");
}
#[test]
fn same_as_none_when_both_empty() {
let a = Thing::builder().name("X").build();
let b = Thing::builder().name("X").build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert!(r.breakdown.same_as_score.is_none());
}
#[test]
fn additional_types_jaccard_full_overlap() {
let a = Thing::builder()
.name("X")
.add_additional_type("https://schema.org/Landmark")
.build();
let b = Thing::builder()
.name("X")
.add_additional_type("https://schema.org/Landmark")
.build();
let r = MatchingEngine::default_config().match_things(&a, &b);
assert_eq!(r.breakdown.additional_types_score, Some(1.0));
}
#[test]
fn deterministic_via_shared_identifier() {
let id = Identifier::new("wikidata", "Q243").unwrap();
let a = Thing::builder()
.name("Eiffel Tower")
.add_identifier(id.clone())
.build();
let b = Thing::builder()
.name("Wholly Different")
.add_identifier(id)
.build();
assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn deterministic_via_shared_same_as() {
let a = Thing::builder()
.name("Eiffel Tower")
.add_same_as("https://www.wikidata.org/wiki/Q243")
.build();
let b = Thing::builder()
.name("Tour Eiffel")
.add_same_as("https://www.wikidata.org/wiki/Q243")
.build();
assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn deterministic_via_shared_url() {
let a = Thing::builder()
.name("X")
.url("https://example.org/")
.build();
let b = Thing::builder()
.name("Y")
.url("https://example.org")
.build();
assert!(MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn deterministic_rejects_when_no_shared_identity_signal() {
let a = Thing::builder().name("X").build();
let b = Thing::builder().name("X").build();
assert!(!MatchingEngine::default_config().deterministic_match(&a, &b));
}
#[test]
fn strict_mode_requires_deterministic_for_is_match() {
let cfg = MatchConfig {
match_threshold: 0.40,
strict_mode: true,
..MatchConfig::default()
};
let t1 = Thing::builder().name("Cafe Centrale").build();
let t2 = Thing::builder().name("Cafe Central").build();
let engine = MatchingEngine::new(cfg);
let r = engine.match_things(&t1, &t2);
assert!(r.score >= 0.40, "should clear threshold");
assert!(!engine.deterministic_match(&t1, &t2));
assert!(!r.is_match);
}
#[test]
fn match_one_to_many_empty_candidates_yields_empty_vec() {
let engine = MatchingEngine::default_config();
let q = Thing::builder().name("Solo").build();
assert!(engine.match_one_to_many(&q, &[]).is_empty());
}
#[test]
fn rank_one_to_many_sorts_by_score_descending() {
let engine = MatchingEngine::default_config();
let q = Thing::builder().name("Eiffel Tower").build();
let candidates = vec![
Thing::builder().name("Big Ben").build(),
q.clone(),
Thing::builder().name("Statue of Liberty").build(),
];
let ranked = engine.rank_one_to_many(&q, &candidates);
assert_eq!(ranked[0].0, 1);
for w in ranked.windows(2) {
assert!(w[0].1.score >= w[1].1.score);
}
}
#[test]
fn confidence_band_boundaries_are_inclusive_on_the_low_side() {
assert_eq!(Confidence::from_score(0.90), Confidence::High);
assert_eq!(Confidence::from_score(0.89), Confidence::Medium);
assert_eq!(Confidence::from_score(0.75), Confidence::Medium);
assert_eq!(Confidence::from_score(0.74), Confidence::Low);
}
#[test]
fn phonetic_score_none_when_off() {
let t = Thing::builder().name("Stephen").build();
let q = Thing::builder().name("Steven").build();
let r = MatchingEngine::new(MatchConfig {
use_phonetic_matching: false,
..MatchConfig::default()
})
.match_things(&t, &q);
assert!(r.breakdown.name_phonetic_score.is_none());
}
#[test]
fn phonetic_score_some_when_on() {
let t = Thing::builder().name("Stephen").build();
let q = Thing::builder().name("Steven").build();
let r = MatchingEngine::new(MatchConfig {
use_phonetic_matching: true,
..MatchConfig::default()
})
.match_things(&t, &q);
assert!(r.breakdown.name_phonetic_score.is_some());
}
}