use std::collections::HashSet;
use rphonetic::{DoubleMetaphone, Encoder};
use unicode_normalization::UnicodeNormalization;
use zer_core::{record::FieldValue, schema::FieldKind};
use crate::similarity::SimilarityFn;
fn to_ascii_upper(s: &str) -> String {
s.nfkd()
.filter(|c| c.is_ascii())
.collect::<String>()
.to_ascii_uppercase()
}
pub struct JaroWinklerSimilarity;
impl SimilarityFn for JaroWinklerSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => {
strsim::jaro_winkler(a, b) as f32
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
strsim::jaro_winkler(a, b) as f32
}
fn field_kind(&self) -> FieldKind { FieldKind::Name }
}
pub struct PhoneticEqualitySimilarity;
impl SimilarityFn for PhoneticEqualitySimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => {
let dm = DoubleMetaphone::default();
let norm_a = to_ascii_upper(a);
let norm_b = to_ascii_upper(b);
if norm_a.is_empty() || norm_b.is_empty() { return 0.0; }
let code_a = dm.encode(&norm_a);
let code_b = dm.encode(&norm_b);
if code_a.is_empty() || code_b.is_empty() { return 0.0; }
if code_a == code_b { 1.0 } else { 0.0 }
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
let dm = DoubleMetaphone::default();
let norm_a = to_ascii_upper(a);
let norm_b = to_ascii_upper(b);
if norm_a.is_empty() || norm_b.is_empty() { return 0.0; }
let code_a = dm.encode(&norm_a);
let code_b = dm.encode(&norm_b);
if code_a.is_empty() || code_b.is_empty() { return 0.0; }
if code_a == code_b { 1.0 } else { 0.0 }
}
fn field_kind(&self) -> FieldKind { FieldKind::Name }
}
pub struct TokenOverlapSimilarity;
impl SimilarityFn for TokenOverlapSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => jaccard_tokens(a, b),
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 { jaccard_tokens(a, b) }
fn field_kind(&self) -> FieldKind { FieldKind::Name }
}
pub(crate) fn jaccard_tokens(a: &str, b: &str) -> f32 {
let set_a: HashSet<&str> = a.split_whitespace().collect();
let set_b: HashSet<&str> = b.split_whitespace().collect();
if set_a.is_empty() && set_b.is_empty() { return 1.0; }
let intersection = set_a.intersection(&set_b).count();
let union = set_a.len() + set_b.len() - intersection;
if union == 0 { return 0.0; }
intersection as f32 / union as f32
}
pub struct LevenshteinSimilarity {
pub max_distance: usize,
}
impl SimilarityFn for LevenshteinSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
let (sa, sb) = match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => (a.as_str(), b.as_str()),
_ => return 0.0,
};
let dist = edit_distance::edit_distance(sa, sb);
if dist > self.max_distance {
0.0
} else {
1.0 - (dist as f32 / self.max_distance.max(1) as f32)
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
let dist = edit_distance::edit_distance(a, b);
if dist > self.max_distance { 0.0 }
else { 1.0 - (dist as f32 / self.max_distance.max(1) as f32) }
}
fn field_kind(&self) -> zer_core::schema::FieldKind { zer_core::schema::FieldKind::Name }
}
pub struct AliasTokenOverlapSimilarity;
impl SimilarityFn for AliasTokenOverlapSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => {
let aliases_a: Vec<&str> = a.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
let aliases_b: Vec<&str> = b.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
if aliases_a.is_empty() || aliases_b.is_empty() { return 0.0; }
aliases_a.iter()
.flat_map(|aa| aliases_b.iter().map(move |ab| jaccard_tokens(aa, ab)))
.fold(0.0_f32, f32::max)
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
let aliases_a: Vec<&str> = a.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
let aliases_b: Vec<&str> = b.split('|').map(str::trim).filter(|s| !s.is_empty()).collect();
if aliases_a.is_empty() || aliases_b.is_empty() { return 0.0; }
aliases_a.iter()
.flat_map(|aa| aliases_b.iter().map(move |ab| jaccard_tokens(aa, ab)))
.fold(0.0_f32, f32::max)
}
fn field_kind(&self) -> FieldKind { FieldKind::Alias }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn jaro_winkler_similar_names() {
let sim = JaroWinklerSimilarity;
let a = FieldValue::Text("JOHN SMITH".into());
let b = FieldValue::Text("JON SMYTH".into());
let s = sim.similarity(&a, &b);
assert!(s > 0.8, "similar names should score > 0.8, got {s}");
}
#[test]
fn jaro_winkler_different_names() {
let sim = JaroWinklerSimilarity;
let a = FieldValue::Text("JOHN SMITH".into());
let b = FieldValue::Text("JANE DOE".into());
let s = sim.similarity(&a, &b);
assert!(s < 0.6, "very different names should score < 0.6, got {s}");
}
#[test]
fn phonetic_equality_sound_alikes() {
let sim = PhoneticEqualitySimilarity;
let a = FieldValue::Text("Smith".into());
let b = FieldValue::Text("Smyth".into());
assert_eq!(sim.similarity(&a, &b), 1.0, "Smith and Smyth should be phonetically equal");
}
#[test]
fn phonetic_equality_different() {
let sim = PhoneticEqualitySimilarity;
let a = FieldValue::Text("Jansen".into());
let b = FieldValue::Text("Bakker".into());
assert_eq!(sim.similarity(&a, &b), 0.0, "Jansen and Bakker should not be phonetically equal");
}
#[test]
fn token_overlap_swapped_name() {
let sim = TokenOverlapSimilarity;
let a = FieldValue::Text("John Smith".into());
let b = FieldValue::Text("Smith John".into());
assert_eq!(sim.similarity(&a, &b), 1.0, "token overlap should be 1.0 for swapped tokens");
}
#[test]
fn token_overlap_partial() {
let sim = TokenOverlapSimilarity;
let a = FieldValue::Text("Alice van Berg".into());
let b = FieldValue::Text("Alice Berg".into());
let s = sim.similarity(&a, &b);
assert!(s > 0.3, "partial name overlap should produce > 0.3, got {s}");
}
#[test]
fn alias_overlap_cross_product() {
let sim = AliasTokenOverlapSimilarity;
let a = FieldValue::Text("Benabdallah Fatima|F. Benabdallah".into());
let b = FieldValue::Text("Fatima Benabdallah".into());
let s = sim.similarity(&a, &b);
assert!(s > 0.5, "alias cross-product should find overlap, got {s}");
}
#[test]
fn alias_overlap_empty_field() {
let sim = AliasTokenOverlapSimilarity;
let a = FieldValue::Text("".into());
let b = FieldValue::Text("Jansen".into());
assert_eq!(sim.similarity(&a, &b), 0.0);
}
#[test]
fn similarity_null_fields_return_zero() {
let sim = JaroWinklerSimilarity;
assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("test".into())), 0.0);
assert_eq!(sim.similarity(&FieldValue::Text("test".into()), &FieldValue::Null), 0.0);
}
#[test]
fn levenshtein_exact_match() {
let sim = LevenshteinSimilarity { max_distance: 3 };
let a = FieldValue::Text("Jansen".into());
let b = FieldValue::Text("Jansen".into());
assert_eq!(sim.similarity(&a, &b), 1.0, "edit distance 0 must yield 1.0");
}
#[test]
fn levenshtein_over_max() {
let sim = LevenshteinSimilarity { max_distance: 2 };
let a = FieldValue::Text("hello".into());
let b = FieldValue::Text("world".into()); assert_eq!(sim.similarity(&a, &b), 0.0, "edit distance > max_distance must yield 0.0");
}
#[test]
fn levenshtein_partial() {
let sim = LevenshteinSimilarity { max_distance: 3 };
let a = FieldValue::Text("Jansen".into());
let b = FieldValue::Text("Jansem".into()); let s = sim.similarity(&a, &b);
assert!(s > 0.0 && s < 1.0, "partial distance must yield value in (0.0, 1.0), got {s}");
}
#[test]
fn levenshtein_null_fields_return_zero() {
let sim = LevenshteinSimilarity { max_distance: 3 };
assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("test".into())), 0.0);
assert_eq!(sim.similarity(&FieldValue::Text("test".into()), &FieldValue::Null), 0.0);
}
}