zer_compare/similarity/
mod.rs1use zer_core::{record::FieldValue, schema::FieldKind};
2
3pub mod address;
4pub mod date;
5pub mod id;
6pub mod name;
7pub mod numeric;
8
9pub trait SimilarityFn: Send + Sync {
12 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32;
13 fn field_kind(&self) -> FieldKind;
14
15 #[inline]
20 fn similarity_str(&self, a: &str, b: &str) -> f32 {
21 let va = FieldValue::Text(a.to_owned());
22 let vb = FieldValue::Text(b.to_owned());
23 self.similarity(&va, &vb)
24 }
25}
26
27pub struct NullSimilarity;
29
30impl SimilarityFn for NullSimilarity {
31 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
32 match (a, b) {
33 (FieldValue::Null, _) | (_, FieldValue::Null) => 0.0,
34 _ => 1.0,
35 }
36 }
37 fn similarity_str(&self, _a: &str, _b: &str) -> f32 { 1.0 }
40 fn field_kind(&self) -> FieldKind { FieldKind::Name }
41}
42
43#[cfg(test)]
44mod null_tests {
45 use super::*;
46
47 #[test]
48 fn null_similarity_either_null() {
49 let sim = NullSimilarity;
50 assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("x".into())), 0.0);
51 assert_eq!(sim.similarity(&FieldValue::Text("x".into()), &FieldValue::Null), 0.0);
52 assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Null), 0.0);
53 }
54
55 #[test]
56 fn null_similarity_both_present_returns_one() {
57 let sim = NullSimilarity;
58 let a = FieldValue::Text("Alice".into());
59 let b = FieldValue::Text("Bob".into());
60 assert_eq!(sim.similarity(&a, &b), 1.0, "non-null values pass through as 1.0");
61 }
62}
63
64pub fn default_fns_for(kind: FieldKind) -> Vec<Box<dyn SimilarityFn>> {
71 use address::AddressTokenOverlap;
72 use date::DateSimilarity;
73 use id::{ExactIdSimilarity, HammingSimilarity};
74 use name::{AliasTokenOverlapSimilarity, JaroWinklerSimilarity, TokenOverlapSimilarity};
75 use numeric::NumericBucketedSimilarity;
76
77 match kind {
78 FieldKind::Name => vec![
79 Box::new(JaroWinklerSimilarity),
80 Box::new(TokenOverlapSimilarity),
81 ],
82 FieldKind::Date | FieldKind::Timestamp => vec![
83 Box::new(DateSimilarity),
84 ],
85 FieldKind::Address => vec![
86 Box::new(AddressTokenOverlap),
87 ],
88 FieldKind::Id => vec![
89 Box::new(ExactIdSimilarity),
90 Box::new(HammingSimilarity { max_distance: 1 }),
91 ],
92 FieldKind::Phone => vec![
93 Box::new(ExactIdSimilarity),
94 Box::new(HammingSimilarity { max_distance: 1 }),
95 ],
96 FieldKind::LicensePlate => vec![
97 Box::new(ExactIdSimilarity),
98 Box::new(HammingSimilarity { max_distance: 1 }),
99 ],
100 FieldKind::Numeric | FieldKind::GpsCoordinate => vec![
101 Box::new(NumericBucketedSimilarity),
102 ],
103 FieldKind::Categorical => vec![
104 Box::new(ExactIdSimilarity),
105 ],
106 FieldKind::FreeText => vec![
107 Box::new(TokenOverlapSimilarity),
108 ],
109 FieldKind::Alias => vec![
110 Box::new(AliasTokenOverlapSimilarity),
111 ],
112 }
113}