Skip to main content

zer_compare/similarity/
mod.rs

1use zer_core::{record::FieldValue, schema::FieldKind};
2
3pub mod address;
4pub mod date;
5pub mod id;
6pub mod name;
7pub mod numeric;
8
9/// Returns a similarity in [0.0, 1.0].
10/// 0.0 = completely different, 1.0 = identical.
11pub trait SimilarityFn: Send + Sync {
12    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32;
13    fn field_kind(&self) -> FieldKind;
14
15    /// Zero-alloc hot path for pool-native comparison.
16    ///
17    /// Called by `compare_pool_field` to avoid wrapping `&str` in `FieldValue::Text`
18    /// on every comparison. Override in concrete types to eliminate the allocation.
19    #[inline]
20    fn similarity_str(&self, a: &str, b: &str) -> f32 {
21        let va = FieldValue::Text(a.to_owned());
22        let vb = FieldValue::Text(b.to_owned());
23        self.similarity(&va, &vb)
24    }
25}
26
27/// Returns 0.0 when either field is `FieldValue::Null`, and 1.0 otherwise.
28pub struct NullSimilarity;
29
30impl SimilarityFn for NullSimilarity {
31    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
32        match (a, b) {
33            (FieldValue::Null, _) | (_, FieldValue::Null) => 0.0,
34            _ => 1.0,
35        }
36    }
37    // compare_pool_field already guards empty strings → ComparisonLevel::None;
38    // any non-empty strings reaching here mean neither side is null.
39    fn similarity_str(&self, _a: &str, _b: &str) -> f32 { 1.0 }
40    fn field_kind(&self) -> FieldKind { FieldKind::Name }
41}
42
43#[cfg(test)]
44mod null_tests {
45    use super::*;
46
47    #[test]
48    fn null_similarity_either_null() {
49        let sim = NullSimilarity;
50        assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Text("x".into())), 0.0);
51        assert_eq!(sim.similarity(&FieldValue::Text("x".into()), &FieldValue::Null), 0.0);
52        assert_eq!(sim.similarity(&FieldValue::Null, &FieldValue::Null), 0.0);
53    }
54
55    #[test]
56    fn null_similarity_both_present_returns_one() {
57        let sim = NullSimilarity;
58        let a = FieldValue::Text("Alice".into());
59        let b = FieldValue::Text("Bob".into());
60        assert_eq!(sim.similarity(&a, &b), 1.0, "non-null values pass through as 1.0");
61    }
62}
63
64/// Look up the default similarity function(s) for a `FieldKind`.
65///
66/// Multiple functions per field kind allow complementary signals, e.g.
67/// Jaro-Winkler for gradual string proximity AND phonetic equality for
68/// sound-alike variants. The `FieldComparator` takes the maximum across all
69/// functions for a given field.
70pub fn default_fns_for(kind: FieldKind) -> Vec<Box<dyn SimilarityFn>> {
71    use address::AddressTokenOverlap;
72    use date::DateSimilarity;
73    use id::{ExactIdSimilarity, HammingSimilarity};
74    use name::{AliasTokenOverlapSimilarity, JaroWinklerSimilarity, TokenOverlapSimilarity};
75    use numeric::NumericBucketedSimilarity;
76
77    match kind {
78        FieldKind::Name => vec![
79            Box::new(JaroWinklerSimilarity),
80            Box::new(TokenOverlapSimilarity),
81        ],
82        FieldKind::Date | FieldKind::Timestamp => vec![
83            Box::new(DateSimilarity),
84        ],
85        FieldKind::Address => vec![
86            Box::new(AddressTokenOverlap),
87        ],
88        FieldKind::Id => vec![
89            Box::new(ExactIdSimilarity),
90            Box::new(HammingSimilarity { max_distance: 1 }),
91        ],
92        FieldKind::Phone => vec![
93            Box::new(ExactIdSimilarity),
94            Box::new(HammingSimilarity { max_distance: 1 }),
95        ],
96        FieldKind::LicensePlate => vec![
97            Box::new(ExactIdSimilarity),
98            Box::new(HammingSimilarity { max_distance: 1 }),
99        ],
100        FieldKind::Numeric | FieldKind::GpsCoordinate => vec![
101            Box::new(NumericBucketedSimilarity),
102        ],
103        FieldKind::Categorical => vec![
104            Box::new(ExactIdSimilarity),
105        ],
106        FieldKind::FreeText => vec![
107            Box::new(TokenOverlapSimilarity),
108        ],
109        FieldKind::Alias => vec![
110            Box::new(AliasTokenOverlapSimilarity),
111        ],
112    }
113}