Skip to main content

zer_compare/similarity/
id.rs

1use zer_core::{record::FieldValue, schema::FieldKind};
2
3use crate::similarity::SimilarityFn;
4
5// ── ExactIdSimilarity ─────────────────────────────────────────────────────────
6
7/// Returns 1.0 if the string representations are identical, 0.0 otherwise.
8/// Works on Text, Int, Float, and Bool field values by comparing their
9/// string/numeric representations.
10pub struct ExactIdSimilarity;
11
12impl SimilarityFn for ExactIdSimilarity {
13    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
14        match (a, b) {
15            (FieldValue::Text(a),  FieldValue::Text(b))  => if a == b { 1.0 } else { 0.0 },
16            (FieldValue::Int(a),   FieldValue::Int(b))   => if a == b { 1.0 } else { 0.0 },
17            (FieldValue::Float(a), FieldValue::Float(b)) => if (a - b).abs() < f64::EPSILON { 1.0 } else { 0.0 },
18            (FieldValue::Bool(a),  FieldValue::Bool(b))  => if a == b { 1.0 } else { 0.0 },
19            _ => 0.0,
20        }
21    }
22    fn similarity_str(&self, a: &str, b: &str) -> f32 { if a == b { 1.0 } else { 0.0 } }
23    fn field_kind(&self) -> FieldKind { FieldKind::Id }
24}
25
26// ── HammingSimilarity ─────────────────────────────────────────────────────────
27
28/// Hamming-distance similarity for equal-length strings.
29///
30/// Only applies when both values have the same character length. Returns:
31///   distance 0              : 1.0
32///   distance <= max_distance: 0.8
33///   otherwise               : 0.0
34pub struct HammingSimilarity {
35    pub max_distance: usize,
36}
37
38impl SimilarityFn for HammingSimilarity {
39    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
40        match (a, b) {
41            (FieldValue::Text(a), FieldValue::Text(b)) if a.len() == b.len() => {
42                match strsim::hamming(a, b) {
43                    Ok(0)    => 1.0,
44                    Ok(dist) if dist <= self.max_distance => 0.8,
45                    _ => 0.0,
46                }
47            }
48            _ => 0.0,
49        }
50    }
51    fn similarity_str(&self, a: &str, b: &str) -> f32 {
52        if a.len() != b.len() { return 0.0; }
53        match strsim::hamming(a, b) {
54            Ok(0)    => 1.0,
55            Ok(dist) if dist <= self.max_distance => 0.8,
56            _ => 0.0,
57        }
58    }
59    fn field_kind(&self) -> FieldKind { FieldKind::Id }
60}
61
62// ── SuffixMatchSimilarity ─────────────────────────────────────────────────────
63
64/// Returns 1.0 when the last `n` characters of both values are identical.
65///
66/// Useful for partial ID matching (e.g. BSN last 4 digits, phone suffix).
67/// Returns 0.0 if either string is shorter than `n`.
68pub struct SuffixMatchSimilarity {
69    pub n: usize,
70}
71
72impl SimilarityFn for SuffixMatchSimilarity {
73    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
74        match (a, b) {
75            (FieldValue::Text(a), FieldValue::Text(b)) => {
76                if a.len() < self.n || b.len() < self.n { return 0.0; }
77                let sa = &a[a.len() - self.n..];
78                let sb = &b[b.len() - self.n..];
79                if sa == sb { 1.0 } else { 0.0 }
80            }
81            _ => 0.0,
82        }
83    }
84    fn similarity_str(&self, a: &str, b: &str) -> f32 {
85        if a.len() < self.n || b.len() < self.n { return 0.0; }
86        if &a[a.len() - self.n..] == &b[b.len() - self.n..] { 1.0 } else { 0.0 }
87    }
88    fn field_kind(&self) -> FieldKind { FieldKind::Id }
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94
95    fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
96
97    #[test]
98    fn exact_id_match() {
99        let sim = ExactIdSimilarity;
100        assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406812")), 1.0);
101    }
102
103    #[test]
104    fn exact_id_mismatch() {
105        let sim = ExactIdSimilarity;
106        assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406813")), 0.0);
107    }
108
109    #[test]
110    fn exact_id_integer() {
111        let sim = ExactIdSimilarity;
112        assert_eq!(sim.similarity(&FieldValue::Int(12345), &FieldValue::Int(12345)), 1.0);
113        assert_eq!(sim.similarity(&FieldValue::Int(12345), &FieldValue::Int(99999)), 0.0);
114    }
115
116    #[test]
117    fn hamming_exact() {
118        let sim = HammingSimilarity { max_distance: 1 };
119        assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406812")), 1.0);
120    }
121
122    #[test]
123    fn hamming_within_distance() {
124        let sim = HammingSimilarity { max_distance: 1 };
125        // One character different
126        assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406813")), 0.8);
127    }
128
129    #[test]
130    fn hamming_exceeds_distance() {
131        let sim = HammingSimilarity { max_distance: 1 };
132        // Two characters different
133        assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406899")), 0.0);
134    }
135
136    #[test]
137    fn hamming_different_lengths() {
138        let sim = HammingSimilarity { max_distance: 1 };
139        // Hamming undefined for unequal lengths → 0.0
140        assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR740681")), 0.0);
141    }
142
143    #[test]
144    fn suffix_match_exact_suffix() {
145        let sim = SuffixMatchSimilarity { n: 4 };
146        // Different last-4: "6789" vs "0001" → 0.0
147        assert_eq!(sim.similarity(&tv("123456789"), &tv("987650001")), 0.0);
148        // Different last-4: "6789" vs "1234" → 0.0
149        assert_eq!(sim.similarity(&tv("123456789"), &tv("111111234")), 0.0);
150        // Same last-4: "6789" == "6789" → 1.0
151        assert_eq!(sim.similarity(&tv("123456789"), &tv("111116789")), 1.0);
152    }
153
154    #[test]
155    fn suffix_match_too_short() {
156        let sim = SuffixMatchSimilarity { n: 6 };
157        assert_eq!(sim.similarity(&tv("123"), &tv("123")), 0.0);
158    }
159}