zer_compare/similarity/
id.rs1use zer_core::{record::FieldValue, schema::FieldKind};
2
3use crate::similarity::SimilarityFn;
4
5pub struct ExactIdSimilarity;
11
12impl SimilarityFn for ExactIdSimilarity {
13 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
14 match (a, b) {
15 (FieldValue::Text(a), FieldValue::Text(b)) => if a == b { 1.0 } else { 0.0 },
16 (FieldValue::Int(a), FieldValue::Int(b)) => if a == b { 1.0 } else { 0.0 },
17 (FieldValue::Float(a), FieldValue::Float(b)) => if (a - b).abs() < f64::EPSILON { 1.0 } else { 0.0 },
18 (FieldValue::Bool(a), FieldValue::Bool(b)) => if a == b { 1.0 } else { 0.0 },
19 _ => 0.0,
20 }
21 }
22 fn similarity_str(&self, a: &str, b: &str) -> f32 { if a == b { 1.0 } else { 0.0 } }
23 fn field_kind(&self) -> FieldKind { FieldKind::Id }
24}
25
26pub struct HammingSimilarity {
35 pub max_distance: usize,
36}
37
38impl SimilarityFn for HammingSimilarity {
39 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
40 match (a, b) {
41 (FieldValue::Text(a), FieldValue::Text(b)) if a.len() == b.len() => {
42 match strsim::hamming(a, b) {
43 Ok(0) => 1.0,
44 Ok(dist) if dist <= self.max_distance => 0.8,
45 _ => 0.0,
46 }
47 }
48 _ => 0.0,
49 }
50 }
51 fn similarity_str(&self, a: &str, b: &str) -> f32 {
52 if a.len() != b.len() { return 0.0; }
53 match strsim::hamming(a, b) {
54 Ok(0) => 1.0,
55 Ok(dist) if dist <= self.max_distance => 0.8,
56 _ => 0.0,
57 }
58 }
59 fn field_kind(&self) -> FieldKind { FieldKind::Id }
60}
61
62pub struct SuffixMatchSimilarity {
69 pub n: usize,
70}
71
72impl SimilarityFn for SuffixMatchSimilarity {
73 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
74 match (a, b) {
75 (FieldValue::Text(a), FieldValue::Text(b)) => {
76 if a.len() < self.n || b.len() < self.n { return 0.0; }
77 let sa = &a[a.len() - self.n..];
78 let sb = &b[b.len() - self.n..];
79 if sa == sb { 1.0 } else { 0.0 }
80 }
81 _ => 0.0,
82 }
83 }
84 fn similarity_str(&self, a: &str, b: &str) -> f32 {
85 if a.len() < self.n || b.len() < self.n { return 0.0; }
86 if &a[a.len() - self.n..] == &b[b.len() - self.n..] { 1.0 } else { 0.0 }
87 }
88 fn field_kind(&self) -> FieldKind { FieldKind::Id }
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94
95 fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
96
97 #[test]
98 fn exact_id_match() {
99 let sim = ExactIdSimilarity;
100 assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406812")), 1.0);
101 }
102
103 #[test]
104 fn exact_id_mismatch() {
105 let sim = ExactIdSimilarity;
106 assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406813")), 0.0);
107 }
108
109 #[test]
110 fn exact_id_integer() {
111 let sim = ExactIdSimilarity;
112 assert_eq!(sim.similarity(&FieldValue::Int(12345), &FieldValue::Int(12345)), 1.0);
113 assert_eq!(sim.similarity(&FieldValue::Int(12345), &FieldValue::Int(99999)), 0.0);
114 }
115
116 #[test]
117 fn hamming_exact() {
118 let sim = HammingSimilarity { max_distance: 1 };
119 assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406812")), 1.0);
120 }
121
122 #[test]
123 fn hamming_within_distance() {
124 let sim = HammingSimilarity { max_distance: 1 };
125 assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406813")), 0.8);
127 }
128
129 #[test]
130 fn hamming_exceeds_distance() {
131 let sim = HammingSimilarity { max_distance: 1 };
132 assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406899")), 0.0);
134 }
135
136 #[test]
137 fn hamming_different_lengths() {
138 let sim = HammingSimilarity { max_distance: 1 };
139 assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR740681")), 0.0);
141 }
142
143 #[test]
144 fn suffix_match_exact_suffix() {
145 let sim = SuffixMatchSimilarity { n: 4 };
146 assert_eq!(sim.similarity(&tv("123456789"), &tv("987650001")), 0.0);
148 assert_eq!(sim.similarity(&tv("123456789"), &tv("111111234")), 0.0);
150 assert_eq!(sim.similarity(&tv("123456789"), &tv("111116789")), 1.0);
152 }
153
154 #[test]
155 fn suffix_match_too_short() {
156 let sim = SuffixMatchSimilarity { n: 6 };
157 assert_eq!(sim.similarity(&tv("123"), &tv("123")), 0.0);
158 }
159}