zer_compare/similarity/
address.rs1use zer_core::{record::FieldValue, schema::FieldKind};
2
3use crate::similarity::{name::jaccard_tokens, SimilarityFn};
4
5const DUTCH_ABBREV: &[(&str, &str)] = &[
8 ("str.", "straat"),
9 ("str", "straat"),
10 ("ln.", "laan"),
11 ("ln", "laan"),
12 ("ave.", "avenue"),
13 ("ave", "avenue"),
14 ("st.", "street"),
15 ("st", "street"),
16 ("blvd.", "boulevard"),
17 ("blvd", "boulevard"),
18 ("dr.", "dreef"),
19 ("dr", "dreef"),
20 ("sg.", "singel"),
21 ("sg", "singel"),
22 ("kade.", "kade"),
23];
24
25fn normalize_address(s: &str) -> String {
26 let lower = s.to_lowercase();
27 let stripped: String = lower
29 .chars()
30 .map(|c| if c.is_ascii_alphanumeric() || c == '-' || c == ' ' { c } else { ' ' })
31 .collect();
32 let tokens: Vec<&str> = stripped.split_whitespace().collect();
34 tokens.iter().map(|t| {
35 DUTCH_ABBREV.iter()
36 .find(|(abbr, _)| *abbr == *t)
37 .map(|(_, full)| *full)
38 .unwrap_or(t)
39 }).collect::<Vec<_>>().join(" ")
40}
41
42fn extract_leading_number(s: &str) -> Option<&str> {
43 let s = s.trim();
44 let end = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len());
45 if end == 0 { None } else { Some(&s[..end]) }
46}
47
48pub struct AddressTokenOverlap;
55
56impl SimilarityFn for AddressTokenOverlap {
57 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
58 match (a, b) {
59 (FieldValue::Text(a), FieldValue::Text(b)) => {
60 let na = normalize_address(a);
61 let nb = normalize_address(b);
62 jaccard_tokens(&na, &nb)
63 }
64 _ => 0.0,
65 }
66 }
67 fn similarity_str(&self, a: &str, b: &str) -> f32 {
68 let na = normalize_address(a);
69 let nb = normalize_address(b);
70 jaccard_tokens(&na, &nb)
71 }
72 fn field_kind(&self) -> FieldKind { FieldKind::Address }
73}
74
75pub struct StreetNumberEditDistance;
85
86impl SimilarityFn for StreetNumberEditDistance {
87 fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
88 match (a, b) {
89 (FieldValue::Text(a), FieldValue::Text(b)) => {
90 let na = extract_leading_number(a);
91 let nb = extract_leading_number(b);
92 match (na, nb) {
93 (Some(na), Some(nb)) => {
94 let dist = strsim::levenshtein(na, nb);
95 if dist == 0 { 1.0 }
96 else if dist == 1 { 0.8 }
97 else { 0.0 }
98 }
99 _ => 0.0,
100 }
101 }
102 _ => 0.0,
103 }
104 }
105 fn similarity_str(&self, a: &str, b: &str) -> f32 {
106 match (extract_leading_number(a), extract_leading_number(b)) {
107 (Some(na), Some(nb)) => {
108 let dist = strsim::levenshtein(na, nb);
109 if dist == 0 { 1.0 } else if dist == 1 { 0.8 } else { 0.0 }
110 }
111 _ => 0.0,
112 }
113 }
114 fn field_kind(&self) -> FieldKind { FieldKind::Address }
115}
116
117#[cfg(test)]
118mod tests {
119 use super::*;
120
121 fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
122
123 #[test]
124 fn address_token_overlap_exact() {
125 let sim = AddressTokenOverlap;
126 assert_eq!(sim.similarity(&tv("Coolsingel 93"), &tv("Coolsingel 93")), 1.0);
127 }
128
129 #[test]
130 fn address_token_overlap_abbreviation() {
131 let sim = AddressTokenOverlap;
132 let s = sim.similarity(&tv("Blaak Str. 10"), &tv("Blaakstraat 10"));
134 assert!(s > 0.0, "abbreviation expansion should yield some overlap, got {s}");
136 }
137
138 #[test]
139 fn address_token_overlap_different() {
140 let sim = AddressTokenOverlap;
141 let s = sim.similarity(&tv("Coolsingel 93"), &tv("Beatrixlaan 241"));
142 assert!(s < 0.3, "completely different addresses should be low, got {s}");
143 }
144
145 #[test]
146 fn street_number_exact() {
147 let sim = StreetNumberEditDistance;
148 assert_eq!(sim.similarity(&tv("239 bis Amsterdamseweg"), &tv("239 bis")), 1.0);
149 }
150
151 #[test]
152 fn street_number_off_by_one() {
153 let sim = StreetNumberEditDistance;
154 assert_eq!(sim.similarity(&tv("10 Coolsingel"), &tv("11 Coolsingel")), 0.8);
156 }
157
158 #[test]
159 fn street_number_no_leading_digit() {
160 let sim = StreetNumberEditDistance;
161 assert_eq!(sim.similarity(&tv("Coolsingel"), &tv("Coolsingel")), 0.0);
163 }
164
165 #[test]
166 fn address_null_field() {
167 let sim = AddressTokenOverlap;
168 assert_eq!(sim.similarity(&FieldValue::Null, &tv("Coolsingel 93")), 0.0);
169 }
170}