Skip to main content

zer_compare/similarity/
address.rs

1use zer_core::{record::FieldValue, schema::FieldKind};
2
3use crate::similarity::{name::jaccard_tokens, SimilarityFn};
4
5// ── Normalization helpers ─────────────────────────────────────────────────────
6
7const DUTCH_ABBREV: &[(&str, &str)] = &[
8    ("str.",   "straat"),
9    ("str",    "straat"),
10    ("ln.",    "laan"),
11    ("ln",     "laan"),
12    ("ave.",   "avenue"),
13    ("ave",    "avenue"),
14    ("st.",    "street"),
15    ("st",     "street"),
16    ("blvd.",  "boulevard"),
17    ("blvd",   "boulevard"),
18    ("dr.",    "dreef"),
19    ("dr",     "dreef"),
20    ("sg.",    "singel"),
21    ("sg",     "singel"),
22    ("kade.",  "kade"),
23];
24
25fn normalize_address(s: &str) -> String {
26    let lower = s.to_lowercase();
27    // Strip punctuation except hyphens (preserve "1011-AB" style)
28    let stripped: String = lower
29        .chars()
30        .map(|c| if c.is_ascii_alphanumeric() || c == '-' || c == ' ' { c } else { ' ' })
31        .collect();
32    // Expand abbreviations (whole-word match via split/rejoin)
33    let tokens: Vec<&str> = stripped.split_whitespace().collect();
34    tokens.iter().map(|t| {
35        DUTCH_ABBREV.iter()
36            .find(|(abbr, _)| *abbr == *t)
37            .map(|(_, full)| *full)
38            .unwrap_or(t)
39    }).collect::<Vec<_>>().join(" ")
40}
41
42fn extract_leading_number(s: &str) -> Option<&str> {
43    let s = s.trim();
44    let end = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len());
45    if end == 0 { None } else { Some(&s[..end]) }
46}
47
48// ── AddressTokenOverlap ───────────────────────────────────────────────────────
49
50/// Jaccard similarity on normalized token sets.
51///
52/// Normalizes both addresses (lowercase, strip punctuation, expand Dutch
53/// abbreviations) then computes Jaccard on the resulting token sets.
54pub struct AddressTokenOverlap;
55
56impl SimilarityFn for AddressTokenOverlap {
57    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
58        match (a, b) {
59            (FieldValue::Text(a), FieldValue::Text(b)) => {
60                let na = normalize_address(a);
61                let nb = normalize_address(b);
62                jaccard_tokens(&na, &nb)
63            }
64            _ => 0.0,
65        }
66    }
67    fn similarity_str(&self, a: &str, b: &str) -> f32 {
68        let na = normalize_address(a);
69        let nb = normalize_address(b);
70        jaccard_tokens(&na, &nb)
71    }
72    fn field_kind(&self) -> FieldKind { FieldKind::Address }
73}
74
75// ── StreetNumberEditDistance ──────────────────────────────────────────────────
76
77/// Levenshtein edit distance on the leading street number.
78///
79/// Extracts the leading numeric sequence from each address and computes
80/// edit distance:
81///   distance 0 : 1.0
82///   distance 1 : 0.8
83///   otherwise  : 0.0
84pub struct StreetNumberEditDistance;
85
86impl SimilarityFn for StreetNumberEditDistance {
87    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
88        match (a, b) {
89            (FieldValue::Text(a), FieldValue::Text(b)) => {
90                let na = extract_leading_number(a);
91                let nb = extract_leading_number(b);
92                match (na, nb) {
93                    (Some(na), Some(nb)) => {
94                        let dist = strsim::levenshtein(na, nb);
95                        if dist == 0 { 1.0 }
96                        else if dist == 1 { 0.8 }
97                        else { 0.0 }
98                    }
99                    _ => 0.0,
100                }
101            }
102            _ => 0.0,
103        }
104    }
105    fn similarity_str(&self, a: &str, b: &str) -> f32 {
106        match (extract_leading_number(a), extract_leading_number(b)) {
107            (Some(na), Some(nb)) => {
108                let dist = strsim::levenshtein(na, nb);
109                if dist == 0 { 1.0 } else if dist == 1 { 0.8 } else { 0.0 }
110            }
111            _ => 0.0,
112        }
113    }
114    fn field_kind(&self) -> FieldKind { FieldKind::Address }
115}
116
117#[cfg(test)]
118mod tests {
119    use super::*;
120
121    fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
122
123    #[test]
124    fn address_token_overlap_exact() {
125        let sim = AddressTokenOverlap;
126        assert_eq!(sim.similarity(&tv("Coolsingel 93"), &tv("Coolsingel 93")), 1.0);
127    }
128
129    #[test]
130    fn address_token_overlap_abbreviation() {
131        let sim = AddressTokenOverlap;
132        // "Blaak Str." normalizes to "blaak straat"
133        let s = sim.similarity(&tv("Blaak Str. 10"), &tv("Blaakstraat 10"));
134        // "blaak straat 10" vs "blaakstraat 10", shares "10" at minimum
135        assert!(s > 0.0, "abbreviation expansion should yield some overlap, got {s}");
136    }
137
138    #[test]
139    fn address_token_overlap_different() {
140        let sim = AddressTokenOverlap;
141        let s = sim.similarity(&tv("Coolsingel 93"), &tv("Beatrixlaan 241"));
142        assert!(s < 0.3, "completely different addresses should be low, got {s}");
143    }
144
145    #[test]
146    fn street_number_exact() {
147        let sim = StreetNumberEditDistance;
148        assert_eq!(sim.similarity(&tv("239 bis Amsterdamseweg"), &tv("239 bis")), 1.0);
149    }
150
151    #[test]
152    fn street_number_off_by_one() {
153        let sim = StreetNumberEditDistance;
154        // "10" vs "11" → levenshtein 1 (single digit substitution)
155        assert_eq!(sim.similarity(&tv("10 Coolsingel"), &tv("11 Coolsingel")), 0.8);
156    }
157
158    #[test]
159    fn street_number_no_leading_digit() {
160        let sim = StreetNumberEditDistance;
161        // Address with no leading number → 0.0
162        assert_eq!(sim.similarity(&tv("Coolsingel"), &tv("Coolsingel")), 0.0);
163    }
164
165    #[test]
166    fn address_null_field() {
167        let sim = AddressTokenOverlap;
168        assert_eq!(sim.similarity(&FieldValue::Null, &tv("Coolsingel 93")), 0.0);
169    }
170}