use zer_core::{record::FieldValue, schema::FieldKind};
use crate::similarity::{name::jaccard_tokens, SimilarityFn};
const DUTCH_ABBREV: &[(&str, &str)] = &[
("str.", "straat"),
("str", "straat"),
("ln.", "laan"),
("ln", "laan"),
("ave.", "avenue"),
("ave", "avenue"),
("st.", "street"),
("st", "street"),
("blvd.", "boulevard"),
("blvd", "boulevard"),
("dr.", "dreef"),
("dr", "dreef"),
("sg.", "singel"),
("sg", "singel"),
("kade.", "kade"),
];
fn normalize_address(s: &str) -> String {
let lower = s.to_lowercase();
let stripped: String = lower
.chars()
.map(|c| if c.is_ascii_alphanumeric() || c == '-' || c == ' ' { c } else { ' ' })
.collect();
let tokens: Vec<&str> = stripped.split_whitespace().collect();
tokens.iter().map(|t| {
DUTCH_ABBREV.iter()
.find(|(abbr, _)| *abbr == *t)
.map(|(_, full)| *full)
.unwrap_or(t)
}).collect::<Vec<_>>().join(" ")
}
fn extract_leading_number(s: &str) -> Option<&str> {
let s = s.trim();
let end = s.find(|c: char| !c.is_ascii_digit()).unwrap_or(s.len());
if end == 0 { None } else { Some(&s[..end]) }
}
pub struct AddressTokenOverlap;
impl SimilarityFn for AddressTokenOverlap {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => {
let na = normalize_address(a);
let nb = normalize_address(b);
jaccard_tokens(&na, &nb)
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
let na = normalize_address(a);
let nb = normalize_address(b);
jaccard_tokens(&na, &nb)
}
fn field_kind(&self) -> FieldKind { FieldKind::Address }
}
pub struct StreetNumberEditDistance;
impl SimilarityFn for StreetNumberEditDistance {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => {
let na = extract_leading_number(a);
let nb = extract_leading_number(b);
match (na, nb) {
(Some(na), Some(nb)) => {
let dist = strsim::levenshtein(na, nb);
if dist == 0 { 1.0 }
else if dist == 1 { 0.8 }
else { 0.0 }
}
_ => 0.0,
}
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
match (extract_leading_number(a), extract_leading_number(b)) {
(Some(na), Some(nb)) => {
let dist = strsim::levenshtein(na, nb);
if dist == 0 { 1.0 } else if dist == 1 { 0.8 } else { 0.0 }
}
_ => 0.0,
}
}
fn field_kind(&self) -> FieldKind { FieldKind::Address }
}
#[cfg(test)]
mod tests {
use super::*;
fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
#[test]
fn address_token_overlap_exact() {
let sim = AddressTokenOverlap;
assert_eq!(sim.similarity(&tv("Coolsingel 93"), &tv("Coolsingel 93")), 1.0);
}
#[test]
fn address_token_overlap_abbreviation() {
let sim = AddressTokenOverlap;
let s = sim.similarity(&tv("Blaak Str. 10"), &tv("Blaakstraat 10"));
assert!(s > 0.0, "abbreviation expansion should yield some overlap, got {s}");
}
#[test]
fn address_token_overlap_different() {
let sim = AddressTokenOverlap;
let s = sim.similarity(&tv("Coolsingel 93"), &tv("Beatrixlaan 241"));
assert!(s < 0.3, "completely different addresses should be low, got {s}");
}
#[test]
fn street_number_exact() {
let sim = StreetNumberEditDistance;
assert_eq!(sim.similarity(&tv("239 bis Amsterdamseweg"), &tv("239 bis")), 1.0);
}
#[test]
fn street_number_off_by_one() {
let sim = StreetNumberEditDistance;
assert_eq!(sim.similarity(&tv("10 Coolsingel"), &tv("11 Coolsingel")), 0.8);
}
#[test]
fn street_number_no_leading_digit() {
let sim = StreetNumberEditDistance;
assert_eq!(sim.similarity(&tv("Coolsingel"), &tv("Coolsingel")), 0.0);
}
#[test]
fn address_null_field() {
let sim = AddressTokenOverlap;
assert_eq!(sim.similarity(&FieldValue::Null, &tv("Coolsingel 93")), 0.0);
}
}