use zer_core::{record::FieldValue, schema::FieldKind};
use crate::similarity::SimilarityFn;
pub struct ExactIdSimilarity;
impl SimilarityFn for ExactIdSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => if a == b { 1.0 } else { 0.0 },
(FieldValue::Int(a), FieldValue::Int(b)) => if a == b { 1.0 } else { 0.0 },
(FieldValue::Float(a), FieldValue::Float(b)) => if (a - b).abs() < f64::EPSILON { 1.0 } else { 0.0 },
(FieldValue::Bool(a), FieldValue::Bool(b)) => if a == b { 1.0 } else { 0.0 },
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 { if a == b { 1.0 } else { 0.0 } }
fn field_kind(&self) -> FieldKind { FieldKind::Id }
}
pub struct HammingSimilarity {
pub max_distance: usize,
}
impl SimilarityFn for HammingSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) if a.len() == b.len() => {
match strsim::hamming(a, b) {
Ok(0) => 1.0,
Ok(dist) if dist <= self.max_distance => 0.8,
_ => 0.0,
}
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
if a.len() != b.len() { return 0.0; }
match strsim::hamming(a, b) {
Ok(0) => 1.0,
Ok(dist) if dist <= self.max_distance => 0.8,
_ => 0.0,
}
}
fn field_kind(&self) -> FieldKind { FieldKind::Id }
}
pub struct SuffixMatchSimilarity {
pub n: usize,
}
impl SimilarityFn for SuffixMatchSimilarity {
fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
match (a, b) {
(FieldValue::Text(a), FieldValue::Text(b)) => {
if a.len() < self.n || b.len() < self.n { return 0.0; }
let sa = &a[a.len() - self.n..];
let sb = &b[b.len() - self.n..];
if sa == sb { 1.0 } else { 0.0 }
}
_ => 0.0,
}
}
fn similarity_str(&self, a: &str, b: &str) -> f32 {
if a.len() < self.n || b.len() < self.n { return 0.0; }
if &a[a.len() - self.n..] == &b[b.len() - self.n..] { 1.0 } else { 0.0 }
}
fn field_kind(&self) -> FieldKind { FieldKind::Id }
}
#[cfg(test)]
mod tests {
use super::*;
fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
#[test]
fn exact_id_match() {
let sim = ExactIdSimilarity;
assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406812")), 1.0);
}
#[test]
fn exact_id_mismatch() {
let sim = ExactIdSimilarity;
assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406813")), 0.0);
}
#[test]
fn exact_id_integer() {
let sim = ExactIdSimilarity;
assert_eq!(sim.similarity(&FieldValue::Int(12345), &FieldValue::Int(12345)), 1.0);
assert_eq!(sim.similarity(&FieldValue::Int(12345), &FieldValue::Int(99999)), 0.0);
}
#[test]
fn hamming_exact() {
let sim = HammingSimilarity { max_distance: 1 };
assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406812")), 1.0);
}
#[test]
fn hamming_within_distance() {
let sim = HammingSimilarity { max_distance: 1 };
assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406813")), 0.8);
}
#[test]
fn hamming_exceeds_distance() {
let sim = HammingSimilarity { max_distance: 1 };
assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR7406899")), 0.0);
}
#[test]
fn hamming_different_lengths() {
let sim = HammingSimilarity { max_distance: 1 };
assert_eq!(sim.similarity(&tv("IR7406812"), &tv("IR740681")), 0.0);
}
#[test]
fn suffix_match_exact_suffix() {
let sim = SuffixMatchSimilarity { n: 4 };
assert_eq!(sim.similarity(&tv("123456789"), &tv("987650001")), 0.0);
assert_eq!(sim.similarity(&tv("123456789"), &tv("111111234")), 0.0);
assert_eq!(sim.similarity(&tv("123456789"), &tv("111116789")), 1.0);
}
#[test]
fn suffix_match_too_short() {
let sim = SuffixMatchSimilarity { n: 6 };
assert_eq!(sim.similarity(&tv("123"), &tv("123")), 0.0);
}
}