Skip to main content

zer_compare/similarity/
numeric.rs

1use zer_core::{record::FieldValue, schema::FieldKind};
2
3use crate::similarity::SimilarityFn;
4
5// ── NumericBucketedSimilarity ─────────────────────────────────────────────────
6
7/// Similarity for numeric fields based on relative difference, bucketed into
8/// four bands.
9///
10/// Extracts f64 values from Text (parsed), Int, or Float field values and
11/// computes `relative_diff = |a - b| / max(|a|, |b|, 1.0)`:
12///   relative_diff == 0.0      : 1.0  (exact)
13///   relative_diff <= 0.05     : 0.85 (<= 5% difference)
14///   relative_diff <= 0.20     : 0.6  (<= 20% difference)
15///   relative_diff <= 0.50     : 0.3  (<= 50% difference)
16///   otherwise                 : 0.0
17pub struct NumericBucketedSimilarity;
18
19fn extract_numeric(v: &FieldValue) -> Option<f64> {
20    match v {
21        FieldValue::Float(f) => Some(*f),
22        FieldValue::Int(i)   => Some(*i as f64),
23        FieldValue::Text(s)  => s.trim().parse::<f64>().ok(),
24        _                    => None,
25    }
26}
27
28fn numeric_score(va: f64, vb: f64) -> f32 {
29    let diff = (va - vb).abs();
30    if diff == 0.0 { return 1.0; }
31    let denom    = va.abs().max(vb.abs()).max(1.0);
32    let rel_diff = diff / denom;
33    if rel_diff <= 0.05 { 0.85 }
34    else if rel_diff <= 0.20 { 0.6 }
35    else if rel_diff <= 0.50 { 0.3 }
36    else { 0.0 }
37}
38
39impl SimilarityFn for NumericBucketedSimilarity {
40    fn similarity(&self, a: &FieldValue, b: &FieldValue) -> f32 {
41        match (extract_numeric(a), extract_numeric(b)) {
42            (Some(va), Some(vb)) => numeric_score(va, vb),
43            _ => 0.0,
44        }
45    }
46    fn similarity_str(&self, a: &str, b: &str) -> f32 {
47        match (a.trim().parse::<f64>(), b.trim().parse::<f64>()) {
48            (Ok(va), Ok(vb)) => numeric_score(va, vb),
49            _ => 0.0,
50        }
51    }
52    fn field_kind(&self) -> FieldKind { FieldKind::Numeric }
53}
54
55#[cfg(test)]
56mod tests {
57    use super::*;
58
59    fn ti(n: i64)  -> FieldValue { FieldValue::Int(n) }
60    fn tf(f: f64)  -> FieldValue { FieldValue::Float(f) }
61    fn tv(s: &str) -> FieldValue { FieldValue::Text(s.into()) }
62
63    #[test]
64    fn exact_int_match() {
65        let sim = NumericBucketedSimilarity;
66        assert_eq!(sim.similarity(&ti(180), &ti(180)), 1.0);
67    }
68
69    #[test]
70    fn close_within_5_percent() {
71        let sim = NumericBucketedSimilarity;
72        // 180 vs 183 → diff 3, denom 183 → 1.6% → bucket 0.85
73        assert_eq!(sim.similarity(&ti(180), &ti(183)), 0.85);
74    }
75
76    #[test]
77    fn medium_within_20_percent() {
78        let sim = NumericBucketedSimilarity;
79        // 100 vs 115 → diff 15, denom 115 → 13% → bucket 0.6
80        assert_eq!(sim.similarity(&ti(100), &ti(115)), 0.6);
81    }
82
83    #[test]
84    fn large_within_50_percent() {
85        let sim = NumericBucketedSimilarity;
86        // 100 vs 140 → diff 40, denom 140 → 28.6% → bucket 0.3
87        assert_eq!(sim.similarity(&ti(100), &ti(140)), 0.3);
88    }
89
90    #[test]
91    fn very_different() {
92        let sim = NumericBucketedSimilarity;
93        assert_eq!(sim.similarity(&ti(100), &ti(300)), 0.0);
94    }
95
96    #[test]
97    fn float_parsing_from_text() {
98        let sim = NumericBucketedSimilarity;
99        // GPS-style: "52.345" vs "52.346", nearly identical
100        assert_eq!(sim.similarity(&tv("52.345"), &tv("52.346")), 0.85);
101    }
102
103    #[test]
104    fn mixed_int_float() {
105        let sim = NumericBucketedSimilarity;
106        assert_eq!(sim.similarity(&ti(100), &tf(100.0)), 1.0);
107    }
108
109    #[test]
110    fn null_returns_zero() {
111        let sim = NumericBucketedSimilarity;
112        assert_eq!(sim.similarity(&FieldValue::Null, &ti(100)), 0.0);
113    }
114}