Skip to main content

zer_compare/
discretize.rs

1use zer_core::{comparison::ComparisonLevel, schema::FieldKind};
2
3/// Configurable per-field thresholds for mapping a float similarity score to a `ComparisonLevel`.
4///
5/// The defaults are tuned per `FieldKind` based on the expected noise distribution
6/// for that field type in Dutch law enforcement data.
7#[derive(Debug, Clone)]
8pub struct LevelThresholds {
9    /// Similarity >= this gives `ComparisonLevel::Exact`.
10    pub exact: f32,
11    /// Similarity >= this (and < exact) gives `ComparisonLevel::Close`.
12    pub close: f32,
13    /// Similarity >= this (and < close) gives `ComparisonLevel::Partial`.
14    pub partial: f32,
15    // similarity < partial gives `ComparisonLevel::None`
16}
17
18impl LevelThresholds {
19    /// Default thresholds tuned per `FieldKind`.
20    pub fn for_kind(kind: FieldKind) -> Self {
21        match kind {
22            FieldKind::Name        => Self { exact: 0.92, close: 0.75, partial: 0.50 },
23            FieldKind::Date
24            | FieldKind::Timestamp => Self { exact: 0.99, close: 0.85, partial: 0.60 },
25            FieldKind::Phone       => Self { exact: 0.98, close: 0.90, partial: 0.70 },
26            FieldKind::Address     => Self { exact: 0.90, close: 0.70, partial: 0.40 },
27            FieldKind::Id          => Self { exact: 0.99, close: 0.90, partial: 0.75 },
28            FieldKind::LicensePlate => Self { exact: 0.99, close: 0.75, partial: 0.50 },
29            FieldKind::Numeric
30            | FieldKind::GpsCoordinate => Self { exact: 0.95, close: 0.80, partial: 0.50 },
31            FieldKind::Categorical => Self { exact: 1.00, close: 0.95, partial: 0.70 },
32            FieldKind::FreeText    => Self { exact: 0.90, close: 0.65, partial: 0.35 },
33            FieldKind::Alias       => Self { exact: 0.90, close: 0.65, partial: 0.35 },
34        }
35    }
36
37    /// Map a raw similarity score to a `ComparisonLevel`.
38    pub fn apply(&self, sim: f32) -> ComparisonLevel {
39        if sim >= self.exact        { ComparisonLevel::Exact   }
40        else if sim >= self.close   { ComparisonLevel::Close   }
41        else if sim >= self.partial { ComparisonLevel::Partial }
42        else                        { ComparisonLevel::None    }
43    }
44}
45
46#[cfg(test)]
47mod tests {
48    use super::*;
49
50    #[test]
51    fn name_thresholds_produce_correct_levels() {
52        let t = LevelThresholds::for_kind(FieldKind::Name);
53        assert_eq!(t.apply(0.95), ComparisonLevel::Exact);
54        assert_eq!(t.apply(0.80), ComparisonLevel::Close);
55        assert_eq!(t.apply(0.60), ComparisonLevel::Partial);
56        assert_eq!(t.apply(0.30), ComparisonLevel::None);
57    }
58
59    #[test]
60    fn categorical_is_binary() {
61        let t = LevelThresholds::for_kind(FieldKind::Categorical);
62        assert_eq!(t.apply(1.00), ComparisonLevel::Exact);
63        assert_eq!(t.apply(0.99), ComparisonLevel::Close);
64        assert_eq!(t.apply(0.00), ComparisonLevel::None);
65    }
66
67    #[test]
68    fn date_thresholds_tight_bands() {
69        let t = LevelThresholds::for_kind(FieldKind::Date);
70        // 1.0 → Exact (same day)
71        assert_eq!(t.apply(1.0),  ComparisonLevel::Exact);
72        // 0.9 → Close (off by 1 day)
73        assert_eq!(t.apply(0.9),  ComparisonLevel::Close);
74        // 0.75 → Partial (same month)
75        assert_eq!(t.apply(0.75), ComparisonLevel::Partial);
76        // 0.3 → None (age-compatible only)
77        assert_eq!(t.apply(0.3),  ComparisonLevel::None);
78    }
79
80    #[test]
81    fn boundary_values_are_exclusive_on_lower_bound() {
82        let t = LevelThresholds::for_kind(FieldKind::Name);
83        // Exactly at the exact threshold → Exact
84        assert_eq!(t.apply(t.exact),       ComparisonLevel::Exact);
85        // One epsilon below exact → Close
86        assert_eq!(t.apply(t.exact - 0.01), ComparisonLevel::Close);
87    }
88}