Skip to main content

zer_compare/
discretize.rs

1use zer_core::{comparison::ComparisonLevel, schema::FieldKind};
2
3/// Configurable per-field thresholds for mapping a float similarity score to a `ComparisonLevel`.
4///
5/// The defaults are tuned per `FieldKind` based on the expected noise distribution
6/// for that field type in Dutch law enforcement data.
7#[derive(Debug, Clone)]
8pub struct LevelThresholds {
9    /// Similarity >= this gives `ComparisonLevel::Exact`.
10    pub exact: f32,
11    /// Similarity >= this (and < exact) gives `ComparisonLevel::Close`.
12    pub close: f32,
13    /// Similarity >= this (and < close) gives `ComparisonLevel::Partial`.
14    pub partial: f32,
15    // similarity < partial gives `ComparisonLevel::None`
16}
17
18impl LevelThresholds {
19    /// Default thresholds tuned per `FieldKind`.
20    pub fn for_kind(kind: FieldKind) -> Self {
21        match kind {
22            FieldKind::Name => Self {
23                exact: 0.92,
24                close: 0.75,
25                partial: 0.50,
26            },
27            FieldKind::Date | FieldKind::Timestamp => Self {
28                exact: 0.99,
29                close: 0.85,
30                partial: 0.60,
31            },
32            FieldKind::Phone => Self {
33                exact: 0.98,
34                close: 0.90,
35                partial: 0.70,
36            },
37            FieldKind::Address => Self {
38                exact: 0.90,
39                close: 0.70,
40                partial: 0.40,
41            },
42            FieldKind::Id => Self {
43                exact: 0.99,
44                close: 0.90,
45                partial: 0.75,
46            },
47            FieldKind::LicensePlate => Self {
48                exact: 0.99,
49                close: 0.75,
50                partial: 0.50,
51            },
52            FieldKind::Numeric | FieldKind::GpsCoordinate => Self {
53                exact: 0.95,
54                close: 0.80,
55                partial: 0.50,
56            },
57            FieldKind::Categorical => Self {
58                exact: 1.00,
59                close: 0.95,
60                partial: 0.70,
61            },
62            FieldKind::FreeText => Self {
63                exact: 0.90,
64                close: 0.65,
65                partial: 0.35,
66            },
67            FieldKind::Alias => Self {
68                exact: 0.90,
69                close: 0.65,
70                partial: 0.35,
71            },
72        }
73    }
74
75    /// Map a raw similarity score to a `ComparisonLevel`.
76    pub fn apply(&self, sim: f32) -> ComparisonLevel {
77        if sim >= self.exact {
78            ComparisonLevel::Exact
79        } else if sim >= self.close {
80            ComparisonLevel::Close
81        } else if sim >= self.partial {
82            ComparisonLevel::Partial
83        } else {
84            ComparisonLevel::None
85        }
86    }
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92
93    #[test]
94    fn name_thresholds_produce_correct_levels() {
95        let t = LevelThresholds::for_kind(FieldKind::Name);
96        assert_eq!(t.apply(0.95), ComparisonLevel::Exact);
97        assert_eq!(t.apply(0.80), ComparisonLevel::Close);
98        assert_eq!(t.apply(0.60), ComparisonLevel::Partial);
99        assert_eq!(t.apply(0.30), ComparisonLevel::None);
100    }
101
102    #[test]
103    fn categorical_is_binary() {
104        let t = LevelThresholds::for_kind(FieldKind::Categorical);
105        assert_eq!(t.apply(1.00), ComparisonLevel::Exact);
106        assert_eq!(t.apply(0.99), ComparisonLevel::Close);
107        assert_eq!(t.apply(0.00), ComparisonLevel::None);
108    }
109
110    #[test]
111    fn date_thresholds_tight_bands() {
112        let t = LevelThresholds::for_kind(FieldKind::Date);
113        // 1.0 → Exact (same day)
114        assert_eq!(t.apply(1.0), ComparisonLevel::Exact);
115        // 0.9 → Close (off by 1 day)
116        assert_eq!(t.apply(0.9), ComparisonLevel::Close);
117        // 0.75 → Partial (same month)
118        assert_eq!(t.apply(0.75), ComparisonLevel::Partial);
119        // 0.3 → None (age-compatible only)
120        assert_eq!(t.apply(0.3), ComparisonLevel::None);
121    }
122
123    #[test]
124    fn boundary_values_are_exclusive_on_lower_bound() {
125        let t = LevelThresholds::for_kind(FieldKind::Name);
126        // Exactly at the exact threshold → Exact
127        assert_eq!(t.apply(t.exact), ComparisonLevel::Exact);
128        // One epsilon below exact → Close
129        assert_eq!(t.apply(t.exact - 0.01), ComparisonLevel::Close);
130    }
131}