Skip to main content

zer_blocking/keys/
vehicle.rs

1use zer_core::{record::Record, schema::Schema};
2
3use crate::normalize::normalize_plate;
4use super::BlockingKey;
5
6// ── LicensePlateNormKey ───────────────────────────────────────────────────────
7
8/// Normalizes a license plate (strips hyphens/spaces, uppercases) and emits
9/// the result as a single exact blocking key.
10///
11/// Use together with `PlateOCRFuzzyKey` for full OCR-resilient plate matching.
12pub struct LicensePlateNormKey {
13    plate_field: String,
14}
15
16impl LicensePlateNormKey {
17    pub fn new(plate_field: &str) -> Self {
18        Self { plate_field: plate_field.into() }
19    }
20}
21
22impl BlockingKey for LicensePlateNormKey {
23    fn name(&self) -> &str { "plate_norm" }
24
25    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
26        let cow = record.field_as_str(&self.plate_field);
27        let plate = match cow.as_deref() {
28            Some(s) => s,
29            None    => return vec![],
30        };
31        let norm = normalize_plate(plate);
32        if norm.is_empty() { return vec![]; }
33        vec![norm]
34    }
35}
36
37// ── PlateOCRFuzzyKey ──────────────────────────────────────────────────────────
38
39/// Emits the normalized plate **plus** a deletion-neighbourhood key for each
40/// character position.
41///
42/// The deletion-neighbourhood approach handles any single-character OCR
43/// confusion (0/O, 1/I, M/W, G/C, etc.) without an explicit confusion table:
44/// two plates that differ by exactly one character at position `i` will both
45/// produce the same key when character `i` is removed, so they land in the
46/// same candidate bucket.
47///
48/// Example: "CX180W" vs "CXI80W" (1/I confusion at position 2) both become "CX80W" after
49/// deleting position 2, so they share a bucket key.
50pub struct PlateOCRFuzzyKey {
51    plate_field: String,
52}
53
54impl PlateOCRFuzzyKey {
55    pub fn new(plate_field: &str) -> Self {
56        Self { plate_field: plate_field.into() }
57    }
58}
59
60impl BlockingKey for PlateOCRFuzzyKey {
61    fn name(&self) -> &str { "plate_ocr" }
62
63    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
64        let cow = record.field_as_str(&self.plate_field);
65        let plate = match cow.as_deref() {
66            Some(s) => s,
67            None    => return vec![],
68        };
69        let norm = normalize_plate(plate);
70        if norm.is_empty() { return vec![]; }
71
72        let chars: Vec<char> = norm.chars().collect();
73        let n = chars.len();
74        let mut keys = Vec::with_capacity(n + 1);
75        keys.push(norm.clone());
76
77        // For each position, emit the plate with that character removed.
78        // Two plates differing by one substitution at position i share the
79        // deletion key produced by removing position i from both.
80        for i in 0..n {
81            let variant: String = chars.iter().enumerate()
82                .filter(|&(j, _)| j != i)
83                .map(|(_, &c)| c)
84                .collect();
85            keys.push(variant);
86        }
87
88        keys.sort();
89        keys.dedup();
90        keys
91    }
92}
93
94// ── CameraTimeWindowKey ───────────────────────────────────────────────────────
95
96/// Groups passages by camera identifier and a fixed-width time window.
97///
98/// Key format: `"{camera_id}:{date}:{slot}"` where `slot = (hour*60 + min) / window_mins`.
99/// Useful for detecting duplicate sensor reads of the same vehicle at the same
100/// camera location within a short interval.
101pub struct CameraTimeWindowKey {
102    camera_field: String,
103    time_field:   String,
104    window_mins:  u32,
105}
106
107impl CameraTimeWindowKey {
108    pub fn new(camera_field: &str, time_field: &str, window_mins: u32) -> Self {
109        Self {
110            camera_field: camera_field.into(),
111            time_field:   time_field.into(),
112            window_mins,
113        }
114    }
115}
116
117fn time_to_slot(datetime: &str, window: u32) -> Option<u32> {
118    let t_idx     = datetime.find('T')?;
119    let time_part = &datetime[t_idx + 1..];
120    let mut parts = time_part.splitn(3, ':');
121    let hour:   u32 = parts.next()?.parse().ok()?;
122    let minute: u32 = parts.next()?.parse().ok()?;
123    Some((hour * 60 + minute) / window)
124}
125
126impl BlockingKey for CameraTimeWindowKey {
127    fn name(&self) -> &str { "cam_time_window" }
128
129    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
130        let cam_cow = record.field_as_str(&self.camera_field);
131        let cam = match cam_cow.as_deref() {
132            Some(s) => s,
133            None    => return vec![],
134        };
135        let ts_cow = record.field_as_str(&self.time_field);
136        let ts = match ts_cow.as_deref() {
137            Some(s) => s,
138            None    => return vec![],
139        };
140        let date = ts.get(..10).unwrap_or("");
141        let slot = match time_to_slot(ts, self.window_mins) {
142            Some(s) => s,
143            None    => return vec![],
144        };
145        vec![format!("{}:{}:{}", cam, date, slot)]
146    }
147}
148
149// ── GeoGridKey ────────────────────────────────────────────────────────────────
150
151/// Groups records by rounding geographic coordinates to a fixed grid cell.
152///
153/// `grid_size = 0.01` (degrees) ≈ 1 km. Useful for clustering passages near
154/// the same highway camera position or street intersection.
155pub struct GeoGridKey {
156    lat_field: String,
157    lon_field: String,
158    grid_size: f64,
159}
160
161impl GeoGridKey {
162    pub fn new(lat_field: &str, lon_field: &str, grid_size: f64) -> Self {
163        Self {
164            lat_field: lat_field.into(),
165            lon_field: lon_field.into(),
166            grid_size,
167        }
168    }
169}
170
171impl BlockingKey for GeoGridKey {
172    fn name(&self) -> &str { "geo_grid" }
173
174    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
175        let lat = match record.field_as::<f64>(&self.lat_field) {
176            Some(v) => v,
177            None    => return vec![],
178        };
179        let lon = match record.field_as::<f64>(&self.lon_field) {
180            Some(v) => v,
181            None    => return vec![],
182        };
183        let lat_cell = (lat / self.grid_size).floor() as i64;
184        let lon_cell = (lon / self.grid_size).floor() as i64;
185        vec![format!("{}:{}", lat_cell, lon_cell)]
186    }
187}
188
189// ── Tests ─────────────────────────────────────────────────────────────────────
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194    use zer_core::{record::FieldValue, schema::{FieldKind, SchemaBuilder}};
195
196    fn schema() -> Schema {
197        SchemaBuilder::new()
198            .field("kenteken",  FieldKind::LicensePlate)
199            .field("camera_id", FieldKind::Categorical)
200            .field("tijdstip",  FieldKind::Timestamp)
201            .field("lat",       FieldKind::GpsCoordinate)
202            .field("lon",       FieldKind::GpsCoordinate)
203            .build()
204            .unwrap()
205    }
206
207    fn rec(id: u64, kenteken: &str, camera: &str, ts: &str, lat: &str, lon: &str) -> Record {
208        Record::new(id)
209            .insert("kenteken",  FieldValue::Text(kenteken.into()))
210            .insert("camera_id", FieldValue::Text(camera.into()))
211            .insert("tijdstip",  FieldValue::Text(ts.into()))
212            .insert("lat",       FieldValue::Text(lat.into()))
213            .insert("lon",       FieldValue::Text(lon.into()))
214    }
215
216    // ── LicensePlateNormKey
217
218    #[test]
219    fn plate_norm_strips_hyphens() {
220        let schema = schema();
221        let key    = LicensePlateNormKey::new("kenteken");
222        let r      = rec(1, "25-XKL-9", "CAM-A1-001", "2025-01-01T10:00:00", "52.3", "4.9");
223        let keys   = key.extract(&r, &schema);
224        assert_eq!(keys, vec!["25XKL9"]);
225    }
226
227    #[test]
228    fn plate_norm_empty_field_returns_empty() {
229        let schema = schema();
230        let key    = LicensePlateNormKey::new("kenteken");
231        let r      = Record::new(1);
232        assert!(key.extract(&r, &schema).is_empty());
233    }
234
235    // ── PlateOCRFuzzyKey
236
237    #[test]
238    fn ocr_fuzzy_original_and_confused_share_key() {
239        let schema   = schema();
240        let key      = PlateOCRFuzzyKey::new("kenteken");
241
242        // True plate "CX-180-W" → normalized "CX180W"
243        let true_r   = rec(1, "CX-180-W", "CAM-A1-001", "2025-01-01T10:00:00", "52.3", "4.9");
244        // OCR plate  "CX-I80-W" (1→I confusion) → normalized "CXI80W"
245        let ocr_r    = rec(2, "CX-I80-W", "CAM-A1-001", "2025-01-01T10:00:00", "52.3", "4.9");
246
247        let true_keys: std::collections::HashSet<String> =
248            key.extract(&true_r, &schema).into_iter().collect();
249        let ocr_keys: std::collections::HashSet<String> =
250            key.extract(&ocr_r, &schema).into_iter().collect();
251
252        let shared: Vec<_> = true_keys.intersection(&ocr_keys).collect();
253        assert!(
254            !shared.is_empty(),
255            "true plate and OCR plate must share at least one fuzzy key; true={true_keys:?}, ocr={ocr_keys:?}"
256        );
257    }
258
259    #[test]
260    fn ocr_fuzzy_emits_multiple_variants() {
261        let schema = schema();
262        let key    = PlateOCRFuzzyKey::new("kenteken");
263        // "L01A4" has 5 chars → original + 5 deletion keys = 6 distinct keys
264        let r      = rec(1, "L01A4", "CAM", "2025-01-01T08:00:00", "52.0", "4.0");
265        let keys   = key.extract(&r, &schema);
266        assert!(keys.len() >= 4, "should emit original + deletion variants; got {keys:?}");
267        assert!(keys.contains(&"L01A4".to_string()), "original key must be present");
268        // Deletion-neighbourhood keys: each char removed once
269        assert!(keys.contains(&"01A4".to_string()),  "deletion at pos 0 (L) expected");
270        assert!(keys.contains(&"L0A4".to_string()),  "deletion at pos 2 (1) expected");
271    }
272
273    // ── CameraTimeWindowKey
274
275    #[test]
276    fn camera_time_window_same_slot() {
277        let schema = schema();
278        let key    = CameraTimeWindowKey::new("camera_id", "tijdstip", 10);
279
280        let r1 = rec(1, "X", "CAM-A1-001", "2025-06-01T14:02:00", "52.0", "4.0");
281        let r2 = rec(2, "Y", "CAM-A1-001", "2025-06-01T14:08:00", "52.0", "4.0");
282        // 14:02 → slot 84  (84*10=840 min = 14h00m); 14:08 → slot 84  (same)
283        assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
284    }
285
286    #[test]
287    fn camera_time_window_different_slot() {
288        let schema = schema();
289        let key    = CameraTimeWindowKey::new("camera_id", "tijdstip", 10);
290
291        let r1 = rec(1, "X", "CAM-A1-001", "2025-06-01T14:02:00", "52.0", "4.0");
292        let r2 = rec(2, "Y", "CAM-A1-001", "2025-06-01T14:12:00", "52.0", "4.0");
293        // 14:02 → slot 84; 14:12 → slot 85
294        assert_ne!(key.extract(&r1, &schema), key.extract(&r2, &schema));
295    }
296
297    // ── GeoGridKey
298
299    #[test]
300    fn geo_grid_nearby_records_share_key() {
301        let schema = schema();
302        let key    = GeoGridKey::new("lat", "lon", 0.01);
303
304        let r1 = rec(1, "X", "CAM", "2025-01-01T10:00:00", "52.345",  "4.901");
305        let r2 = rec(2, "Y", "CAM", "2025-01-01T10:00:00", "52.349",  "4.907");
306        // Both land in the same 0.01° cell (lat 5234, lon 490)
307        assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
308    }
309
310    #[test]
311    fn geo_grid_distant_records_differ() {
312        let schema = schema();
313        let key    = GeoGridKey::new("lat", "lon", 0.01);
314
315        let r1 = rec(1, "X", "CAM", "2025-01-01T10:00:00", "52.345", "4.901");
316        let r2 = rec(2, "Y", "CAM", "2025-01-01T10:00:00", "51.922", "4.479");
317        assert_ne!(key.extract(&r1, &schema), key.extract(&r2, &schema));
318    }
319}