Skip to main content

zer_blocking/keys/
vehicle.rs

1use zer_core::{record::Record, schema::Schema};
2
3use super::BlockingKey;
4use crate::normalize::normalize_plate;
5
6// ── LicensePlateNormKey ───────────────────────────────────────────────────────
7
8/// Normalizes a license plate (strips hyphens/spaces, uppercases) and emits
9/// the result as a single exact blocking key.
10///
11/// Use together with `PlateOCRFuzzyKey` for full OCR-resilient plate matching.
12pub struct LicensePlateNormKey {
13    plate_field: String,
14}
15
16impl LicensePlateNormKey {
17    pub fn new(plate_field: &str) -> Self {
18        Self {
19            plate_field: plate_field.into(),
20        }
21    }
22}
23
24impl BlockingKey for LicensePlateNormKey {
25    fn name(&self) -> &str {
26        "plate_norm"
27    }
28
29    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
30        let cow = record.field_as_str(&self.plate_field);
31        let plate = match cow.as_deref() {
32            Some(s) => s,
33            None => return vec![],
34        };
35        let norm = normalize_plate(plate);
36        if norm.is_empty() {
37            return vec![];
38        }
39        vec![norm]
40    }
41}
42
43// ── PlateOCRFuzzyKey ──────────────────────────────────────────────────────────
44
45/// Emits the normalized plate **plus** a deletion-neighbourhood key for each
46/// character position.
47///
48/// The deletion-neighbourhood approach handles any single-character OCR
49/// confusion (0/O, 1/I, M/W, G/C, etc.) without an explicit confusion table:
50/// two plates that differ by exactly one character at position `i` will both
51/// produce the same key when character `i` is removed, so they land in the
52/// same candidate bucket.
53///
54/// Example: "CX180W" vs "CXI80W" (1/I confusion at position 2) both become "CX80W" after
55/// deleting position 2, so they share a bucket key.
56pub struct PlateOCRFuzzyKey {
57    plate_field: String,
58}
59
60impl PlateOCRFuzzyKey {
61    pub fn new(plate_field: &str) -> Self {
62        Self {
63            plate_field: plate_field.into(),
64        }
65    }
66}
67
68impl BlockingKey for PlateOCRFuzzyKey {
69    fn name(&self) -> &str {
70        "plate_ocr"
71    }
72
73    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
74        let cow = record.field_as_str(&self.plate_field);
75        let plate = match cow.as_deref() {
76            Some(s) => s,
77            None => return vec![],
78        };
79        let norm = normalize_plate(plate);
80        if norm.is_empty() {
81            return vec![];
82        }
83
84        let chars: Vec<char> = norm.chars().collect();
85        let n = chars.len();
86        let mut keys = Vec::with_capacity(n + 1);
87        keys.push(norm.clone());
88
89        // For each position, emit the plate with that character removed.
90        // Two plates differing by one substitution at position i share the
91        // deletion key produced by removing position i from both.
92        for i in 0..n {
93            let variant: String = chars
94                .iter()
95                .enumerate()
96                .filter(|&(j, _)| j != i)
97                .map(|(_, &c)| c)
98                .collect();
99            keys.push(variant);
100        }
101
102        keys.sort();
103        keys.dedup();
104        keys
105    }
106}
107
108// ── CameraTimeWindowKey ───────────────────────────────────────────────────────
109
110/// Groups passages by camera identifier and a fixed-width time window.
111///
112/// Key format: `"{camera_id}:{date}:{slot}"` where `slot = (hour*60 + min) / window_mins`.
113/// Useful for detecting duplicate sensor reads of the same vehicle at the same
114/// camera location within a short interval.
115pub struct CameraTimeWindowKey {
116    camera_field: String,
117    time_field: String,
118    window_mins: u32,
119}
120
121impl CameraTimeWindowKey {
122    pub fn new(camera_field: &str, time_field: &str, window_mins: u32) -> Self {
123        Self {
124            camera_field: camera_field.into(),
125            time_field: time_field.into(),
126            window_mins,
127        }
128    }
129}
130
131fn time_to_slot(datetime: &str, window: u32) -> Option<u32> {
132    let t_idx = datetime.find('T')?;
133    let time_part = &datetime[t_idx + 1..];
134    let mut parts = time_part.splitn(3, ':');
135    let hour: u32 = parts.next()?.parse().ok()?;
136    let minute: u32 = parts.next()?.parse().ok()?;
137    Some((hour * 60 + minute) / window)
138}
139
140impl BlockingKey for CameraTimeWindowKey {
141    fn name(&self) -> &str {
142        "cam_time_window"
143    }
144
145    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
146        let cam_cow = record.field_as_str(&self.camera_field);
147        let cam = match cam_cow.as_deref() {
148            Some(s) => s,
149            None => return vec![],
150        };
151        let ts_cow = record.field_as_str(&self.time_field);
152        let ts = match ts_cow.as_deref() {
153            Some(s) => s,
154            None => return vec![],
155        };
156        let date = ts.get(..10).unwrap_or("");
157        let slot = match time_to_slot(ts, self.window_mins) {
158            Some(s) => s,
159            None => return vec![],
160        };
161        vec![format!("{}:{}:{}", cam, date, slot)]
162    }
163}
164
165// ── GeoGridKey ────────────────────────────────────────────────────────────────
166
167/// Groups records by rounding geographic coordinates to a fixed grid cell.
168///
169/// `grid_size = 0.01` (degrees) ≈ 1 km. Useful for clustering passages near
170/// the same highway camera position or street intersection.
171pub struct GeoGridKey {
172    lat_field: String,
173    lon_field: String,
174    grid_size: f64,
175}
176
177impl GeoGridKey {
178    pub fn new(lat_field: &str, lon_field: &str, grid_size: f64) -> Self {
179        Self {
180            lat_field: lat_field.into(),
181            lon_field: lon_field.into(),
182            grid_size,
183        }
184    }
185}
186
187impl BlockingKey for GeoGridKey {
188    fn name(&self) -> &str {
189        "geo_grid"
190    }
191
192    fn extract(&self, record: &Record, _schema: &Schema) -> Vec<String> {
193        let lat = match record.field_as::<f64>(&self.lat_field) {
194            Some(v) => v,
195            None => return vec![],
196        };
197        let lon = match record.field_as::<f64>(&self.lon_field) {
198            Some(v) => v,
199            None => return vec![],
200        };
201        let lat_cell = (lat / self.grid_size).floor() as i64;
202        let lon_cell = (lon / self.grid_size).floor() as i64;
203        vec![format!("{}:{}", lat_cell, lon_cell)]
204    }
205}
206
207// ── Tests ─────────────────────────────────────────────────────────────────────
208
209#[cfg(test)]
210mod tests {
211    use super::*;
212    use zer_core::{
213        record::FieldValue,
214        schema::{FieldKind, SchemaBuilder},
215    };
216
217    fn schema() -> Schema {
218        SchemaBuilder::new()
219            .field("kenteken", FieldKind::LicensePlate)
220            .field("camera_id", FieldKind::Categorical)
221            .field("tijdstip", FieldKind::Timestamp)
222            .field("lat", FieldKind::GpsCoordinate)
223            .field("lon", FieldKind::GpsCoordinate)
224            .build()
225            .unwrap()
226    }
227
228    fn rec(id: u64, kenteken: &str, camera: &str, ts: &str, lat: &str, lon: &str) -> Record {
229        Record::new(id)
230            .insert("kenteken", FieldValue::Text(kenteken.into()))
231            .insert("camera_id", FieldValue::Text(camera.into()))
232            .insert("tijdstip", FieldValue::Text(ts.into()))
233            .insert("lat", FieldValue::Text(lat.into()))
234            .insert("lon", FieldValue::Text(lon.into()))
235    }
236
237    // ── LicensePlateNormKey
238
239    #[test]
240    fn plate_norm_strips_hyphens() {
241        let schema = schema();
242        let key = LicensePlateNormKey::new("kenteken");
243        let r = rec(
244            1,
245            "25-XKL-9",
246            "CAM-A1-001",
247            "2025-01-01T10:00:00",
248            "52.3",
249            "4.9",
250        );
251        let keys = key.extract(&r, &schema);
252        assert_eq!(keys, vec!["25XKL9"]);
253    }
254
255    #[test]
256    fn plate_norm_empty_field_returns_empty() {
257        let schema = schema();
258        let key = LicensePlateNormKey::new("kenteken");
259        let r = Record::new(1);
260        assert!(key.extract(&r, &schema).is_empty());
261    }
262
263    // ── PlateOCRFuzzyKey
264
265    #[test]
266    fn ocr_fuzzy_original_and_confused_share_key() {
267        let schema = schema();
268        let key = PlateOCRFuzzyKey::new("kenteken");
269
270        // True plate "CX-180-W" → normalized "CX180W"
271        let true_r = rec(
272            1,
273            "CX-180-W",
274            "CAM-A1-001",
275            "2025-01-01T10:00:00",
276            "52.3",
277            "4.9",
278        );
279        // OCR plate  "CX-I80-W" (1→I confusion) → normalized "CXI80W"
280        let ocr_r = rec(
281            2,
282            "CX-I80-W",
283            "CAM-A1-001",
284            "2025-01-01T10:00:00",
285            "52.3",
286            "4.9",
287        );
288
289        let true_keys: std::collections::HashSet<String> =
290            key.extract(&true_r, &schema).into_iter().collect();
291        let ocr_keys: std::collections::HashSet<String> =
292            key.extract(&ocr_r, &schema).into_iter().collect();
293
294        let shared: Vec<_> = true_keys.intersection(&ocr_keys).collect();
295        assert!(
296            !shared.is_empty(),
297            "true plate and OCR plate must share at least one fuzzy key; true={true_keys:?}, ocr={ocr_keys:?}"
298        );
299    }
300
301    #[test]
302    fn ocr_fuzzy_emits_multiple_variants() {
303        let schema = schema();
304        let key = PlateOCRFuzzyKey::new("kenteken");
305        // "L01A4" has 5 chars → original + 5 deletion keys = 6 distinct keys
306        let r = rec(1, "L01A4", "CAM", "2025-01-01T08:00:00", "52.0", "4.0");
307        let keys = key.extract(&r, &schema);
308        assert!(
309            keys.len() >= 4,
310            "should emit original + deletion variants; got {keys:?}"
311        );
312        assert!(
313            keys.contains(&"L01A4".to_string()),
314            "original key must be present"
315        );
316        // Deletion-neighbourhood keys: each char removed once
317        assert!(
318            keys.contains(&"01A4".to_string()),
319            "deletion at pos 0 (L) expected"
320        );
321        assert!(
322            keys.contains(&"L0A4".to_string()),
323            "deletion at pos 2 (1) expected"
324        );
325    }
326
327    // ── CameraTimeWindowKey
328
329    #[test]
330    fn camera_time_window_same_slot() {
331        let schema = schema();
332        let key = CameraTimeWindowKey::new("camera_id", "tijdstip", 10);
333
334        let r1 = rec(1, "X", "CAM-A1-001", "2025-06-01T14:02:00", "52.0", "4.0");
335        let r2 = rec(2, "Y", "CAM-A1-001", "2025-06-01T14:08:00", "52.0", "4.0");
336        // 14:02 → slot 84  (84*10=840 min = 14h00m); 14:08 → slot 84  (same)
337        assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
338    }
339
340    #[test]
341    fn camera_time_window_different_slot() {
342        let schema = schema();
343        let key = CameraTimeWindowKey::new("camera_id", "tijdstip", 10);
344
345        let r1 = rec(1, "X", "CAM-A1-001", "2025-06-01T14:02:00", "52.0", "4.0");
346        let r2 = rec(2, "Y", "CAM-A1-001", "2025-06-01T14:12:00", "52.0", "4.0");
347        // 14:02 → slot 84; 14:12 → slot 85
348        assert_ne!(key.extract(&r1, &schema), key.extract(&r2, &schema));
349    }
350
351    // ── GeoGridKey
352
353    #[test]
354    fn geo_grid_nearby_records_share_key() {
355        let schema = schema();
356        let key = GeoGridKey::new("lat", "lon", 0.01);
357
358        let r1 = rec(1, "X", "CAM", "2025-01-01T10:00:00", "52.345", "4.901");
359        let r2 = rec(2, "Y", "CAM", "2025-01-01T10:00:00", "52.349", "4.907");
360        // Both land in the same 0.01° cell (lat 5234, lon 490)
361        assert_eq!(key.extract(&r1, &schema), key.extract(&r2, &schema));
362    }
363
364    #[test]
365    fn geo_grid_distant_records_differ() {
366        let schema = schema();
367        let key = GeoGridKey::new("lat", "lon", 0.01);
368
369        let r1 = rec(1, "X", "CAM", "2025-01-01T10:00:00", "52.345", "4.901");
370        let r2 = rec(2, "Y", "CAM", "2025-01-01T10:00:00", "51.922", "4.479");
371        assert_ne!(key.extract(&r1, &schema), key.extract(&r2, &schema));
372    }
373}