1use zer_core::schema::{FieldKind, Schema};
2
3use crate::{
4 blocker::CompositeBlocker,
5 keys::{
6 AddressInitialKey, AliasPhoneticKey, CameraTimeWindowKey, DateFragmentKey,
7 DateGranularity, DocumentSuffixKey, ExactFieldKey, FuzzyYearKey, GeoGridKey,
8 LicensePlateNormKey, PhoneticNameDobInitialKey, PhoneticNameDobKey, PlateOCRFuzzyKey,
9 SuffixKey, TransliteratedPhoneticKey, BlockingKey,
10 },
11};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum SchemaCategory {
20 PersonRegistry,
22 WantedPersons,
24 ANPRPassages,
26 CallDetailRecords,
28 SIMSubscribers,
30 FinancialIntelligence,
32}
33
34enum CategoryRule {
40 PhoneSuffix(usize),
42 IdSuffix(usize),
44 DocumentSuffix(usize),
46 ExactCategorical,
48 DateFragment(DateGranularity),
50 PhoneticNameDob,
52 PhoneticNameDobInitial,
55 AddressInitial,
57 Custom(Box<dyn BlockingKey>),
59}
60
61pub struct CustomSchemaCategory {
79 rules: Vec<CategoryRule>,
80}
81
82impl CustomSchemaCategory {
83 pub fn new() -> Self {
84 Self { rules: vec![] }
85 }
86
87 pub fn with_phone_suffix(mut self, n: usize) -> Self {
89 self.rules.push(CategoryRule::PhoneSuffix(n));
90 self
91 }
92
93 pub fn with_id_suffix(mut self, n: usize) -> Self {
95 self.rules.push(CategoryRule::IdSuffix(n));
96 self
97 }
98
99 pub fn with_document_suffix(mut self, n: usize) -> Self {
101 self.rules.push(CategoryRule::DocumentSuffix(n));
102 self
103 }
104
105 pub fn with_exact_categorical(mut self) -> Self {
107 self.rules.push(CategoryRule::ExactCategorical);
108 self
109 }
110
111 pub fn with_date_fragment(mut self, granularity: DateGranularity) -> Self {
113 self.rules.push(CategoryRule::DateFragment(granularity));
114 self
115 }
116
117 pub fn with_phonetic_name_dob(mut self) -> Self {
119 self.rules.push(CategoryRule::PhoneticNameDob);
120 self
121 }
122
123 pub fn with_phonetic_name_dob_initial(mut self) -> Self {
126 self.rules.push(CategoryRule::PhoneticNameDobInitial);
127 self
128 }
129
130 pub fn with_address_initial(mut self) -> Self {
132 self.rules.push(CategoryRule::AddressInitial);
133 self
134 }
135
136 pub fn with_key(mut self, key: impl BlockingKey + 'static) -> Self {
138 self.rules.push(CategoryRule::Custom(Box::new(key)));
139 self
140 }
141}
142
143impl Default for CustomSchemaCategory {
144 fn default() -> Self {
145 Self::new()
146 }
147}
148
149pub struct BlockerFactory;
150
151impl BlockerFactory {
152 pub fn from_schema(schema: &Schema) -> CompositeBlocker {
164 let mut blocker = CompositeBlocker::new();
165
166 let name_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
167 let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
168 let addr_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Address).collect();
169 let phone_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Phone).collect();
170 let id_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Id).collect();
171 let cat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Categorical).collect();
172
173 if let (Some(&surname), Some(&dob)) = (name_fields.last(), date_fields.first()) {
174 if name_fields.len() >= 2 {
175 let first_name = name_fields[0];
176 blocker = blocker.add(PhoneticNameDobInitialKey::new(surname, first_name, dob));
177 } else {
178 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
179 }
180 blocker = blocker.add(DateFragmentKey::new(dob, DateGranularity::YearMonth));
182 }
183
184 if let (Some(&first_name), Some(&addr)) = (name_fields.first(), addr_fields.first()) {
185 blocker = blocker.add(AddressInitialKey::new(addr, first_name));
186 }
187
188 if let Some(&phone) = phone_fields.first() {
189 blocker = blocker.add(SuffixKey::new(phone, 7));
190 }
191
192 for &id in &id_fields {
193 blocker = blocker.add(SuffixKey::new(id, 4));
194 }
195
196 if name_fields.is_empty() {
197 if let Some(&dob) = date_fields.first() {
198 blocker = blocker.add(DateFragmentKey::new(dob, DateGranularity::YearMonth));
199 }
200 }
201
202 for &cat in &cat_fields {
203 blocker = blocker.add(ExactFieldKey::new(cat));
204 }
205
206 blocker
207 }
208
209 pub fn from_custom_category(schema: &Schema, cat: CustomSchemaCategory) -> CompositeBlocker {
215 let mut blocker = CompositeBlocker::new();
216
217 for rule in cat.rules {
218 match rule {
219 CategoryRule::PhoneSuffix(n) => {
220 for field in schema.fields_of_kind(FieldKind::Phone) {
221 blocker = blocker.add(SuffixKey::new(field, n));
222 }
223 }
224 CategoryRule::IdSuffix(n) => {
225 for field in schema.fields_of_kind(FieldKind::Id) {
226 blocker = blocker.add(SuffixKey::new(field, n));
227 }
228 }
229 CategoryRule::DocumentSuffix(n) => {
230 for field in schema.fields_of_kind(FieldKind::Id) {
231 blocker = blocker.add(DocumentSuffixKey::new(field, n));
232 }
233 }
234 CategoryRule::ExactCategorical => {
235 for field in schema.fields_of_kind(FieldKind::Categorical) {
236 blocker = blocker.add(ExactFieldKey::new(field));
237 }
238 }
239 CategoryRule::DateFragment(granularity) => {
240 if let Some(field) = schema.fields_of_kind(FieldKind::Date).next() {
241 blocker = blocker.add(DateFragmentKey::new(field, granularity));
242 }
243 }
244 CategoryRule::PhoneticNameDob => {
245 let names: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
246 let dates: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
247 if let (Some(&surname), Some(&dob)) = (names.last(), dates.first()) {
248 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
249 }
250 }
251 CategoryRule::PhoneticNameDobInitial => {
252 let names: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
253 let dates: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
254 if let Some(&dob) = dates.first() {
255 if names.len() >= 2 {
256 let first_name = names[0];
257 let surname = names[names.len() - 1];
258 blocker = blocker.add(PhoneticNameDobInitialKey::new(surname, first_name, dob));
259 } else if let Some(&surname) = names.last() {
260 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
261 }
262 }
263 }
264 CategoryRule::AddressInitial => {
265 let names: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
266 let addrs: Vec<&str> = schema.fields_of_kind(FieldKind::Address).collect();
267 if let (Some(&first_name), Some(&addr)) = (names.first(), addrs.first()) {
268 blocker = blocker.add(AddressInitialKey::new(addr, first_name));
269 }
270 }
271 CategoryRule::Custom(key) => {
272 blocker = blocker.add_boxed(key);
273 }
274 }
275 }
276
277 blocker
278 }
279
280 fn telecom_blocker(schema: &Schema) -> CompositeBlocker {
281 let mut blocker = CompositeBlocker::new();
282 for f in schema.fields_of_kind(FieldKind::Phone) { blocker = blocker.add(SuffixKey::new(f, 7)); }
283 for f in schema.fields_of_kind(FieldKind::Id) { blocker = blocker.add(SuffixKey::new(f, 6)); }
284 for f in schema.fields_of_kind(FieldKind::Categorical) { blocker = blocker.add(ExactFieldKey::new(f)); }
285 blocker
286 }
287
288 pub fn from_schema_category(schema: &Schema, category: SchemaCategory) -> CompositeBlocker {
293 match category {
294 SchemaCategory::PersonRegistry => Self::from_schema(schema),
295
296 SchemaCategory::WantedPersons => {
297 let mut blocker = CompositeBlocker::new();
298
299 let name_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
300 let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
301 let alias_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Alias).collect();
302 let id_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Id).collect();
303
304 if let (Some(&surname), Some(&dob)) = (name_fields.last(), date_fields.first()) {
305 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
306 blocker = blocker.add(TransliteratedPhoneticKey::new(surname, dob));
307 blocker = blocker.add(FuzzyYearKey::new(surname, dob, 1));
308 }
309
310 if let Some(&dob) = date_fields.first() {
311 for &alias in &alias_fields {
312 blocker = blocker.add(AliasPhoneticKey::new(alias, dob));
313 }
314 }
315
316 for &id in &id_fields {
317 blocker = blocker.add(DocumentSuffixKey::new(id, 6));
318 }
319
320 blocker
321 }
322
323 SchemaCategory::ANPRPassages => {
324 let mut blocker = CompositeBlocker::new();
325
326 let plate_fields: Vec<&str> = schema.fields_of_kind(FieldKind::LicensePlate).collect();
327 let ts_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Timestamp).collect();
328 let cat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Categorical).collect();
329 let lat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::GpsCoordinate).collect();
330
331 for &plate in &plate_fields {
332 blocker = blocker.add(LicensePlateNormKey::new(plate));
333 blocker = blocker.add(PlateOCRFuzzyKey::new(plate));
334 }
335
336 if let (Some(&cam), Some(&ts)) = (cat_fields.first(), ts_fields.first()) {
338 blocker = blocker.add(CameraTimeWindowKey::new(cam, ts, 10));
339 }
340
341 if lat_fields.len() >= 2 {
343 blocker = blocker.add(GeoGridKey::new(lat_fields[0], lat_fields[1], 0.01));
344 }
345
346 blocker
347 }
348
349 SchemaCategory::CallDetailRecords |
350 SchemaCategory::SIMSubscribers => Self::telecom_blocker(schema),
351
352 SchemaCategory::FinancialIntelligence => {
353 let mut blocker = CompositeBlocker::new();
354
355 let id_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Id).collect();
356 let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
357 let cat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Categorical).collect();
358
359 for &id in &id_fields {
360 blocker = blocker.add(SuffixKey::new(id, 6));
361 }
362
363 if let Some(&dob) = date_fields.first() {
364 blocker = blocker.add(DateFragmentKey::new(dob, DateGranularity::YearMonth));
365 }
366
367 for &cat in &cat_fields {
368 blocker = blocker.add(ExactFieldKey::new(cat));
369 }
370
371 blocker
372 }
373 }
374 }
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380 use zer_core::{
381 record::FieldValue,
382 schema::{FieldKind, SchemaBuilder},
383 traits::Blocker,
384 };
385 use crate::index::InvertedIndex;
386
387 fn person_schema() -> Schema {
388 SchemaBuilder::new()
389 .field("voornamen", FieldKind::Name)
390 .field("achternaam", FieldKind::Name)
391 .field("geboortedatum", FieldKind::Date)
392 .field("woonplaats", FieldKind::Address)
393 .field("postcode", FieldKind::Id)
394 .build()
395 .unwrap()
396 }
397
398 #[test]
399 fn factory_name_date_schema_adds_secondary_year_month_key() {
400 let schema = SchemaBuilder::new()
405 .field("voornamen", FieldKind::Name)
406 .field("achternaam", FieldKind::Name)
407 .field("geboortedatum", FieldKind::Date)
408 .build()
409 .unwrap();
410 let blocker = BlockerFactory::from_schema(&schema);
411
412 let mut idx = InvertedIndex::new();
413 let r1 = zer_core::record::Record::new(1)
414 .insert("achternaam", FieldValue::Text("Jansen".into()))
415 .insert("geboortedatum", FieldValue::Text("1985-06-15".into()));
416 let r2 = zer_core::record::Record::new(2)
418 .insert("achternaam", FieldValue::Text("Pietersen".into()))
419 .insert("geboortedatum", FieldValue::Text("1985-06-22".into()));
420
421 blocker.index_record(&r1, &schema, &mut idx);
422 blocker.index_record(&r2, &schema, &mut idx);
423
424 let cands = blocker.candidates(&r1, &schema, &idx);
425 assert!(
426 cands.contains(&2),
427 "secondary YearMonth key must surface r2 (same birth year-month, different surname)"
428 );
429 }
430
431 #[test]
432 fn factory_produces_non_empty_blocker() {
433 let schema = person_schema();
434 let blocker = BlockerFactory::from_schema(&schema);
435 let record = zer_core::record::Record::new(1)
436 .insert("achternaam", FieldValue::Text("Jansen".into()))
437 .insert("geboortedatum", FieldValue::Text("1980-01-15".into()));
438
439 let keys = blocker.blocking_keys(&record, &schema);
440 assert!(!keys.is_empty(), "BlockerFactory should produce at least one key");
441 }
442
443 #[test]
444 fn factory_date_only_schema_uses_date_fragment() {
445 let schema = SchemaBuilder::new()
446 .field("dob", FieldKind::Date)
447 .build()
448 .unwrap();
449 let blocker = BlockerFactory::from_schema(&schema);
450 let r = zer_core::record::Record::new(1)
451 .insert("dob", FieldValue::Text("1990-06-01".into()));
452
453 let mut idx = InvertedIndex::new();
454 blocker.index_record(&r, &schema, &mut idx);
455 assert!(!idx.is_empty());
456 }
457
458 #[test]
459 fn category_wanted_persons_produces_keys() {
460 let schema = SchemaBuilder::new()
461 .field("voornamen", FieldKind::Name)
462 .field("achternaam", FieldKind::Name)
463 .field("alias_namen", FieldKind::Alias)
464 .field("geboortedatum", FieldKind::Date)
465 .field("document_nummer", FieldKind::Id)
466 .build()
467 .unwrap();
468 let blocker = BlockerFactory::from_schema_category(&schema, SchemaCategory::WantedPersons);
469 let r = zer_core::record::Record::new(1)
470 .insert("achternaam", FieldValue::Text("Benabdallah".into()))
471 .insert("geboortedatum", FieldValue::Text("1999-06-14".into()));
472
473 let keys = blocker.blocking_keys(&r, &schema);
474 assert!(!keys.is_empty());
475 }
476
477 #[test]
478 fn category_anpr_produces_plate_keys() {
479 let schema = SchemaBuilder::new()
480 .field("kenteken", FieldKind::LicensePlate)
481 .field("camera_id", FieldKind::Categorical)
482 .field("tijdstip", FieldKind::Timestamp)
483 .build()
484 .unwrap();
485 let blocker = BlockerFactory::from_schema_category(&schema, SchemaCategory::ANPRPassages);
486 let r = zer_core::record::Record::new(1)
487 .insert("kenteken", FieldValue::Text("25-XKL-9".into()))
488 .insert("camera_id", FieldValue::Text("CAM-A12-001".into()))
489 .insert("tijdstip", FieldValue::Text("2025-06-01T10:00:00".into()));
490
491 let keys = blocker.blocking_keys(&r, &schema);
492 assert!(!keys.is_empty());
493 assert!(keys.iter().any(|k| k.contains("25XKL9")));
494 }
495
496 #[test]
499 fn custom_phone_suffix_extracts_key() {
500 let schema = SchemaBuilder::new()
501 .field("telefoon", FieldKind::Phone)
502 .build()
503 .unwrap();
504 let cat = CustomSchemaCategory::new().with_phone_suffix(7);
505 let blocker = BlockerFactory::from_custom_category(&schema, cat);
506 let r = zer_core::record::Record::new(1)
507 .insert("telefoon", FieldValue::Text("0612345678".into()));
508
509 let keys = blocker.blocking_keys(&r, &schema);
510 assert!(!keys.is_empty(), "phone suffix rule must produce a key");
511 assert!(keys.iter().any(|k| k.ends_with("2345678")), "key must end with last 7 digits");
512 }
513
514 #[test]
515 fn custom_id_suffix_correct_length() {
516 let schema = SchemaBuilder::new()
517 .field("postcode", FieldKind::Id)
518 .build()
519 .unwrap();
520 let cat = CustomSchemaCategory::new().with_id_suffix(4);
521 let blocker = BlockerFactory::from_custom_category(&schema, cat);
522 let r = zer_core::record::Record::new(1)
523 .insert("postcode", FieldValue::Text("1011AB".into()));
524
525 let keys = blocker.blocking_keys(&r, &schema);
526 assert!(keys.iter().any(|k| k.ends_with("1011")), "id suffix must be 4 digits: {keys:?}");
528 }
529
530 #[test]
531 fn custom_exact_categorical_matches_only_same_value() {
532 let schema = SchemaBuilder::new()
533 .field("tussenvoegsel", FieldKind::Categorical)
534 .build()
535 .unwrap();
536 let cat = CustomSchemaCategory::new().with_exact_categorical();
537 let blocker = BlockerFactory::from_custom_category(&schema, cat);
538
539 let mut idx = InvertedIndex::new();
540 let r1 = zer_core::record::Record::new(1).insert("tussenvoegsel", FieldValue::Text("van".into()));
541 let r2 = zer_core::record::Record::new(2).insert("tussenvoegsel", FieldValue::Text("van".into()));
542 let r3 = zer_core::record::Record::new(3).insert("tussenvoegsel", FieldValue::Text("de".into()));
543
544 blocker.index_record(&r1, &schema, &mut idx);
545 blocker.index_record(&r2, &schema, &mut idx);
546 blocker.index_record(&r3, &schema, &mut idx);
547
548 let cands = blocker.candidates(&r1, &schema, &idx);
549 assert!(cands.contains(&2), "r2 (same tussenvoegsel) must be a candidate");
550 assert!(!cands.contains(&3), "r3 (different tussenvoegsel) must NOT be a candidate");
551 }
552
553 #[test]
554 fn custom_date_fragment_produces_year_month_key() {
555 let schema = SchemaBuilder::new()
556 .field("geboortedatum", FieldKind::Date)
557 .build()
558 .unwrap();
559 let cat = CustomSchemaCategory::new().with_date_fragment(DateGranularity::YearMonth);
560 let blocker = BlockerFactory::from_custom_category(&schema, cat);
561 let r = zer_core::record::Record::new(1)
562 .insert("geboortedatum", FieldValue::Text("1990-06-15".into()));
563
564 let keys = blocker.blocking_keys(&r, &schema);
565 assert!(keys.iter().any(|k| k.contains("1990-06")), "key must contain YYYY-MM: {keys:?}");
566 }
567
568 #[test]
569 fn custom_phonetic_name_dob_links_same_person() {
570 let schema = SchemaBuilder::new()
571 .field("voornamen", FieldKind::Name)
572 .field("achternaam", FieldKind::Name)
573 .field("geboortedatum", FieldKind::Date)
574 .build()
575 .unwrap();
576 let cat = CustomSchemaCategory::new().with_phonetic_name_dob();
577 let blocker = BlockerFactory::from_custom_category(&schema, cat);
578
579 let mut idx = InvertedIndex::new();
580 let r1 = zer_core::record::Record::new(1)
581 .insert("achternaam", FieldValue::Text("Jansen".into()))
582 .insert("geboortedatum", FieldValue::Text("1978-03-15".into()));
583 let r2 = zer_core::record::Record::new(2)
584 .insert("achternaam", FieldValue::Text("Jansen".into()))
585 .insert("geboortedatum", FieldValue::Text("1978-03-15".into()));
586 let r3 = zer_core::record::Record::new(3)
587 .insert("achternaam", FieldValue::Text("de Wit".into()))
588 .insert("geboortedatum", FieldValue::Text("1990-07-22".into()));
589
590 blocker.index_record(&r1, &schema, &mut idx);
591 blocker.index_record(&r2, &schema, &mut idx);
592 blocker.index_record(&r3, &schema, &mut idx);
593
594 let cands = blocker.candidates(&r1, &schema, &idx);
595 assert!(cands.contains(&2), "same surname+DOB must be a candidate");
596 assert!(!cands.contains(&3), "different surname+DOB must NOT be a candidate");
597 }
598
599 #[test]
600 fn custom_missing_field_kind_produces_no_panic() {
601 let schema = SchemaBuilder::new()
603 .field("achternaam", FieldKind::Name)
604 .build()
605 .unwrap();
606 let cat = CustomSchemaCategory::new().with_phone_suffix(7);
607 let blocker = BlockerFactory::from_custom_category(&schema, cat);
608 let r = zer_core::record::Record::new(1)
609 .insert("achternaam", FieldValue::Text("Jansen".into()));
610
611 let keys = blocker.blocking_keys(&r, &schema);
612 assert!(keys.is_empty(), "no Phone fields → no keys, no panic");
613 }
614
615 #[test]
616 fn custom_escape_hatch_with_key_works() {
617 let schema = SchemaBuilder::new()
618 .field("postcode", FieldKind::Id)
619 .build()
620 .unwrap();
621 let cat = CustomSchemaCategory::new().with_key(SuffixKey::new("postcode", 4));
623 let blocker = BlockerFactory::from_custom_category(&schema, cat);
624 let r = zer_core::record::Record::new(1)
625 .insert("postcode", FieldValue::Text("1011AB".into()));
626
627 let keys = blocker.blocking_keys(&r, &schema);
628 assert!(!keys.is_empty(), "escape-hatch key must produce at least one key");
629 }
630
631 #[test]
632 fn custom_combined_rules_produce_multiple_key_types() {
633 let schema = SchemaBuilder::new()
634 .field("voornamen", FieldKind::Name)
635 .field("achternaam", FieldKind::Name)
636 .field("geboortedatum", FieldKind::Date)
637 .field("postcode", FieldKind::Id)
638 .field("tussenvoegsel", FieldKind::Categorical)
639 .build()
640 .unwrap();
641 let cat = CustomSchemaCategory::new()
642 .with_phonetic_name_dob()
643 .with_id_suffix(4)
644 .with_exact_categorical();
645 let blocker = BlockerFactory::from_custom_category(&schema, cat);
646 let r = zer_core::record::Record::new(1)
647 .insert("achternaam", FieldValue::Text("van den Berg".into()))
648 .insert("geboortedatum", FieldValue::Text("1978-03-15".into()))
649 .insert("postcode", FieldValue::Text("1011AB".into()))
650 .insert("tussenvoegsel", FieldValue::Text("van den".into()));
651
652 let keys = blocker.blocking_keys(&r, &schema);
653 assert!(keys.len() >= 3, "combined rules must produce at least 3 keys: {keys:?}");
655 }
656}