1use zer_core::schema::{FieldKind, Schema};
2
3use crate::{
4 blocker::CompositeBlocker,
5 keys::{
6 AddressInitialKey, AliasPhoneticKey, BlockingKey, CameraTimeWindowKey, DateFragmentKey,
7 DateGranularity, DocumentSuffixKey, ExactFieldKey, FuzzyYearKey, GeoGridKey,
8 LicensePlateNormKey, PhoneticNameDobInitialKey, PhoneticNameDobKey, PlateOCRFuzzyKey,
9 SuffixKey, TransliteratedPhoneticKey,
10 },
11};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum SchemaCategory {
20 PersonRegistry,
22 WantedPersons,
24 ANPRPassages,
26 CallDetailRecords,
28 SIMSubscribers,
30 FinancialIntelligence,
32}
33
34enum CategoryRule {
40 PhoneSuffix(usize),
42 IdSuffix(usize),
44 DocumentSuffix(usize),
46 ExactCategorical,
48 DateFragment(DateGranularity),
50 PhoneticNameDob,
52 PhoneticNameDobInitial,
55 AddressInitial,
57 Custom(Box<dyn BlockingKey>),
59}
60
61pub struct CustomSchemaCategory {
79 rules: Vec<CategoryRule>,
80}
81
82impl CustomSchemaCategory {
83 pub fn new() -> Self {
84 Self { rules: vec![] }
85 }
86
87 pub fn with_phone_suffix(mut self, n: usize) -> Self {
89 self.rules.push(CategoryRule::PhoneSuffix(n));
90 self
91 }
92
93 pub fn with_id_suffix(mut self, n: usize) -> Self {
95 self.rules.push(CategoryRule::IdSuffix(n));
96 self
97 }
98
99 pub fn with_document_suffix(mut self, n: usize) -> Self {
101 self.rules.push(CategoryRule::DocumentSuffix(n));
102 self
103 }
104
105 pub fn with_exact_categorical(mut self) -> Self {
107 self.rules.push(CategoryRule::ExactCategorical);
108 self
109 }
110
111 pub fn with_date_fragment(mut self, granularity: DateGranularity) -> Self {
113 self.rules.push(CategoryRule::DateFragment(granularity));
114 self
115 }
116
117 pub fn with_phonetic_name_dob(mut self) -> Self {
119 self.rules.push(CategoryRule::PhoneticNameDob);
120 self
121 }
122
123 pub fn with_phonetic_name_dob_initial(mut self) -> Self {
126 self.rules.push(CategoryRule::PhoneticNameDobInitial);
127 self
128 }
129
130 pub fn with_address_initial(mut self) -> Self {
132 self.rules.push(CategoryRule::AddressInitial);
133 self
134 }
135
136 pub fn with_key(mut self, key: impl BlockingKey + 'static) -> Self {
138 self.rules.push(CategoryRule::Custom(Box::new(key)));
139 self
140 }
141}
142
143impl Default for CustomSchemaCategory {
144 fn default() -> Self {
145 Self::new()
146 }
147}
148
149pub struct BlockerFactory;
150
151impl BlockerFactory {
152 pub fn from_schema(schema: &Schema) -> CompositeBlocker {
164 let mut blocker = CompositeBlocker::new();
165
166 let name_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
167 let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
168 let addr_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Address).collect();
169 let phone_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Phone).collect();
170 let id_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Id).collect();
171 let cat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Categorical).collect();
172
173 if let (Some(&surname), Some(&dob)) = (name_fields.last(), date_fields.first()) {
174 if name_fields.len() >= 2 {
175 let first_name = name_fields[0];
176 blocker = blocker.add(PhoneticNameDobInitialKey::new(surname, first_name, dob));
177 } else {
178 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
179 }
180 blocker = blocker.add(DateFragmentKey::new(dob, DateGranularity::YearMonth));
182 }
183
184 if let (Some(&first_name), Some(&addr)) = (name_fields.first(), addr_fields.first()) {
185 blocker = blocker.add(AddressInitialKey::new(addr, first_name));
186 }
187
188 if let Some(&phone) = phone_fields.first() {
189 blocker = blocker.add(SuffixKey::new(phone, 7));
190 }
191
192 for &id in &id_fields {
193 blocker = blocker.add(SuffixKey::new(id, 4));
194 }
195
196 if name_fields.is_empty() {
197 if let Some(&dob) = date_fields.first() {
198 blocker = blocker.add(DateFragmentKey::new(dob, DateGranularity::YearMonth));
199 }
200 }
201
202 for &cat in &cat_fields {
203 blocker = blocker.add(ExactFieldKey::new(cat));
204 }
205
206 blocker
207 }
208
209 pub fn from_custom_category(schema: &Schema, cat: CustomSchemaCategory) -> CompositeBlocker {
215 let mut blocker = CompositeBlocker::new();
216
217 for rule in cat.rules {
218 match rule {
219 CategoryRule::PhoneSuffix(n) => {
220 for field in schema.fields_of_kind(FieldKind::Phone) {
221 blocker = blocker.add(SuffixKey::new(field, n));
222 }
223 }
224 CategoryRule::IdSuffix(n) => {
225 for field in schema.fields_of_kind(FieldKind::Id) {
226 blocker = blocker.add(SuffixKey::new(field, n));
227 }
228 }
229 CategoryRule::DocumentSuffix(n) => {
230 for field in schema.fields_of_kind(FieldKind::Id) {
231 blocker = blocker.add(DocumentSuffixKey::new(field, n));
232 }
233 }
234 CategoryRule::ExactCategorical => {
235 for field in schema.fields_of_kind(FieldKind::Categorical) {
236 blocker = blocker.add(ExactFieldKey::new(field));
237 }
238 }
239 CategoryRule::DateFragment(granularity) => {
240 if let Some(field) = schema.fields_of_kind(FieldKind::Date).next() {
241 blocker = blocker.add(DateFragmentKey::new(field, granularity));
242 }
243 }
244 CategoryRule::PhoneticNameDob => {
245 let names: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
246 let dates: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
247 if let (Some(&surname), Some(&dob)) = (names.last(), dates.first()) {
248 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
249 }
250 }
251 CategoryRule::PhoneticNameDobInitial => {
252 let names: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
253 let dates: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
254 if let Some(&dob) = dates.first() {
255 if names.len() >= 2 {
256 let first_name = names[0];
257 let surname = names[names.len() - 1];
258 blocker = blocker
259 .add(PhoneticNameDobInitialKey::new(surname, first_name, dob));
260 } else if let Some(&surname) = names.last() {
261 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
262 }
263 }
264 }
265 CategoryRule::AddressInitial => {
266 let names: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
267 let addrs: Vec<&str> = schema.fields_of_kind(FieldKind::Address).collect();
268 if let (Some(&first_name), Some(&addr)) = (names.first(), addrs.first()) {
269 blocker = blocker.add(AddressInitialKey::new(addr, first_name));
270 }
271 }
272 CategoryRule::Custom(key) => {
273 blocker = blocker.add_boxed(key);
274 }
275 }
276 }
277
278 blocker
279 }
280
281 fn telecom_blocker(schema: &Schema) -> CompositeBlocker {
282 let mut blocker = CompositeBlocker::new();
283 for f in schema.fields_of_kind(FieldKind::Phone) {
284 blocker = blocker.add(SuffixKey::new(f, 7));
285 }
286 for f in schema.fields_of_kind(FieldKind::Id) {
287 blocker = blocker.add(SuffixKey::new(f, 6));
288 }
289 for f in schema.fields_of_kind(FieldKind::Categorical) {
290 blocker = blocker.add(ExactFieldKey::new(f));
291 }
292 blocker
293 }
294
295 pub fn from_schema_category(schema: &Schema, category: SchemaCategory) -> CompositeBlocker {
300 match category {
301 SchemaCategory::PersonRegistry => Self::from_schema(schema),
302
303 SchemaCategory::WantedPersons => {
304 let mut blocker = CompositeBlocker::new();
305
306 let name_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
307 let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
308 let alias_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Alias).collect();
309 let id_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Id).collect();
310
311 if let (Some(&surname), Some(&dob)) = (name_fields.last(), date_fields.first()) {
312 blocker = blocker.add(PhoneticNameDobKey::new(surname, dob));
313 blocker = blocker.add(TransliteratedPhoneticKey::new(surname, dob));
314 blocker = blocker.add(FuzzyYearKey::new(surname, dob, 1));
315 }
316
317 if let Some(&dob) = date_fields.first() {
318 for &alias in &alias_fields {
319 blocker = blocker.add(AliasPhoneticKey::new(alias, dob));
320 }
321 }
322
323 for &id in &id_fields {
324 blocker = blocker.add(DocumentSuffixKey::new(id, 6));
325 }
326
327 blocker
328 }
329
330 SchemaCategory::ANPRPassages => {
331 let mut blocker = CompositeBlocker::new();
332
333 let plate_fields: Vec<&str> =
334 schema.fields_of_kind(FieldKind::LicensePlate).collect();
335 let ts_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Timestamp).collect();
336 let cat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Categorical).collect();
337 let lat_fields: Vec<&str> =
338 schema.fields_of_kind(FieldKind::GpsCoordinate).collect();
339
340 for &plate in &plate_fields {
341 blocker = blocker.add(LicensePlateNormKey::new(plate));
342 blocker = blocker.add(PlateOCRFuzzyKey::new(plate));
343 }
344
345 if let (Some(&cam), Some(&ts)) = (cat_fields.first(), ts_fields.first()) {
347 blocker = blocker.add(CameraTimeWindowKey::new(cam, ts, 10));
348 }
349
350 if lat_fields.len() >= 2 {
352 blocker = blocker.add(GeoGridKey::new(lat_fields[0], lat_fields[1], 0.01));
353 }
354
355 blocker
356 }
357
358 SchemaCategory::CallDetailRecords | SchemaCategory::SIMSubscribers => {
359 Self::telecom_blocker(schema)
360 }
361
362 SchemaCategory::FinancialIntelligence => {
363 let mut blocker = CompositeBlocker::new();
364
365 let id_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Id).collect();
366 let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
367 let cat_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Categorical).collect();
368
369 for &id in &id_fields {
370 blocker = blocker.add(SuffixKey::new(id, 6));
371 }
372
373 if let Some(&dob) = date_fields.first() {
374 blocker = blocker.add(DateFragmentKey::new(dob, DateGranularity::YearMonth));
375 }
376
377 for &cat in &cat_fields {
378 blocker = blocker.add(ExactFieldKey::new(cat));
379 }
380
381 blocker
382 }
383 }
384 }
385}
386
387#[cfg(test)]
388mod tests {
389 use super::*;
390 use crate::index::InvertedIndex;
391 use zer_core::{
392 record::FieldValue,
393 schema::{FieldKind, SchemaBuilder},
394 traits::Blocker,
395 };
396
397 fn person_schema() -> Schema {
398 SchemaBuilder::new()
399 .field("voornamen", FieldKind::Name)
400 .field("achternaam", FieldKind::Name)
401 .field("geboortedatum", FieldKind::Date)
402 .field("woonplaats", FieldKind::Address)
403 .field("postcode", FieldKind::Id)
404 .build()
405 .unwrap()
406 }
407
408 #[test]
409 fn factory_name_date_schema_adds_secondary_year_month_key() {
410 let schema = SchemaBuilder::new()
415 .field("voornamen", FieldKind::Name)
416 .field("achternaam", FieldKind::Name)
417 .field("geboortedatum", FieldKind::Date)
418 .build()
419 .unwrap();
420 let blocker = BlockerFactory::from_schema(&schema);
421
422 let mut idx = InvertedIndex::new();
423 let r1 = zer_core::record::Record::new(1)
424 .insert("achternaam", FieldValue::Text("Jansen".into()))
425 .insert("geboortedatum", FieldValue::Text("1985-06-15".into()));
426 let r2 = zer_core::record::Record::new(2)
428 .insert("achternaam", FieldValue::Text("Pietersen".into()))
429 .insert("geboortedatum", FieldValue::Text("1985-06-22".into()));
430
431 blocker.index_record(&r1, &schema, &mut idx);
432 blocker.index_record(&r2, &schema, &mut idx);
433
434 let cands = blocker.candidates(&r1, &schema, &idx);
435 assert!(
436 cands.contains(&2),
437 "secondary YearMonth key must surface r2 (same birth year-month, different surname)"
438 );
439 }
440
441 #[test]
442 fn factory_produces_non_empty_blocker() {
443 let schema = person_schema();
444 let blocker = BlockerFactory::from_schema(&schema);
445 let record = zer_core::record::Record::new(1)
446 .insert("achternaam", FieldValue::Text("Jansen".into()))
447 .insert("geboortedatum", FieldValue::Text("1980-01-15".into()));
448
449 let keys = blocker.blocking_keys(&record, &schema);
450 assert!(
451 !keys.is_empty(),
452 "BlockerFactory should produce at least one key"
453 );
454 }
455
456 #[test]
457 fn factory_date_only_schema_uses_date_fragment() {
458 let schema = SchemaBuilder::new()
459 .field("dob", FieldKind::Date)
460 .build()
461 .unwrap();
462 let blocker = BlockerFactory::from_schema(&schema);
463 let r =
464 zer_core::record::Record::new(1).insert("dob", FieldValue::Text("1990-06-01".into()));
465
466 let mut idx = InvertedIndex::new();
467 blocker.index_record(&r, &schema, &mut idx);
468 assert!(!idx.is_empty());
469 }
470
471 #[test]
472 fn category_wanted_persons_produces_keys() {
473 let schema = SchemaBuilder::new()
474 .field("voornamen", FieldKind::Name)
475 .field("achternaam", FieldKind::Name)
476 .field("alias_namen", FieldKind::Alias)
477 .field("geboortedatum", FieldKind::Date)
478 .field("document_nummer", FieldKind::Id)
479 .build()
480 .unwrap();
481 let blocker = BlockerFactory::from_schema_category(&schema, SchemaCategory::WantedPersons);
482 let r = zer_core::record::Record::new(1)
483 .insert("achternaam", FieldValue::Text("Benabdallah".into()))
484 .insert("geboortedatum", FieldValue::Text("1999-06-14".into()));
485
486 let keys = blocker.blocking_keys(&r, &schema);
487 assert!(!keys.is_empty());
488 }
489
490 #[test]
491 fn category_anpr_produces_plate_keys() {
492 let schema = SchemaBuilder::new()
493 .field("kenteken", FieldKind::LicensePlate)
494 .field("camera_id", FieldKind::Categorical)
495 .field("tijdstip", FieldKind::Timestamp)
496 .build()
497 .unwrap();
498 let blocker = BlockerFactory::from_schema_category(&schema, SchemaCategory::ANPRPassages);
499 let r = zer_core::record::Record::new(1)
500 .insert("kenteken", FieldValue::Text("25-XKL-9".into()))
501 .insert("camera_id", FieldValue::Text("CAM-A12-001".into()))
502 .insert("tijdstip", FieldValue::Text("2025-06-01T10:00:00".into()));
503
504 let keys = blocker.blocking_keys(&r, &schema);
505 assert!(!keys.is_empty());
506 assert!(keys.iter().any(|k| k.contains("25XKL9")));
507 }
508
509 #[test]
512 fn custom_phone_suffix_extracts_key() {
513 let schema = SchemaBuilder::new()
514 .field("telefoon", FieldKind::Phone)
515 .build()
516 .unwrap();
517 let cat = CustomSchemaCategory::new().with_phone_suffix(7);
518 let blocker = BlockerFactory::from_custom_category(&schema, cat);
519 let r = zer_core::record::Record::new(1)
520 .insert("telefoon", FieldValue::Text("0612345678".into()));
521
522 let keys = blocker.blocking_keys(&r, &schema);
523 assert!(!keys.is_empty(), "phone suffix rule must produce a key");
524 assert!(
525 keys.iter().any(|k| k.ends_with("2345678")),
526 "key must end with last 7 digits"
527 );
528 }
529
530 #[test]
531 fn custom_id_suffix_correct_length() {
532 let schema = SchemaBuilder::new()
533 .field("postcode", FieldKind::Id)
534 .build()
535 .unwrap();
536 let cat = CustomSchemaCategory::new().with_id_suffix(4);
537 let blocker = BlockerFactory::from_custom_category(&schema, cat);
538 let r =
539 zer_core::record::Record::new(1).insert("postcode", FieldValue::Text("1011AB".into()));
540
541 let keys = blocker.blocking_keys(&r, &schema);
542 assert!(
544 keys.iter().any(|k| k.ends_with("1011")),
545 "id suffix must be 4 digits: {keys:?}"
546 );
547 }
548
549 #[test]
550 fn custom_exact_categorical_matches_only_same_value() {
551 let schema = SchemaBuilder::new()
552 .field("tussenvoegsel", FieldKind::Categorical)
553 .build()
554 .unwrap();
555 let cat = CustomSchemaCategory::new().with_exact_categorical();
556 let blocker = BlockerFactory::from_custom_category(&schema, cat);
557
558 let mut idx = InvertedIndex::new();
559 let r1 = zer_core::record::Record::new(1)
560 .insert("tussenvoegsel", FieldValue::Text("van".into()));
561 let r2 = zer_core::record::Record::new(2)
562 .insert("tussenvoegsel", FieldValue::Text("van".into()));
563 let r3 =
564 zer_core::record::Record::new(3).insert("tussenvoegsel", FieldValue::Text("de".into()));
565
566 blocker.index_record(&r1, &schema, &mut idx);
567 blocker.index_record(&r2, &schema, &mut idx);
568 blocker.index_record(&r3, &schema, &mut idx);
569
570 let cands = blocker.candidates(&r1, &schema, &idx);
571 assert!(
572 cands.contains(&2),
573 "r2 (same tussenvoegsel) must be a candidate"
574 );
575 assert!(
576 !cands.contains(&3),
577 "r3 (different tussenvoegsel) must NOT be a candidate"
578 );
579 }
580
581 #[test]
582 fn custom_date_fragment_produces_year_month_key() {
583 let schema = SchemaBuilder::new()
584 .field("geboortedatum", FieldKind::Date)
585 .build()
586 .unwrap();
587 let cat = CustomSchemaCategory::new().with_date_fragment(DateGranularity::YearMonth);
588 let blocker = BlockerFactory::from_custom_category(&schema, cat);
589 let r = zer_core::record::Record::new(1)
590 .insert("geboortedatum", FieldValue::Text("1990-06-15".into()));
591
592 let keys = blocker.blocking_keys(&r, &schema);
593 assert!(
594 keys.iter().any(|k| k.contains("1990-06")),
595 "key must contain YYYY-MM: {keys:?}"
596 );
597 }
598
599 #[test]
600 fn custom_phonetic_name_dob_links_same_person() {
601 let schema = SchemaBuilder::new()
602 .field("voornamen", FieldKind::Name)
603 .field("achternaam", FieldKind::Name)
604 .field("geboortedatum", FieldKind::Date)
605 .build()
606 .unwrap();
607 let cat = CustomSchemaCategory::new().with_phonetic_name_dob();
608 let blocker = BlockerFactory::from_custom_category(&schema, cat);
609
610 let mut idx = InvertedIndex::new();
611 let r1 = zer_core::record::Record::new(1)
612 .insert("achternaam", FieldValue::Text("Jansen".into()))
613 .insert("geboortedatum", FieldValue::Text("1978-03-15".into()));
614 let r2 = zer_core::record::Record::new(2)
615 .insert("achternaam", FieldValue::Text("Jansen".into()))
616 .insert("geboortedatum", FieldValue::Text("1978-03-15".into()));
617 let r3 = zer_core::record::Record::new(3)
618 .insert("achternaam", FieldValue::Text("de Wit".into()))
619 .insert("geboortedatum", FieldValue::Text("1990-07-22".into()));
620
621 blocker.index_record(&r1, &schema, &mut idx);
622 blocker.index_record(&r2, &schema, &mut idx);
623 blocker.index_record(&r3, &schema, &mut idx);
624
625 let cands = blocker.candidates(&r1, &schema, &idx);
626 assert!(cands.contains(&2), "same surname+DOB must be a candidate");
627 assert!(
628 !cands.contains(&3),
629 "different surname+DOB must NOT be a candidate"
630 );
631 }
632
633 #[test]
634 fn custom_missing_field_kind_produces_no_panic() {
635 let schema = SchemaBuilder::new()
637 .field("achternaam", FieldKind::Name)
638 .build()
639 .unwrap();
640 let cat = CustomSchemaCategory::new().with_phone_suffix(7);
641 let blocker = BlockerFactory::from_custom_category(&schema, cat);
642 let r = zer_core::record::Record::new(1)
643 .insert("achternaam", FieldValue::Text("Jansen".into()));
644
645 let keys = blocker.blocking_keys(&r, &schema);
646 assert!(keys.is_empty(), "no Phone fields → no keys, no panic");
647 }
648
649 #[test]
650 fn custom_escape_hatch_with_key_works() {
651 let schema = SchemaBuilder::new()
652 .field("postcode", FieldKind::Id)
653 .build()
654 .unwrap();
655 let cat = CustomSchemaCategory::new().with_key(SuffixKey::new("postcode", 4));
657 let blocker = BlockerFactory::from_custom_category(&schema, cat);
658 let r =
659 zer_core::record::Record::new(1).insert("postcode", FieldValue::Text("1011AB".into()));
660
661 let keys = blocker.blocking_keys(&r, &schema);
662 assert!(
663 !keys.is_empty(),
664 "escape-hatch key must produce at least one key"
665 );
666 }
667
668 #[test]
669 fn custom_combined_rules_produce_multiple_key_types() {
670 let schema = SchemaBuilder::new()
671 .field("voornamen", FieldKind::Name)
672 .field("achternaam", FieldKind::Name)
673 .field("geboortedatum", FieldKind::Date)
674 .field("postcode", FieldKind::Id)
675 .field("tussenvoegsel", FieldKind::Categorical)
676 .build()
677 .unwrap();
678 let cat = CustomSchemaCategory::new()
679 .with_phonetic_name_dob()
680 .with_id_suffix(4)
681 .with_exact_categorical();
682 let blocker = BlockerFactory::from_custom_category(&schema, cat);
683 let r = zer_core::record::Record::new(1)
684 .insert("achternaam", FieldValue::Text("van den Berg".into()))
685 .insert("geboortedatum", FieldValue::Text("1978-03-15".into()))
686 .insert("postcode", FieldValue::Text("1011AB".into()))
687 .insert("tussenvoegsel", FieldValue::Text("van den".into()));
688
689 let keys = blocker.blocking_keys(&r, &schema);
690 assert!(
692 keys.len() >= 3,
693 "combined rules must produce at least 3 keys: {keys:?}"
694 );
695 }
696}