use std::collections::HashMap;
use csv;
use zer_blocking::{BlockerFactory, CustomSchemaCategory, InvertedIndex};
use zer_core::{
record::Record,
schema::{FieldKind, Schema, SchemaBuilder},
traits::Blocker,
};
const KVK_CSV: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../data/tests/kvk/kvk_director_flat.csv"
);
const GT_CSV: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../data/tests/kvk/ground_truth_pairs.csv"
);
fn kvk_schema() -> Schema {
SchemaBuilder::new()
.field("voornamen", FieldKind::Name)
.field("achternaam", FieldKind::Name)
.field("tussenvoegsel", FieldKind::Categorical)
.field("geboortedatum", FieldKind::Date)
.field("woonplaats", FieldKind::Address)
.field("straatnaam", FieldKind::Address)
.field("postcode", FieldKind::Id)
.field("kvkNummer", FieldKind::Id)
.build()
.unwrap()
}
fn load_kvk_records() -> HashMap<u64, Record> {
let mut rdr = csv::Reader::from_path(KVK_CSV).expect("KvK CSV not found, run data generator first");
let headers_row = rdr.headers().unwrap().clone();
let col = |name: &str| headers_row.iter().position(|h| h == name);
let c_kvk = col("kvkNummer").unwrap();
let c_voor = col("voornamen").unwrap();
let c_tuss = col("tussenvoegsel").unwrap();
let c_ach = col("achternaam").unwrap();
let c_dob = col("geboortedatum").unwrap();
let c_woon = col("woonplaats").unwrap();
let c_str = col("straatnaam").unwrap();
let c_post = col("postcode").unwrap();
let mut records = HashMap::new();
for result in rdr.records() {
let row = result.unwrap();
let kvk: u64 = row[c_kvk].parse().unwrap();
let opt = |idx: usize| -> Option<&str> {
let v = row[idx].trim();
if v.is_empty() { None } else { Some(v) }
};
let r = Record::new(kvk).with_source("kvk")
.insert("voornamen", opt(c_voor))
.insert("tussenvoegsel", opt(c_tuss))
.insert("achternaam", opt(c_ach))
.insert("geboortedatum", opt(c_dob))
.insert("woonplaats", opt(c_woon))
.insert("straatnaam", opt(c_str))
.insert("postcode", opt(c_post))
.insert("kvkNummer", row[c_kvk].trim());
records.insert(kvk, r);
}
records
}
fn load_true_pairs() -> Vec<(u64, u64)> {
let mut rdr = csv::Reader::from_path(GT_CSV).expect("Ground truth CSV not found");
let mut pairs = vec![];
for result in rdr.records() {
let row = result.unwrap();
if row.get(2).unwrap_or("") == "True" {
let a: u64 = row[0].parse().unwrap();
let b: u64 = row[1].parse().unwrap();
pairs.push((a, b));
}
}
pairs
}
#[test]
fn blocking_recall_kvk_above_threshold() {
let schema = kvk_schema();
let records = load_kvk_records();
let true_pairs = load_true_pairs();
assert!(!true_pairs.is_empty(), "Ground truth must not be empty");
let blocker = BlockerFactory::from_schema(&schema);
let mut idx = InvertedIndex::new();
for record in records.values() {
blocker.index_record(record, &schema, &mut idx);
}
let mut found = 0usize;
for (a_id, b_id) in &true_pairs {
if let Some(rec_a) = records.get(a_id) {
let candidates = blocker.candidates(rec_a, &schema, &idx);
if candidates.contains(b_id) {
found += 1;
}
}
}
let recall = found as f64 / true_pairs.len() as f64;
println!(
"KvK blocking recall: {:.4} ({}/{} pairs found)",
recall, found, true_pairs.len()
);
assert!(
recall >= 0.97,
"Blocking recall {:.4} is below the 0.97 target ({}/{} pairs found)",
recall, found, true_pairs.len()
);
}
#[test]
fn blocking_reduction_ratio_kvk() {
let schema = kvk_schema();
let records = load_kvk_records();
let n = records.len() as f64;
let blocker = BlockerFactory::from_schema(&schema);
let mut idx = InvertedIndex::new();
for record in records.values() {
blocker.index_record(record, &schema, &mut idx);
}
let total_candidates: usize = records
.values()
.map(|r| blocker.candidates(r, &schema, &idx).len())
.sum::<usize>()
/ 2;
let all_possible = (n * (n - 1.0) / 2.0) as usize;
let reduction = 1.0 - total_candidates as f64 / all_possible as f64;
println!(
"KvK reduction ratio: {:.4} ({} candidate pairs from {} possible)",
reduction, total_candidates, all_possible
);
assert!(
reduction >= 0.97,
"Reduction ratio {:.4} is below the 0.97 threshold on synthetic data",
reduction
);
}
#[test]
fn blocking_recall_custom_category_kvk_above_threshold() {
let schema = kvk_schema();
let records = load_kvk_records();
let true_pairs = load_true_pairs();
assert!(!true_pairs.is_empty(), "Ground truth must not be empty");
let category = CustomSchemaCategory::new()
.with_phonetic_name_dob()
.with_address_initial()
.with_id_suffix(4)
.with_exact_categorical();
let blocker = BlockerFactory::from_custom_category(&schema, category);
let mut idx = InvertedIndex::new();
for record in records.values() {
blocker.index_record(record, &schema, &mut idx);
}
let mut found = 0usize;
for (a_id, b_id) in &true_pairs {
if let Some(rec_a) = records.get(a_id) {
let candidates = blocker.candidates(rec_a, &schema, &idx);
if candidates.contains(b_id) {
found += 1;
}
}
}
let recall = found as f64 / true_pairs.len() as f64;
println!(
"KvK custom-category blocking recall: {:.4} ({}/{} pairs found)",
recall, found, true_pairs.len()
);
assert!(
recall >= 0.97,
"Custom-category recall {:.4} is below the 0.97 target ({}/{} pairs found)",
recall, found, true_pairs.len()
);
}
#[test]
fn no_self_candidates_custom_category_kvk() {
let schema = kvk_schema();
let records = load_kvk_records();
let category = CustomSchemaCategory::new().with_id_suffix(4);
let blocker = BlockerFactory::from_custom_category(&schema, category);
let mut idx = InvertedIndex::new();
for record in records.values() {
blocker.index_record(record, &schema, &mut idx);
}
for record in records.values() {
let cands = blocker.candidates(record, &schema, &idx);
assert!(
!cands.contains(&record.id),
"Record {} appears in its own candidate list (custom category)",
record.id
);
}
}
#[test]
fn no_self_candidates_kvk() {
let schema = kvk_schema();
let records = load_kvk_records();
let blocker = BlockerFactory::from_schema(&schema);
let mut idx = InvertedIndex::new();
for record in records.values() {
blocker.index_record(record, &schema, &mut idx);
}
for record in records.values() {
let cands = blocker.candidates(record, &schema, &idx);
assert!(
!cands.contains(&record.id),
"Record {} appears in its own candidate list",
record.id
);
}
}