use zer_core::{
record::{FieldValue, Record},
schema::{FieldKind, SchemaBuilder},
};
use zer_schema::{
fingerprint::SchemaFingerprint,
similarity::{fingerprint_distance, WARM_START_THRESHOLD},
};
const BRP_Q1_CSV: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../data/examples/brp_q1/brp_persons.csv"
);
const BRP_Q2_CSV: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../data/examples/brp_q2/brp_persons.csv"
);
const SIM_SNAP1_CSV: &str = concat!(
env!("CARGO_MANIFEST_DIR"),
"/../../data/examples/sim/sim_subscribers.csv"
);
fn load_records(path: &str) -> Vec<Record> {
let mut rdr = csv::Reader::from_path(path)
.unwrap_or_else(|_| panic!("CSV not found at {path}"));
let headers = rdr.headers().unwrap().clone();
let mut records = Vec::new();
let mut id: u64 = 1;
for result in rdr.records() {
let row = result.unwrap();
let mut r = Record::new(id);
for (i, header) in headers.iter().enumerate() {
let v = row.get(i).unwrap_or("").trim();
r = r.insert(
header,
if v.is_empty() {
FieldValue::Null
} else {
FieldValue::Text(v.into())
},
);
}
records.push(r);
id += 1;
}
records
}
fn brp_schema() -> zer_core::schema::Schema {
SchemaBuilder::new()
.field("bsn", FieldKind::Id)
.field("voornamen", FieldKind::Name)
.field("tussenvoegsel", FieldKind::Categorical)
.field("achternaam", FieldKind::Name)
.field("geboortedatum", FieldKind::Date)
.field("geboorteplaats", FieldKind::Categorical)
.field("geboorteland", FieldKind::Categorical)
.field("nationaliteit", FieldKind::Categorical)
.field("geslacht", FieldKind::Categorical)
.field("straatnaam", FieldKind::Address)
.field("huisnummer", FieldKind::Address)
.field("postcode", FieldKind::Id)
.field("woonplaats", FieldKind::Address)
.build()
.unwrap()
}
fn sim_schema() -> zer_core::schema::Schema {
SchemaBuilder::new()
.field("sim_id", FieldKind::Id)
.field("msisdn", FieldKind::Phone)
.field("imsi", FieldKind::Id)
.field("iccid", FieldKind::Id)
.field("carrier", FieldKind::Categorical)
.field("contract_type", FieldKind::Categorical)
.field("activatiedatum", FieldKind::Date)
.field("voornamen", FieldKind::Name)
.field("achternaam", FieldKind::Name)
.field("geboortedatum", FieldKind::Date)
.field("nationaliteit", FieldKind::Categorical)
.field("document_type", FieldKind::Categorical)
.field("document_nummer", FieldKind::Id)
.field("bsn", FieldKind::Id)
.build()
.unwrap()
}
#[test]
fn brp_q1_q2_same_schema_hash() {
let q1 = load_records(BRP_Q1_CSV);
let q2 = load_records(BRP_Q2_CSV);
let schema = brp_schema();
let fp1 = SchemaFingerprint::from_sample(&schema, &q1);
let fp2 = SchemaFingerprint::from_sample(&schema, &q2);
assert_eq!(
fp1.schema_hash, fp2.schema_hash,
"Q1 and Q2 share the same schema, hashes must be identical"
);
assert_eq!(fingerprint_distance(&fp1, &fp2), 0.0);
}
#[test]
fn brp_vs_sim_distance_exceeds_warm_start_threshold() {
let brp_records = load_records(BRP_Q1_CSV);
let sim_records = load_records(SIM_SNAP1_CSV);
let fp_brp = SchemaFingerprint::from_sample(&brp_schema(), &brp_records);
let fp_sim = SchemaFingerprint::from_sample(&sim_schema(), &sim_records);
let dist = fingerprint_distance(&fp_brp, &fp_sim);
assert!(
dist > WARM_START_THRESHOLD,
"BRP vs SIM distance {dist:.4} should exceed warm-start threshold {WARM_START_THRESHOLD}"
);
}
#[test]
fn from_sample_populates_stats_for_brp() {
let records = load_records(BRP_Q1_CSV);
let schema = brp_schema();
let fp = SchemaFingerprint::from_sample(&schema, &records);
assert_eq!(fp.field_stats.len(), schema.len());
assert!(fp.record_count > 1_000, "record_count should reflect CSV size");
let dob = fp.field_stats.iter().find(|f| f.name == "geboortedatum").unwrap();
assert!(
dob.null_rate < 0.1,
"geboortedatum null_rate should be near 0, got {}",
dob.null_rate
);
assert!(
dob.cardinality > 100,
"geboortedatum should have many distinct values"
);
}
#[test]
fn from_sample_top_k_is_ordered_by_frequency() {
let records = load_records(BRP_Q1_CSV);
let schema = brp_schema();
let fp = SchemaFingerprint::from_sample(&schema, &records);
let gender = fp.field_stats.iter().find(|f| f.name == "geslacht").unwrap();
assert!(!gender.top_k.is_empty(), "geslacht top_k must not be empty");
assert!(
gender.cardinality <= 5,
"geslacht should have at most 5 distinct values"
);
}
#[test]
fn brp_extended_schema_warm_start_distance() {
let records = load_records(BRP_Q1_CSV);
let base = brp_schema();
let extended = SchemaBuilder::new()
.field("bsn", FieldKind::Id)
.field("voornamen", FieldKind::Name)
.field("tussenvoegsel", FieldKind::Categorical)
.field("achternaam", FieldKind::Name)
.field("geboortedatum", FieldKind::Date)
.field("geboorteplaats", FieldKind::Categorical)
.field("geboorteland", FieldKind::Categorical)
.field("nationaliteit", FieldKind::Categorical)
.field("geslacht", FieldKind::Categorical)
.field("straatnaam", FieldKind::Address)
.field("huisnummer", FieldKind::Address)
.field("postcode", FieldKind::Id)
.field("woonplaats", FieldKind::Address)
.field("verblijfstitel", FieldKind::Categorical) .build()
.unwrap();
let fp_base = SchemaFingerprint::from_sample(&base, &records);
let fp_ext = SchemaFingerprint::from_schema(&extended);
let dist = fingerprint_distance(&fp_base, &fp_ext);
assert!(
dist > 0.0,
"schemas differ by one field, distance must be > 0"
);
assert!(
dist <= WARM_START_THRESHOLD,
"one extra field should stay within warm-start threshold, got {dist:.4}"
);
}