use zer_core::{
record::{FieldValue, Record},
schema::{FieldKind, Schema},
};
pub struct NearDuplicateGenerator {
pub pair_count: usize,
pub id_offset: u64,
}
impl NearDuplicateGenerator {
pub fn generate(&self, source: &[Record], schema: &Schema) -> Vec<Record> {
if source.is_empty() || self.pair_count == 0 {
return vec![];
}
let name_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Name).collect();
let date_fields: Vec<&str> = schema.fields_of_kind(FieldKind::Date).collect();
let mut out = Vec::with_capacity(self.pair_count * 2);
for i in 0..self.pair_count {
let src = &source[i % source.len()];
let record_a_id = self.id_offset + (2 * i) as u64;
let mut record_a = Record::new(record_a_id);
for field in &schema.fields {
if let Some(v) = src.get(&field.name) {
record_a = record_a.insert(&field.name, v.clone());
}
}
let record_b_id = self.id_offset + (2 * i + 1) as u64;
let mut record_b = Record::new(record_b_id);
for field in &schema.fields {
let perturbed = if let Some(v) = src.get(&field.name) {
if name_fields.contains(&field.name.as_str()) {
perturb_name(v)
} else if date_fields.contains(&field.name.as_str()) {
perturb_date(v, i)
} else {
v.clone()
}
} else {
FieldValue::Null
};
record_b = record_b.insert(&field.name, perturbed);
}
out.push(record_a);
out.push(record_b);
}
out
}
}
fn perturb_name(value: &FieldValue) -> FieldValue {
match value {
FieldValue::Text(s) => {
let mut chars: Vec<char> = s.chars().collect();
if chars.len() > 1 {
chars.pop();
FieldValue::Text(chars.into_iter().collect())
} else {
value.clone()
}
}
other => other.clone(),
}
}
fn perturb_date(value: &FieldValue, pair_index: usize) -> FieldValue {
match value {
FieldValue::Text(s) if s.len() >= 4 => {
let year = &s[..4];
let alt_month = (pair_index % 11 + 1) as u8;
let alt_day = (pair_index % 25 + 1) as u8;
FieldValue::Text(format!("{year}-{alt_month:02}-{alt_day:02}"))
}
other => other.clone(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use zer_core::schema::SchemaBuilder;
fn person_schema() -> Schema {
SchemaBuilder::new()
.field("voornamen", FieldKind::Name)
.field("achternaam", FieldKind::Name)
.field("geboortedatum", FieldKind::Date)
.build()
.unwrap()
}
fn make_record(id: u64, first: &str, last: &str, dob: &str) -> Record {
Record::new(id)
.insert("voornamen", FieldValue::Text(first.into()))
.insert("achternaam", FieldValue::Text(last.into()))
.insert("geboortedatum", FieldValue::Text(dob.into()))
}
fn source_records() -> Vec<Record> {
vec![
make_record(1, "Maria", "Jansen", "1985-03-15"),
make_record(2, "Pieter", "de Vries", "1990-07-22"),
make_record(3, "Annelies", "Bakker", "1978-11-05"),
]
}
#[test]
fn generate_correct_count() {
let schema = person_schema();
let source = source_records();
let gen = NearDuplicateGenerator {
pair_count: 3,
id_offset: 9_000_000,
};
let result = gen.generate(&source, &schema);
assert_eq!(result.len(), 6, "pair_count=3 → 6 synthetic records");
}
#[test]
fn generate_ids_start_at_offset() {
let schema = person_schema();
let source = source_records();
let gen = NearDuplicateGenerator {
pair_count: 2,
id_offset: 5_000,
};
let result = gen.generate(&source, &schema);
assert!(
result.iter().all(|r| r.id >= 5_000),
"all IDs must be >= id_offset"
);
assert_eq!(result[0].id, 5_000);
assert_eq!(result[1].id, 5_001);
}
#[test]
fn generate_ids_are_unique() {
let schema = person_schema();
let source = source_records();
let gen = NearDuplicateGenerator {
pair_count: 5,
id_offset: 1_000,
};
let result = gen.generate(&source, &schema);
let ids: std::collections::HashSet<u64> = result.iter().map(|r| r.id).collect();
assert_eq!(
ids.len(),
result.len(),
"all generated record IDs must be unique"
);
}
#[test]
fn generate_name_fields_are_perturbed() {
let schema = person_schema();
let source = vec![make_record(1, "Maria", "Jansen", "1985-03-15")];
let gen = NearDuplicateGenerator {
pair_count: 1,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
let orig_first = source[0].get("voornamen");
let pert_first = result[1].get("voornamen");
assert_ne!(
orig_first, pert_first,
"first name must differ between original and perturbed"
);
}
#[test]
fn generate_surname_is_also_perturbed() {
let schema = person_schema();
let source = vec![make_record(1, "Maria", "Jansen", "1985-03-15")];
let gen = NearDuplicateGenerator {
pair_count: 1,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
let orig = source[0].get("achternaam");
let pert = result[1].get("achternaam");
assert_ne!(orig, pert, "surname must be perturbed");
assert_eq!(pert, Some(&FieldValue::Text("Janse".into())));
}
#[test]
fn generate_date_year_preserved() {
let schema = person_schema();
let source = vec![make_record(1, "Maria", "Jansen", "1985-03-15")];
let gen = NearDuplicateGenerator {
pair_count: 1,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
let dob_val = result[1].get("geboortedatum");
if let Some(FieldValue::Text(s)) = dob_val {
assert!(s.starts_with("1985-"), "year must be preserved: {s}");
} else {
panic!("expected Text value for geboortedatum");
}
}
#[test]
fn generate_date_day_month_differ() {
let schema = person_schema();
let source = vec![make_record(1, "Maria", "Jansen", "1985-03-15")];
let gen = NearDuplicateGenerator {
pair_count: 1,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
let orig_dob = source[0].get("geboortedatum");
let pert_dob = result[1].get("geboortedatum");
assert_ne!(
orig_dob, pert_dob,
"perturbed DOB must differ from original"
);
}
#[test]
fn generate_verbatim_copy_equals_source() {
let schema = person_schema();
let source = vec![make_record(1, "Maria", "Jansen", "1985-03-15")];
let gen = NearDuplicateGenerator {
pair_count: 1,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
assert_eq!(result[0].get("voornamen"), source[0].get("voornamen"));
assert_eq!(result[0].get("achternaam"), source[0].get("achternaam"));
assert_eq!(
result[0].get("geboortedatum"),
source[0].get("geboortedatum")
);
}
#[test]
fn generate_cycles_when_fewer_sources() {
let schema = person_schema();
let source = vec![make_record(1, "Maria", "Jansen", "1985-03-15")]; let gen = NearDuplicateGenerator {
pair_count: 3,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
assert_eq!(
result.len(),
6,
"should still generate 2 times pair_count records when cycling"
);
}
#[test]
fn generate_empty_source_returns_empty() {
let schema = person_schema();
let gen = NearDuplicateGenerator {
pair_count: 5,
id_offset: 9_000,
};
let result = gen.generate(&[], &schema);
assert!(result.is_empty());
}
#[test]
fn generate_zero_pairs_returns_empty() {
let schema = person_schema();
let source = source_records();
let gen = NearDuplicateGenerator {
pair_count: 0,
id_offset: 9_000,
};
let result = gen.generate(&source, &schema);
assert!(result.is_empty());
}
#[test]
fn perturb_name_strips_last_char() {
assert_eq!(
perturb_name(&FieldValue::Text("Pieter".into())),
FieldValue::Text("Piete".into()),
);
assert_eq!(
perturb_name(&FieldValue::Text("Maria".into())),
FieldValue::Text("Mari".into()),
);
}
#[test]
fn perturb_name_single_char_unchanged() {
assert_eq!(
perturb_name(&FieldValue::Text("A".into())),
FieldValue::Text("A".into()),
);
}
#[test]
fn perturb_date_preserves_year() {
let v = FieldValue::Text("1990-06-15".into());
let p = perturb_date(&v, 0);
if let FieldValue::Text(s) = p {
assert!(s.starts_with("1990-"), "year must be preserved");
} else {
panic!("expected Text");
}
}
#[test]
fn perturb_date_changes_with_index() {
let v = FieldValue::Text("1990-06-15".into());
let p0 = perturb_date(&v, 0);
let p1 = perturb_date(&v, 1);
assert_ne!(
p0, p1,
"different pair indices must produce different dates"
);
}
#[test]
fn perturb_date_null_passthrough() {
assert_eq!(perturb_date(&FieldValue::Null, 0), FieldValue::Null);
}
}