use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LinkageConfig {
pub max_reidentification_rate: f64,
pub min_k_anonymity: usize,
}
impl Default for LinkageConfig {
fn default() -> Self {
Self {
max_reidentification_rate: 0.05,
min_k_anonymity: 5,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LinkageResults {
pub re_identification_rate: f64,
pub k_anonymity_achieved: usize,
pub unique_qi_combos_original: usize,
pub unique_qi_combos_synthetic: usize,
pub overlapping_combos: usize,
pub uniquely_linked: usize,
pub total_synthetic: usize,
pub passes: bool,
}
pub struct LinkageAttack {
config: LinkageConfig,
}
impl LinkageAttack {
pub fn new(config: LinkageConfig) -> Self {
Self { config }
}
pub fn with_defaults() -> Self {
Self::new(LinkageConfig::default())
}
pub fn evaluate(
&self,
original_qis: &[Vec<String>],
synthetic_qis: &[Vec<String>],
) -> LinkageResults {
if original_qis.is_empty() || synthetic_qis.is_empty() {
return LinkageResults {
re_identification_rate: 0.0,
k_anonymity_achieved: usize::MAX,
unique_qi_combos_original: 0,
unique_qi_combos_synthetic: 0,
overlapping_combos: 0,
uniquely_linked: 0,
total_synthetic: synthetic_qis.len(),
passes: true,
};
}
let mut original_freq: HashMap<Vec<String>, usize> = HashMap::new();
for qi in original_qis {
*original_freq.entry(qi.clone()).or_insert(0) += 1;
}
let mut synthetic_freq: HashMap<Vec<String>, usize> = HashMap::new();
for qi in synthetic_qis {
*synthetic_freq.entry(qi.clone()).or_insert(0) += 1;
}
let overlapping_combos = synthetic_freq
.keys()
.filter(|qi| original_freq.contains_key(*qi))
.count();
let mut uniquely_linked = 0usize;
for qi in synthetic_qis {
if let Some(&orig_count) = original_freq.get(qi) {
if orig_count == 1 {
uniquely_linked += 1;
}
}
}
let re_identification_rate = if synthetic_qis.is_empty() {
0.0
} else {
uniquely_linked as f64 / synthetic_qis.len() as f64
};
let k_anonymity_achieved = original_freq.values().copied().min().unwrap_or(0);
let passes = re_identification_rate <= self.config.max_reidentification_rate
&& k_anonymity_achieved >= self.config.min_k_anonymity;
LinkageResults {
re_identification_rate,
k_anonymity_achieved,
unique_qi_combos_original: original_freq.len(),
unique_qi_combos_synthetic: synthetic_freq.len(),
overlapping_combos,
uniquely_linked,
total_synthetic: synthetic_qis.len(),
passes,
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
fn make_qi(fields: &[&str]) -> Vec<String> {
fields.iter().map(|s| s.to_string()).collect()
}
#[test]
fn test_k_anonymized_data_low_reidentification() {
let mut original = Vec::new();
for _ in 0..5 {
original.push(make_qi(&["30-39", "100", "M"]));
original.push(make_qi(&["40-49", "200", "F"]));
original.push(make_qi(&["50-59", "300", "M"]));
}
let synthetic = vec![
make_qi(&["30-39", "100", "M"]),
make_qi(&["40-49", "200", "F"]),
make_qi(&["50-59", "300", "M"]),
];
let attack = LinkageAttack::with_defaults();
let results = attack.evaluate(&original, &synthetic);
assert_eq!(results.re_identification_rate, 0.0);
assert_eq!(results.k_anonymity_achieved, 5);
assert!(results.passes);
}
#[test]
fn test_unique_records_high_reidentification() {
let original = vec![
make_qi(&["25", "10001", "M"]),
make_qi(&["32", "10002", "F"]),
make_qi(&["45", "10003", "M"]),
make_qi(&["58", "10004", "F"]),
];
let synthetic = vec![
make_qi(&["25", "10001", "M"]),
make_qi(&["32", "10002", "F"]),
];
let attack = LinkageAttack::with_defaults();
let results = attack.evaluate(&original, &synthetic);
assert!((results.re_identification_rate - 1.0).abs() < 1e-10);
assert_eq!(results.k_anonymity_achieved, 1);
assert!(!results.passes);
}
#[test]
fn test_no_overlap() {
let original = vec![make_qi(&["A", "1"]), make_qi(&["B", "2"])];
let synthetic = vec![make_qi(&["C", "3"]), make_qi(&["D", "4"])];
let attack = LinkageAttack::with_defaults();
let results = attack.evaluate(&original, &synthetic);
assert_eq!(results.re_identification_rate, 0.0);
assert_eq!(results.overlapping_combos, 0);
assert_eq!(results.uniquely_linked, 0);
}
#[test]
fn test_empty_datasets() {
let attack = LinkageAttack::with_defaults();
let results = attack.evaluate(&[], &[]);
assert!(results.passes);
assert_eq!(results.re_identification_rate, 0.0);
}
#[test]
fn test_linkage_config_serde() {
let config = LinkageConfig::default();
let json = serde_json::to_string(&config).unwrap();
let parsed: LinkageConfig = serde_json::from_str(&json).unwrap();
assert!((parsed.max_reidentification_rate - 0.05).abs() < 1e-10);
assert_eq!(parsed.min_k_anonymity, 5);
}
#[test]
fn test_linkage_results_serde() {
let results = LinkageResults {
re_identification_rate: 0.02,
k_anonymity_achieved: 10,
unique_qi_combos_original: 50,
unique_qi_combos_synthetic: 45,
overlapping_combos: 30,
uniquely_linked: 1,
total_synthetic: 100,
passes: true,
};
let json = serde_json::to_string(&results).unwrap();
let parsed: LinkageResults = serde_json::from_str(&json).unwrap();
assert!((parsed.re_identification_rate - 0.02).abs() < 1e-10);
assert_eq!(parsed.k_anonymity_achieved, 10);
}
#[test]
fn test_partial_overlap() {
let original = vec![
make_qi(&["A", "1"]), make_qi(&["B", "2"]), make_qi(&["B", "2"]),
make_qi(&["C", "3"]), make_qi(&["C", "3"]),
make_qi(&["C", "3"]),
];
let synthetic = vec![
make_qi(&["A", "1"]), make_qi(&["B", "2"]), make_qi(&["C", "3"]), ];
let attack = LinkageAttack::with_defaults();
let results = attack.evaluate(&original, &synthetic);
assert_eq!(results.uniquely_linked, 1);
assert!((results.re_identification_rate - 1.0 / 3.0).abs() < 1e-10);
assert_eq!(results.k_anonymity_achieved, 1); }
}