Skip to main content

datasynth_eval/privacy/
linkage.rs

1//! Linkage Attack Assessment.
2//!
3//! Evaluates the risk of re-identification by checking if synthetic records
4//! can be uniquely linked back to original records using quasi-identifiers (QIs).
5//!
6//! A linkage attack selects a subset of fields as quasi-identifiers and checks
7//! how many synthetic records uniquely match a single original record on those QIs.
8//! Low re-identification rates and high k-anonymity indicate good privacy.
9
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12
13/// Configuration for linkage attack evaluation.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct LinkageConfig {
16    /// Maximum acceptable re-identification rate (0.0 - 1.0).
17    /// Default: 0.05 (5%).
18    pub max_reidentification_rate: f64,
19    /// Minimum k-anonymity level to achieve.
20    /// Default: 5 (each record matches at least 5 others).
21    pub min_k_anonymity: usize,
22}
23
24impl Default for LinkageConfig {
25    fn default() -> Self {
26        Self {
27            max_reidentification_rate: 0.05,
28            min_k_anonymity: 5,
29        }
30    }
31}
32
33/// Results from a linkage attack evaluation.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct LinkageResults {
36    /// Fraction of synthetic records that uniquely match a single original record.
37    pub re_identification_rate: f64,
38    /// The effective k-anonymity achieved (minimum group size across all QI combinations).
39    pub k_anonymity_achieved: usize,
40    /// Number of unique QI combinations in the original data.
41    pub unique_qi_combos_original: usize,
42    /// Number of unique QI combinations in the synthetic data.
43    pub unique_qi_combos_synthetic: usize,
44    /// Number of QI combinations that appear in both datasets.
45    pub overlapping_combos: usize,
46    /// Number of synthetic records that could be uniquely linked.
47    pub uniquely_linked: usize,
48    /// Total synthetic records evaluated.
49    pub total_synthetic: usize,
50    /// Whether privacy passes the configured thresholds.
51    pub passes: bool,
52}
53
54/// Linkage attack evaluator.
55///
56/// Checks if synthetic records can be uniquely re-identified in the original
57/// dataset based on quasi-identifier fields.
58pub struct LinkageAttack {
59    config: LinkageConfig,
60}
61
62impl LinkageAttack {
63    /// Create a new linkage attack evaluator.
64    pub fn new(config: LinkageConfig) -> Self {
65        Self { config }
66    }
67
68    /// Create with default configuration.
69    pub fn with_defaults() -> Self {
70        Self::new(LinkageConfig::default())
71    }
72
73    /// Run the linkage attack evaluation.
74    ///
75    /// # Arguments
76    /// * `original_qis` - Quasi-identifier tuples from the original dataset.
77    ///   Each entry is a record represented as a Vec of string-valued QI fields.
78    /// * `synthetic_qis` - Quasi-identifier tuples from the synthetic dataset.
79    ///
80    /// # Example
81    /// ```ignore
82    /// // Each record has QI fields: [age_bucket, zip_prefix, gender]
83    /// let original = vec![
84    ///     vec!["30-39".into(), "100".into(), "M".into()],
85    ///     vec!["40-49".into(), "200".into(), "F".into()],
86    /// ];
87    /// let synthetic = vec![
88    ///     vec!["30-39".into(), "100".into(), "M".into()],
89    /// ];
90    /// let results = attack.evaluate(&original, &synthetic);
91    /// ```
92    pub fn evaluate(
93        &self,
94        original_qis: &[Vec<String>],
95        synthetic_qis: &[Vec<String>],
96    ) -> LinkageResults {
97        if original_qis.is_empty() || synthetic_qis.is_empty() {
98            return LinkageResults {
99                re_identification_rate: 0.0,
100                k_anonymity_achieved: usize::MAX,
101                unique_qi_combos_original: 0,
102                unique_qi_combos_synthetic: 0,
103                overlapping_combos: 0,
104                uniquely_linked: 0,
105                total_synthetic: synthetic_qis.len(),
106                passes: true,
107            };
108        }
109
110        // Build frequency map for original data QI combinations
111        let mut original_freq: HashMap<Vec<String>, usize> = HashMap::new();
112        for qi in original_qis {
113            *original_freq.entry(qi.clone()).or_insert(0) += 1;
114        }
115
116        // Build frequency map for synthetic data QI combinations
117        let mut synthetic_freq: HashMap<Vec<String>, usize> = HashMap::new();
118        for qi in synthetic_qis {
119            *synthetic_freq.entry(qi.clone()).or_insert(0) += 1;
120        }
121
122        // Count overlapping QI combinations
123        let overlapping_combos = synthetic_freq
124            .keys()
125            .filter(|qi| original_freq.contains_key(*qi))
126            .count();
127
128        // Count uniquely linked records:
129        // A synthetic record is "uniquely linked" if its QI combination maps to
130        // exactly 1 record in the original dataset
131        let mut uniquely_linked = 0usize;
132        for qi in synthetic_qis {
133            if let Some(&orig_count) = original_freq.get(qi) {
134                if orig_count == 1 {
135                    uniquely_linked += 1;
136                }
137            }
138        }
139
140        let re_identification_rate = if synthetic_qis.is_empty() {
141            0.0
142        } else {
143            uniquely_linked as f64 / synthetic_qis.len() as f64
144        };
145
146        // k-anonymity: minimum group size across all QI combos present in original data
147        let k_anonymity_achieved = original_freq.values().copied().min().unwrap_or(0);
148
149        let passes = re_identification_rate <= self.config.max_reidentification_rate
150            && k_anonymity_achieved >= self.config.min_k_anonymity;
151
152        LinkageResults {
153            re_identification_rate,
154            k_anonymity_achieved,
155            unique_qi_combos_original: original_freq.len(),
156            unique_qi_combos_synthetic: synthetic_freq.len(),
157            overlapping_combos,
158            uniquely_linked,
159            total_synthetic: synthetic_qis.len(),
160            passes,
161        }
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    fn make_qi(fields: &[&str]) -> Vec<String> {
170        fields.iter().map(|s| s.to_string()).collect()
171    }
172
173    #[test]
174    fn test_k_anonymized_data_low_reidentification() {
175        // Each QI combo appears at least 5 times in original
176        let mut original = Vec::new();
177        for _ in 0..5 {
178            original.push(make_qi(&["30-39", "100", "M"]));
179            original.push(make_qi(&["40-49", "200", "F"]));
180            original.push(make_qi(&["50-59", "300", "M"]));
181        }
182
183        let synthetic = vec![
184            make_qi(&["30-39", "100", "M"]),
185            make_qi(&["40-49", "200", "F"]),
186            make_qi(&["50-59", "300", "M"]),
187        ];
188
189        let attack = LinkageAttack::with_defaults();
190        let results = attack.evaluate(&original, &synthetic);
191
192        assert_eq!(results.re_identification_rate, 0.0);
193        assert_eq!(results.k_anonymity_achieved, 5);
194        assert!(results.passes);
195    }
196
197    #[test]
198    fn test_unique_records_high_reidentification() {
199        // Every record in original has a unique QI combination
200        let original = vec![
201            make_qi(&["25", "10001", "M"]),
202            make_qi(&["32", "10002", "F"]),
203            make_qi(&["45", "10003", "M"]),
204            make_qi(&["58", "10004", "F"]),
205        ];
206
207        // Synthetic has matching QI combos
208        let synthetic = vec![
209            make_qi(&["25", "10001", "M"]),
210            make_qi(&["32", "10002", "F"]),
211        ];
212
213        let attack = LinkageAttack::with_defaults();
214        let results = attack.evaluate(&original, &synthetic);
215
216        // All synthetic records uniquely match
217        assert!((results.re_identification_rate - 1.0).abs() < 1e-10);
218        assert_eq!(results.k_anonymity_achieved, 1);
219        assert!(!results.passes);
220    }
221
222    #[test]
223    fn test_no_overlap() {
224        let original = vec![make_qi(&["A", "1"]), make_qi(&["B", "2"])];
225        let synthetic = vec![make_qi(&["C", "3"]), make_qi(&["D", "4"])];
226
227        let attack = LinkageAttack::with_defaults();
228        let results = attack.evaluate(&original, &synthetic);
229
230        assert_eq!(results.re_identification_rate, 0.0);
231        assert_eq!(results.overlapping_combos, 0);
232        assert_eq!(results.uniquely_linked, 0);
233    }
234
235    #[test]
236    fn test_empty_datasets() {
237        let attack = LinkageAttack::with_defaults();
238        let results = attack.evaluate(&[], &[]);
239        assert!(results.passes);
240        assert_eq!(results.re_identification_rate, 0.0);
241    }
242
243    #[test]
244    fn test_linkage_config_serde() {
245        let config = LinkageConfig::default();
246        let json = serde_json::to_string(&config).unwrap();
247        let parsed: LinkageConfig = serde_json::from_str(&json).unwrap();
248        assert!((parsed.max_reidentification_rate - 0.05).abs() < 1e-10);
249        assert_eq!(parsed.min_k_anonymity, 5);
250    }
251
252    #[test]
253    fn test_linkage_results_serde() {
254        let results = LinkageResults {
255            re_identification_rate: 0.02,
256            k_anonymity_achieved: 10,
257            unique_qi_combos_original: 50,
258            unique_qi_combos_synthetic: 45,
259            overlapping_combos: 30,
260            uniquely_linked: 1,
261            total_synthetic: 100,
262            passes: true,
263        };
264        let json = serde_json::to_string(&results).unwrap();
265        let parsed: LinkageResults = serde_json::from_str(&json).unwrap();
266        assert!((parsed.re_identification_rate - 0.02).abs() < 1e-10);
267        assert_eq!(parsed.k_anonymity_achieved, 10);
268    }
269
270    #[test]
271    fn test_partial_overlap() {
272        // Some records unique, some with k>=2
273        let original = vec![
274            make_qi(&["A", "1"]), // unique
275            make_qi(&["B", "2"]), // appears twice
276            make_qi(&["B", "2"]),
277            make_qi(&["C", "3"]), // appears 3 times
278            make_qi(&["C", "3"]),
279            make_qi(&["C", "3"]),
280        ];
281
282        // Synthetic has all three combos
283        let synthetic = vec![
284            make_qi(&["A", "1"]), // uniquely linked (orig count=1)
285            make_qi(&["B", "2"]), // not uniquely linked (orig count=2)
286            make_qi(&["C", "3"]), // not uniquely linked (orig count=3)
287        ];
288
289        let attack = LinkageAttack::with_defaults();
290        let results = attack.evaluate(&original, &synthetic);
291
292        assert_eq!(results.uniquely_linked, 1);
293        assert!((results.re_identification_rate - 1.0 / 3.0).abs() < 1e-10);
294        assert_eq!(results.k_anonymity_achieved, 1); // min group size = 1
295    }
296}