Skip to main content

datasynth_eval/privacy/
linkage.rs

1//! Linkage Attack Assessment.
2//!
3//! Evaluates the risk of re-identification by checking if synthetic records
4//! can be uniquely linked back to original records using quasi-identifiers (QIs).
5//!
6//! A linkage attack selects a subset of fields as quasi-identifiers and checks
7//! how many synthetic records uniquely match a single original record on those QIs.
8//! Low re-identification rates and high k-anonymity indicate good privacy.
9
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12
13/// Configuration for linkage attack evaluation.
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct LinkageConfig {
16    /// Maximum acceptable re-identification rate (0.0 - 1.0).
17    /// Default: 0.05 (5%).
18    pub max_reidentification_rate: f64,
19    /// Minimum k-anonymity level to achieve.
20    /// Default: 5 (each record matches at least 5 others).
21    pub min_k_anonymity: usize,
22}
23
24impl Default for LinkageConfig {
25    fn default() -> Self {
26        Self {
27            max_reidentification_rate: 0.05,
28            min_k_anonymity: 5,
29        }
30    }
31}
32
33/// Results from a linkage attack evaluation.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct LinkageResults {
36    /// Fraction of synthetic records that uniquely match a single original record.
37    pub re_identification_rate: f64,
38    /// The effective k-anonymity achieved (minimum group size across all QI combinations).
39    pub k_anonymity_achieved: usize,
40    /// Number of unique QI combinations in the original data.
41    pub unique_qi_combos_original: usize,
42    /// Number of unique QI combinations in the synthetic data.
43    pub unique_qi_combos_synthetic: usize,
44    /// Number of QI combinations that appear in both datasets.
45    pub overlapping_combos: usize,
46    /// Number of synthetic records that could be uniquely linked.
47    pub uniquely_linked: usize,
48    /// Total synthetic records evaluated.
49    pub total_synthetic: usize,
50    /// Whether privacy passes the configured thresholds.
51    pub passes: bool,
52}
53
54/// Linkage attack evaluator.
55///
56/// Checks if synthetic records can be uniquely re-identified in the original
57/// dataset based on quasi-identifier fields.
58pub struct LinkageAttack {
59    config: LinkageConfig,
60}
61
62impl LinkageAttack {
63    /// Create a new linkage attack evaluator.
64    pub fn new(config: LinkageConfig) -> Self {
65        Self { config }
66    }
67
68    /// Create with default configuration.
69    pub fn with_defaults() -> Self {
70        Self::new(LinkageConfig::default())
71    }
72
73    /// Run the linkage attack evaluation.
74    ///
75    /// # Arguments
76    /// * `original_qis` - Quasi-identifier tuples from the original dataset.
77    ///   Each entry is a record represented as a Vec of string-valued QI fields.
78    /// * `synthetic_qis` - Quasi-identifier tuples from the synthetic dataset.
79    ///
80    /// # Example
81    /// ```ignore
82    /// // Each record has QI fields: [age_bucket, zip_prefix, gender]
83    /// let original = vec![
84    ///     vec!["30-39".into(), "100".into(), "M".into()],
85    ///     vec!["40-49".into(), "200".into(), "F".into()],
86    /// ];
87    /// let synthetic = vec![
88    ///     vec!["30-39".into(), "100".into(), "M".into()],
89    /// ];
90    /// let results = attack.evaluate(&original, &synthetic);
91    /// ```
92    pub fn evaluate(
93        &self,
94        original_qis: &[Vec<String>],
95        synthetic_qis: &[Vec<String>],
96    ) -> LinkageResults {
97        if original_qis.is_empty() || synthetic_qis.is_empty() {
98            return LinkageResults {
99                re_identification_rate: 0.0,
100                k_anonymity_achieved: usize::MAX,
101                unique_qi_combos_original: 0,
102                unique_qi_combos_synthetic: 0,
103                overlapping_combos: 0,
104                uniquely_linked: 0,
105                total_synthetic: synthetic_qis.len(),
106                passes: true,
107            };
108        }
109
110        // Build frequency map for original data QI combinations
111        let mut original_freq: HashMap<Vec<String>, usize> = HashMap::new();
112        for qi in original_qis {
113            *original_freq.entry(qi.clone()).or_insert(0) += 1;
114        }
115
116        // Build frequency map for synthetic data QI combinations
117        let mut synthetic_freq: HashMap<Vec<String>, usize> = HashMap::new();
118        for qi in synthetic_qis {
119            *synthetic_freq.entry(qi.clone()).or_insert(0) += 1;
120        }
121
122        // Count overlapping QI combinations
123        let overlapping_combos = synthetic_freq
124            .keys()
125            .filter(|qi| original_freq.contains_key(*qi))
126            .count();
127
128        // Count uniquely linked records:
129        // A synthetic record is "uniquely linked" if its QI combination maps to
130        // exactly 1 record in the original dataset
131        let mut uniquely_linked = 0usize;
132        for qi in synthetic_qis {
133            if let Some(&orig_count) = original_freq.get(qi) {
134                if orig_count == 1 {
135                    uniquely_linked += 1;
136                }
137            }
138        }
139
140        let re_identification_rate = if synthetic_qis.is_empty() {
141            0.0
142        } else {
143            uniquely_linked as f64 / synthetic_qis.len() as f64
144        };
145
146        // k-anonymity: minimum group size across all QI combos present in original data
147        let k_anonymity_achieved = original_freq.values().copied().min().unwrap_or(0);
148
149        let passes = re_identification_rate <= self.config.max_reidentification_rate
150            && k_anonymity_achieved >= self.config.min_k_anonymity;
151
152        LinkageResults {
153            re_identification_rate,
154            k_anonymity_achieved,
155            unique_qi_combos_original: original_freq.len(),
156            unique_qi_combos_synthetic: synthetic_freq.len(),
157            overlapping_combos,
158            uniquely_linked,
159            total_synthetic: synthetic_qis.len(),
160            passes,
161        }
162    }
163}
164
165#[cfg(test)]
166#[allow(clippy::unwrap_used)]
167mod tests {
168    use super::*;
169
170    fn make_qi(fields: &[&str]) -> Vec<String> {
171        fields.iter().map(|s| s.to_string()).collect()
172    }
173
174    #[test]
175    fn test_k_anonymized_data_low_reidentification() {
176        // Each QI combo appears at least 5 times in original
177        let mut original = Vec::new();
178        for _ in 0..5 {
179            original.push(make_qi(&["30-39", "100", "M"]));
180            original.push(make_qi(&["40-49", "200", "F"]));
181            original.push(make_qi(&["50-59", "300", "M"]));
182        }
183
184        let synthetic = vec![
185            make_qi(&["30-39", "100", "M"]),
186            make_qi(&["40-49", "200", "F"]),
187            make_qi(&["50-59", "300", "M"]),
188        ];
189
190        let attack = LinkageAttack::with_defaults();
191        let results = attack.evaluate(&original, &synthetic);
192
193        assert_eq!(results.re_identification_rate, 0.0);
194        assert_eq!(results.k_anonymity_achieved, 5);
195        assert!(results.passes);
196    }
197
198    #[test]
199    fn test_unique_records_high_reidentification() {
200        // Every record in original has a unique QI combination
201        let original = vec![
202            make_qi(&["25", "10001", "M"]),
203            make_qi(&["32", "10002", "F"]),
204            make_qi(&["45", "10003", "M"]),
205            make_qi(&["58", "10004", "F"]),
206        ];
207
208        // Synthetic has matching QI combos
209        let synthetic = vec![
210            make_qi(&["25", "10001", "M"]),
211            make_qi(&["32", "10002", "F"]),
212        ];
213
214        let attack = LinkageAttack::with_defaults();
215        let results = attack.evaluate(&original, &synthetic);
216
217        // All synthetic records uniquely match
218        assert!((results.re_identification_rate - 1.0).abs() < 1e-10);
219        assert_eq!(results.k_anonymity_achieved, 1);
220        assert!(!results.passes);
221    }
222
223    #[test]
224    fn test_no_overlap() {
225        let original = vec![make_qi(&["A", "1"]), make_qi(&["B", "2"])];
226        let synthetic = vec![make_qi(&["C", "3"]), make_qi(&["D", "4"])];
227
228        let attack = LinkageAttack::with_defaults();
229        let results = attack.evaluate(&original, &synthetic);
230
231        assert_eq!(results.re_identification_rate, 0.0);
232        assert_eq!(results.overlapping_combos, 0);
233        assert_eq!(results.uniquely_linked, 0);
234    }
235
236    #[test]
237    fn test_empty_datasets() {
238        let attack = LinkageAttack::with_defaults();
239        let results = attack.evaluate(&[], &[]);
240        assert!(results.passes);
241        assert_eq!(results.re_identification_rate, 0.0);
242    }
243
244    #[test]
245    fn test_linkage_config_serde() {
246        let config = LinkageConfig::default();
247        let json = serde_json::to_string(&config).unwrap();
248        let parsed: LinkageConfig = serde_json::from_str(&json).unwrap();
249        assert!((parsed.max_reidentification_rate - 0.05).abs() < 1e-10);
250        assert_eq!(parsed.min_k_anonymity, 5);
251    }
252
253    #[test]
254    fn test_linkage_results_serde() {
255        let results = LinkageResults {
256            re_identification_rate: 0.02,
257            k_anonymity_achieved: 10,
258            unique_qi_combos_original: 50,
259            unique_qi_combos_synthetic: 45,
260            overlapping_combos: 30,
261            uniquely_linked: 1,
262            total_synthetic: 100,
263            passes: true,
264        };
265        let json = serde_json::to_string(&results).unwrap();
266        let parsed: LinkageResults = serde_json::from_str(&json).unwrap();
267        assert!((parsed.re_identification_rate - 0.02).abs() < 1e-10);
268        assert_eq!(parsed.k_anonymity_achieved, 10);
269    }
270
271    #[test]
272    fn test_partial_overlap() {
273        // Some records unique, some with k>=2
274        let original = vec![
275            make_qi(&["A", "1"]), // unique
276            make_qi(&["B", "2"]), // appears twice
277            make_qi(&["B", "2"]),
278            make_qi(&["C", "3"]), // appears 3 times
279            make_qi(&["C", "3"]),
280            make_qi(&["C", "3"]),
281        ];
282
283        // Synthetic has all three combos
284        let synthetic = vec![
285            make_qi(&["A", "1"]), // uniquely linked (orig count=1)
286            make_qi(&["B", "2"]), // not uniquely linked (orig count=2)
287            make_qi(&["C", "3"]), // not uniquely linked (orig count=3)
288        ];
289
290        let attack = LinkageAttack::with_defaults();
291        let results = attack.evaluate(&original, &synthetic);
292
293        assert_eq!(results.uniquely_linked, 1);
294        assert!((results.re_identification_rate - 1.0 / 3.0).abs() < 1e-10);
295        assert_eq!(results.k_anonymity_achieved, 1); // min group size = 1
296    }
297}