diff_priv/data_manipulation/
anonymizable.rs

1use crate::data_manipulation::aggregation::truncate_to_domain;
2use num::abs;
3use rand::distributions::{Distribution, Uniform};
4use rand::thread_rng;
5use rand_distr::Normal;
6use serde::Serialize;
7use std::time::SystemTime;
8use uuid::Uuid;
9
10#[derive(Hash, Eq, PartialEq)]
11pub enum SensitiveAttribute {
12    String(String),
13    Integer(i32),
14}
15
16/// value, min_value, max_value, weight of attribute
17pub type IntervalType = (
18    QuasiIdentifierType,
19    QuasiIdentifierType,
20    QuasiIdentifierType,
21    usize,
22);
23
24/// rank, max_rank, weight of attribute
25pub type OrdinalType = (i32, i32, usize);
26
27/// value, max value, weight of attribute
28pub type NominalType = (i32, i32, usize);
29
30#[derive(Debug, Copy, Clone)]
31pub enum QuasiIdentifierType {
32    Float(f64),
33    Integer(i32),
34}
35
36/// Possible quasi identifier data category types
37#[derive(Debug)]
38pub enum QuasiIdentifierTypes {
39    /// value, min_value, max_value, weight of attribute
40    Interval(IntervalType),
41    /// rank, max_rank, weight of attribute
42    Ordinal(OrdinalType),
43    /// value, weight of attribute
44    Nominal(NominalType),
45}
46
47impl QuasiIdentifierTypes {
48    /// consume itself and extract the value of the quasi identifier
49    pub fn extract_value(self) -> QuasiIdentifierType {
50        match self {
51            QuasiIdentifierTypes::Interval((value, _, _, _)) => value,
52            QuasiIdentifierTypes::Ordinal((value, _, _)) => QuasiIdentifierType::Integer(value),
53            QuasiIdentifierTypes::Nominal((value, _, _)) => QuasiIdentifierType::Integer(value),
54        }
55    }
56
57    pub fn randomize(self) -> QuasiIdentifierTypes {
58        let mut rng = thread_rng();
59        match self {
60            QuasiIdentifierTypes::Interval((value, min, max, weight)) => match (value, min, max) {
61                (
62                    QuasiIdentifierType::Float(val_fl),
63                    QuasiIdentifierType::Float(min_val),
64                    QuasiIdentifierType::Float(max_val),
65                ) => {
66                    let normal: Normal<f64> = Normal::new(val_fl, 1.0).unwrap();
67                    let e = normal.sample(&mut rng);
68                    QuasiIdentifierTypes::Interval((
69                        QuasiIdentifierType::Float(truncate_to_domain(e, min_val, max_val)),
70                        QuasiIdentifierType::Float(min_val),
71                        QuasiIdentifierType::Float(max_val),
72                        weight,
73                    ))
74                }
75                (
76                    QuasiIdentifierType::Integer(val_int),
77                    QuasiIdentifierType::Integer(min_val),
78                    QuasiIdentifierType::Integer(max_val),
79                ) => {
80                    let normal: Normal<f64> = Normal::new(val_int as f64, 1.0).unwrap();
81                    let e = normal.sample(&mut rng);
82                    QuasiIdentifierTypes::Interval((
83                        QuasiIdentifierType::Integer(truncate_to_domain(
84                            e as i32, min_val, max_val,
85                        )),
86                        QuasiIdentifierType::Integer(min_val),
87                        QuasiIdentifierType::Integer(max_val),
88                        weight,
89                    ))
90                }
91                _ => panic!("Wrong combination of type found in randomization of interval"),
92            },
93            QuasiIdentifierTypes::Ordinal((_, max_rank, weight)) => {
94                let between = Uniform::<i32>::from(0..max_rank + 1);
95                let random_ordinal_qi = between.sample(&mut rng);
96                QuasiIdentifierTypes::Ordinal((random_ordinal_qi as i32, max_rank, weight))
97            }
98            QuasiIdentifierTypes::Nominal((_, max_value, weight)) => {
99                let between = Uniform::<i32>::from(0..max_value + 1);
100                let random_nominal_qi = between.sample(&mut rng);
101                QuasiIdentifierTypes::Nominal((random_nominal_qi, max_value, weight))
102            }
103        }
104    }
105}
106
107/// The role of this trait is to create a generic way of making sure that the struct can be anonymized
108/// using the Anonymizer
109pub trait Anonymizable: Default + Clone + Serialize + Sync {
110    /// compare 2 data points and return the euclidean difference between them
111    fn calculate_difference(&self, other: &Self) -> f64 {
112        let mut sum_weight: usize = 0;
113        let diff: f64 = self
114            .quasi_identifiers()
115            .into_iter()
116            .zip(other.quasi_identifiers().into_iter())
117            .map(|(x, y)| match (x, y) {
118                (
119                    QuasiIdentifierTypes::Interval(interval_x),
120                    QuasiIdentifierTypes::Interval(interval_y),
121                ) => {
122                    let (_, _, _, weight) = interval_x;
123                    sum_weight += weight;
124                    Self::calculate_interval_distance(interval_x, interval_y)
125                }
126                (
127                    QuasiIdentifierTypes::Ordinal(ordinal_x),
128                    QuasiIdentifierTypes::Ordinal(ordinal_y),
129                ) => {
130                    let (_, _, weight) = ordinal_x;
131                    sum_weight += weight;
132                    Self::calculate_ordinal_distance(ordinal_x, ordinal_y)
133                }
134                (
135                    QuasiIdentifierTypes::Nominal(nominal_x),
136                    QuasiIdentifierTypes::Nominal(nominal_y),
137                ) => {
138                    let (_, _, weight) = nominal_x;
139                    sum_weight += weight;
140                    Self::calculate_nominal_distance(nominal_x, nominal_y)
141                }
142                _ => {
143                    panic!("wrong types provided")
144                }
145            })
146            .sum();
147
148        diff / sum_weight as f64
149    }
150
151    /// calculate the info loss between 2 different Anonymizable
152    /// structs
153    fn calculate_info_loss(&self, other: &Self) -> f64 {
154        let mut distance = 0.0;
155        let self_qi = self.quasi_identifiers();
156        let other_qi = other.quasi_identifiers();
157
158        self_qi
159            .into_iter()
160            .zip(other_qi.into_iter())
161            .for_each(|(x, y)| match (x.extract_value(), y.extract_value()) {
162                (QuasiIdentifierType::Integer(value1), QuasiIdentifierType::Integer(value2)) => {
163                    distance += (value1 as f64 - value2 as f64).powi(2)
164                }
165                (QuasiIdentifierType::Float(value1), QuasiIdentifierType::Float(value2)) => {
166                    distance += (value1 - value2).powi(2)
167                }
168                _ => {
169                    panic!("Incompatible values have been found")
170                }
171            });
172
173        distance.sqrt()
174    }
175
176    /// return the values of the quasi identifiers in the data struct
177    fn quasi_identifiers(&self) -> Vec<QuasiIdentifierTypes>;
178
179    /// return a copy of the Anonymizable struct and replace its
180    /// quasi identifier attributes with given QI's
181    /// we return a copy because we want to keep the original intact for new aggregation
182    fn update_quasi_identifiers(&self, qi: Vec<QuasiIdentifierTypes>) -> Self;
183
184    /// return a copy of the sensitive attribute of the struct
185    fn sensitive_value(&self) -> SensitiveAttribute;
186
187    /// extract all the values in string format to be used for creating CSV
188    fn extract_string_values(&self, uuid: Uuid, dr: f64) -> Vec<String>;
189
190    // get the timestamp that the tuple has entered the algorithm
191    fn get_timestamp(&self) -> SystemTime;
192
193    /// suppress the qi's based on a buffer of Anonymizables
194    fn suppress(&self) -> Self {
195        let suppressed_qi = self
196            .quasi_identifiers()
197            .into_iter()
198            .map(|x| x.randomize())
199            .collect();
200
201        self.update_quasi_identifiers(suppressed_qi)
202    }
203
204    /// calculate the euclidean distance between 2 ordinal data category types
205    /// TODO: clarify that ranking starts at 1
206    fn calculate_ordinal_distance(ordinal_x: OrdinalType, ordinal_y: OrdinalType) -> f64 {
207        let (rank1, max_rank, weight) = ordinal_x;
208        let (rank2, _, _) = ordinal_y;
209
210        let x = (rank1 as f64 - 1.0) / (max_rank as f64 - 1.0);
211        let y = (rank2 as f64 - 1.0) / (max_rank as f64 - 1.0);
212
213        (weight as f64)
214            * Self::calculate_interval_distance(
215                (
216                    QuasiIdentifierType::Float(x),
217                    QuasiIdentifierType::Float(1.0),
218                    QuasiIdentifierType::Float(max_rank as f64),
219                    weight,
220                ),
221                (
222                    QuasiIdentifierType::Float(y),
223                    QuasiIdentifierType::Float(1.0),
224                    QuasiIdentifierType::Float(max_rank as f64),
225                    weight,
226                ),
227            )
228    }
229
230    /// calculate the euclidean distance between 2 interval data types
231    fn calculate_interval_distance(interval_x: IntervalType, interval_y: IntervalType) -> f64 {
232        let (num1, min, max, weight) = interval_x;
233        let (num2, _, _, _) = interval_y;
234
235        match (num1, min, max, num2) {
236            (
237                QuasiIdentifierType::Float(x),
238                QuasiIdentifierType::Float(min),
239                QuasiIdentifierType::Float(max),
240                QuasiIdentifierType::Float(y),
241            ) => weight as f64 * abs(x - y) / (max - min),
242            (
243                QuasiIdentifierType::Integer(x),
244                QuasiIdentifierType::Integer(min),
245                QuasiIdentifierType::Integer(max),
246                QuasiIdentifierType::Integer(y),
247            ) => weight as f64 * abs(x as f64 - y as f64) / (max as f64 - min as f64),
248            _ => {
249                panic!("wrong type conversion")
250            }
251        }
252    }
253
254    /// calculate the euclidean distance between 2 nominal data types
255    fn calculate_nominal_distance(nominal_x: NominalType, nominal_y: NominalType) -> f64 {
256        let (x, _, weight) = nominal_x;
257        let (y, _, _) = nominal_y;
258
259        match x == y {
260            true => 0.0,
261            false => weight as f64,
262        }
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use crate::data_manipulation::aggregation::AggregateType;
269    use crate::data_manipulation::anonymizable::Anonymizable;
270    use crate::data_manipulation::anonymizable::QuasiIdentifierType::{Float, Integer};
271    use crate::data_manipulation::anonymizable::QuasiIdentifierTypes::{
272        Interval, Nominal, Ordinal,
273    };
274    use crate::data_manipulation::mueller::MuellerStream;
275
276    #[test]
277    fn get_quasi_identifiers() {
278        let mueller = MuellerStream {
279            age: Some(32),
280            gender: Some("male".to_string()),
281            ..MuellerStream::default()
282        };
283
284        let mut quasi_identifiers = mueller.quasi_identifiers();
285
286        match quasi_identifiers.remove(0) {
287            Interval((Integer(32), Integer(33), Integer(85), 1)) => {}
288            _ => {
289                panic!()
290            }
291        }
292
293        match quasi_identifiers.remove(0) {
294            Nominal((0, 1, 1)) => {}
295            _ => {
296                panic!()
297            }
298        }
299    }
300
301    #[test]
302    fn update_quasi_identifiers() {
303        let mueller = MuellerStream {
304            age: Some(32),
305            gender: Some("male".to_string()),
306            ..MuellerStream::default()
307        };
308
309        let centroid = MuellerStream {
310            age: Some(50),
311            gender: Some("female".to_string()),
312            ..MuellerStream::default()
313        };
314
315        let anonymized = mueller.update_quasi_identifiers(centroid.quasi_identifiers());
316
317        assert_eq!(anonymized.age, Some(50));
318        assert_eq!(anonymized.gender, Some("female".to_string()))
319    }
320
321    #[test]
322    fn calculate_difference() {
323        let mueller = MuellerStream {
324            age: Some(37),
325            gender: Some("male".to_string()),
326            ..MuellerStream::default()
327        };
328
329        let centroid = MuellerStream {
330            age: Some(50),
331            gender: Some("female".to_string()),
332            ..MuellerStream::default()
333        };
334
335        let difference = mueller.calculate_difference(&centroid);
336
337        assert_eq!(difference, 0.625)
338    }
339
340    #[test]
341    fn calculate_difference_zero() {
342        let mueller = MuellerStream {
343            age: Some(37),
344            gender: Some("male".to_string()),
345            ..MuellerStream::default()
346        };
347
348        let centroid = MuellerStream {
349            age: Some(37),
350            gender: Some("male".to_string()),
351            ..MuellerStream::default()
352        };
353
354        let difference = mueller.calculate_difference(&centroid);
355
356        assert_eq!(difference, 0.0)
357    }
358
359    #[test]
360    fn calculate_difference_one() {
361        let mueller = MuellerStream {
362            age: Some(33),
363            gender: Some("male".to_string()),
364            ..MuellerStream::default()
365        };
366
367        let centroid = MuellerStream {
368            age: Some(85),
369            gender: Some("female".to_string()),
370            ..MuellerStream::default()
371        };
372
373        let difference = mueller.calculate_difference(&centroid);
374
375        assert_eq!(difference, 1.0)
376    }
377
378    #[test]
379    fn calculate_info_loss() {
380        let mueller = MuellerStream {
381            age: Some(33),
382            gender: Some("male".to_string()),
383            ..MuellerStream::default()
384        };
385
386        let centroid = MuellerStream {
387            age: Some(50),
388            gender: Some("female".to_string()),
389            ..MuellerStream::default()
390        };
391
392        let info_loss = mueller.calculate_info_loss(&centroid);
393        assert!((info_loss - 17.29) <= f64::EPSILON)
394    }
395
396    #[test]
397    fn aggregation_interval_integer() {
398        let agg1 = Interval((Integer(1), Integer(0), Integer(10), 1));
399        let agg2 = Interval((Integer(4), Integer(0), Integer(10), 1));
400        let agg3 = Interval((Integer(6), Integer(0), Integer(10), 1));
401        let agg4 = Interval((Integer(10), Integer(0), Integer(10), 1));
402
403        let aggregation = AggregateType::Mean(vec![agg1, agg2, agg3, agg4]).aggregate();
404
405        if let Integer(value) = aggregation.extract_value() {
406            assert_eq!(value, 5)
407        } else {
408            panic!()
409        }
410    }
411
412    #[test]
413    fn aggregation_interval_float() {
414        let agg1 = Interval((Float(1.0), Float(0.0), Float(10.0), 1));
415        let agg2 = Interval((Float(4.0), Float(0.0), Float(10.0), 1));
416        let agg3 = Interval((Float(6.0), Float(0.0), Float(10.0), 1));
417        let agg4 = Interval((Float(10.0), Float(0.0), Float(10.0), 1));
418
419        let aggregation = AggregateType::Mean(vec![agg1, agg2, agg3, agg4]).aggregate();
420
421        if let Float(value) = aggregation.extract_value() {
422            assert_eq!(value, 5.25)
423        } else {
424            panic!()
425        }
426    }
427
428    #[test]
429    fn aggregation_ordinal() {
430        let agg1 = Ordinal((1, 10, 1));
431        let agg2 = Ordinal((1, 10, 1));
432        let agg3 = Ordinal((2, 10, 1));
433        let agg4 = Ordinal((4, 10, 1));
434
435        let aggregation = AggregateType::Mode(vec![agg1, agg2, agg3, agg4]).aggregate();
436
437        if let Integer(value) = aggregation.extract_value() {
438            assert_eq!(value, 1)
439        } else {
440            panic!()
441        }
442    }
443
444    #[test]
445    fn aggregation_nominal() {
446        let agg1 = Nominal((1, 4, 10));
447        let agg2 = Nominal((1, 4, 10));
448        let agg3 = Nominal((2, 4, 10));
449        let agg4 = Nominal((4, 4, 10));
450
451        let aggregation = AggregateType::Mode(vec![agg1, agg2, agg3, agg4]).aggregate();
452
453        if let Integer(value) = aggregation.extract_value() {
454            assert_eq!(value, 1)
455        } else {
456            panic!()
457        }
458    }
459}