1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
//! # DiffPriv
//! DiffPriv is a differential privacy framework for real time data streaming written in Rust. Supporting k-anonymity,
//! (c,l)-diversity and ε-differential privacy. The framework is based on the [Preserving Differential Privacy and Utility of Non-stationary Data Streams](https://ieeexplore.ieee.org/document/8637412) paper, with various improvements implemented.
//!
//! This library is the result of my master thesis: Differential privacy in large scale data streaming.
//! It has been developer during an internship at [STRM Privacy](https://strmprivacy.io/)
//!
//! # Using the anonymizer
//! An example of using the anonymizer can be seen below
//! ```
//! use csv::Reader;
//! use diff_priv::anonymization::microagg_anonymizer::MicroaggAnonymizer;
//! use diff_priv::noise::laplace::laplace_noiser::LaplaceNoiser;
//! use diff_priv::test::adult::Adult;
//! use diff_priv::test::dummy_publisher::DummyPublisher;
//!
//! // we initialize our noiser that implements the `Noiser` trait
//! let noiser = LaplaceNoiser::new(0.1, 3, 0.1);
//! // we initialize a publisher that implements the `Publisher` trait
//! let publisher = DummyPublisher::default();
//! // we create the anonymizer with the desired parameters
//! // k: 2 | k_max: 10 | c: 2 | l: 7 | diff_thres: 0.1 | delta: 10 | buff_size: 5
//! let mut anonymizer: MicroaggAnonymizer<LaplaceNoiser, Adult, DummyPublisher> =
//! MicroaggAnonymizer::new(2, 10, 2, 7, 0.1, 10, 5, publisher, noiser);
//!
//! // load CSV file representing an Adult
//! let mut file = Reader::from_path("datasets/Adult_1_numeric_only_class_50K.csv").unwrap();
//! for line in file.deserialize() {
//! let row_result = line;
//! // when we call for `anonymizer()` the anonymizer will
//! // automatically publish to the given backend when the given
//! // privacy parameter conditions are met
//! match row_result {
//! Ok(row) => anonymizer.anonymize(row),
//! Err(e) => panic!("{}", e)
//! }
//! }
//!
//! // publish remaining data tuples to the given publisher
//! // in this case a `DummyPublisher`
//! anonymizer
//! .cluster_set
//! .into_iter()
//! .for_each(|(_, mut cluster)| {
//! cluster.publish_all(&mut anonymizer.publisher, &mut anonymizer.analysers)
//! });
//! ```
//! ## Implementing `Anonymizable` trait to anonymize new data
//! By implementing the `Anonymizable` trait on any type of datastructure, DiffPriv will know how to anonymize it.
//! The following QIs types are implemented
//! ```rust
//! # use diff_priv::data_manipulation::anonymizable::QuasiIdentifierType;
//! /// value, min_value, max_value, weight of attribute
//! pub type IntervalType = (
//! QuasiIdentifierType,
//! QuasiIdentifierType,
//! QuasiIdentifierType,
//! usize,
//! );
//!
//! /// rank, max_rank, weight of attribute
//! pub type OrdinalType = (i32, i32, usize);
//!
//! /// value, max value, weight of attribute
//! pub type NominalType = (i32, i32, usize);
//! ```
//! An example implementation of the `Anonymizable` trait can be seen below
//! ```
//! use std::time::{SystemTime, UNIX_EPOCH};
//! use serde::{Serialize, Deserialize};
//! use bimap::BiMap;
//! use lazy_static::lazy_static;
//! use uuid::Uuid;
//!
//! use diff_priv::data_manipulation::anonymizable::{
//! Anonymizable, QuasiIdentifierType, QuasiIdentifierTypes, SensitiveAttribute,
//! };
//!
//! lazy_static! {
//! static ref CLASS_BIMAP: BiMap<&'static str, i32> =
//! BiMap::from_iter(vec![("<=50K", 0), (">50K", 1),]);
//! }
//!
//! // This is the datastructure that we are going to anonymize
//! #[derive(Debug, Serialize, Clone, Deserialize)]
//! pub struct Adult {
//! timestamp: i32,
//! age: i32,
//! capital_gain: i32,
//! capital_loss: i32,
//! class: String,
//! #[serde(skip_deserializing, default = "default_time")]
//! time_generated: SystemTime,
//! }
//!
//! fn default_time() -> SystemTime {
//! SystemTime::now()
//! }
//!
//! impl Default for Adult {
//! fn default() -> Self {
//! Self {
//! timestamp: 0,
//! age: 0,
//! capital_gain: 0,
//! capital_loss: 0,
//! class: "".to_string(),
//! time_generated: SystemTime::now(),
//! }
//! }
//! }
//!
//! impl Adult {
//! // here we extract an Interval QI from the `age` attribute
//! fn get_age_qi(&self) -> QuasiIdentifierTypes {
//! QuasiIdentifierTypes::Interval((
//! QuasiIdentifierType::Integer(self.age),
//! QuasiIdentifierType::Integer(1),
//! QuasiIdentifierType::Integer(100),
//! 1,
//! ))
//! }
//!
//! // here we extract an Interval QI from the `capital_gain` attribute
//! fn get_capital_gain_qi(&self) -> QuasiIdentifierTypes {
//! QuasiIdentifierTypes::Interval((
//! QuasiIdentifierType::Integer(self.capital_gain),
//! QuasiIdentifierType::Integer(0),
//! QuasiIdentifierType::Integer(100000),
//! 1,
//! ))
//! }
//!
//! // here we extract an Interval QI from the `capital_loss` attribute
//! fn get_capital_loss_qi(&self) -> QuasiIdentifierTypes {
//! QuasiIdentifierTypes::Interval((
//! QuasiIdentifierType::Integer(self.capital_loss),
//! QuasiIdentifierType::Integer(0),
//! QuasiIdentifierType::Integer(5000),
//! 1,
//! ))
//! }
//!
//! }
//!
//! // Here we implement the `Anonymizable` trait
//! impl Anonymizable for Adult {
//! // We extract the QIs from the datastructure and return a `vec` of QIs
//! fn quasi_identifiers(&self) -> Vec<QuasiIdentifierTypes> {
//! let age = self.get_age_qi();
//! let capital_gain = self.get_capital_gain_qi();
//! let capital_loss = self.get_capital_loss_qi();
//!
//! vec![
//! age,
//! capital_gain,
//! capital_loss,
//! ]
//! }
//!
//! // We update the datastructures QIs with a `vec` of QIs. The `vec` needs to be
//! // popped in the same order that the QIs are extracted with the `quasi_identifiers`
//! // function
//! fn update_quasi_identifiers(&self, mut qi: Vec<QuasiIdentifierTypes>) -> Self {
//! if let (
//! QuasiIdentifierType::Integer(capital_loss),
//! QuasiIdentifierType::Integer(capital_gain),
//! QuasiIdentifierType::Integer(age),
//! ) = (
//! qi.pop().unwrap().extract_value(),
//! qi.pop().unwrap().extract_value(),
//! qi.pop().unwrap().extract_value(),
//! ) {
//! Self {
//! timestamp: self.timestamp,
//! age,
//! capital_gain,
//! capital_loss,
//! class: self.class.to_owned(),
//! time_generated: self.time_generated,
//! }
//! } else {
//! panic!("Couldn't Adult with QI's")
//! }
//! }
//!
//! // We extract the sensative attribute from the datastructure
//! fn sensitive_value(&self) -> SensitiveAttribute {
//! SensitiveAttribute::String(self.class.to_owned())
//! }
//!
//! // We return a vector of strings containing the String version of the QIs
//! // Used for printing to CSVs
//! fn extract_string_values(&self, uuid: Uuid, dr: f64) -> Vec<String> {
//! vec![
//! uuid.to_string(),
//! dr.to_string(),
//! self.timestamp.to_string(),
//! self.age.to_string(),
//! self.capital_gain.to_string(),
//! self.capital_loss.to_string(),
//! self.class.to_owned(),
//! ]
//! }
//!
//! fn get_timestamp(&self) -> SystemTime {
//! self.time_generated
//! }
//! }
//! ```
//!
//! # The `Publisher` trait
//! To publish an anonymized struct to a desired backend we use the `Publisher` trait.
//! DiffPriv also support exporting to an [Apache Kafka topic](publishing::kafka_publisher::KafkaPublisher). This can be seen in `publishing` directory.
//! An example publisher for CSVs can be seen here: [CsvPublisher](publishing::csv_publisher::CsvPublisher).
//! To implement a custom publishing backend one can use the [Publisher](publishing::publisher::Publisher) trait.
//!
//! # The `Noiser` trait
//! DiffPriv support [Laplace noise](noise::laplace::laplace_noiser::LaplaceNoiser) for ε-differential privacy.
//! The noiser supports 2 different kind of noise: one for [numerical values](noise::laplace::numerical_noiser::NumericalNoiser) and one for [categorical](noise::laplace::categorical_noiser::CategoricalNoiser).
//! To implement a custom implementation of ε-differential privacy noise, one can use the [Noiser](noise::noiser::Noiser) trait.
extern crate serde;
extern crate log;
extern crate lazy_static;
extern crate core;
extern crate pretty_env_logger;