diff_priv/
lib.rs

1//! # DiffPriv
2//! DiffPriv is a differential privacy framework for real time data streaming written in Rust. Supporting k-anonymity,
3//! (c,l)-diversity and ε-differential privacy. The framework is based on the [Preserving Differential Privacy and Utility of Non-stationary Data Streams](https://ieeexplore.ieee.org/document/8637412) paper, with various improvements implemented.
4//!
5//! This library is the result of my master thesis: Differential privacy in large scale data streaming.
6//! It has been developer during an internship at [STRM Privacy](https://strmprivacy.io/)
7//!
8//! # Using the anonymizer
9//! An example of using the anonymizer can be seen below
10//! ```
11//! use csv::Reader;
12//! use diff_priv::anonymization::microagg_anonymizer::MicroaggAnonymizer;
13//! use diff_priv::noise::laplace::laplace_noiser::LaplaceNoiser;
14//! use diff_priv::test::adult::Adult;
15//! use diff_priv::test::dummy_publisher::DummyPublisher;
16//!
17//! // we initialize our noiser that implements the `Noiser` trait
18//! let noiser = LaplaceNoiser::new(0.1, 3, 0.1);
19//! // we initialize a publisher that implements the `Publisher` trait
20//! let publisher = DummyPublisher::default();
21//! // we create the anonymizer with the desired parameters
22//! // k: 2 | k_max: 10 | c: 2 | l: 7 | diff_thres: 0.1 | delta: 10 | buff_size: 5
23//! let mut anonymizer: MicroaggAnonymizer<LaplaceNoiser, Adult, DummyPublisher> =
24//!     MicroaggAnonymizer::new(2, 10, 2, 7, 0.1, 10, 5, publisher, noiser);
25//!
26//! // load CSV file representing an Adult
27//! let mut file = Reader::from_path("datasets/Adult_1_numeric_only_class_50K.csv").unwrap();
28//! for line in file.deserialize() {
29//!     let row_result = line;
30//!     // when we call for `anonymizer()` the anonymizer will
31//!     // automatically publish to the given backend when the given
32//!     // privacy parameter conditions are met
33//!     match row_result {
34//!         Ok(row) => anonymizer.anonymize(row),
35//!         Err(e) => panic!("{}", e)
36//!     }
37//!  }
38//!
39//! // publish remaining data tuples to the given publisher
40//! // in this case a `DummyPublisher`
41//! anonymizer
42//!     .cluster_set
43//!     .into_iter()
44//!     .for_each(|(_, mut cluster)| {
45//!         cluster.publish_all(&mut anonymizer.publisher, &mut anonymizer.analysers)
46//! });
47//! ```
48//! ## Implementing `Anonymizable` trait to anonymize new data
49//! By implementing the `Anonymizable` trait on any type of datastructure, DiffPriv will know how to anonymize it.
50//! The following QIs types are implemented
51//! ```rust
52//! # use diff_priv::data_manipulation::anonymizable::QuasiIdentifierType;
53//!  /// value, min_value, max_value, weight of attribute
54//! pub type IntervalType = (
55//!     QuasiIdentifierType,
56//!     QuasiIdentifierType,
57//!     QuasiIdentifierType,
58//!     usize,
59//! );
60//!
61//! /// rank, max_rank, weight of attribute
62//! pub type OrdinalType = (i32, i32, usize);
63//!
64//! /// value, max value, weight of attribute
65//! pub type NominalType = (i32, i32, usize);
66//! ```
67//! An example implementation of the `Anonymizable` trait can be seen below
68//! ```
69//! use std::time::{SystemTime, UNIX_EPOCH};
70//! use serde::{Serialize, Deserialize};
71//! use bimap::BiMap;
72//! use lazy_static::lazy_static;
73//! use uuid::Uuid;
74//!
75//! use diff_priv::data_manipulation::anonymizable::{
76//!     Anonymizable, QuasiIdentifierType, QuasiIdentifierTypes, SensitiveAttribute,
77//! };
78//!
79//! lazy_static! {
80//!     static ref CLASS_BIMAP: BiMap<&'static str, i32> =
81//!         BiMap::from_iter(vec![("<=50K", 0), (">50K", 1),]);
82//! }
83//!
84//! // This is the datastructure that we are going to anonymize
85//! #[derive(Debug, Serialize, Clone, Deserialize)]
86//! pub struct Adult {
87//!     timestamp: i32,
88//!     age: i32,
89//!     capital_gain: i32,
90//!     capital_loss: i32,
91//!     class: String,
92//!     #[serde(skip_deserializing, default = "default_time")]
93//!     time_generated: SystemTime,
94//! }
95//!
96//! fn default_time() -> SystemTime {
97//!     SystemTime::now()
98//! }
99//!
100//! impl Default for Adult {
101//!     fn default() -> Self {
102//!         Self {
103//!             timestamp: 0,
104//!             age: 0,
105//!             capital_gain: 0,
106//!             capital_loss: 0,
107//!             class: "".to_string(),
108//!             time_generated: SystemTime::now(),
109//!         }
110//!     }
111//! }
112//!
113//! impl Adult {
114//!     // here we extract an Interval QI from the `age` attribute
115//!     fn get_age_qi(&self) -> QuasiIdentifierTypes {
116//!         QuasiIdentifierTypes::Interval((
117//!             QuasiIdentifierType::Integer(self.age),
118//!             QuasiIdentifierType::Integer(1),
119//!             QuasiIdentifierType::Integer(100),
120//!             1,
121//!         ))
122//!     }
123//!
124//!     // here we extract an Interval QI from the `capital_gain` attribute
125//!     fn get_capital_gain_qi(&self) -> QuasiIdentifierTypes {
126//!         QuasiIdentifierTypes::Interval((
127//!             QuasiIdentifierType::Integer(self.capital_gain),
128//!             QuasiIdentifierType::Integer(0),
129//!             QuasiIdentifierType::Integer(100000),
130//!             1,
131//!         ))
132//!     }
133//!
134//!     // here we extract an Interval QI from the `capital_loss` attribute
135//!     fn get_capital_loss_qi(&self) -> QuasiIdentifierTypes {
136//!         QuasiIdentifierTypes::Interval((
137//!             QuasiIdentifierType::Integer(self.capital_loss),
138//!             QuasiIdentifierType::Integer(0),
139//!             QuasiIdentifierType::Integer(5000),
140//!             1,
141//!         ))
142//!     }
143//!
144//! }
145//!
146//! // Here we implement the `Anonymizable` trait
147//! impl Anonymizable for Adult {
148//!     // We extract the QIs from the datastructure and return a `vec` of QIs
149//!     fn quasi_identifiers(&self) -> Vec<QuasiIdentifierTypes> {
150//!         let age = self.get_age_qi();
151//!         let capital_gain = self.get_capital_gain_qi();
152//!         let capital_loss = self.get_capital_loss_qi();
153//!
154//!         vec![
155//!             age,
156//!             capital_gain,
157//!             capital_loss,
158//!         ]
159//!     }
160//!     
161//!     // We update the datastructures QIs with a `vec` of QIs. The `vec` needs to be
162//!     // popped in the same order that the QIs are extracted with the `quasi_identifiers`
163//!     // function
164//!     fn update_quasi_identifiers(&self, mut qi: Vec<QuasiIdentifierTypes>) -> Self {
165//!         if let (
166//!             QuasiIdentifierType::Integer(capital_loss),
167//!             QuasiIdentifierType::Integer(capital_gain),
168//!             QuasiIdentifierType::Integer(age),
169//!         ) = (
170//!             qi.pop().unwrap().extract_value(),
171//!             qi.pop().unwrap().extract_value(),
172//!             qi.pop().unwrap().extract_value(),
173//!         ) {
174//!             Self {
175//!                 timestamp: self.timestamp,
176//!                 age,
177//!                 capital_gain,
178//!                 capital_loss,
179//!                 class: self.class.to_owned(),
180//!                 time_generated: self.time_generated,
181//!             }
182//!         } else {
183//!             panic!("Couldn't Adult with QI's")
184//!         }
185//!     }
186//!     
187//!     // We extract the sensative attribute from the datastructure
188//!     fn sensitive_value(&self) -> SensitiveAttribute {
189//!         SensitiveAttribute::String(self.class.to_owned())
190//!     }
191//!
192//!     // We return a vector of strings containing the String version of the QIs
193//!     // Used for printing to CSVs
194//!     fn extract_string_values(&self, uuid: Uuid, dr: f64) -> Vec<String> {
195//!         vec![
196//!             uuid.to_string(),
197//!             dr.to_string(),
198//!             self.timestamp.to_string(),
199//!             self.age.to_string(),
200//!             self.capital_gain.to_string(),
201//!             self.capital_loss.to_string(),
202//!             self.class.to_owned(),
203//!         ]
204//!     }
205//!
206//!     fn get_timestamp(&self) -> SystemTime {
207//!         self.time_generated
208//!     }
209//! }
210//! ```
211//!
212//! # The `Publisher` trait
213//! To publish an anonymized struct to a desired backend we use the `Publisher` trait.
214//! DiffPriv also support exporting to an [Apache Kafka topic](publishing::kafka_publisher::KafkaPublisher). This can be seen in `publishing` directory.
215//! An example publisher for CSVs can be seen here: [CsvPublisher](publishing::csv_publisher::CsvPublisher).
216//! To implement a custom publishing backend one can use the [Publisher](publishing::publisher::Publisher) trait.
217//!
218//! # The `Noiser` trait
219//! DiffPriv support [Laplace noise](noise::laplace::laplace_noiser::LaplaceNoiser) for ε-differential privacy.
220//! The noiser supports 2 different kind of noise: one for [numerical values](noise::laplace::numerical_noiser::NumericalNoiser) and one for [categorical](noise::laplace::categorical_noiser::CategoricalNoiser).
221//! To implement a custom implementation of ε-differential privacy noise, one can use the [Noiser](noise::noiser::Noiser) trait.
222
223#[macro_use]
224extern crate serde;
225#[macro_use]
226extern crate log;
227#[macro_use]
228extern crate lazy_static;
229
230extern crate core;
231extern crate pretty_env_logger;
232
233pub mod analysis;
234pub mod anonymization;
235pub mod config;
236pub mod data_manipulation;
237pub mod kafka;
238pub mod noise;
239pub mod publishing;
240pub mod test;
241pub mod vec_set;