Skip to main content

pacha/data/
datasheet.rs

1//! Datasheet for standardized dataset documentation.
2//!
3//! Based on "Datasheets for Datasets" (Gebru et al., 2021).
4
5use chrono::{DateTime, Utc};
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Datasheet with standardized dataset documentation.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct Datasheet {
12    // Motivation
13    /// Purpose of the dataset.
14    pub purpose: String,
15    /// Creators of the dataset.
16    #[serde(default)]
17    pub creators: Vec<String>,
18    /// Funding source.
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub funding: Option<String>,
21
22    // Composition
23    /// Number of instances.
24    #[serde(skip_serializing_if = "Option::is_none")]
25    pub instance_count: Option<u64>,
26    /// Feature descriptions.
27    #[serde(default)]
28    pub features: HashMap<String, FeatureInfo>,
29    /// Sensitive features that require special handling.
30    #[serde(default)]
31    pub sensitive_features: Vec<String>,
32
33    // Collection process
34    /// How the data was collected.
35    #[serde(skip_serializing_if = "Option::is_none")]
36    pub collection_method: Option<String>,
37    /// When the data collection started.
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub collection_start: Option<DateTime<Utc>>,
40    /// When the data collection ended.
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub collection_end: Option<DateTime<Utc>>,
43    /// Preprocessing steps applied.
44    #[serde(default)]
45    pub preprocessing: Vec<PreprocessingStep>,
46
47    // Distribution
48    /// License for the dataset.
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub license: Option<String>,
51    /// Access restrictions.
52    #[serde(default)]
53    pub access_restrictions: Vec<String>,
54
55    // Maintenance
56    /// Who maintains the dataset.
57    #[serde(skip_serializing_if = "Option::is_none")]
58    pub maintainer: Option<String>,
59    /// How often the dataset is updated.
60    #[serde(skip_serializing_if = "Option::is_none")]
61    pub update_frequency: Option<String>,
62    /// Deprecation policy.
63    #[serde(skip_serializing_if = "Option::is_none")]
64    pub deprecation_policy: Option<String>,
65
66    /// Additional metadata.
67    #[serde(default)]
68    pub extra: HashMap<String, serde_json::Value>,
69}
70
71/// Information about a feature in the dataset.
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct FeatureInfo {
74    /// Data type of the feature.
75    pub dtype: String,
76    /// Description of the feature.
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub description: Option<String>,
79    /// Whether the feature can be null.
80    #[serde(default)]
81    pub nullable: bool,
82    /// Statistics about the feature.
83    #[serde(default)]
84    pub statistics: HashMap<String, f64>,
85}
86
87impl FeatureInfo {
88    /// Create a new feature info.
89    #[must_use]
90    pub fn new(dtype: impl Into<String>) -> Self {
91        Self { dtype: dtype.into(), description: None, nullable: false, statistics: HashMap::new() }
92    }
93
94    /// Set description.
95    #[must_use]
96    pub fn with_description(mut self, description: impl Into<String>) -> Self {
97        self.description = Some(description.into());
98        self
99    }
100
101    /// Set nullable.
102    #[must_use]
103    pub fn with_nullable(mut self, nullable: bool) -> Self {
104        self.nullable = nullable;
105        self
106    }
107}
108
109/// A preprocessing step applied to the data.
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct PreprocessingStep {
112    /// Name of the step.
113    pub name: String,
114    /// Description of what the step does.
115    #[serde(skip_serializing_if = "Option::is_none")]
116    pub description: Option<String>,
117    /// Parameters used.
118    #[serde(default)]
119    pub parameters: HashMap<String, serde_json::Value>,
120}
121
122impl PreprocessingStep {
123    /// Create a new preprocessing step.
124    #[must_use]
125    pub fn new(name: impl Into<String>) -> Self {
126        Self { name: name.into(), description: None, parameters: HashMap::new() }
127    }
128
129    /// Set description.
130    #[must_use]
131    pub fn with_description(mut self, description: impl Into<String>) -> Self {
132        self.description = Some(description.into());
133        self
134    }
135}
136
137impl Datasheet {
138    /// Create a new datasheet builder.
139    #[must_use]
140    pub fn builder() -> DatasheetBuilder {
141        DatasheetBuilder::new()
142    }
143
144    /// Create a minimal datasheet with just a purpose.
145    #[must_use]
146    pub fn new(purpose: impl Into<String>) -> Self {
147        Self {
148            purpose: purpose.into(),
149            creators: Vec::new(),
150            funding: None,
151            instance_count: None,
152            features: HashMap::new(),
153            sensitive_features: Vec::new(),
154            collection_method: None,
155            collection_start: None,
156            collection_end: None,
157            preprocessing: Vec::new(),
158            license: None,
159            access_restrictions: Vec::new(),
160            maintainer: None,
161            update_frequency: None,
162            deprecation_policy: None,
163            extra: HashMap::new(),
164        }
165    }
166
167    /// Add a feature.
168    pub fn add_feature(&mut self, name: impl Into<String>, info: FeatureInfo) {
169        self.features.insert(name.into(), info);
170    }
171
172    /// Add a preprocessing step.
173    pub fn add_preprocessing(&mut self, step: PreprocessingStep) {
174        self.preprocessing.push(step);
175    }
176}
177
178impl Default for Datasheet {
179    fn default() -> Self {
180        Self::new("")
181    }
182}
183
184/// Builder for creating datasheets.
185#[derive(Debug, Default)]
186pub struct DatasheetBuilder {
187    sheet: Datasheet,
188}
189
190impl DatasheetBuilder {
191    /// Create a new builder.
192    #[must_use]
193    pub fn new() -> Self {
194        Self { sheet: Datasheet::default() }
195    }
196
197    /// Set the purpose.
198    #[must_use]
199    pub fn purpose(mut self, purpose: impl Into<String>) -> Self {
200        self.sheet.purpose = purpose.into();
201        self
202    }
203
204    /// Set creators.
205    #[must_use]
206    pub fn creators<I, S>(mut self, creators: I) -> Self
207    where
208        I: IntoIterator<Item = S>,
209        S: Into<String>,
210    {
211        self.sheet.creators = creators.into_iter().map(Into::into).collect();
212        self
213    }
214
215    /// Set funding.
216    #[must_use]
217    pub fn funding(mut self, funding: impl Into<String>) -> Self {
218        self.sheet.funding = Some(funding.into());
219        self
220    }
221
222    /// Set instance count.
223    #[must_use]
224    pub fn instance_count(mut self, count: u64) -> Self {
225        self.sheet.instance_count = Some(count);
226        self
227    }
228
229    /// Add a feature.
230    #[must_use]
231    pub fn feature(mut self, name: impl Into<String>, info: FeatureInfo) -> Self {
232        self.sheet.features.insert(name.into(), info);
233        self
234    }
235
236    /// Set sensitive features.
237    #[must_use]
238    pub fn sensitive_features<I, S>(mut self, features: I) -> Self
239    where
240        I: IntoIterator<Item = S>,
241        S: Into<String>,
242    {
243        self.sheet.sensitive_features = features.into_iter().map(Into::into).collect();
244        self
245    }
246
247    /// Set collection method.
248    #[must_use]
249    pub fn collection_method(mut self, method: impl Into<String>) -> Self {
250        self.sheet.collection_method = Some(method.into());
251        self
252    }
253
254    /// Set license.
255    #[must_use]
256    pub fn license(mut self, license: impl Into<String>) -> Self {
257        self.sheet.license = Some(license.into());
258        self
259    }
260
261    /// Set maintainer.
262    #[must_use]
263    pub fn maintainer(mut self, maintainer: impl Into<String>) -> Self {
264        self.sheet.maintainer = Some(maintainer.into());
265        self
266    }
267
268    /// Build the datasheet.
269    #[must_use]
270    pub fn build(self) -> Datasheet {
271        self.sheet
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_datasheet_new() {
281        let sheet = Datasheet::new("Customer transactions for fraud detection");
282        assert_eq!(sheet.purpose, "Customer transactions for fraud detection");
283        assert!(sheet.features.is_empty());
284    }
285
286    #[test]
287    fn test_datasheet_builder() {
288        let sheet = Datasheet::builder()
289            .purpose("Training data for fraud detection")
290            .creators(["Alice", "Bob"])
291            .instance_count(1_000_000)
292            .feature(
293                "amount",
294                FeatureInfo::new("float64").with_description("Transaction amount in USD"),
295            )
296            .feature("timestamp", FeatureInfo::new("datetime").with_nullable(true))
297            .sensitive_features(["customer_id", "card_number"])
298            .license("MIT")
299            .maintainer("data-team@company.com")
300            .build();
301
302        assert_eq!(sheet.purpose, "Training data for fraud detection");
303        assert_eq!(sheet.creators, vec!["Alice", "Bob"]);
304        assert_eq!(sheet.instance_count, Some(1_000_000));
305        assert_eq!(sheet.features.len(), 2);
306        assert!(sheet.features.contains_key("amount"));
307        assert_eq!(sheet.sensitive_features.len(), 2);
308        assert_eq!(sheet.license, Some("MIT".to_string()));
309    }
310
311    #[test]
312    fn test_feature_info() {
313        let info = FeatureInfo::new("int64").with_description("User ID").with_nullable(false);
314
315        assert_eq!(info.dtype, "int64");
316        assert_eq!(info.description, Some("User ID".to_string()));
317        assert!(!info.nullable);
318    }
319
320    #[test]
321    fn test_preprocessing_step() {
322        let step = PreprocessingStep::new("normalize").with_description("Min-max normalization");
323
324        assert_eq!(step.name, "normalize");
325        assert_eq!(step.description, Some("Min-max normalization".to_string()));
326    }
327
328    #[test]
329    fn test_datasheet_add_methods() {
330        let mut sheet = Datasheet::new("Test dataset");
331        sheet.add_feature("col1", FeatureInfo::new("string"));
332        sheet.add_preprocessing(PreprocessingStep::new("clean"));
333
334        assert_eq!(sheet.features.len(), 1);
335        assert_eq!(sheet.preprocessing.len(), 1);
336    }
337
338    #[test]
339    fn test_datasheet_serialization() {
340        let sheet = Datasheet::builder().purpose("Test").instance_count(100).build();
341
342        let json = serde_json::to_string(&sheet).unwrap();
343        let deserialized: Datasheet = serde_json::from_str(&json).unwrap();
344
345        assert_eq!(sheet.purpose, deserialized.purpose);
346        assert_eq!(sheet.instance_count, deserialized.instance_count);
347    }
348}