1use chrono::{DateTime, Utc};
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct Datasheet {
12 pub purpose: String,
15 #[serde(default)]
17 pub creators: Vec<String>,
18 #[serde(skip_serializing_if = "Option::is_none")]
20 pub funding: Option<String>,
21
22 #[serde(skip_serializing_if = "Option::is_none")]
25 pub instance_count: Option<u64>,
26 #[serde(default)]
28 pub features: HashMap<String, FeatureInfo>,
29 #[serde(default)]
31 pub sensitive_features: Vec<String>,
32
33 #[serde(skip_serializing_if = "Option::is_none")]
36 pub collection_method: Option<String>,
37 #[serde(skip_serializing_if = "Option::is_none")]
39 pub collection_start: Option<DateTime<Utc>>,
40 #[serde(skip_serializing_if = "Option::is_none")]
42 pub collection_end: Option<DateTime<Utc>>,
43 #[serde(default)]
45 pub preprocessing: Vec<PreprocessingStep>,
46
47 #[serde(skip_serializing_if = "Option::is_none")]
50 pub license: Option<String>,
51 #[serde(default)]
53 pub access_restrictions: Vec<String>,
54
55 #[serde(skip_serializing_if = "Option::is_none")]
58 pub maintainer: Option<String>,
59 #[serde(skip_serializing_if = "Option::is_none")]
61 pub update_frequency: Option<String>,
62 #[serde(skip_serializing_if = "Option::is_none")]
64 pub deprecation_policy: Option<String>,
65
66 #[serde(default)]
68 pub extra: HashMap<String, serde_json::Value>,
69}
70
71#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct FeatureInfo {
74 pub dtype: String,
76 #[serde(skip_serializing_if = "Option::is_none")]
78 pub description: Option<String>,
79 #[serde(default)]
81 pub nullable: bool,
82 #[serde(default)]
84 pub statistics: HashMap<String, f64>,
85}
86
87impl FeatureInfo {
88 #[must_use]
90 pub fn new(dtype: impl Into<String>) -> Self {
91 Self { dtype: dtype.into(), description: None, nullable: false, statistics: HashMap::new() }
92 }
93
94 #[must_use]
96 pub fn with_description(mut self, description: impl Into<String>) -> Self {
97 self.description = Some(description.into());
98 self
99 }
100
101 #[must_use]
103 pub fn with_nullable(mut self, nullable: bool) -> Self {
104 self.nullable = nullable;
105 self
106 }
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct PreprocessingStep {
112 pub name: String,
114 #[serde(skip_serializing_if = "Option::is_none")]
116 pub description: Option<String>,
117 #[serde(default)]
119 pub parameters: HashMap<String, serde_json::Value>,
120}
121
122impl PreprocessingStep {
123 #[must_use]
125 pub fn new(name: impl Into<String>) -> Self {
126 Self { name: name.into(), description: None, parameters: HashMap::new() }
127 }
128
129 #[must_use]
131 pub fn with_description(mut self, description: impl Into<String>) -> Self {
132 self.description = Some(description.into());
133 self
134 }
135}
136
137impl Datasheet {
138 #[must_use]
140 pub fn builder() -> DatasheetBuilder {
141 DatasheetBuilder::new()
142 }
143
144 #[must_use]
146 pub fn new(purpose: impl Into<String>) -> Self {
147 Self {
148 purpose: purpose.into(),
149 creators: Vec::new(),
150 funding: None,
151 instance_count: None,
152 features: HashMap::new(),
153 sensitive_features: Vec::new(),
154 collection_method: None,
155 collection_start: None,
156 collection_end: None,
157 preprocessing: Vec::new(),
158 license: None,
159 access_restrictions: Vec::new(),
160 maintainer: None,
161 update_frequency: None,
162 deprecation_policy: None,
163 extra: HashMap::new(),
164 }
165 }
166
167 pub fn add_feature(&mut self, name: impl Into<String>, info: FeatureInfo) {
169 self.features.insert(name.into(), info);
170 }
171
172 pub fn add_preprocessing(&mut self, step: PreprocessingStep) {
174 self.preprocessing.push(step);
175 }
176}
177
178impl Default for Datasheet {
179 fn default() -> Self {
180 Self::new("")
181 }
182}
183
184#[derive(Debug, Default)]
186pub struct DatasheetBuilder {
187 sheet: Datasheet,
188}
189
190impl DatasheetBuilder {
191 #[must_use]
193 pub fn new() -> Self {
194 Self { sheet: Datasheet::default() }
195 }
196
197 #[must_use]
199 pub fn purpose(mut self, purpose: impl Into<String>) -> Self {
200 self.sheet.purpose = purpose.into();
201 self
202 }
203
204 #[must_use]
206 pub fn creators<I, S>(mut self, creators: I) -> Self
207 where
208 I: IntoIterator<Item = S>,
209 S: Into<String>,
210 {
211 self.sheet.creators = creators.into_iter().map(Into::into).collect();
212 self
213 }
214
215 #[must_use]
217 pub fn funding(mut self, funding: impl Into<String>) -> Self {
218 self.sheet.funding = Some(funding.into());
219 self
220 }
221
222 #[must_use]
224 pub fn instance_count(mut self, count: u64) -> Self {
225 self.sheet.instance_count = Some(count);
226 self
227 }
228
229 #[must_use]
231 pub fn feature(mut self, name: impl Into<String>, info: FeatureInfo) -> Self {
232 self.sheet.features.insert(name.into(), info);
233 self
234 }
235
236 #[must_use]
238 pub fn sensitive_features<I, S>(mut self, features: I) -> Self
239 where
240 I: IntoIterator<Item = S>,
241 S: Into<String>,
242 {
243 self.sheet.sensitive_features = features.into_iter().map(Into::into).collect();
244 self
245 }
246
247 #[must_use]
249 pub fn collection_method(mut self, method: impl Into<String>) -> Self {
250 self.sheet.collection_method = Some(method.into());
251 self
252 }
253
254 #[must_use]
256 pub fn license(mut self, license: impl Into<String>) -> Self {
257 self.sheet.license = Some(license.into());
258 self
259 }
260
261 #[must_use]
263 pub fn maintainer(mut self, maintainer: impl Into<String>) -> Self {
264 self.sheet.maintainer = Some(maintainer.into());
265 self
266 }
267
268 #[must_use]
270 pub fn build(self) -> Datasheet {
271 self.sheet
272 }
273}
274
275#[cfg(test)]
276mod tests {
277 use super::*;
278
279 #[test]
280 fn test_datasheet_new() {
281 let sheet = Datasheet::new("Customer transactions for fraud detection");
282 assert_eq!(sheet.purpose, "Customer transactions for fraud detection");
283 assert!(sheet.features.is_empty());
284 }
285
286 #[test]
287 fn test_datasheet_builder() {
288 let sheet = Datasheet::builder()
289 .purpose("Training data for fraud detection")
290 .creators(["Alice", "Bob"])
291 .instance_count(1_000_000)
292 .feature(
293 "amount",
294 FeatureInfo::new("float64").with_description("Transaction amount in USD"),
295 )
296 .feature("timestamp", FeatureInfo::new("datetime").with_nullable(true))
297 .sensitive_features(["customer_id", "card_number"])
298 .license("MIT")
299 .maintainer("data-team@company.com")
300 .build();
301
302 assert_eq!(sheet.purpose, "Training data for fraud detection");
303 assert_eq!(sheet.creators, vec!["Alice", "Bob"]);
304 assert_eq!(sheet.instance_count, Some(1_000_000));
305 assert_eq!(sheet.features.len(), 2);
306 assert!(sheet.features.contains_key("amount"));
307 assert_eq!(sheet.sensitive_features.len(), 2);
308 assert_eq!(sheet.license, Some("MIT".to_string()));
309 }
310
311 #[test]
312 fn test_feature_info() {
313 let info = FeatureInfo::new("int64").with_description("User ID").with_nullable(false);
314
315 assert_eq!(info.dtype, "int64");
316 assert_eq!(info.description, Some("User ID".to_string()));
317 assert!(!info.nullable);
318 }
319
320 #[test]
321 fn test_preprocessing_step() {
322 let step = PreprocessingStep::new("normalize").with_description("Min-max normalization");
323
324 assert_eq!(step.name, "normalize");
325 assert_eq!(step.description, Some("Min-max normalization".to_string()));
326 }
327
328 #[test]
329 fn test_datasheet_add_methods() {
330 let mut sheet = Datasheet::new("Test dataset");
331 sheet.add_feature("col1", FeatureInfo::new("string"));
332 sheet.add_preprocessing(PreprocessingStep::new("clean"));
333
334 assert_eq!(sheet.features.len(), 1);
335 assert_eq!(sheet.preprocessing.len(), 1);
336 }
337
338 #[test]
339 fn test_datasheet_serialization() {
340 let sheet = Datasheet::builder().purpose("Test").instance_count(100).build();
341
342 let json = serde_json::to_string(&sheet).unwrap();
343 let deserialized: Datasheet = serde_json::from_str(&json).unwrap();
344
345 assert_eq!(sheet.purpose, deserialized.purpose);
346 assert_eq!(sheet.instance_count, deserialized.instance_count);
347 }
348}