use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Datasheet {
pub purpose: String,
#[serde(default)]
pub creators: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub funding: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub instance_count: Option<u64>,
#[serde(default)]
pub features: HashMap<String, FeatureInfo>,
#[serde(default)]
pub sensitive_features: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub collection_method: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub collection_start: Option<DateTime<Utc>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub collection_end: Option<DateTime<Utc>>,
#[serde(default)]
pub preprocessing: Vec<PreprocessingStep>,
#[serde(skip_serializing_if = "Option::is_none")]
pub license: Option<String>,
#[serde(default)]
pub access_restrictions: Vec<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub maintainer: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub update_frequency: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub deprecation_policy: Option<String>,
#[serde(default)]
pub extra: HashMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FeatureInfo {
pub dtype: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default)]
pub nullable: bool,
#[serde(default)]
pub statistics: HashMap<String, f64>,
}
impl FeatureInfo {
#[must_use]
pub fn new(dtype: impl Into<String>) -> Self {
Self {
dtype: dtype.into(),
description: None,
nullable: false,
statistics: HashMap::new(),
}
}
#[must_use]
pub fn with_description(mut self, description: impl Into<String>) -> Self {
self.description = Some(description.into());
self
}
#[must_use]
pub fn with_nullable(mut self, nullable: bool) -> Self {
self.nullable = nullable;
self
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PreprocessingStep {
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub description: Option<String>,
#[serde(default)]
pub parameters: HashMap<String, serde_json::Value>,
}
impl PreprocessingStep {
#[must_use]
pub fn new(name: impl Into<String>) -> Self {
Self {
name: name.into(),
description: None,
parameters: HashMap::new(),
}
}
#[must_use]
pub fn with_description(mut self, description: impl Into<String>) -> Self {
self.description = Some(description.into());
self
}
}
impl Datasheet {
#[must_use]
pub fn builder() -> DatasheetBuilder {
DatasheetBuilder::new()
}
#[must_use]
pub fn new(purpose: impl Into<String>) -> Self {
Self {
purpose: purpose.into(),
creators: Vec::new(),
funding: None,
instance_count: None,
features: HashMap::new(),
sensitive_features: Vec::new(),
collection_method: None,
collection_start: None,
collection_end: None,
preprocessing: Vec::new(),
license: None,
access_restrictions: Vec::new(),
maintainer: None,
update_frequency: None,
deprecation_policy: None,
extra: HashMap::new(),
}
}
pub fn add_feature(&mut self, name: impl Into<String>, info: FeatureInfo) {
self.features.insert(name.into(), info);
}
pub fn add_preprocessing(&mut self, step: PreprocessingStep) {
self.preprocessing.push(step);
}
}
impl Default for Datasheet {
fn default() -> Self {
Self::new("")
}
}
#[derive(Debug, Default)]
pub struct DatasheetBuilder {
sheet: Datasheet,
}
impl DatasheetBuilder {
#[must_use]
pub fn new() -> Self {
Self {
sheet: Datasheet::default(),
}
}
#[must_use]
pub fn purpose(mut self, purpose: impl Into<String>) -> Self {
self.sheet.purpose = purpose.into();
self
}
#[must_use]
pub fn creators<I, S>(mut self, creators: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.sheet.creators = creators.into_iter().map(Into::into).collect();
self
}
#[must_use]
pub fn funding(mut self, funding: impl Into<String>) -> Self {
self.sheet.funding = Some(funding.into());
self
}
#[must_use]
pub fn instance_count(mut self, count: u64) -> Self {
self.sheet.instance_count = Some(count);
self
}
#[must_use]
pub fn feature(mut self, name: impl Into<String>, info: FeatureInfo) -> Self {
self.sheet.features.insert(name.into(), info);
self
}
#[must_use]
pub fn sensitive_features<I, S>(mut self, features: I) -> Self
where
I: IntoIterator<Item = S>,
S: Into<String>,
{
self.sheet.sensitive_features = features.into_iter().map(Into::into).collect();
self
}
#[must_use]
pub fn collection_method(mut self, method: impl Into<String>) -> Self {
self.sheet.collection_method = Some(method.into());
self
}
#[must_use]
pub fn license(mut self, license: impl Into<String>) -> Self {
self.sheet.license = Some(license.into());
self
}
#[must_use]
pub fn maintainer(mut self, maintainer: impl Into<String>) -> Self {
self.sheet.maintainer = Some(maintainer.into());
self
}
#[must_use]
pub fn build(self) -> Datasheet {
self.sheet
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_datasheet_new() {
let sheet = Datasheet::new("Customer transactions for fraud detection");
assert_eq!(sheet.purpose, "Customer transactions for fraud detection");
assert!(sheet.features.is_empty());
}
#[test]
fn test_datasheet_builder() {
let sheet = Datasheet::builder()
.purpose("Training data for fraud detection")
.creators(["Alice", "Bob"])
.instance_count(1_000_000)
.feature(
"amount",
FeatureInfo::new("float64").with_description("Transaction amount in USD"),
)
.feature(
"timestamp",
FeatureInfo::new("datetime").with_nullable(true),
)
.sensitive_features(["customer_id", "card_number"])
.license("MIT")
.maintainer("data-team@company.com")
.build();
assert_eq!(sheet.purpose, "Training data for fraud detection");
assert_eq!(sheet.creators, vec!["Alice", "Bob"]);
assert_eq!(sheet.instance_count, Some(1_000_000));
assert_eq!(sheet.features.len(), 2);
assert!(sheet.features.contains_key("amount"));
assert_eq!(sheet.sensitive_features.len(), 2);
assert_eq!(sheet.license, Some("MIT".to_string()));
}
#[test]
fn test_feature_info() {
let info = FeatureInfo::new("int64")
.with_description("User ID")
.with_nullable(false);
assert_eq!(info.dtype, "int64");
assert_eq!(info.description, Some("User ID".to_string()));
assert!(!info.nullable);
}
#[test]
fn test_preprocessing_step() {
let step = PreprocessingStep::new("normalize").with_description("Min-max normalization");
assert_eq!(step.name, "normalize");
assert_eq!(step.description, Some("Min-max normalization".to_string()));
}
#[test]
fn test_datasheet_add_methods() {
let mut sheet = Datasheet::new("Test dataset");
sheet.add_feature("col1", FeatureInfo::new("string"));
sheet.add_preprocessing(PreprocessingStep::new("clean"));
assert_eq!(sheet.features.len(), 1);
assert_eq!(sheet.preprocessing.len(), 1);
}
#[test]
fn test_datasheet_serialization() {
let sheet = Datasheet::builder()
.purpose("Test")
.instance_count(100)
.build();
let json = serde_json::to_string(&sheet).unwrap();
let deserialized: Datasheet = serde_json::from_str(&json).unwrap();
assert_eq!(sheet.purpose, deserialized.purpose);
assert_eq!(sheet.instance_count, deserialized.instance_count);
}
}