use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContentCredential {
pub generator: String,
pub generator_version: String,
pub generation_timestamp: DateTime<Utc>,
pub config_hash: String,
pub seed: u64,
pub content_type: String,
pub ai_act_article: String,
pub credential_version: String,
}
impl ContentCredential {
pub fn new(config_hash: String, seed: u64) -> Self {
Self {
generator: "DataSynth".to_string(),
generator_version: env!("CARGO_PKG_VERSION").to_string(),
generation_timestamp: Utc::now(),
config_hash,
seed,
content_type: "synthetic_tabular_data".to_string(),
ai_act_article: "Article 50".to_string(),
credential_version: "1.0".to_string(),
}
}
pub fn to_csv_header(&self) -> String {
format!(
"# SYNTHETIC DATA - Generated by {} v{}\n\
# EU AI Act Article 50 Content Credential\n\
# Content-Type: {}\n\
# Generation-Timestamp: {}\n\
# Config-Hash: {}\n\
# Seed: {}\n\
# Credential-Version: {}",
self.generator,
self.generator_version,
self.content_type,
self.generation_timestamp.to_rfc3339(),
self.config_hash,
self.seed,
self.credential_version,
)
}
pub fn to_json_value(&self) -> serde_json::Value {
serde_json::json!({
"generator": self.generator,
"generator_version": self.generator_version,
"generation_timestamp": self.generation_timestamp.to_rfc3339(),
"config_hash": self.config_hash,
"seed": self.seed,
"content_type": self.content_type,
"ai_act_article": self.ai_act_article,
"credential_version": self.credential_version,
})
}
pub fn to_parquet_metadata(&self) -> String {
serde_json::to_string(self).unwrap_or_default()
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum MarkingFormat {
#[default]
Embedded,
Sidecar,
Both,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MarkingConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default)]
pub format: MarkingFormat,
}
fn default_true() -> bool {
true
}
impl Default for MarkingConfig {
fn default() -> Self {
Self {
enabled: true,
format: MarkingFormat::default(),
}
}
}
pub struct SyntheticContentMarker;
impl SyntheticContentMarker {
pub fn create_credential(config_hash: String, seed: u64) -> ContentCredential {
ContentCredential::new(config_hash, seed)
}
pub fn hash_config(config_yaml: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(config_yaml.as_bytes());
hex::encode(hasher.finalize())
}
pub fn create_sidecar(credential: &ContentCredential) -> String {
serde_json::to_string_pretty(credential).unwrap_or_default()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_content_credential_creation() {
let cred = ContentCredential::new("abc123".to_string(), 42);
assert_eq!(cred.generator, "DataSynth");
assert_eq!(cred.seed, 42);
assert_eq!(cred.config_hash, "abc123");
assert_eq!(cred.ai_act_article, "Article 50");
assert_eq!(cred.content_type, "synthetic_tabular_data");
}
#[test]
fn test_csv_header_contains_marker() {
let cred = ContentCredential::new("hash".to_string(), 42);
let header = cred.to_csv_header();
assert!(header.contains("SYNTHETIC DATA"));
assert!(header.contains("EU AI Act Article 50"));
assert!(header.contains("DataSynth"));
}
#[test]
fn test_json_value_structure() {
let cred = ContentCredential::new("hash".to_string(), 42);
let json = cred.to_json_value();
assert_eq!(json["generator"], "DataSynth");
assert_eq!(json["seed"], 42);
assert_eq!(json["ai_act_article"], "Article 50");
}
#[test]
fn test_parquet_metadata_is_valid_json() {
let cred = ContentCredential::new("hash".to_string(), 42);
let metadata = cred.to_parquet_metadata();
let parsed: serde_json::Value = serde_json::from_str(&metadata).expect("valid JSON");
assert_eq!(parsed["generator"], "DataSynth");
}
#[test]
fn test_marking_config_default() {
let config = MarkingConfig::default();
assert!(config.enabled);
assert_eq!(config.format, MarkingFormat::Embedded);
}
#[test]
fn test_marking_config_serialization() {
let config = MarkingConfig {
enabled: true,
format: MarkingFormat::Both,
};
let json = serde_json::to_string(&config).expect("serialize");
let deser: MarkingConfig = serde_json::from_str(&json).expect("deserialize");
assert!(deser.enabled);
assert_eq!(deser.format, MarkingFormat::Both);
}
#[test]
fn test_hash_config() {
let hash = SyntheticContentMarker::hash_config("test config");
assert_eq!(hash.len(), 64); }
#[test]
fn test_disabled_marking() {
let config = MarkingConfig {
enabled: false,
format: MarkingFormat::Embedded,
};
assert!(!config.enabled);
}
#[test]
fn test_sidecar_creation() {
let cred = ContentCredential::new("hash".to_string(), 42);
let sidecar = SyntheticContentMarker::create_sidecar(&cred);
assert!(sidecar.contains("DataSynth"));
assert!(sidecar.contains("Article 50"));
}
}