Skip to main content

datasynth_core/compliance/
content_marking.rs

1//! EU AI Act Article 50 — Synthetic content marking.
2//!
3//! Generates machine-readable content credentials for all synthetic output,
4//! indicating artificial generation. Inspired by C2PA content credentials.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10/// Content credential for synthetic data, per EU AI Act Article 50.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ContentCredential {
13    /// Generator name.
14    pub generator: String,
15    /// Generator version.
16    pub generator_version: String,
17    /// Timestamp of generation.
18    pub generation_timestamp: DateTime<Utc>,
19    /// SHA-256 hash of the configuration used.
20    pub config_hash: String,
21    /// RNG seed used for generation.
22    pub seed: u64,
23    /// Content type descriptor.
24    pub content_type: String,
25    /// Regulatory article reference.
26    pub ai_act_article: String,
27    /// Credential specification version.
28    pub credential_version: String,
29}
30
31impl ContentCredential {
32    /// Create a new content credential.
33    pub fn new(config_hash: String, seed: u64) -> Self {
34        Self {
35            generator: "DataSynth".to_string(),
36            generator_version: env!("CARGO_PKG_VERSION").to_string(),
37            generation_timestamp: Utc::now(),
38            config_hash,
39            seed,
40            content_type: "synthetic_tabular_data".to_string(),
41            ai_act_article: "Article 50".to_string(),
42            credential_version: "1.0".to_string(),
43        }
44    }
45
46    /// Render as a CSV comment header.
47    pub fn to_csv_header(&self) -> String {
48        format!(
49            "# SYNTHETIC DATA - Generated by {} v{}\n\
50             # EU AI Act Article 50 Content Credential\n\
51             # Content-Type: {}\n\
52             # Generation-Timestamp: {}\n\
53             # Config-Hash: {}\n\
54             # Seed: {}\n\
55             # Credential-Version: {}",
56            self.generator,
57            self.generator_version,
58            self.content_type,
59            self.generation_timestamp.to_rfc3339(),
60            self.config_hash,
61            self.seed,
62            self.credential_version,
63        )
64    }
65
66    /// Render as a JSON value for embedding in JSON output.
67    pub fn to_json_value(&self) -> serde_json::Value {
68        serde_json::json!({
69            "generator": self.generator,
70            "generator_version": self.generator_version,
71            "generation_timestamp": self.generation_timestamp.to_rfc3339(),
72            "config_hash": self.config_hash,
73            "seed": self.seed,
74            "content_type": self.content_type,
75            "ai_act_article": self.ai_act_article,
76            "credential_version": self.credential_version,
77        })
78    }
79
80    /// Render as a Parquet metadata key-value string.
81    pub fn to_parquet_metadata(&self) -> String {
82        serde_json::to_string(self).unwrap_or_default()
83    }
84}
85
86/// Format for embedding synthetic content markers.
87#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
88#[serde(rename_all = "snake_case")]
89pub enum MarkingFormat {
90    /// Embed markers directly in output files.
91    #[default]
92    Embedded,
93    /// Write a separate sidecar credential file.
94    Sidecar,
95    /// Both embedded and sidecar.
96    Both,
97}
98
99/// Configuration for synthetic content marking.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct MarkingConfig {
102    /// Whether content marking is enabled (default: true).
103    #[serde(default = "default_true")]
104    pub enabled: bool,
105    /// Marking format.
106    #[serde(default)]
107    pub format: MarkingFormat,
108}
109
110fn default_true() -> bool {
111    true
112}
113
114impl Default for MarkingConfig {
115    fn default() -> Self {
116        Self {
117            enabled: true,
118            format: MarkingFormat::default(),
119        }
120    }
121}
122
123/// Generates synthetic content markers for output files.
124pub struct SyntheticContentMarker;
125
126impl SyntheticContentMarker {
127    /// Create a content credential from config hash and seed.
128    pub fn create_credential(config_hash: String, seed: u64) -> ContentCredential {
129        ContentCredential::new(config_hash, seed)
130    }
131
132    /// Compute SHA-256 hash of configuration YAML.
133    pub fn hash_config(config_yaml: &str) -> String {
134        let mut hasher = Sha256::new();
135        hasher.update(config_yaml.as_bytes());
136        hex::encode(hasher.finalize())
137    }
138
139    /// Create a sidecar credential file content (JSON).
140    pub fn create_sidecar(credential: &ContentCredential) -> String {
141        serde_json::to_string_pretty(credential).unwrap_or_default()
142    }
143}
144
145#[cfg(test)]
146#[allow(clippy::unwrap_used)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn test_content_credential_creation() {
152        let cred = ContentCredential::new("abc123".to_string(), 42);
153        assert_eq!(cred.generator, "DataSynth");
154        assert_eq!(cred.seed, 42);
155        assert_eq!(cred.config_hash, "abc123");
156        assert_eq!(cred.ai_act_article, "Article 50");
157        assert_eq!(cred.content_type, "synthetic_tabular_data");
158    }
159
160    #[test]
161    fn test_csv_header_contains_marker() {
162        let cred = ContentCredential::new("hash".to_string(), 42);
163        let header = cred.to_csv_header();
164        assert!(header.contains("SYNTHETIC DATA"));
165        assert!(header.contains("EU AI Act Article 50"));
166        assert!(header.contains("DataSynth"));
167    }
168
169    #[test]
170    fn test_json_value_structure() {
171        let cred = ContentCredential::new("hash".to_string(), 42);
172        let json = cred.to_json_value();
173        assert_eq!(json["generator"], "DataSynth");
174        assert_eq!(json["seed"], 42);
175        assert_eq!(json["ai_act_article"], "Article 50");
176    }
177
178    #[test]
179    fn test_parquet_metadata_is_valid_json() {
180        let cred = ContentCredential::new("hash".to_string(), 42);
181        let metadata = cred.to_parquet_metadata();
182        let parsed: serde_json::Value = serde_json::from_str(&metadata).expect("valid JSON");
183        assert_eq!(parsed["generator"], "DataSynth");
184    }
185
186    #[test]
187    fn test_marking_config_default() {
188        let config = MarkingConfig::default();
189        assert!(config.enabled);
190        assert_eq!(config.format, MarkingFormat::Embedded);
191    }
192
193    #[test]
194    fn test_marking_config_serialization() {
195        let config = MarkingConfig {
196            enabled: true,
197            format: MarkingFormat::Both,
198        };
199        let json = serde_json::to_string(&config).expect("serialize");
200        let deser: MarkingConfig = serde_json::from_str(&json).expect("deserialize");
201        assert!(deser.enabled);
202        assert_eq!(deser.format, MarkingFormat::Both);
203    }
204
205    #[test]
206    fn test_hash_config() {
207        let hash = SyntheticContentMarker::hash_config("test config");
208        assert_eq!(hash.len(), 64); // SHA-256 hex
209    }
210
211    #[test]
212    fn test_disabled_marking() {
213        let config = MarkingConfig {
214            enabled: false,
215            format: MarkingFormat::Embedded,
216        };
217        assert!(!config.enabled);
218    }
219
220    #[test]
221    fn test_sidecar_creation() {
222        let cred = ContentCredential::new("hash".to_string(), 42);
223        let sidecar = SyntheticContentMarker::create_sidecar(&cred);
224        assert!(sidecar.contains("DataSynth"));
225        assert!(sidecar.contains("Article 50"));
226    }
227}