Skip to main content

datasynth_core/compliance/
content_marking.rs

1//! EU AI Act Article 50 — Synthetic content marking.
2//!
3//! Generates machine-readable content credentials for all synthetic output,
4//! indicating artificial generation. Inspired by C2PA content credentials.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10/// Content credential for synthetic data, per EU AI Act Article 50.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ContentCredential {
13    /// Generator name.
14    pub generator: String,
15    /// Generator version.
16    pub generator_version: String,
17    /// Timestamp of generation.
18    pub generation_timestamp: DateTime<Utc>,
19    /// SHA-256 hash of the configuration used.
20    pub config_hash: String,
21    /// RNG seed used for generation.
22    pub seed: u64,
23    /// Content type descriptor.
24    pub content_type: String,
25    /// Regulatory article reference.
26    pub ai_act_article: String,
27    /// Credential specification version.
28    pub credential_version: String,
29}
30
31impl ContentCredential {
32    /// Create a new content credential.
33    pub fn new(config_hash: String, seed: u64) -> Self {
34        Self {
35            generator: "DataSynth".to_string(),
36            generator_version: env!("CARGO_PKG_VERSION").to_string(),
37            generation_timestamp: Utc::now(),
38            config_hash,
39            seed,
40            content_type: "synthetic_tabular_data".to_string(),
41            ai_act_article: "Article 50".to_string(),
42            credential_version: "1.0".to_string(),
43        }
44    }
45
46    /// Render as a CSV comment header.
47    pub fn to_csv_header(&self) -> String {
48        format!(
49            "# SYNTHETIC DATA - Generated by {} v{}\n\
50             # EU AI Act Article 50 Content Credential\n\
51             # Content-Type: {}\n\
52             # Generation-Timestamp: {}\n\
53             # Config-Hash: {}\n\
54             # Seed: {}\n\
55             # Credential-Version: {}",
56            self.generator,
57            self.generator_version,
58            self.content_type,
59            self.generation_timestamp.to_rfc3339(),
60            self.config_hash,
61            self.seed,
62            self.credential_version,
63        )
64    }
65
66    /// Render as a JSON value for embedding in JSON output.
67    pub fn to_json_value(&self) -> serde_json::Value {
68        serde_json::json!({
69            "generator": self.generator,
70            "generator_version": self.generator_version,
71            "generation_timestamp": self.generation_timestamp.to_rfc3339(),
72            "config_hash": self.config_hash,
73            "seed": self.seed,
74            "content_type": self.content_type,
75            "ai_act_article": self.ai_act_article,
76            "credential_version": self.credential_version,
77        })
78    }
79
80    /// Render as a Parquet metadata key-value string.
81    pub fn to_parquet_metadata(&self) -> String {
82        serde_json::to_string(self).unwrap_or_default()
83    }
84}
85
86/// Format for embedding synthetic content markers.
87#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
88#[serde(rename_all = "snake_case")]
89pub enum MarkingFormat {
90    /// Embed markers directly in output files.
91    #[default]
92    Embedded,
93    /// Write a separate sidecar credential file.
94    Sidecar,
95    /// Both embedded and sidecar.
96    Both,
97}
98
99/// Configuration for synthetic content marking.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct MarkingConfig {
102    /// Whether content marking is enabled (default: true).
103    #[serde(default = "default_true")]
104    pub enabled: bool,
105    /// Marking format.
106    #[serde(default)]
107    pub format: MarkingFormat,
108}
109
110fn default_true() -> bool {
111    true
112}
113
114impl Default for MarkingConfig {
115    fn default() -> Self {
116        Self {
117            enabled: true,
118            format: MarkingFormat::default(),
119        }
120    }
121}
122
123/// Generates synthetic content markers for output files.
124pub struct SyntheticContentMarker;
125
126impl SyntheticContentMarker {
127    /// Create a content credential from config hash and seed.
128    pub fn create_credential(config_hash: String, seed: u64) -> ContentCredential {
129        ContentCredential::new(config_hash, seed)
130    }
131
132    /// Compute SHA-256 hash of configuration YAML.
133    pub fn hash_config(config_yaml: &str) -> String {
134        let mut hasher = Sha256::new();
135        hasher.update(config_yaml.as_bytes());
136        hex::encode(hasher.finalize())
137    }
138
139    /// Create a sidecar credential file content (JSON).
140    pub fn create_sidecar(credential: &ContentCredential) -> String {
141        serde_json::to_string_pretty(credential).unwrap_or_default()
142    }
143}
144
145#[cfg(test)]
146mod tests {
147    use super::*;
148
149    #[test]
150    fn test_content_credential_creation() {
151        let cred = ContentCredential::new("abc123".to_string(), 42);
152        assert_eq!(cred.generator, "DataSynth");
153        assert_eq!(cred.seed, 42);
154        assert_eq!(cred.config_hash, "abc123");
155        assert_eq!(cred.ai_act_article, "Article 50");
156        assert_eq!(cred.content_type, "synthetic_tabular_data");
157    }
158
159    #[test]
160    fn test_csv_header_contains_marker() {
161        let cred = ContentCredential::new("hash".to_string(), 42);
162        let header = cred.to_csv_header();
163        assert!(header.contains("SYNTHETIC DATA"));
164        assert!(header.contains("EU AI Act Article 50"));
165        assert!(header.contains("DataSynth"));
166    }
167
168    #[test]
169    fn test_json_value_structure() {
170        let cred = ContentCredential::new("hash".to_string(), 42);
171        let json = cred.to_json_value();
172        assert_eq!(json["generator"], "DataSynth");
173        assert_eq!(json["seed"], 42);
174        assert_eq!(json["ai_act_article"], "Article 50");
175    }
176
177    #[test]
178    fn test_parquet_metadata_is_valid_json() {
179        let cred = ContentCredential::new("hash".to_string(), 42);
180        let metadata = cred.to_parquet_metadata();
181        let parsed: serde_json::Value = serde_json::from_str(&metadata).expect("valid JSON");
182        assert_eq!(parsed["generator"], "DataSynth");
183    }
184
185    #[test]
186    fn test_marking_config_default() {
187        let config = MarkingConfig::default();
188        assert!(config.enabled);
189        assert_eq!(config.format, MarkingFormat::Embedded);
190    }
191
192    #[test]
193    fn test_marking_config_serialization() {
194        let config = MarkingConfig {
195            enabled: true,
196            format: MarkingFormat::Both,
197        };
198        let json = serde_json::to_string(&config).expect("serialize");
199        let deser: MarkingConfig = serde_json::from_str(&json).expect("deserialize");
200        assert!(deser.enabled);
201        assert_eq!(deser.format, MarkingFormat::Both);
202    }
203
204    #[test]
205    fn test_hash_config() {
206        let hash = SyntheticContentMarker::hash_config("test config");
207        assert_eq!(hash.len(), 64); // SHA-256 hex
208    }
209
210    #[test]
211    fn test_disabled_marking() {
212        let config = MarkingConfig {
213            enabled: false,
214            format: MarkingFormat::Embedded,
215        };
216        assert!(!config.enabled);
217    }
218
219    #[test]
220    fn test_sidecar_creation() {
221        let cred = ContentCredential::new("hash".to_string(), 42);
222        let sidecar = SyntheticContentMarker::create_sidecar(&cred);
223        assert!(sidecar.contains("DataSynth"));
224        assert!(sidecar.contains("Article 50"));
225    }
226}