datasynth_core/compliance/
content_marking.rs1use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use sha2::{Digest, Sha256};
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct ContentCredential {
13 pub generator: String,
15 pub generator_version: String,
17 pub generation_timestamp: DateTime<Utc>,
19 pub config_hash: String,
21 pub seed: u64,
23 pub content_type: String,
25 pub ai_act_article: String,
27 pub credential_version: String,
29}
30
31impl ContentCredential {
32 pub fn new(config_hash: String, seed: u64) -> Self {
34 Self {
35 generator: "DataSynth".to_string(),
36 generator_version: env!("CARGO_PKG_VERSION").to_string(),
37 generation_timestamp: Utc::now(),
38 config_hash,
39 seed,
40 content_type: "synthetic_tabular_data".to_string(),
41 ai_act_article: "Article 50".to_string(),
42 credential_version: "1.0".to_string(),
43 }
44 }
45
46 pub fn to_csv_header(&self) -> String {
48 format!(
49 "# SYNTHETIC DATA - Generated by {} v{}\n\
50 # EU AI Act Article 50 Content Credential\n\
51 # Content-Type: {}\n\
52 # Generation-Timestamp: {}\n\
53 # Config-Hash: {}\n\
54 # Seed: {}\n\
55 # Credential-Version: {}",
56 self.generator,
57 self.generator_version,
58 self.content_type,
59 self.generation_timestamp.to_rfc3339(),
60 self.config_hash,
61 self.seed,
62 self.credential_version,
63 )
64 }
65
66 pub fn to_json_value(&self) -> serde_json::Value {
68 serde_json::json!({
69 "generator": self.generator,
70 "generator_version": self.generator_version,
71 "generation_timestamp": self.generation_timestamp.to_rfc3339(),
72 "config_hash": self.config_hash,
73 "seed": self.seed,
74 "content_type": self.content_type,
75 "ai_act_article": self.ai_act_article,
76 "credential_version": self.credential_version,
77 })
78 }
79
80 pub fn to_parquet_metadata(&self) -> String {
82 serde_json::to_string(self).unwrap_or_default()
83 }
84}
85
86#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
88#[serde(rename_all = "snake_case")]
89pub enum MarkingFormat {
90 #[default]
92 Embedded,
93 Sidecar,
95 Both,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct MarkingConfig {
102 #[serde(default = "default_true")]
104 pub enabled: bool,
105 #[serde(default)]
107 pub format: MarkingFormat,
108}
109
110fn default_true() -> bool {
111 true
112}
113
114impl Default for MarkingConfig {
115 fn default() -> Self {
116 Self {
117 enabled: true,
118 format: MarkingFormat::default(),
119 }
120 }
121}
122
123pub struct SyntheticContentMarker;
125
126impl SyntheticContentMarker {
127 pub fn create_credential(config_hash: String, seed: u64) -> ContentCredential {
129 ContentCredential::new(config_hash, seed)
130 }
131
132 pub fn hash_config(config_yaml: &str) -> String {
134 let mut hasher = Sha256::new();
135 hasher.update(config_yaml.as_bytes());
136 hex::encode(hasher.finalize())
137 }
138
139 pub fn create_sidecar(credential: &ContentCredential) -> String {
141 serde_json::to_string_pretty(credential).unwrap_or_default()
142 }
143}
144
145#[cfg(test)]
146#[allow(clippy::unwrap_used)]
147mod tests {
148 use super::*;
149
150 #[test]
151 fn test_content_credential_creation() {
152 let cred = ContentCredential::new("abc123".to_string(), 42);
153 assert_eq!(cred.generator, "DataSynth");
154 assert_eq!(cred.seed, 42);
155 assert_eq!(cred.config_hash, "abc123");
156 assert_eq!(cred.ai_act_article, "Article 50");
157 assert_eq!(cred.content_type, "synthetic_tabular_data");
158 }
159
160 #[test]
161 fn test_csv_header_contains_marker() {
162 let cred = ContentCredential::new("hash".to_string(), 42);
163 let header = cred.to_csv_header();
164 assert!(header.contains("SYNTHETIC DATA"));
165 assert!(header.contains("EU AI Act Article 50"));
166 assert!(header.contains("DataSynth"));
167 }
168
169 #[test]
170 fn test_json_value_structure() {
171 let cred = ContentCredential::new("hash".to_string(), 42);
172 let json = cred.to_json_value();
173 assert_eq!(json["generator"], "DataSynth");
174 assert_eq!(json["seed"], 42);
175 assert_eq!(json["ai_act_article"], "Article 50");
176 }
177
178 #[test]
179 fn test_parquet_metadata_is_valid_json() {
180 let cred = ContentCredential::new("hash".to_string(), 42);
181 let metadata = cred.to_parquet_metadata();
182 let parsed: serde_json::Value = serde_json::from_str(&metadata).expect("valid JSON");
183 assert_eq!(parsed["generator"], "DataSynth");
184 }
185
186 #[test]
187 fn test_marking_config_default() {
188 let config = MarkingConfig::default();
189 assert!(config.enabled);
190 assert_eq!(config.format, MarkingFormat::Embedded);
191 }
192
193 #[test]
194 fn test_marking_config_serialization() {
195 let config = MarkingConfig {
196 enabled: true,
197 format: MarkingFormat::Both,
198 };
199 let json = serde_json::to_string(&config).expect("serialize");
200 let deser: MarkingConfig = serde_json::from_str(&json).expect("deserialize");
201 assert!(deser.enabled);
202 assert_eq!(deser.format, MarkingFormat::Both);
203 }
204
205 #[test]
206 fn test_hash_config() {
207 let hash = SyntheticContentMarker::hash_config("test config");
208 assert_eq!(hash.len(), 64); }
210
211 #[test]
212 fn test_disabled_marking() {
213 let config = MarkingConfig {
214 enabled: false,
215 format: MarkingFormat::Embedded,
216 };
217 assert!(!config.enabled);
218 }
219
220 #[test]
221 fn test_sidecar_creation() {
222 let cred = ContentCredential::new("hash".to_string(), 42);
223 let sidecar = SyntheticContentMarker::create_sidecar(&cred);
224 assert!(sidecar.contains("DataSynth"));
225 assert!(sidecar.contains("Article 50"));
226 }
227}