datasynth-core 2.4.0

Core domain models, traits, and distributions for synthetic enterprise data generation
Documentation
//! EU AI Act Article 50 — Synthetic content marking.
//!
//! Generates machine-readable content credentials for all synthetic output,
//! indicating artificial generation. Inspired by C2PA content credentials.

use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};

/// Content credential for synthetic data, per EU AI Act Article 50.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ContentCredential {
    /// Generator name.
    pub generator: String,
    /// Generator version.
    pub generator_version: String,
    /// Timestamp of generation.
    pub generation_timestamp: DateTime<Utc>,
    /// SHA-256 hash of the configuration used.
    pub config_hash: String,
    /// RNG seed used for generation.
    pub seed: u64,
    /// Content type descriptor.
    pub content_type: String,
    /// Regulatory article reference.
    pub ai_act_article: String,
    /// Credential specification version.
    pub credential_version: String,
}

impl ContentCredential {
    /// Create a new content credential.
    pub fn new(config_hash: String, seed: u64) -> Self {
        Self {
            generator: "DataSynth".to_string(),
            generator_version: env!("CARGO_PKG_VERSION").to_string(),
            generation_timestamp: Utc::now(),
            config_hash,
            seed,
            content_type: "synthetic_tabular_data".to_string(),
            ai_act_article: "Article 50".to_string(),
            credential_version: "1.0".to_string(),
        }
    }

    /// Render as a CSV comment header.
    pub fn to_csv_header(&self) -> String {
        format!(
            "# SYNTHETIC DATA - Generated by {} v{}\n\
             # EU AI Act Article 50 Content Credential\n\
             # Content-Type: {}\n\
             # Generation-Timestamp: {}\n\
             # Config-Hash: {}\n\
             # Seed: {}\n\
             # Credential-Version: {}",
            self.generator,
            self.generator_version,
            self.content_type,
            self.generation_timestamp.to_rfc3339(),
            self.config_hash,
            self.seed,
            self.credential_version,
        )
    }

    /// Render as a JSON value for embedding in JSON output.
    pub fn to_json_value(&self) -> serde_json::Value {
        serde_json::json!({
            "generator": self.generator,
            "generator_version": self.generator_version,
            "generation_timestamp": self.generation_timestamp.to_rfc3339(),
            "config_hash": self.config_hash,
            "seed": self.seed,
            "content_type": self.content_type,
            "ai_act_article": self.ai_act_article,
            "credential_version": self.credential_version,
        })
    }

    /// Render as a Parquet metadata key-value string.
    pub fn to_parquet_metadata(&self) -> String {
        serde_json::to_string(self).unwrap_or_default()
    }
}

/// Format for embedding synthetic content markers.
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum MarkingFormat {
    /// Embed markers directly in output files.
    #[default]
    Embedded,
    /// Write a separate sidecar credential file.
    Sidecar,
    /// Both embedded and sidecar.
    Both,
}

/// Configuration for synthetic content marking.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MarkingConfig {
    /// Whether content marking is enabled (default: true).
    #[serde(default = "default_true")]
    pub enabled: bool,
    /// Marking format.
    #[serde(default)]
    pub format: MarkingFormat,
}

fn default_true() -> bool {
    true
}

impl Default for MarkingConfig {
    fn default() -> Self {
        Self {
            enabled: true,
            format: MarkingFormat::default(),
        }
    }
}

/// Generates synthetic content markers for output files.
pub struct SyntheticContentMarker;

impl SyntheticContentMarker {
    /// Create a content credential from config hash and seed.
    pub fn create_credential(config_hash: String, seed: u64) -> ContentCredential {
        ContentCredential::new(config_hash, seed)
    }

    /// Compute SHA-256 hash of configuration YAML.
    pub fn hash_config(config_yaml: &str) -> String {
        let mut hasher = Sha256::new();
        hasher.update(config_yaml.as_bytes());
        hex::encode(hasher.finalize())
    }

    /// Create a sidecar credential file content (JSON).
    pub fn create_sidecar(credential: &ContentCredential) -> String {
        serde_json::to_string_pretty(credential).unwrap_or_default()
    }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
    use super::*;

    #[test]
    fn test_content_credential_creation() {
        let cred = ContentCredential::new("abc123".to_string(), 42);
        assert_eq!(cred.generator, "DataSynth");
        assert_eq!(cred.seed, 42);
        assert_eq!(cred.config_hash, "abc123");
        assert_eq!(cred.ai_act_article, "Article 50");
        assert_eq!(cred.content_type, "synthetic_tabular_data");
    }

    #[test]
    fn test_csv_header_contains_marker() {
        let cred = ContentCredential::new("hash".to_string(), 42);
        let header = cred.to_csv_header();
        assert!(header.contains("SYNTHETIC DATA"));
        assert!(header.contains("EU AI Act Article 50"));
        assert!(header.contains("DataSynth"));
    }

    #[test]
    fn test_json_value_structure() {
        let cred = ContentCredential::new("hash".to_string(), 42);
        let json = cred.to_json_value();
        assert_eq!(json["generator"], "DataSynth");
        assert_eq!(json["seed"], 42);
        assert_eq!(json["ai_act_article"], "Article 50");
    }

    #[test]
    fn test_parquet_metadata_is_valid_json() {
        let cred = ContentCredential::new("hash".to_string(), 42);
        let metadata = cred.to_parquet_metadata();
        let parsed: serde_json::Value = serde_json::from_str(&metadata).expect("valid JSON");
        assert_eq!(parsed["generator"], "DataSynth");
    }

    #[test]
    fn test_marking_config_default() {
        let config = MarkingConfig::default();
        assert!(config.enabled);
        assert_eq!(config.format, MarkingFormat::Embedded);
    }

    #[test]
    fn test_marking_config_serialization() {
        let config = MarkingConfig {
            enabled: true,
            format: MarkingFormat::Both,
        };
        let json = serde_json::to_string(&config).expect("serialize");
        let deser: MarkingConfig = serde_json::from_str(&json).expect("deserialize");
        assert!(deser.enabled);
        assert_eq!(deser.format, MarkingFormat::Both);
    }

    #[test]
    fn test_hash_config() {
        let hash = SyntheticContentMarker::hash_config("test config");
        assert_eq!(hash.len(), 64); // SHA-256 hex
    }

    #[test]
    fn test_disabled_marking() {
        let config = MarkingConfig {
            enabled: false,
            format: MarkingFormat::Embedded,
        };
        assert!(!config.enabled);
    }

    #[test]
    fn test_sidecar_creation() {
        let cred = ContentCredential::new("hash".to_string(), 42);
        let sidecar = SyntheticContentMarker::create_sidecar(&cred);
        assert!(sidecar.contains("DataSynth"));
        assert!(sidecar.contains("Article 50"));
    }
}