Skip to main content

datasynth_core/compliance/
article10.rs

1//! EU AI Act Article 10 — Data Governance Report.
2//!
3//! Generates documentation about data sources, processing steps,
4//! quality measures, and bias assessment for synthetic data.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Data governance report per EU AI Act Article 10.
11///
12/// Documents the provenance, processing, and quality of generated data.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct DataGovernanceReport {
15    /// Report version.
16    pub report_version: String,
17    /// Generation timestamp.
18    pub generated_at: DateTime<Utc>,
19    /// Generator name and version.
20    pub generator: String,
21    /// Data sources used for generation.
22    pub data_sources: Vec<DataSourceEntry>,
23    /// Processing steps applied during generation.
24    pub processing_steps: Vec<ProcessingStep>,
25    /// Quality measures applied and their results.
26    pub quality_measures: Vec<QualityMeasure>,
27    /// Bias assessment summary.
28    pub bias_assessment: BiasAssessment,
29    /// Configuration hash for traceability.
30    pub config_hash: String,
31    /// Seed used for reproducibility.
32    pub seed: u64,
33    /// Additional metadata.
34    #[serde(default)]
35    pub metadata: HashMap<String, String>,
36}
37
38impl DataGovernanceReport {
39    /// Create a new data governance report.
40    pub fn new(config_hash: String, seed: u64) -> Self {
41        Self {
42            report_version: "1.0".to_string(),
43            generated_at: Utc::now(),
44            generator: format!("DataSynth v{}", env!("CARGO_PKG_VERSION")),
45            data_sources: vec![DataSourceEntry {
46                name: "Synthetic generation (no real data used)".to_string(),
47                description: "All data is algorithmically generated using statistical distributions, domain models, and configurable parameters. No real personal or corporate data is used as input.".to_string(),
48                source_type: "synthetic".to_string(),
49                contains_personal_data: false,
50            }],
51            processing_steps: Vec::new(),
52            quality_measures: Vec::new(),
53            bias_assessment: BiasAssessment::default(),
54            config_hash,
55            seed,
56            metadata: HashMap::new(),
57        }
58    }
59
60    /// Add processing steps based on the phases that were executed.
61    pub fn add_standard_processing_steps(&mut self) {
62        self.processing_steps = vec![
63            ProcessingStep {
64                name: "Chart of Accounts Generation".to_string(),
65                description: "Generate GL account structure based on industry and complexity"
66                    .to_string(),
67                order: 1,
68            },
69            ProcessingStep {
70                name: "Master Data Generation".to_string(),
71                description: "Generate vendors, customers, materials, fixed assets, employees"
72                    .to_string(),
73                order: 2,
74            },
75            ProcessingStep {
76                name: "Document Flow Generation".to_string(),
77                description: "Generate P2P and O2C document chains with three-way matching"
78                    .to_string(),
79                order: 3,
80            },
81            ProcessingStep {
82                name: "Journal Entry Generation".to_string(),
83                description: "Generate balanced journal entries following Benford's Law"
84                    .to_string(),
85                order: 4,
86            },
87            ProcessingStep {
88                name: "Anomaly Injection".to_string(),
89                description:
90                    "Inject configurable fraud and error patterns with ground truth labels"
91                        .to_string(),
92                order: 5,
93            },
94            ProcessingStep {
95                name: "Quality Validation".to_string(),
96                description:
97                    "Validate balance coherence, referential integrity, and statistical properties"
98                        .to_string(),
99                order: 6,
100            },
101        ];
102    }
103
104    /// Add standard quality measures.
105    pub fn add_standard_quality_measures(&mut self) {
106        self.quality_measures = vec![
107            QualityMeasure {
108                name: "Benford's Law Compliance".to_string(),
109                description: "First-digit distribution follows Benford's Law (MAD < 0.015)"
110                    .to_string(),
111                result: "Applied".to_string(),
112            },
113            QualityMeasure {
114                name: "Balance Coherence".to_string(),
115                description: "All journal entries are balanced (debits = credits)".to_string(),
116                result: "Enforced at construction".to_string(),
117            },
118            QualityMeasure {
119                name: "Deterministic Reproducibility".to_string(),
120                description: "Same config + seed produces identical output".to_string(),
121                result: "ChaCha8 RNG with configurable seed".to_string(),
122            },
123            QualityMeasure {
124                name: "Referential Integrity".to_string(),
125                description: "All foreign key references are valid".to_string(),
126                result: "Applied".to_string(),
127            },
128        ];
129    }
130}
131
132/// A data source entry in the governance report.
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct DataSourceEntry {
135    /// Source name.
136    pub name: String,
137    /// Source description.
138    pub description: String,
139    /// Type of source (synthetic, real, derived).
140    pub source_type: String,
141    /// Whether the source contains personal data.
142    pub contains_personal_data: bool,
143}
144
145/// A processing step in the generation pipeline.
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct ProcessingStep {
148    /// Step name.
149    pub name: String,
150    /// Step description.
151    pub description: String,
152    /// Order in the pipeline.
153    pub order: u32,
154}
155
156/// A quality measure applied during generation.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct QualityMeasure {
159    /// Measure name.
160    pub name: String,
161    /// Measure description.
162    pub description: String,
163    /// Result or status.
164    pub result: String,
165}
166
167/// Bias assessment for the generated data.
168#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct BiasAssessment {
170    /// Overall assessment.
171    pub assessment: String,
172    /// Known limitations.
173    pub known_limitations: Vec<String>,
174    /// Mitigation measures.
175    pub mitigation_measures: Vec<String>,
176}
177
178impl Default for BiasAssessment {
179    fn default() -> Self {
180        Self {
181            assessment: "Synthetic data generation uses configurable statistical distributions. \
182                         Bias characteristics are determined by configuration parameters (industry profiles, \
183                         distribution parameters, anomaly rates) rather than real-world data."
184                .to_string(),
185            known_limitations: vec![
186                "Generated data reflects configured distribution parameters, not real-world distributions".to_string(),
187                "Industry profiles are approximations based on published research".to_string(),
188                "Temporal patterns use simplified models of business cycles".to_string(),
189            ],
190            mitigation_measures: vec![
191                "All configuration parameters are documented and reproducible".to_string(),
192                "Evaluation framework validates statistical properties of output".to_string(),
193                "AutoTuner can adjust parameters based on evaluation feedback".to_string(),
194                "Users should validate generated data against their specific use case requirements".to_string(),
195            ],
196        }
197    }
198}
199
200#[cfg(test)]
201#[allow(clippy::unwrap_used)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn test_report_creation() {
207        let report = DataGovernanceReport::new("hash123".to_string(), 42);
208        assert_eq!(report.report_version, "1.0");
209        assert_eq!(report.seed, 42);
210        assert_eq!(report.config_hash, "hash123");
211        assert!(!report.data_sources.is_empty());
212        assert!(!report.data_sources[0].contains_personal_data);
213    }
214
215    #[test]
216    fn test_standard_processing_steps() {
217        let mut report = DataGovernanceReport::new("hash".to_string(), 42);
218        report.add_standard_processing_steps();
219        assert!(report.processing_steps.len() >= 5);
220        assert_eq!(report.processing_steps[0].order, 1);
221    }
222
223    #[test]
224    fn test_standard_quality_measures() {
225        let mut report = DataGovernanceReport::new("hash".to_string(), 42);
226        report.add_standard_quality_measures();
227        assert!(report.quality_measures.len() >= 3);
228    }
229
230    #[test]
231    fn test_bias_assessment_default() {
232        let assessment = BiasAssessment::default();
233        assert!(!assessment.assessment.is_empty());
234        assert!(!assessment.known_limitations.is_empty());
235        assert!(!assessment.mitigation_measures.is_empty());
236    }
237
238    #[test]
239    fn test_report_serialization() {
240        let mut report = DataGovernanceReport::new("hash".to_string(), 42);
241        report.add_standard_processing_steps();
242        report.add_standard_quality_measures();
243        let json = serde_json::to_string_pretty(&report).expect("should serialize");
244        assert!(json.contains("DataSynth"));
245        assert!(json.contains("Article 10") || json.contains("data_sources"));
246        // Verify it round-trips
247        let deser: DataGovernanceReport = serde_json::from_str(&json).expect("should deserialize");
248        assert_eq!(deser.seed, 42);
249    }
250}