Skip to main content

datasynth_eval/ml/
mod.rs

1//! ML-readiness evaluation module.
2//!
3//! Validates that generated data is suitable for machine learning tasks
4//! including feature distributions, label quality, and graph structure.
5//!
6//! Also provides baseline task definitions for benchmarking synthetic data.
7
8mod baselines;
9mod features;
10mod graph;
11mod labels;
12mod splits;
13
14pub use baselines::{
15    get_accounting_baseline_tasks, BaselineAlgorithm, BaselineConfig, BaselineEvaluation,
16    BaselineResult, BaselineSummary, BaselineTask, ClassificationMetrics, ExpectedMetrics,
17    MLTaskType, PerformanceGrade, RankingMetrics, RegressionMetrics,
18};
19pub use features::{FeatureAnalysis, FeatureAnalyzer, FeatureStats};
20pub use graph::{GraphAnalysis, GraphAnalyzer, GraphMetrics};
21pub use labels::{LabelAnalysis, LabelAnalyzer, LabelDistribution};
22pub use splits::{SplitAnalysis, SplitAnalyzer, SplitMetrics};
23
24use serde::{Deserialize, Serialize};
25
26/// Combined ML-readiness evaluation results.
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct MLReadinessEvaluation {
29    /// Feature distribution analysis.
30    pub features: Option<FeatureAnalysis>,
31    /// Label quality analysis.
32    pub labels: Option<LabelAnalysis>,
33    /// Train/test split analysis.
34    pub splits: Option<SplitAnalysis>,
35    /// Graph structure analysis.
36    pub graph: Option<GraphAnalysis>,
37    /// Overall ML-readiness score (0.0-1.0).
38    pub overall_score: f64,
39    /// Whether data meets ML-readiness criteria.
40    pub passes: bool,
41    /// ML-readiness issues found.
42    pub issues: Vec<String>,
43    /// ML-readiness failures (alias for issues).
44    pub failures: Vec<String>,
45}
46
47impl MLReadinessEvaluation {
48    /// Create a new empty evaluation.
49    pub fn new() -> Self {
50        Self {
51            features: None,
52            labels: None,
53            splits: None,
54            graph: None,
55            overall_score: 1.0,
56            passes: true,
57            issues: Vec::new(),
58            failures: Vec::new(),
59        }
60    }
61
62    /// Check all results against thresholds.
63    pub fn check_thresholds(&mut self, thresholds: &crate::config::EvaluationThresholds) {
64        self.issues.clear();
65        self.failures.clear();
66        let mut scores = Vec::new();
67
68        if let Some(ref labels) = self.labels {
69            // Check anomaly rate is within expected range
70            if labels.anomaly_rate < thresholds.anomaly_rate_min {
71                self.issues.push(format!(
72                    "Anomaly rate {} < {} (min threshold)",
73                    labels.anomaly_rate, thresholds.anomaly_rate_min
74                ));
75            }
76            if labels.anomaly_rate > thresholds.anomaly_rate_max {
77                self.issues.push(format!(
78                    "Anomaly rate {} > {} (max threshold)",
79                    labels.anomaly_rate, thresholds.anomaly_rate_max
80                ));
81            }
82
83            // Check label coverage
84            if labels.label_coverage < thresholds.label_coverage_min {
85                self.issues.push(format!(
86                    "Label coverage {} < {} (threshold)",
87                    labels.label_coverage, thresholds.label_coverage_min
88                ));
89            }
90
91            scores.push(labels.quality_score);
92        }
93
94        if let Some(ref splits) = self.splits {
95            if !splits.is_valid {
96                self.issues
97                    .push("Train/test split validation failed".to_string());
98            }
99            scores.push(if splits.is_valid { 1.0 } else { 0.0 });
100        }
101
102        if let Some(ref graph) = self.graph {
103            if graph.connectivity_score < thresholds.graph_connectivity_min {
104                self.issues.push(format!(
105                    "Graph connectivity {} < {} (threshold)",
106                    graph.connectivity_score, thresholds.graph_connectivity_min
107                ));
108            }
109            scores.push(graph.connectivity_score);
110        }
111
112        if let Some(ref features) = self.features {
113            scores.push(features.quality_score);
114        }
115
116        self.overall_score = if scores.is_empty() {
117            1.0
118        } else {
119            scores.iter().sum::<f64>() / scores.len() as f64
120        };
121
122        // Sync failures with issues
123        self.failures = self.issues.clone();
124        self.passes = self.issues.is_empty();
125    }
126}
127
128impl Default for MLReadinessEvaluation {
129    fn default() -> Self {
130        Self::new()
131    }
132}