datasynth_eval/ml/
mod.rs

1//! ML-readiness evaluation module.
2//!
3//! Validates that generated data is suitable for machine learning tasks
4//! including feature distributions, label quality, and graph structure.
5
6mod features;
7mod graph;
8mod labels;
9mod splits;
10
11pub use features::{FeatureAnalysis, FeatureAnalyzer, FeatureStats};
12pub use graph::{GraphAnalysis, GraphAnalyzer, GraphMetrics};
13pub use labels::{LabelAnalysis, LabelAnalyzer, LabelDistribution};
14pub use splits::{SplitAnalysis, SplitAnalyzer, SplitMetrics};
15
16use serde::{Deserialize, Serialize};
17
18/// Combined ML-readiness evaluation results.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct MLReadinessEvaluation {
21    /// Feature distribution analysis.
22    pub features: Option<FeatureAnalysis>,
23    /// Label quality analysis.
24    pub labels: Option<LabelAnalysis>,
25    /// Train/test split analysis.
26    pub splits: Option<SplitAnalysis>,
27    /// Graph structure analysis.
28    pub graph: Option<GraphAnalysis>,
29    /// Overall ML-readiness score (0.0-1.0).
30    pub overall_score: f64,
31    /// Whether data meets ML-readiness criteria.
32    pub passes: bool,
33    /// ML-readiness issues found.
34    pub issues: Vec<String>,
35    /// ML-readiness failures (alias for issues).
36    pub failures: Vec<String>,
37}
38
39impl MLReadinessEvaluation {
40    /// Create a new empty evaluation.
41    pub fn new() -> Self {
42        Self {
43            features: None,
44            labels: None,
45            splits: None,
46            graph: None,
47            overall_score: 1.0,
48            passes: true,
49            issues: Vec::new(),
50            failures: Vec::new(),
51        }
52    }
53
54    /// Check all results against thresholds.
55    pub fn check_thresholds(&mut self, thresholds: &crate::config::EvaluationThresholds) {
56        self.issues.clear();
57        self.failures.clear();
58        let mut scores = Vec::new();
59
60        if let Some(ref labels) = self.labels {
61            // Check anomaly rate is within expected range
62            if labels.anomaly_rate < thresholds.anomaly_rate_min {
63                self.issues.push(format!(
64                    "Anomaly rate {} < {} (min threshold)",
65                    labels.anomaly_rate, thresholds.anomaly_rate_min
66                ));
67            }
68            if labels.anomaly_rate > thresholds.anomaly_rate_max {
69                self.issues.push(format!(
70                    "Anomaly rate {} > {} (max threshold)",
71                    labels.anomaly_rate, thresholds.anomaly_rate_max
72                ));
73            }
74
75            // Check label coverage
76            if labels.label_coverage < thresholds.label_coverage_min {
77                self.issues.push(format!(
78                    "Label coverage {} < {} (threshold)",
79                    labels.label_coverage, thresholds.label_coverage_min
80                ));
81            }
82
83            scores.push(labels.quality_score);
84        }
85
86        if let Some(ref splits) = self.splits {
87            if !splits.is_valid {
88                self.issues
89                    .push("Train/test split validation failed".to_string());
90            }
91            scores.push(if splits.is_valid { 1.0 } else { 0.0 });
92        }
93
94        if let Some(ref graph) = self.graph {
95            if graph.connectivity_score < thresholds.graph_connectivity_min {
96                self.issues.push(format!(
97                    "Graph connectivity {} < {} (threshold)",
98                    graph.connectivity_score, thresholds.graph_connectivity_min
99                ));
100            }
101            scores.push(graph.connectivity_score);
102        }
103
104        if let Some(ref features) = self.features {
105            scores.push(features.quality_score);
106        }
107
108        self.overall_score = if scores.is_empty() {
109            1.0
110        } else {
111            scores.iter().sum::<f64>() / scores.len() as f64
112        };
113
114        // Sync failures with issues
115        self.failures = self.issues.clone();
116        self.passes = self.issues.is_empty();
117    }
118}
119
120impl Default for MLReadinessEvaluation {
121    fn default() -> Self {
122        Self::new()
123    }
124}