Skip to main content

datasynth_eval/report/
comparison.rs

1//! Baseline comparison for evaluation reports.
2//!
3//! Compares current evaluation results against a baseline to track
4//! improvements or regressions over time.
5
6use serde::{Deserialize, Serialize};
7
8/// Direction of metric change.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
10pub enum ChangeDirection {
11    /// Metric improved.
12    Improved,
13    /// Metric regressed.
14    Regressed,
15    /// No significant change.
16    Unchanged,
17}
18
19/// Significance of the change.
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
21pub enum ChangeSeverity {
22    /// Critical change requiring attention.
23    Critical,
24    /// Notable change.
25    Notable,
26    /// Minor change.
27    Minor,
28    /// Negligible change.
29    Negligible,
30}
31
32/// A single metric change between baseline and current.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct MetricChange {
35    /// Metric name.
36    pub metric_name: String,
37    /// Metric category (e.g., "statistical", "coherence").
38    pub category: String,
39    /// Baseline value.
40    pub baseline_value: f64,
41    /// Current value.
42    pub current_value: f64,
43    /// Absolute change (current - baseline).
44    pub absolute_change: f64,
45    /// Percentage change ((current - baseline) / baseline * 100).
46    pub percent_change: f64,
47    /// Direction of change.
48    pub direction: ChangeDirection,
49    /// Severity of change.
50    pub severity: ChangeSeverity,
51    /// Whether higher values are better for this metric.
52    pub higher_is_better: bool,
53}
54
55impl MetricChange {
56    /// Create a new metric change.
57    pub fn new(
58        metric_name: impl Into<String>,
59        category: impl Into<String>,
60        baseline_value: f64,
61        current_value: f64,
62        higher_is_better: bool,
63    ) -> Self {
64        let absolute_change = current_value - baseline_value;
65        let percent_change = if baseline_value.abs() > 1e-10 {
66            (absolute_change / baseline_value) * 100.0
67        } else if current_value.abs() > 1e-10 {
68            100.0 // From zero to non-zero
69        } else {
70            0.0 // Both zero
71        };
72
73        // Determine direction based on whether higher is better
74        let direction = if absolute_change.abs() < 1e-6 {
75            ChangeDirection::Unchanged
76        } else if (absolute_change > 0.0) == higher_is_better {
77            ChangeDirection::Improved
78        } else {
79            ChangeDirection::Regressed
80        };
81
82        // Determine severity based on percent change
83        let severity = match percent_change.abs() {
84            x if x >= 20.0 => ChangeSeverity::Critical,
85            x if x >= 10.0 => ChangeSeverity::Notable,
86            x if x >= 2.0 => ChangeSeverity::Minor,
87            _ => ChangeSeverity::Negligible,
88        };
89
90        Self {
91            metric_name: metric_name.into(),
92            category: category.into(),
93            baseline_value,
94            current_value,
95            absolute_change,
96            percent_change,
97            direction,
98            severity,
99            higher_is_better,
100        }
101    }
102
103    /// Check if this change is a regression.
104    pub fn is_regression(&self) -> bool {
105        self.direction == ChangeDirection::Regressed
106    }
107
108    /// Check if this change is an improvement.
109    pub fn is_improvement(&self) -> bool {
110        self.direction == ChangeDirection::Improved
111    }
112
113    /// Check if this change is significant (notable or critical).
114    pub fn is_significant(&self) -> bool {
115        matches!(
116            self.severity,
117            ChangeSeverity::Critical | ChangeSeverity::Notable
118        )
119    }
120}
121
122/// Result of comparing current evaluation against baseline.
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct ComparisonResult {
125    /// Individual metric changes.
126    pub metric_changes: Vec<MetricChange>,
127    /// Number of improved metrics.
128    pub improvements: usize,
129    /// Number of regressed metrics.
130    pub regressions: usize,
131    /// Number of unchanged metrics.
132    pub unchanged: usize,
133    /// Number of critical regressions.
134    pub critical_regressions: usize,
135    /// Overall comparison summary.
136    pub summary: ComparisonSummary,
137}
138
139/// Summary of comparison results.
140#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
141pub enum ComparisonSummary {
142    /// Overall improvement.
143    Improved,
144    /// Overall regression.
145    Regressed,
146    /// Mixed results.
147    Mixed,
148    /// No significant changes.
149    Stable,
150}
151
152impl ComparisonResult {
153    /// Create a new comparison result from metric changes.
154    pub fn from_changes(metric_changes: Vec<MetricChange>) -> Self {
155        let improvements = metric_changes.iter().filter(|c| c.is_improvement()).count();
156        let regressions = metric_changes.iter().filter(|c| c.is_regression()).count();
157        let unchanged = metric_changes.len() - improvements - regressions;
158        let critical_regressions = metric_changes
159            .iter()
160            .filter(|c| c.is_regression() && c.severity == ChangeSeverity::Critical)
161            .count();
162
163        let summary = if critical_regressions > 0 {
164            ComparisonSummary::Regressed
165        } else if regressions == 0 && improvements > 0 {
166            ComparisonSummary::Improved
167        } else if regressions > 0 && improvements > 0 {
168            ComparisonSummary::Mixed
169        } else {
170            ComparisonSummary::Stable
171        };
172
173        Self {
174            metric_changes,
175            improvements,
176            regressions,
177            unchanged,
178            critical_regressions,
179            summary,
180        }
181    }
182
183    /// Get all regressions.
184    pub fn get_regressions(&self) -> Vec<&MetricChange> {
185        self.metric_changes
186            .iter()
187            .filter(|c| c.is_regression())
188            .collect()
189    }
190
191    /// Get all improvements.
192    pub fn get_improvements(&self) -> Vec<&MetricChange> {
193        self.metric_changes
194            .iter()
195            .filter(|c| c.is_improvement())
196            .collect()
197    }
198
199    /// Get significant changes only.
200    pub fn get_significant_changes(&self) -> Vec<&MetricChange> {
201        self.metric_changes
202            .iter()
203            .filter(|c| c.is_significant())
204            .collect()
205    }
206
207    /// Get changes by category.
208    pub fn get_by_category(&self, category: &str) -> Vec<&MetricChange> {
209        self.metric_changes
210            .iter()
211            .filter(|c| c.category == category)
212            .collect()
213    }
214}
215
216/// Baseline metrics for comparison.
217#[derive(Debug, Clone, Serialize, Deserialize)]
218pub struct BaselineComparison {
219    /// Baseline report metadata.
220    pub baseline_source: String,
221    /// When baseline was recorded.
222    pub baseline_timestamp: String,
223    /// Comparison results.
224    pub comparison: ComparisonResult,
225}
226
227impl BaselineComparison {
228    /// Create a new baseline comparison.
229    pub fn new(
230        baseline_source: impl Into<String>,
231        baseline_timestamp: impl Into<String>,
232        comparison: ComparisonResult,
233    ) -> Self {
234        Self {
235            baseline_source: baseline_source.into(),
236            baseline_timestamp: baseline_timestamp.into(),
237            comparison,
238        }
239    }
240}
241
242#[cfg(test)]
243#[allow(clippy::unwrap_used)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn test_metric_change_improvement() {
249        let change = MetricChange::new(
250            "completeness",
251            "quality",
252            0.90,
253            0.95,
254            true, // higher is better
255        );
256
257        assert!(change.is_improvement());
258        assert!(!change.is_regression());
259        assert_eq!(change.direction, ChangeDirection::Improved);
260    }
261
262    #[test]
263    fn test_metric_change_regression() {
264        let change = MetricChange::new(
265            "completeness",
266            "quality",
267            0.95,
268            0.90,
269            true, // higher is better
270        );
271
272        assert!(change.is_regression());
273        assert!(!change.is_improvement());
274        assert_eq!(change.direction, ChangeDirection::Regressed);
275    }
276
277    #[test]
278    fn test_metric_change_lower_is_better() {
279        let change = MetricChange::new(
280            "duplicate_rate",
281            "quality",
282            0.05,
283            0.02,
284            false, // lower is better
285        );
286
287        assert!(change.is_improvement());
288        assert_eq!(change.direction, ChangeDirection::Improved);
289    }
290
291    #[test]
292    fn test_comparison_result() {
293        let changes = vec![
294            MetricChange::new("metric1", "cat1", 0.80, 0.90, true),
295            MetricChange::new("metric2", "cat1", 0.90, 0.85, true),
296            MetricChange::new("metric3", "cat2", 0.95, 0.95, true),
297        ];
298
299        let result = ComparisonResult::from_changes(changes);
300
301        assert_eq!(result.improvements, 1);
302        assert_eq!(result.regressions, 1);
303        assert_eq!(result.unchanged, 1);
304        assert_eq!(result.summary, ComparisonSummary::Mixed);
305    }
306
307    #[test]
308    fn test_critical_severity() {
309        let change = MetricChange::new("metric", "category", 0.50, 0.70, true);
310
311        assert_eq!(change.severity, ChangeSeverity::Critical);
312        assert!(change.percent_change >= 20.0);
313    }
314}