Skip to main content

datasynth_eval/report/
comparison.rs

1//! Baseline comparison for evaluation reports.
2//!
3//! Compares current evaluation results against a baseline to track
4//! improvements or regressions over time.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9/// Direction of metric change.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
11pub enum ChangeDirection {
12    /// Metric improved.
13    Improved,
14    /// Metric regressed.
15    Regressed,
16    /// No significant change.
17    Unchanged,
18}
19
20/// Significance of the change.
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
22pub enum ChangeSeverity {
23    /// Critical change requiring attention.
24    Critical,
25    /// Notable change.
26    Notable,
27    /// Minor change.
28    Minor,
29    /// Negligible change.
30    Negligible,
31}
32
33/// A single metric change between baseline and current.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct MetricChange {
36    /// Metric name.
37    pub metric_name: String,
38    /// Metric category (e.g., "statistical", "coherence").
39    pub category: String,
40    /// Baseline value.
41    pub baseline_value: f64,
42    /// Current value.
43    pub current_value: f64,
44    /// Absolute change (current - baseline).
45    pub absolute_change: f64,
46    /// Percentage change ((current - baseline) / baseline * 100).
47    pub percent_change: f64,
48    /// Direction of change.
49    pub direction: ChangeDirection,
50    /// Severity of change.
51    pub severity: ChangeSeverity,
52    /// Whether higher values are better for this metric.
53    pub higher_is_better: bool,
54}
55
56impl MetricChange {
57    /// Create a new metric change.
58    pub fn new(
59        metric_name: impl Into<String>,
60        category: impl Into<String>,
61        baseline_value: f64,
62        current_value: f64,
63        higher_is_better: bool,
64    ) -> Self {
65        let absolute_change = current_value - baseline_value;
66        let percent_change = if baseline_value.abs() > 1e-10 {
67            (absolute_change / baseline_value) * 100.0
68        } else if current_value.abs() > 1e-10 {
69            100.0 // From zero to non-zero
70        } else {
71            0.0 // Both zero
72        };
73
74        // Determine direction based on whether higher is better
75        let direction = if absolute_change.abs() < 1e-6 {
76            ChangeDirection::Unchanged
77        } else if (absolute_change > 0.0) == higher_is_better {
78            ChangeDirection::Improved
79        } else {
80            ChangeDirection::Regressed
81        };
82
83        // Determine severity based on percent change
84        let severity = match percent_change.abs() {
85            x if x >= 20.0 => ChangeSeverity::Critical,
86            x if x >= 10.0 => ChangeSeverity::Notable,
87            x if x >= 2.0 => ChangeSeverity::Minor,
88            _ => ChangeSeverity::Negligible,
89        };
90
91        Self {
92            metric_name: metric_name.into(),
93            category: category.into(),
94            baseline_value,
95            current_value,
96            absolute_change,
97            percent_change,
98            direction,
99            severity,
100            higher_is_better,
101        }
102    }
103
104    /// Check if this change is a regression.
105    pub fn is_regression(&self) -> bool {
106        self.direction == ChangeDirection::Regressed
107    }
108
109    /// Check if this change is an improvement.
110    pub fn is_improvement(&self) -> bool {
111        self.direction == ChangeDirection::Improved
112    }
113
114    /// Check if this change is significant (notable or critical).
115    pub fn is_significant(&self) -> bool {
116        matches!(
117            self.severity,
118            ChangeSeverity::Critical | ChangeSeverity::Notable
119        )
120    }
121}
122
123/// Result of comparing current evaluation against baseline.
124#[derive(Debug, Clone, Serialize, Deserialize)]
125pub struct ComparisonResult {
126    /// Individual metric changes.
127    pub metric_changes: Vec<MetricChange>,
128    /// Number of improved metrics.
129    pub improvements: usize,
130    /// Number of regressed metrics.
131    pub regressions: usize,
132    /// Number of unchanged metrics.
133    pub unchanged: usize,
134    /// Number of critical regressions.
135    pub critical_regressions: usize,
136    /// Overall comparison summary.
137    pub summary: ComparisonSummary,
138}
139
140/// Summary of comparison results.
141#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
142pub enum ComparisonSummary {
143    /// Overall improvement.
144    Improved,
145    /// Overall regression.
146    Regressed,
147    /// Mixed results.
148    Mixed,
149    /// No significant changes.
150    Stable,
151}
152
153impl ComparisonResult {
154    /// Create a new comparison result from metric changes.
155    pub fn from_changes(metric_changes: Vec<MetricChange>) -> Self {
156        let improvements = metric_changes.iter().filter(|c| c.is_improvement()).count();
157        let regressions = metric_changes.iter().filter(|c| c.is_regression()).count();
158        let unchanged = metric_changes.len() - improvements - regressions;
159        let critical_regressions = metric_changes
160            .iter()
161            .filter(|c| c.is_regression() && c.severity == ChangeSeverity::Critical)
162            .count();
163
164        let summary = if critical_regressions > 0 {
165            ComparisonSummary::Regressed
166        } else if regressions == 0 && improvements > 0 {
167            ComparisonSummary::Improved
168        } else if regressions > 0 && improvements > 0 {
169            ComparisonSummary::Mixed
170        } else {
171            ComparisonSummary::Stable
172        };
173
174        Self {
175            metric_changes,
176            improvements,
177            regressions,
178            unchanged,
179            critical_regressions,
180            summary,
181        }
182    }
183
184    /// Get all regressions.
185    pub fn get_regressions(&self) -> Vec<&MetricChange> {
186        self.metric_changes
187            .iter()
188            .filter(|c| c.is_regression())
189            .collect()
190    }
191
192    /// Get all improvements.
193    pub fn get_improvements(&self) -> Vec<&MetricChange> {
194        self.metric_changes
195            .iter()
196            .filter(|c| c.is_improvement())
197            .collect()
198    }
199
200    /// Get significant changes only.
201    pub fn get_significant_changes(&self) -> Vec<&MetricChange> {
202        self.metric_changes
203            .iter()
204            .filter(|c| c.is_significant())
205            .collect()
206    }
207
208    /// Get changes by category.
209    pub fn get_by_category(&self, category: &str) -> Vec<&MetricChange> {
210        self.metric_changes
211            .iter()
212            .filter(|c| c.category == category)
213            .collect()
214    }
215}
216
217/// Baseline metrics for comparison.
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct BaselineComparison {
220    /// Baseline report metadata.
221    pub baseline_source: String,
222    /// When baseline was recorded.
223    pub baseline_timestamp: String,
224    /// Comparison results.
225    pub comparison: ComparisonResult,
226}
227
228impl BaselineComparison {
229    /// Create a new baseline comparison.
230    pub fn new(
231        baseline_source: impl Into<String>,
232        baseline_timestamp: impl Into<String>,
233        comparison: ComparisonResult,
234    ) -> Self {
235        Self {
236            baseline_source: baseline_source.into(),
237            baseline_timestamp: baseline_timestamp.into(),
238            comparison,
239        }
240    }
241}
242
243/// Compares evaluation reports against baselines.
244#[allow(dead_code)] // Reserved for baseline comparison feature
245pub struct BaselineComparer {
246    /// Metric definitions with higher_is_better flags.
247    metric_definitions: HashMap<String, MetricDefinition>,
248    /// Threshold for considering a change significant.
249    significance_threshold: f64,
250}
251
252#[allow(dead_code)]
253#[derive(Clone)]
254struct MetricDefinition {
255    category: String,
256    higher_is_better: bool,
257}
258
259#[allow(dead_code)]
260impl BaselineComparer {
261    /// Create a new baseline comparer with default metric definitions.
262    pub fn new() -> Self {
263        let mut definitions = HashMap::new();
264
265        // Statistical metrics (higher p-values are better)
266        definitions.insert(
267            "benford_p_value".to_string(),
268            MetricDefinition {
269                category: "statistical".to_string(),
270                higher_is_better: true,
271            },
272        );
273        definitions.insert(
274            "benford_mad".to_string(),
275            MetricDefinition {
276                category: "statistical".to_string(),
277                higher_is_better: false, // Lower MAD is better
278            },
279        );
280        definitions.insert(
281            "amount_ks_p_value".to_string(),
282            MetricDefinition {
283                category: "statistical".to_string(),
284                higher_is_better: true,
285            },
286        );
287        definitions.insert(
288            "temporal_correlation".to_string(),
289            MetricDefinition {
290                category: "statistical".to_string(),
291                higher_is_better: true,
292            },
293        );
294
295        // Coherence metrics (higher is better)
296        definitions.insert(
297            "balance_sheet_balanced".to_string(),
298            MetricDefinition {
299                category: "coherence".to_string(),
300                higher_is_better: true,
301            },
302        );
303        definitions.insert(
304            "subledger_reconciliation".to_string(),
305            MetricDefinition {
306                category: "coherence".to_string(),
307                higher_is_better: true,
308            },
309        );
310        definitions.insert(
311            "document_chain_completion".to_string(),
312            MetricDefinition {
313                category: "coherence".to_string(),
314                higher_is_better: true,
315            },
316        );
317        definitions.insert(
318            "ic_match_rate".to_string(),
319            MetricDefinition {
320                category: "coherence".to_string(),
321                higher_is_better: true,
322            },
323        );
324
325        // Quality metrics
326        definitions.insert(
327            "duplicate_rate".to_string(),
328            MetricDefinition {
329                category: "quality".to_string(),
330                higher_is_better: false, // Lower is better
331            },
332        );
333        definitions.insert(
334            "completeness".to_string(),
335            MetricDefinition {
336                category: "quality".to_string(),
337                higher_is_better: true,
338            },
339        );
340        definitions.insert(
341            "format_consistency".to_string(),
342            MetricDefinition {
343                category: "quality".to_string(),
344                higher_is_better: true,
345            },
346        );
347
348        // ML metrics
349        definitions.insert(
350            "anomaly_rate".to_string(),
351            MetricDefinition {
352                category: "ml".to_string(),
353                higher_is_better: true, // We want anomalies for training
354            },
355        );
356        definitions.insert(
357            "label_coverage".to_string(),
358            MetricDefinition {
359                category: "ml".to_string(),
360                higher_is_better: true,
361            },
362        );
363        definitions.insert(
364            "graph_connectivity".to_string(),
365            MetricDefinition {
366                category: "ml".to_string(),
367                higher_is_better: true,
368            },
369        );
370
371        Self {
372            metric_definitions: definitions,
373            significance_threshold: 2.0, // 2% change is significant
374        }
375    }
376
377    /// Set significance threshold (in percent).
378    pub fn with_significance_threshold(mut self, threshold: f64) -> Self {
379        self.significance_threshold = threshold;
380        self
381    }
382
383    /// Add a custom metric definition.
384    pub fn add_metric(
385        &mut self,
386        name: impl Into<String>,
387        category: impl Into<String>,
388        higher_is_better: bool,
389    ) {
390        self.metric_definitions.insert(
391            name.into(),
392            MetricDefinition {
393                category: category.into(),
394                higher_is_better,
395            },
396        );
397    }
398
399    /// Compare baseline and current metric values.
400    pub fn compare(
401        &self,
402        baseline: &HashMap<String, f64>,
403        current: &HashMap<String, f64>,
404    ) -> ComparisonResult {
405        let mut changes = Vec::new();
406
407        for (metric_name, &current_value) in current {
408            if let Some(&baseline_value) = baseline.get(metric_name) {
409                let (category, higher_is_better) = self
410                    .metric_definitions
411                    .get(metric_name)
412                    .map(|d| (d.category.clone(), d.higher_is_better))
413                    .unwrap_or(("unknown".to_string(), true));
414
415                changes.push(MetricChange::new(
416                    metric_name.clone(),
417                    category,
418                    baseline_value,
419                    current_value,
420                    higher_is_better,
421                ));
422            }
423        }
424
425        ComparisonResult::from_changes(changes)
426    }
427
428    /// Create a baseline comparison from metric maps.
429    pub fn create_comparison(
430        &self,
431        baseline_source: impl Into<String>,
432        baseline_timestamp: impl Into<String>,
433        baseline_metrics: &HashMap<String, f64>,
434        current_metrics: &HashMap<String, f64>,
435    ) -> BaselineComparison {
436        let comparison = self.compare(baseline_metrics, current_metrics);
437        BaselineComparison::new(baseline_source, baseline_timestamp, comparison)
438    }
439}
440
441impl Default for BaselineComparer {
442    fn default() -> Self {
443        Self::new()
444    }
445}
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450
451    #[test]
452    fn test_metric_change_improvement() {
453        let change = MetricChange::new(
454            "completeness",
455            "quality",
456            0.90,
457            0.95,
458            true, // higher is better
459        );
460
461        assert!(change.is_improvement());
462        assert!(!change.is_regression());
463        assert_eq!(change.direction, ChangeDirection::Improved);
464    }
465
466    #[test]
467    fn test_metric_change_regression() {
468        let change = MetricChange::new(
469            "completeness",
470            "quality",
471            0.95,
472            0.90,
473            true, // higher is better
474        );
475
476        assert!(change.is_regression());
477        assert!(!change.is_improvement());
478        assert_eq!(change.direction, ChangeDirection::Regressed);
479    }
480
481    #[test]
482    fn test_metric_change_lower_is_better() {
483        let change = MetricChange::new(
484            "duplicate_rate",
485            "quality",
486            0.05,
487            0.02,
488            false, // lower is better
489        );
490
491        assert!(change.is_improvement());
492        assert_eq!(change.direction, ChangeDirection::Improved);
493    }
494
495    #[test]
496    fn test_comparison_result() {
497        let changes = vec![
498            MetricChange::new("metric1", "cat1", 0.80, 0.90, true),
499            MetricChange::new("metric2", "cat1", 0.90, 0.85, true),
500            MetricChange::new("metric3", "cat2", 0.95, 0.95, true),
501        ];
502
503        let result = ComparisonResult::from_changes(changes);
504
505        assert_eq!(result.improvements, 1);
506        assert_eq!(result.regressions, 1);
507        assert_eq!(result.unchanged, 1);
508        assert_eq!(result.summary, ComparisonSummary::Mixed);
509    }
510
511    #[test]
512    fn test_baseline_comparer() {
513        let comparer = BaselineComparer::new();
514
515        let mut baseline = HashMap::new();
516        baseline.insert("completeness".to_string(), 0.90);
517        baseline.insert("duplicate_rate".to_string(), 0.05);
518
519        let mut current = HashMap::new();
520        current.insert("completeness".to_string(), 0.95);
521        current.insert("duplicate_rate".to_string(), 0.03);
522
523        let result = comparer.compare(&baseline, &current);
524
525        assert_eq!(result.improvements, 2);
526        assert_eq!(result.regressions, 0);
527    }
528
529    #[test]
530    fn test_critical_severity() {
531        let change = MetricChange::new("metric", "category", 0.50, 0.70, true);
532
533        assert_eq!(change.severity, ChangeSeverity::Critical);
534        assert!(change.percent_change >= 20.0);
535    }
536}