Skip to main content

atomr_agents_eval/
regression.rs

1use serde::{Deserialize, Serialize};
2
3use crate::suite::EvalRun;
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct RegressionResult {
7    pub baseline_pass_rate: f32,
8    pub current_pass_rate: f32,
9    pub delta: f32,
10    pub blocked: bool,
11    pub reason: String,
12}
13
14/// Compare a current `EvalRun` against a baseline. Blocks publication
15/// if pass-rate regressed by more than `tolerance`.
16pub struct RegressionGate {
17    pub tolerance: f32,
18}
19
20impl RegressionGate {
21    pub fn check(&self, baseline: &EvalRun, current: &EvalRun) -> RegressionResult {
22        let delta = current.pass_rate() - baseline.pass_rate();
23        let blocked = delta < -self.tolerance;
24        let reason = if blocked {
25            format!(
26                "pass_rate dropped from {:.2} to {:.2} (tolerance {:.2})",
27                baseline.pass_rate(),
28                current.pass_rate(),
29                self.tolerance
30            )
31        } else {
32            "ok".into()
33        };
34        RegressionResult {
35            baseline_pass_rate: baseline.pass_rate(),
36            current_pass_rate: current.pass_rate(),
37            delta,
38            blocked,
39            reason,
40        }
41    }
42}
43
44#[cfg(test)]
45mod tests {
46    use super::*;
47
48    fn run_with(passed: u32, failed: u32) -> EvalRun {
49        EvalRun {
50            passed,
51            failed,
52            avg_score: 0.0,
53            results: vec![],
54        }
55    }
56
57    #[test]
58    fn regression_blocks_when_below_tolerance() {
59        let gate = RegressionGate { tolerance: 0.05 };
60        let baseline = run_with(9, 1); // 0.9
61        let current = run_with(7, 3); // 0.7
62        let r = gate.check(&baseline, &current);
63        assert!(r.blocked);
64    }
65
66    #[test]
67    fn regression_allows_within_tolerance() {
68        let gate = RegressionGate { tolerance: 0.10 };
69        let baseline = run_with(10, 0); // 1.0
70        let current = run_with(95, 5); // 0.95
71        let r = gate.check(&baseline, &current);
72        assert!(!r.blocked, "delta {} blocked unexpectedly", r.delta);
73    }
74}