Skip to main content

adk_eval/
baseline.rs

1//! Baseline storage for regression detection.
2//!
3//! Provides persistence of evaluation metric snapshots and comparison
4//! against baselines to detect regressions.
5//!
6//! # Example
7//!
8//! ```rust,ignore
9//! use adk_eval::BaselineStore;
10//! use std::collections::HashMap;
11//!
12//! let store = BaselineStore::new(".eval-baseline.json");
13//!
14//! // Save current metrics as baseline
15//! let mut metrics = HashMap::new();
16//! let mut case_metrics = HashMap::new();
17//! case_metrics.insert("accuracy".to_string(), 0.95);
18//! metrics.insert("case_1".to_string(), case_metrics);
19//! store.save("my_eval_set", &metrics).unwrap();
20//!
21//! // Check for regressions on a later run
22//! let regressions = store.check_regressions(&metrics, 0.05).unwrap();
23//! assert!(regressions.is_empty());
24//! ```
25
26use std::collections::HashMap;
27use std::path::PathBuf;
28
29use serde::{Deserialize, Serialize};
30
31use crate::error::{EvalError, Result};
32
33/// Baseline file content containing metric snapshots.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct Baseline {
36    /// When the baseline was saved
37    pub timestamp: chrono::DateTime<chrono::Utc>,
38    /// Identifier for the eval set
39    pub eval_set_id: String,
40    /// Per-case, per-metric scores: outer key is metric_name, inner key is case_id
41    pub metrics: HashMap<String, HashMap<String, f64>>,
42}
43
44/// A regression detected between baseline and current run.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct Regression {
47    /// Name of the metric that regressed
48    pub metric_name: String,
49    /// Identifier of the case that regressed
50    pub case_id: String,
51    /// Score from the baseline
52    pub baseline_value: f64,
53    /// Score from the current run
54    pub current_value: f64,
55    /// Difference (baseline - current)
56    pub delta: f64,
57}
58
59/// Manages baseline persistence and regression detection.
60pub struct BaselineStore {
61    path: PathBuf,
62}
63
64impl BaselineStore {
65    /// Create a new baseline store at the given path.
66    pub fn new(path: impl Into<PathBuf>) -> Self {
67        Self { path: path.into() }
68    }
69
70    /// Save metrics as a baseline.
71    ///
72    /// Writes the metrics map with a timestamp and eval set identifier
73    /// to the configured path as pretty-printed JSON.
74    pub fn save(
75        &self,
76        eval_set_id: &str,
77        metrics: &HashMap<String, HashMap<String, f64>>,
78    ) -> Result<()> {
79        let baseline = Baseline {
80            timestamp: chrono::Utc::now(),
81            eval_set_id: eval_set_id.to_string(),
82            metrics: metrics.clone(),
83        };
84
85        let json = serde_json::to_string_pretty(&baseline)
86            .map_err(|e| EvalError::BaselineError(format!("failed to serialize baseline: {e}")))?;
87
88        std::fs::write(&self.path, json)
89            .map_err(|e| EvalError::BaselineError(format!("failed to write baseline file: {e}")))?;
90
91        Ok(())
92    }
93
94    /// Load existing baseline.
95    ///
96    /// Returns `Ok(None)` if the baseline file does not exist.
97    /// Returns an error only for actual I/O or parse failures.
98    pub fn load(&self) -> Result<Option<Baseline>> {
99        if !self.path.exists() {
100            return Ok(None);
101        }
102
103        let contents = std::fs::read_to_string(&self.path)
104            .map_err(|e| EvalError::BaselineError(format!("failed to read baseline file: {e}")))?;
105
106        let baseline: Baseline = serde_json::from_str(&contents)
107            .map_err(|e| EvalError::BaselineError(format!("failed to parse baseline file: {e}")))?;
108
109        Ok(Some(baseline))
110    }
111
112    /// Compare current metrics against baseline and detect regressions.
113    ///
114    /// A regression is detected when `baseline_value - current_value > tolerance`.
115    /// If no baseline file exists, returns an empty vector (no regressions).
116    pub fn check_regressions(
117        &self,
118        current: &HashMap<String, HashMap<String, f64>>,
119        tolerance: f64,
120    ) -> Result<Vec<Regression>> {
121        let baseline = match self.load()? {
122            Some(b) => b,
123            None => {
124                tracing::info!(
125                    "no baseline file found at {:?}, skipping regression check",
126                    self.path
127                );
128                return Ok(Vec::new());
129            }
130        };
131
132        let mut regressions = Vec::new();
133
134        for (metric_name, baseline_cases) in &baseline.metrics {
135            if let Some(current_cases) = current.get(metric_name) {
136                for (case_id, &baseline_value) in baseline_cases {
137                    if let Some(&current_value) = current_cases.get(case_id) {
138                        let delta = baseline_value - current_value;
139                        if delta > tolerance {
140                            regressions.push(Regression {
141                                metric_name: metric_name.clone(),
142                                case_id: case_id.clone(),
143                                baseline_value,
144                                current_value,
145                                delta,
146                            });
147                        }
148                    }
149                }
150            }
151        }
152
153        Ok(regressions)
154    }
155}
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160    use tempfile::TempDir;
161
162    fn make_store(dir: &TempDir) -> BaselineStore {
163        let path = dir.path().join(".eval-baseline.json");
164        BaselineStore::new(path)
165    }
166
167    fn sample_metrics() -> HashMap<String, HashMap<String, f64>> {
168        let mut metrics = HashMap::new();
169        let mut accuracy = HashMap::new();
170        accuracy.insert("case_1".to_string(), 0.95);
171        accuracy.insert("case_2".to_string(), 0.88);
172        metrics.insert("accuracy".to_string(), accuracy);
173
174        let mut latency = HashMap::new();
175        latency.insert("case_1".to_string(), 0.7);
176        latency.insert("case_2".to_string(), 0.6);
177        metrics.insert("latency".to_string(), latency);
178
179        metrics
180    }
181
182    #[test]
183    fn test_save_and_load_roundtrip() {
184        let dir = TempDir::new().unwrap();
185        let store = make_store(&dir);
186        let metrics = sample_metrics();
187
188        store.save("test_set", &metrics).unwrap();
189
190        let loaded = store.load().unwrap().expect("baseline should exist");
191        assert_eq!(loaded.eval_set_id, "test_set");
192        assert_eq!(loaded.metrics, metrics);
193    }
194
195    #[test]
196    fn test_load_returns_none_when_no_file() {
197        let dir = TempDir::new().unwrap();
198        let store = make_store(&dir);
199
200        let result = store.load().unwrap();
201        assert!(result.is_none());
202    }
203
204    #[test]
205    fn test_check_regressions_no_baseline() {
206        let dir = TempDir::new().unwrap();
207        let store = make_store(&dir);
208        let current = sample_metrics();
209
210        let regressions = store.check_regressions(&current, 0.05).unwrap();
211        assert!(regressions.is_empty());
212    }
213
214    #[test]
215    fn test_check_regressions_no_regression() {
216        let dir = TempDir::new().unwrap();
217        let store = make_store(&dir);
218        let metrics = sample_metrics();
219
220        store.save("test_set", &metrics).unwrap();
221
222        // Same metrics — no regression
223        let regressions = store.check_regressions(&metrics, 0.05).unwrap();
224        assert!(regressions.is_empty());
225    }
226
227    #[test]
228    fn test_check_regressions_detects_regression() {
229        let dir = TempDir::new().unwrap();
230        let store = make_store(&dir);
231        let metrics = sample_metrics();
232
233        store.save("test_set", &metrics).unwrap();
234
235        // Drop case_1 accuracy from 0.95 to 0.80 (delta = 0.15, exceeds 0.05 tolerance)
236        let mut current = metrics.clone();
237        current.get_mut("accuracy").unwrap().insert("case_1".to_string(), 0.80);
238
239        let regressions = store.check_regressions(&current, 0.05).unwrap();
240        assert_eq!(regressions.len(), 1);
241
242        let reg = &regressions[0];
243        assert_eq!(reg.metric_name, "accuracy");
244        assert_eq!(reg.case_id, "case_1");
245        assert!((reg.baseline_value - 0.95).abs() < f64::EPSILON);
246        assert!((reg.current_value - 0.80).abs() < f64::EPSILON);
247        assert!((reg.delta - 0.15).abs() < 1e-10);
248    }
249
250    #[test]
251    fn test_check_regressions_within_tolerance() {
252        let dir = TempDir::new().unwrap();
253        let store = make_store(&dir);
254        let metrics = sample_metrics();
255
256        store.save("test_set", &metrics).unwrap();
257
258        // Drop case_1 accuracy from 0.95 to 0.91 (delta = 0.04, within 0.05 tolerance)
259        let mut current = metrics.clone();
260        current.get_mut("accuracy").unwrap().insert("case_1".to_string(), 0.91);
261
262        let regressions = store.check_regressions(&current, 0.05).unwrap();
263        assert!(regressions.is_empty());
264    }
265
266    #[test]
267    fn test_check_regressions_improvement_not_flagged() {
268        let dir = TempDir::new().unwrap();
269        let store = make_store(&dir);
270        let metrics = sample_metrics();
271
272        store.save("test_set", &metrics).unwrap();
273
274        // Improve case_1 accuracy from 0.95 to 0.99 (negative delta — improvement)
275        let mut current = metrics.clone();
276        current.get_mut("accuracy").unwrap().insert("case_1".to_string(), 0.99);
277
278        let regressions = store.check_regressions(&current, 0.05).unwrap();
279        assert!(regressions.is_empty());
280    }
281
282    #[test]
283    fn test_save_writes_pretty_json() {
284        let dir = TempDir::new().unwrap();
285        let store = make_store(&dir);
286        let metrics = sample_metrics();
287
288        store.save("test_set", &metrics).unwrap();
289
290        let contents = std::fs::read_to_string(dir.path().join(".eval-baseline.json")).unwrap();
291        // Pretty-printed JSON has newlines and indentation
292        assert!(contents.contains('\n'));
293        assert!(contents.contains("  "));
294        // Verify it's valid JSON
295        let _: serde_json::Value = serde_json::from_str(&contents).unwrap();
296    }
297
298    #[test]
299    fn test_baseline_contains_timestamp() {
300        let dir = TempDir::new().unwrap();
301        let store = make_store(&dir);
302        let metrics = sample_metrics();
303
304        let before = chrono::Utc::now();
305        store.save("test_set", &metrics).unwrap();
306        let after = chrono::Utc::now();
307
308        let loaded = store.load().unwrap().unwrap();
309        assert!(loaded.timestamp >= before);
310        assert!(loaded.timestamp <= after);
311    }
312}