1use std::collections::HashMap;
27use std::path::PathBuf;
28
29use serde::{Deserialize, Serialize};
30
31use crate::error::{EvalError, Result};
32
33#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct Baseline {
36 pub timestamp: chrono::DateTime<chrono::Utc>,
38 pub eval_set_id: String,
40 pub metrics: HashMap<String, HashMap<String, f64>>,
42}
43
44#[derive(Debug, Clone, Serialize, Deserialize)]
46pub struct Regression {
47 pub metric_name: String,
49 pub case_id: String,
51 pub baseline_value: f64,
53 pub current_value: f64,
55 pub delta: f64,
57}
58
59pub struct BaselineStore {
61 path: PathBuf,
62}
63
64impl BaselineStore {
65 pub fn new(path: impl Into<PathBuf>) -> Self {
67 Self { path: path.into() }
68 }
69
70 pub fn save(
75 &self,
76 eval_set_id: &str,
77 metrics: &HashMap<String, HashMap<String, f64>>,
78 ) -> Result<()> {
79 let baseline = Baseline {
80 timestamp: chrono::Utc::now(),
81 eval_set_id: eval_set_id.to_string(),
82 metrics: metrics.clone(),
83 };
84
85 let json = serde_json::to_string_pretty(&baseline)
86 .map_err(|e| EvalError::BaselineError(format!("failed to serialize baseline: {e}")))?;
87
88 std::fs::write(&self.path, json)
89 .map_err(|e| EvalError::BaselineError(format!("failed to write baseline file: {e}")))?;
90
91 Ok(())
92 }
93
94 pub fn load(&self) -> Result<Option<Baseline>> {
99 if !self.path.exists() {
100 return Ok(None);
101 }
102
103 let contents = std::fs::read_to_string(&self.path)
104 .map_err(|e| EvalError::BaselineError(format!("failed to read baseline file: {e}")))?;
105
106 let baseline: Baseline = serde_json::from_str(&contents)
107 .map_err(|e| EvalError::BaselineError(format!("failed to parse baseline file: {e}")))?;
108
109 Ok(Some(baseline))
110 }
111
112 pub fn check_regressions(
117 &self,
118 current: &HashMap<String, HashMap<String, f64>>,
119 tolerance: f64,
120 ) -> Result<Vec<Regression>> {
121 let baseline = match self.load()? {
122 Some(b) => b,
123 None => {
124 tracing::info!(
125 "no baseline file found at {:?}, skipping regression check",
126 self.path
127 );
128 return Ok(Vec::new());
129 }
130 };
131
132 let mut regressions = Vec::new();
133
134 for (metric_name, baseline_cases) in &baseline.metrics {
135 if let Some(current_cases) = current.get(metric_name) {
136 for (case_id, &baseline_value) in baseline_cases {
137 if let Some(¤t_value) = current_cases.get(case_id) {
138 let delta = baseline_value - current_value;
139 if delta > tolerance {
140 regressions.push(Regression {
141 metric_name: metric_name.clone(),
142 case_id: case_id.clone(),
143 baseline_value,
144 current_value,
145 delta,
146 });
147 }
148 }
149 }
150 }
151 }
152
153 Ok(regressions)
154 }
155}
156
157#[cfg(test)]
158mod tests {
159 use super::*;
160 use tempfile::TempDir;
161
162 fn make_store(dir: &TempDir) -> BaselineStore {
163 let path = dir.path().join(".eval-baseline.json");
164 BaselineStore::new(path)
165 }
166
167 fn sample_metrics() -> HashMap<String, HashMap<String, f64>> {
168 let mut metrics = HashMap::new();
169 let mut accuracy = HashMap::new();
170 accuracy.insert("case_1".to_string(), 0.95);
171 accuracy.insert("case_2".to_string(), 0.88);
172 metrics.insert("accuracy".to_string(), accuracy);
173
174 let mut latency = HashMap::new();
175 latency.insert("case_1".to_string(), 0.7);
176 latency.insert("case_2".to_string(), 0.6);
177 metrics.insert("latency".to_string(), latency);
178
179 metrics
180 }
181
182 #[test]
183 fn test_save_and_load_roundtrip() {
184 let dir = TempDir::new().unwrap();
185 let store = make_store(&dir);
186 let metrics = sample_metrics();
187
188 store.save("test_set", &metrics).unwrap();
189
190 let loaded = store.load().unwrap().expect("baseline should exist");
191 assert_eq!(loaded.eval_set_id, "test_set");
192 assert_eq!(loaded.metrics, metrics);
193 }
194
195 #[test]
196 fn test_load_returns_none_when_no_file() {
197 let dir = TempDir::new().unwrap();
198 let store = make_store(&dir);
199
200 let result = store.load().unwrap();
201 assert!(result.is_none());
202 }
203
204 #[test]
205 fn test_check_regressions_no_baseline() {
206 let dir = TempDir::new().unwrap();
207 let store = make_store(&dir);
208 let current = sample_metrics();
209
210 let regressions = store.check_regressions(¤t, 0.05).unwrap();
211 assert!(regressions.is_empty());
212 }
213
214 #[test]
215 fn test_check_regressions_no_regression() {
216 let dir = TempDir::new().unwrap();
217 let store = make_store(&dir);
218 let metrics = sample_metrics();
219
220 store.save("test_set", &metrics).unwrap();
221
222 let regressions = store.check_regressions(&metrics, 0.05).unwrap();
224 assert!(regressions.is_empty());
225 }
226
227 #[test]
228 fn test_check_regressions_detects_regression() {
229 let dir = TempDir::new().unwrap();
230 let store = make_store(&dir);
231 let metrics = sample_metrics();
232
233 store.save("test_set", &metrics).unwrap();
234
235 let mut current = metrics.clone();
237 current.get_mut("accuracy").unwrap().insert("case_1".to_string(), 0.80);
238
239 let regressions = store.check_regressions(¤t, 0.05).unwrap();
240 assert_eq!(regressions.len(), 1);
241
242 let reg = ®ressions[0];
243 assert_eq!(reg.metric_name, "accuracy");
244 assert_eq!(reg.case_id, "case_1");
245 assert!((reg.baseline_value - 0.95).abs() < f64::EPSILON);
246 assert!((reg.current_value - 0.80).abs() < f64::EPSILON);
247 assert!((reg.delta - 0.15).abs() < 1e-10);
248 }
249
250 #[test]
251 fn test_check_regressions_within_tolerance() {
252 let dir = TempDir::new().unwrap();
253 let store = make_store(&dir);
254 let metrics = sample_metrics();
255
256 store.save("test_set", &metrics).unwrap();
257
258 let mut current = metrics.clone();
260 current.get_mut("accuracy").unwrap().insert("case_1".to_string(), 0.91);
261
262 let regressions = store.check_regressions(¤t, 0.05).unwrap();
263 assert!(regressions.is_empty());
264 }
265
266 #[test]
267 fn test_check_regressions_improvement_not_flagged() {
268 let dir = TempDir::new().unwrap();
269 let store = make_store(&dir);
270 let metrics = sample_metrics();
271
272 store.save("test_set", &metrics).unwrap();
273
274 let mut current = metrics.clone();
276 current.get_mut("accuracy").unwrap().insert("case_1".to_string(), 0.99);
277
278 let regressions = store.check_regressions(¤t, 0.05).unwrap();
279 assert!(regressions.is_empty());
280 }
281
282 #[test]
283 fn test_save_writes_pretty_json() {
284 let dir = TempDir::new().unwrap();
285 let store = make_store(&dir);
286 let metrics = sample_metrics();
287
288 store.save("test_set", &metrics).unwrap();
289
290 let contents = std::fs::read_to_string(dir.path().join(".eval-baseline.json")).unwrap();
291 assert!(contents.contains('\n'));
293 assert!(contents.contains(" "));
294 let _: serde_json::Value = serde_json::from_str(&contents).unwrap();
296 }
297
298 #[test]
299 fn test_baseline_contains_timestamp() {
300 let dir = TempDir::new().unwrap();
301 let store = make_store(&dir);
302 let metrics = sample_metrics();
303
304 let before = chrono::Utc::now();
305 store.save("test_set", &metrics).unwrap();
306 let after = chrono::Utc::now();
307
308 let loaded = store.load().unwrap().unwrap();
309 assert!(loaded.timestamp >= before);
310 assert!(loaded.timestamp <= after);
311 }
312}