1use super::regression::RegressionSuite;
18use super::suite::SuiteResult;
19
20#[derive(Debug, Clone)]
24pub enum FaultKind {
25 Regression {
28 previous_rate: f64,
30 current_rate: f64,
32 drop: f64,
34 },
35 NewCapability {
38 description: String,
40 },
41 ConsistentFailure {
44 success_rate: f64,
46 },
47 Flaky {
49 mean_rate: f64,
51 ci_width: f64,
53 },
54}
55
56impl FaultKind {
57 pub fn priority(&self) -> u8 {
66 match self {
67 FaultKind::Regression { drop, .. } => {
68 let scaled = (*drop * 100.0).round() as u8;
69 scaled.clamp(1, 10)
70 }
71 FaultKind::ConsistentFailure { .. } => 8,
72 FaultKind::NewCapability { .. } => 5,
73 FaultKind::Flaky { .. } => 4,
74 }
75 }
76
77 pub fn label(&self) -> &'static str {
79 match self {
80 FaultKind::Regression { .. } => "regression",
81 FaultKind::NewCapability { .. } => "new_capability",
82 FaultKind::ConsistentFailure { .. } => "consistent_failure",
83 FaultKind::Flaky { .. } => "flaky",
84 }
85 }
86}
87
88#[derive(Debug, Clone)]
92pub struct FaultReport {
93 pub case_name: String,
95 pub category: String,
97 pub fault_kind: FaultKind,
99 pub sample_errors: Vec<String>,
101 pub n_failures: usize,
103 pub n_trials: usize,
105 pub suggested_task_description: String,
107}
108
109impl FaultReport {
110 pub fn regression(
112 case_name: impl Into<String>,
113 category: impl Into<String>,
114 previous_rate: f64,
115 current_rate: f64,
116 sample_errors: Vec<String>,
117 n_failures: usize,
118 n_trials: usize,
119 ) -> Self {
120 let drop = (previous_rate - current_rate).max(0.0);
121 let cn = case_name.into();
122 let suggested = format!(
123 "Fix regression in eval case '{}': success rate dropped from {:.0}% to {:.0}% \
124 (drop: {:.0}%). Investigate recent changes and restore reliability.",
125 cn,
126 previous_rate * 100.0,
127 current_rate * 100.0,
128 drop * 100.0,
129 );
130 Self {
131 case_name: cn,
132 category: category.into(),
133 fault_kind: FaultKind::Regression {
134 previous_rate,
135 current_rate,
136 drop,
137 },
138 sample_errors,
139 n_failures,
140 n_trials,
141 suggested_task_description: suggested,
142 }
143 }
144
145 pub fn new_capability(
147 case_name: impl Into<String>,
148 category: impl Into<String>,
149 description: impl Into<String>,
150 success_rate: f64,
151 n_failures: usize,
152 n_trials: usize,
153 ) -> Self {
154 let cn = case_name.into();
155 let desc = description.into();
156 let suggested = format!(
157 "Record baseline for newly-observed eval case '{}' ({:.0}% success rate). \
158 Add documentation and verify the capability is tested consistently.",
159 cn,
160 success_rate * 100.0,
161 );
162 Self {
163 case_name: cn,
164 category: category.into(),
165 fault_kind: FaultKind::NewCapability { description: desc },
166 sample_errors: Vec::new(),
167 n_failures,
168 n_trials,
169 suggested_task_description: suggested,
170 }
171 }
172
173 pub fn priority(&self) -> u8 {
175 self.fault_kind.priority()
176 }
177}
178
179pub fn analyze_suite_for_faults(
199 suite_result: &SuiteResult,
200 regression_suite: Option<&RegressionSuite>,
201 consistent_failure_threshold: f64,
202 flaky_ci_threshold: f64,
203) -> Vec<FaultReport> {
204 let mut reports: Vec<FaultReport> = Vec::new();
205
206 for (case_name, stats) in &suite_result.stats {
207 let n_trials = stats.n_trials;
208 let n_failures = n_trials - stats.successes;
209 let success_rate = stats.success_rate;
210 let ci_width = stats.confidence_interval_95.upper - stats.confidence_interval_95.lower;
211
212 let sample_errors: Vec<String> = suite_result
214 .case_results
215 .get(case_name)
216 .map(|trials| {
217 trials
218 .iter()
219 .filter_map(|t| t.error.clone())
220 .take(3)
221 .collect()
222 })
223 .unwrap_or_default();
224
225 let baseline = regression_suite.and_then(|rs| rs.get_baseline(case_name));
227
228 if let Some(b) = baseline {
230 let drop = b.baseline_success_rate - success_rate;
231 if drop > 0.03 {
232 reports.push(FaultReport::regression(
233 case_name,
234 case_name,
235 b.baseline_success_rate,
236 success_rate,
237 sample_errors,
238 n_failures,
239 n_trials,
240 ));
241 continue;
242 }
243 }
244
245 if success_rate < consistent_failure_threshold {
247 let suggested = format!(
248 "Fix consistently failing eval case '{}' (success rate: {:.0}%). \
249 Review the implementation and ensure the evaluated functionality \
250 works correctly.",
251 case_name,
252 success_rate * 100.0,
253 );
254 reports.push(FaultReport {
255 case_name: case_name.clone(),
256 category: case_name.clone(),
257 fault_kind: FaultKind::ConsistentFailure { success_rate },
258 sample_errors,
259 n_failures,
260 n_trials,
261 suggested_task_description: suggested,
262 });
263 continue;
264 }
265
266 if n_failures > 0 && ci_width > flaky_ci_threshold {
271 let suggested = format!(
272 "Stabilize flaky eval case '{}' (mean success: {:.0}%, CI width: {:.2}). \
273 Investigate sources of non-determinism and improve consistency.",
274 case_name,
275 success_rate * 100.0,
276 ci_width,
277 );
278 reports.push(FaultReport {
279 case_name: case_name.clone(),
280 category: case_name.clone(),
281 fault_kind: FaultKind::Flaky {
282 mean_rate: success_rate,
283 ci_width,
284 },
285 sample_errors,
286 n_failures,
287 n_trials,
288 suggested_task_description: suggested,
289 });
290 continue;
291 }
292
293 if baseline.is_none() && regression_suite.is_some() && success_rate >= 0.8 {
295 reports.push(FaultReport::new_capability(
296 case_name,
297 case_name,
298 format!(
299 "New eval case '{}' achieving {:.0}% success — baseline not yet recorded",
300 case_name,
301 success_rate * 100.0,
302 ),
303 success_rate,
304 n_failures,
305 n_trials,
306 ));
307 }
308 }
309
310 reports.sort_by_key(|b| std::cmp::Reverse(b.priority()));
312 reports
313}
314
315#[cfg(test)]
318mod tests {
319 use super::*;
320 use crate::regression::RegressionSuite;
321 use crate::suite::SuiteResult;
322 use crate::trial::{EvaluationStats, TrialResult};
323 use std::collections::HashMap;
324
325 fn make_suite_result(case_name: &str, successes: usize, total: usize) -> SuiteResult {
326 let trials: Vec<TrialResult> = (0..total)
327 .map(|i| {
328 if i < successes {
329 TrialResult::success(i, 1)
330 } else {
331 TrialResult::failure(i, 1, format!("error_{i}"))
332 }
333 })
334 .collect();
335 let stats = EvaluationStats::from_trials(&trials).unwrap();
336 SuiteResult {
337 case_results: HashMap::from([(case_name.to_string(), trials)]),
338 stats: HashMap::from([(case_name.to_string(), stats)]),
339 }
340 }
341
342 #[test]
345 fn test_priority_regression_scaled_by_drop() {
346 let fk = FaultKind::Regression {
348 previous_rate: 0.9,
349 current_rate: 0.85,
350 drop: 0.05,
351 };
352 assert_eq!(fk.priority(), 5);
353 }
354
355 #[test]
356 fn test_priority_regression_capped_at_10() {
357 let fk = FaultKind::Regression {
359 previous_rate: 1.0,
360 current_rate: 0.75,
361 drop: 0.25,
362 };
363 assert_eq!(fk.priority(), 10);
364 }
365
366 #[test]
367 fn test_priority_consistent_failure() {
368 assert_eq!(
369 FaultKind::ConsistentFailure { success_rate: 0.1 }.priority(),
370 8
371 );
372 }
373
374 #[test]
375 fn test_priority_new_capability() {
376 assert_eq!(
377 FaultKind::NewCapability {
378 description: "x".into()
379 }
380 .priority(),
381 5
382 );
383 }
384
385 #[test]
386 fn test_priority_flaky() {
387 assert_eq!(
388 FaultKind::Flaky {
389 mean_rate: 0.5,
390 ci_width: 0.3
391 }
392 .priority(),
393 4
394 );
395 }
396
397 #[test]
400 fn test_regression_constructor_sets_fields() {
401 let report =
402 FaultReport::regression("my_case", "smoke", 0.9, 0.7, vec!["err1".into()], 3, 10);
403 assert_eq!(report.case_name, "my_case");
404 assert_eq!(report.category, "smoke");
405 assert_eq!(report.n_failures, 3);
406 assert_eq!(report.n_trials, 10);
407 assert!(report.suggested_task_description.contains("my_case"));
408 assert!(report.suggested_task_description.contains("regression"));
409 match &report.fault_kind {
410 FaultKind::Regression {
411 drop,
412 previous_rate,
413 current_rate,
414 } => {
415 assert!((*drop - 0.2).abs() < 1e-9);
416 assert!((*previous_rate - 0.9).abs() < 1e-9);
417 assert!((*current_rate - 0.7).abs() < 1e-9);
418 }
419 _ => panic!("expected Regression variant"),
420 }
421 }
422
423 #[test]
424 fn test_new_capability_constructor() {
425 let report = FaultReport::new_capability("new_case", "cat", "desc", 0.85, 1, 10);
426 assert_eq!(report.case_name, "new_case");
427 assert!(matches!(report.fault_kind, FaultKind::NewCapability { .. }));
428 assert!(report.sample_errors.is_empty());
429 }
430
431 #[test]
434 fn test_consistent_failure_detected() {
435 let result = make_suite_result("bad_case", 1, 20); let reports = analyze_suite_for_faults(&result, None, 0.2, 0.25);
437 assert_eq!(reports.len(), 1);
438 assert!(
439 matches!(reports[0].fault_kind, FaultKind::ConsistentFailure { .. }),
440 "expected ConsistentFailure"
441 );
442 assert_eq!(reports[0].case_name, "bad_case");
443 }
444
445 #[test]
446 fn test_regression_detected_when_drop_exceeds_tolerance() {
447 let result = make_suite_result("my_case", 7, 10); let mut reg = RegressionSuite::new();
449 let baseline_trials: Vec<TrialResult> = (0..10)
451 .map(|i| {
452 if i < 9 {
453 TrialResult::success(i, 1)
454 } else {
455 TrialResult::failure(i, 1, "e")
456 }
457 })
458 .collect();
459 let baseline_stats = EvaluationStats::from_trials(&baseline_trials).unwrap();
460 reg.add_baseline("my_case", &baseline_stats);
461
462 let reports = analyze_suite_for_faults(&result, Some(®), 0.2, 0.25);
463 assert!(
464 reports
465 .iter()
466 .any(|r| matches!(r.fault_kind, FaultKind::Regression { .. })),
467 "expected Regression fault"
468 );
469 }
470
471 #[test]
472 fn test_no_fault_when_within_tolerance() {
473 let result = make_suite_result("ok_case", 88, 100);
475 let mut reg = RegressionSuite::new();
476 let baseline_trials: Vec<TrialResult> = (0..100)
477 .map(|i| {
478 if i < 90 {
479 TrialResult::success(i, 1)
480 } else {
481 TrialResult::failure(i, 1, "e")
482 }
483 })
484 .collect();
485 let baseline_stats = EvaluationStats::from_trials(&baseline_trials).unwrap();
486 reg.add_baseline("ok_case", &baseline_stats);
487
488 let reports = analyze_suite_for_faults(&result, Some(®), 0.2, 0.25);
489 assert!(
490 reports.is_empty(),
491 "2 pp drop within 3 pp tolerance should produce no fault"
492 );
493 }
494
495 #[test]
496 fn test_no_fault_for_passing_case_without_regression_suite() {
497 let result = make_suite_result("good_case", 45, 50);
499 let reports = analyze_suite_for_faults(&result, None, 0.2, 0.25);
500 assert!(reports.is_empty());
501 }
502
503 #[test]
504 fn test_new_capability_when_regression_suite_provided_but_no_matching_baseline() {
505 let result = make_suite_result("new_case", 45, 50);
507 let reg = RegressionSuite::new(); let reports = analyze_suite_for_faults(&result, Some(®), 0.2, 0.25);
509 assert!(
510 reports
511 .iter()
512 .any(|r| matches!(r.fault_kind, FaultKind::NewCapability { .. })),
513 "should report NewCapability for high-success case with no baseline"
514 );
515 }
516
517 #[test]
518 fn test_results_sorted_by_priority_descending() {
519 let mut case_results = HashMap::new();
520 let mut stats_map = HashMap::new();
521
522 let bad: Vec<TrialResult> = (0..10)
524 .map(|i| {
525 if i < 1 {
526 TrialResult::success(i, 1)
527 } else {
528 TrialResult::failure(i, 1, "e")
529 }
530 })
531 .collect();
532 stats_map.insert(
533 "bad".to_string(),
534 EvaluationStats::from_trials(&bad).unwrap(),
535 );
536 case_results.insert("bad".to_string(), bad);
537
538 let flaky: Vec<TrialResult> = (0..10)
540 .map(|i| {
541 if i < 5 {
542 TrialResult::success(i, 1)
543 } else {
544 TrialResult::failure(i, 1, "e")
545 }
546 })
547 .collect();
548 stats_map.insert(
549 "flaky".to_string(),
550 EvaluationStats::from_trials(&flaky).unwrap(),
551 );
552 case_results.insert("flaky".to_string(), flaky);
553
554 let result = SuiteResult {
555 case_results,
556 stats: stats_map,
557 };
558 let reports = analyze_suite_for_faults(&result, None, 0.2, 0.25);
559
560 assert!(reports.len() >= 2);
561 for i in 0..reports.len() - 1 {
562 assert!(
563 reports[i].priority() >= reports[i + 1].priority(),
564 "reports should be sorted by priority desc"
565 );
566 }
567 }
568
569 #[test]
570 fn test_sample_errors_collected() {
571 let result = make_suite_result("broken", 0, 5); let reports = analyze_suite_for_faults(&result, None, 0.2, 0.25);
573 assert!(!reports.is_empty());
574 assert!(!reports[0].sample_errors.is_empty());
576 assert!(reports[0].sample_errors.len() <= 3);
577 }
578}