agentcarousel 0.2.3

Evaluate agents and skills with YAML fixtures, run cases (mock or live), and keep run rows in SQLite for reports and evidence export.
Documentation
use agentcarousel_core::{CaseRegression, CaseStatus, RegressionKind, Run, RunDiff};
use serde_json::json;
use std::collections::HashMap;

pub fn diff_runs(run_a: &Run, run_b: &Run, regression_threshold: f32) -> RunDiff {
    let lookup: HashMap<_, _> = run_a
        .cases
        .iter()
        .map(|case| (case.case_id.0.clone(), case))
        .collect();

    let mut regressions = Vec::new();
    for case in &run_b.cases {
        let Some(before_case) = lookup.get(&case.case_id.0) else {
            continue;
        };

        if status_rank(case.status.clone()) > status_rank(before_case.status.clone()) {
            regressions.push(CaseRegression {
                case_id: case.case_id.clone(),
                kind: RegressionKind::StatusChange,
                before: json!(before_case.status),
                after: json!(case.status),
            });
        }

        let before_latency = before_case.metrics.total_latency_ms as f64;
        let after_latency = case.metrics.total_latency_ms as f64;
        if before_latency > 0.0
            && after_latency > before_latency * (1.0 + regression_threshold as f64)
        {
            regressions.push(CaseRegression {
                case_id: case.case_id.clone(),
                kind: RegressionKind::LatencyIncrease,
                before: json!(before_latency),
                after: json!(after_latency),
            });
        }

        let before_effectiveness = before_case
            .eval_scores
            .as_ref()
            .map(|scores| scores.effectiveness_score);
        let after_effectiveness = case
            .eval_scores
            .as_ref()
            .map(|scores| scores.effectiveness_score);
        if let (Some(before_score), Some(after_score)) = (before_effectiveness, after_effectiveness)
        {
            if after_score < before_score - regression_threshold {
                regressions.push(CaseRegression {
                    case_id: case.case_id.clone(),
                    kind: RegressionKind::EffectivenessDropped,
                    before: json!(before_score),
                    after: json!(after_score),
                });
            }
        }

        let before_error_rate = before_case.metrics.error_rate;
        let after_error_rate = case.metrics.error_rate;
        if let (Some(before_rate), Some(after_rate)) = (before_error_rate, after_error_rate) {
            if after_rate > before_rate + regression_threshold {
                regressions.push(CaseRegression {
                    case_id: case.case_id.clone(),
                    kind: RegressionKind::ErrorRateIncreased,
                    before: json!(before_rate),
                    after: json!(after_rate),
                });
            }
        }
    }

    let has_regressions = !regressions.is_empty();
    RunDiff {
        run_a: run_a.id.clone(),
        run_b: run_b.id.clone(),
        regressions,
        improvements: Vec::new(),
        has_regressions,
    }
}

pub fn print_diff(diff: &RunDiff) {
    if !diff.has_regressions {
        println!("no regressions detected");
        return;
    }
    println!("regressions detected:");
    for regression in &diff.regressions {
        println!(
            "- {}: {:?} ({:?} -> {:?})",
            regression.case_id.0, regression.kind, regression.before, regression.after
        );
    }
}

fn status_rank(status: CaseStatus) -> u8 {
    match status {
        CaseStatus::Passed => 0,
        CaseStatus::Skipped => 0,
        CaseStatus::Flaky => 1,
        CaseStatus::Failed => 2,
        CaseStatus::TimedOut => 3,
        CaseStatus::Error => 4,
    }
}