roder-evals 0.1.0

Agentic software development tools and SDKs for Roder.
Documentation
use std::collections::BTreeMap;
use std::path::Path;

use serde::{Deserialize, Serialize};

use super::report::{EvalFixtureResult, EvalSuiteReport, list_eval_reports};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct ReliabilityBaseline {
    pub version: u32,
    #[serde(default)]
    pub unknown_error_blocker_threshold: u64,
    #[serde(default)]
    pub expectations: Vec<ReliabilityBaselineExpectation>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct ReliabilityBaselineExpectation {
    pub scope: String,
    pub metric: String,
    #[serde(default)]
    pub max_count: u64,
    #[serde(default)]
    pub max_increase: u64,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct ReliabilityBaselineComparison {
    pub status: ReliabilityBaselineStatus,
    pub rows: Vec<ReliabilityBaselineRow>,
    pub unknown_errors: u64,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ReliabilityBaselineStatus {
    Ok,
    Attention,
    Blocked,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct ReliabilityBaselineRow {
    pub scope: String,
    pub metric: String,
    pub current: u64,
    pub allowed: u64,
    pub status: ReliabilityBaselineStatus,
}

pub fn compare_eval_report_to_baseline(
    output_dir: &Path,
    report_id: &str,
    baseline_path: &Path,
) -> anyhow::Result<String> {
    let report = read_suite_report(output_dir, report_id)?;
    let baseline: ReliabilityBaseline =
        serde_json::from_str(&std::fs::read_to_string(baseline_path)?)?;
    let comparison = compare_reliability_baseline(&report, &baseline);
    Ok(reliability_baseline_markdown(&comparison))
}

pub fn compare_reliability_baseline(
    report: &EvalSuiteReport,
    baseline: &ReliabilityBaseline,
) -> ReliabilityBaselineComparison {
    let scoped = scoped_reliability_metrics(report);
    let unknown_errors = metric_value(&scoped, "suite", "reliability_unknown_errors");
    let mut rows = Vec::new();
    let mut status = if unknown_errors > baseline.unknown_error_blocker_threshold {
        ReliabilityBaselineStatus::Blocked
    } else {
        ReliabilityBaselineStatus::Ok
    };
    for expectation in &baseline.expectations {
        let current = metric_value(&scoped, &expectation.scope, &expectation.metric);
        let allowed = expectation
            .max_count
            .saturating_add(expectation.max_increase);
        let row_status = if expectation.metric == "reliability_unknown_errors"
            && current > expectation.max_count
        {
            ReliabilityBaselineStatus::Blocked
        } else if current > allowed {
            ReliabilityBaselineStatus::Attention
        } else {
            ReliabilityBaselineStatus::Ok
        };
        status = merge_status(status, &row_status);
        rows.push(ReliabilityBaselineRow {
            scope: expectation.scope.clone(),
            metric: expectation.metric.clone(),
            current,
            allowed,
            status: row_status,
        });
    }
    ReliabilityBaselineComparison {
        status,
        rows,
        unknown_errors,
    }
}

fn read_suite_report(output_dir: &Path, report_id: &str) -> anyhow::Result<EvalSuiteReport> {
    let reports = list_eval_reports(output_dir)?;
    let summary = reports
        .into_iter()
        .find(|report| report.id == report_id)
        .ok_or_else(|| anyhow::anyhow!("eval report not found: {report_id}"))?;
    let path = summary.path.join("eval-run.json");
    Ok(serde_json::from_str(&std::fs::read_to_string(path)?)?)
}

fn reliability_baseline_markdown(comparison: &ReliabilityBaselineComparison) -> String {
    let mut text = format!(
        "\n## Reliability Baseline Comparison\n\nStatus: `{:?}`\n\nUnknown errors: `{}`\n\n| Scope | Metric | Current | Allowed | Status |\n| --- | --- | ---: | ---: | --- |\n",
        comparison.status, comparison.unknown_errors
    );
    for row in &comparison.rows {
        text.push_str(&format!(
            "| `{}` | `{}` | {} | {} | `{:?}` |\n",
            row.scope, row.metric, row.current, row.allowed, row.status
        ));
    }
    text
}

fn scoped_reliability_metrics(report: &EvalSuiteReport) -> BTreeMap<(String, String), u64> {
    let mut scoped = BTreeMap::new();
    for result in &report.results {
        add_result_metrics(&mut scoped, "suite", result);
        let model = format!(
            "model:{}/{}",
            result.report.run.provider, result.report.run.model
        );
        add_result_metrics(&mut scoped, &model, result);
        for tag in result
            .report
            .run
            .tags
            .iter()
            .filter(|tag| tag.starts_with("tool:"))
        {
            add_result_metrics(&mut scoped, tag, result);
        }
    }
    scoped
}

fn add_result_metrics(
    scoped: &mut BTreeMap<(String, String), u64>,
    scope: &str,
    result: &EvalFixtureResult,
) {
    for metric in &result.report.metrics {
        if !metric.name.starts_with("reliability_") {
            continue;
        }
        *scoped
            .entry((scope.to_string(), metric.name.clone()))
            .or_insert(0) += metric.value.max(0.0) as u64;
    }
}

fn metric_value(scoped: &BTreeMap<(String, String), u64>, scope: &str, metric: &str) -> u64 {
    scoped
        .get(&(scope.to_string(), metric.to_string()))
        .copied()
        .unwrap_or(0)
}

fn merge_status(
    current: ReliabilityBaselineStatus,
    next: &ReliabilityBaselineStatus,
) -> ReliabilityBaselineStatus {
    match (current, next) {
        (ReliabilityBaselineStatus::Blocked, _) | (_, ReliabilityBaselineStatus::Blocked) => {
            ReliabilityBaselineStatus::Blocked
        }
        (ReliabilityBaselineStatus::Attention, _) | (_, ReliabilityBaselineStatus::Attention) => {
            ReliabilityBaselineStatus::Attention
        }
        _ => ReliabilityBaselineStatus::Ok,
    }
}

#[cfg(test)]
mod tests {
    use std::path::PathBuf;

    use roder_api::events::{RoderEvent, ThreadId, TurnId};
    use roder_api::reliability::{
        ReliabilityContext, ReliabilityDetails, ReliabilityErrorClass, ReliabilityFailureRecorded,
    };
    use time::OffsetDateTime;

    use crate::{EvalMetric, EvalMetricKind, EvalOutcome, EvalReport, EvalRun, EvalTrajectory};

    use super::*;

    #[test]
    fn reliability_baseline_flags_unknown_errors_as_blockers() {
        let report = suite_report(vec![fixture_result(
            "unknown-panic",
            vec![
                metric("reliability_unknown_errors", 1.0),
                metric("reliability_error_class_unknown", 1.0),
            ],
            EvalOutcome::HarnessError,
        )]);
        let baseline = ReliabilityBaseline {
            version: 1,
            unknown_error_blocker_threshold: 0,
            expectations: vec![ReliabilityBaselineExpectation {
                scope: "suite".to_string(),
                metric: "reliability_unknown_errors".to_string(),
                max_count: 0,
                max_increase: 0,
            }],
        };

        let comparison = compare_reliability_baseline(&report, &baseline);

        assert_eq!(comparison.status, ReliabilityBaselineStatus::Blocked);
        assert_eq!(comparison.unknown_errors, 1);
        assert_eq!(
            comparison.rows[0].status,
            ReliabilityBaselineStatus::Blocked
        );
    }

    #[test]
    fn reliability_baseline_flags_per_model_spikes() {
        let report = suite_report(vec![fixture_result(
            "provider-429",
            vec![metric("reliability_error_class_provider_error", 3.0)],
            EvalOutcome::Pass,
        )]);
        let baseline = ReliabilityBaseline {
            version: 1,
            unknown_error_blocker_threshold: 0,
            expectations: vec![ReliabilityBaselineExpectation {
                scope: "model:mock/mock".to_string(),
                metric: "reliability_error_class_provider_error".to_string(),
                max_count: 1,
                max_increase: 1,
            }],
        };

        let comparison = compare_reliability_baseline(&report, &baseline);

        assert_eq!(comparison.status, ReliabilityBaselineStatus::Attention);
        assert_eq!(comparison.rows[0].current, 3);
        assert_eq!(comparison.rows[0].allowed, 2);
    }

    fn suite_report(results: Vec<EvalFixtureResult>) -> EvalSuiteReport {
        EvalSuiteReport {
            suite_id: "reliability".to_string(),
            fixture_dir: PathBuf::from("evals/fixtures/reliability"),
            output_dir: PathBuf::from("/tmp/roder-evals"),
            offline: true,
            generated_at: OffsetDateTime::UNIX_EPOCH,
            results,
        }
    }

    fn fixture_result(
        fixture_id: &str,
        metrics: Vec<EvalMetric>,
        outcome: EvalOutcome,
    ) -> EvalFixtureResult {
        EvalFixtureResult {
            fixture_id: fixture_id.to_string(),
            title: fixture_id.to_string(),
            workspace: PathBuf::from("/tmp/workspace"),
            final_answer: String::new(),
            report: EvalReport {
                run: EvalRun {
                    suite_id: "reliability".to_string(),
                    run_id: "run".to_string(),
                    provider: "mock".to_string(),
                    model: "mock".to_string(),
                    started_at: OffsetDateTime::UNIX_EPOCH,
                    tags: vec!["reliability".to_string(), "tool:read_file".to_string()],
                },
                outcome,
                failure_class: None,
                trajectory: EvalTrajectory::from_events(
                    ThreadId::from("thread"),
                    TurnId::from("turn"),
                    &[RoderEvent::ReliabilityFailureRecorded(
                        ReliabilityFailureRecorded {
                            context: ReliabilityContext {
                                thread_id: "thread".to_string(),
                                turn_id: "turn".to_string(),
                                tool_id: None,
                                tool_name: None,
                                provider: Some("mock".to_string()),
                                model: Some("mock".to_string()),
                            },
                            error_class: ReliabilityErrorClass::Unknown,
                            details: ReliabilityDetails::redacted("test"),
                            timestamp: OffsetDateTime::UNIX_EPOCH,
                        },
                    )],
                ),
                metrics,
            },
            trace_excerpt: Vec::new(),
            failure_message: None,
        }
    }

    fn metric(name: &str, value: f64) -> EvalMetric {
        EvalMetric {
            name: name.to_string(),
            kind: EvalMetricKind::Count,
            value,
            unit: None,
        }
    }
}