roder-evals 0.1.1

Agentic software development tools and SDKs for Roder.
Documentation
use roder_api::events::RoderEvent;
use roder_api::retrieval::{RetrievalMode, RetrievalOutcomeKind};

use crate::{
    EvalFixture, EvalFixtureResult, EvalMetric, EvalMetricKind, EvalOutcome, EvalSuiteReport,
};

pub fn grade_retrieval_router_fixture(
    fixture: &EvalFixture,
    events: &[RoderEvent],
    outcome: &EvalOutcome,
) -> Vec<EvalMetric> {
    if !fixture.tags.iter().any(|tag| tag == "retrieval-router") {
        return Vec::new();
    }

    let mut planned = 0u64;
    let mut accepted = 0u64;
    let mut ignored = 0u64;
    let mut failed = 0u64;
    let mut useful = 0u64;
    let mut irrelevant = 0u64;
    let mut discovery_before_tool_use = 0u64;
    let mut promotion_before_tool_use = 0u64;
    let mut wrong_tool_family = 0u64;
    let mut unknown_tool = 0u64;
    let mut missing_promotion = 0u64;
    let mut latency_ms = 0u64;
    let mut returned_tokens = 0u64;
    let mut first_useful_path = None;
    let mut first_useful_event = 0u64;

    for (index, event) in events.iter().enumerate() {
        match event {
            RoderEvent::RetrievalRoutePlanned(_) => planned += 1,
            RoderEvent::RetrievalRouteAccepted(_) => accepted += 1,
            RoderEvent::RetrievalRouteIgnored(_) => ignored += 1,
            RoderEvent::RetrievalRouteFailed(_) => failed += 1,
            RoderEvent::RetrievalResultUsed(event) => {
                latency_ms += event.outcome.latency_ms;
                returned_tokens += u64::from(event.outcome.estimated_tokens_returned);
                if event.outcome.discovery_before_tool_use {
                    discovery_before_tool_use += 1;
                }
                if event.outcome.promotion_before_tool_use {
                    promotion_before_tool_use += 1;
                }
                match event.outcome.outcome {
                    RetrievalOutcomeKind::Useful => {
                        useful += 1;
                        if first_useful_path.is_none() {
                            first_useful_path = event
                                .outcome
                                .first_useful_path
                                .clone()
                                .or_else(|| Some(event.outcome.mode.clone()));
                            first_useful_event = index as u64 + 1;
                        }
                    }
                    RetrievalOutcomeKind::Irrelevant => irrelevant += 1,
                    RetrievalOutcomeKind::WrongToolFamily => wrong_tool_family += 1,
                    RetrievalOutcomeKind::UnknownTool => unknown_tool += 1,
                    RetrievalOutcomeKind::MissingPromotion => missing_promotion += 1,
                    _ => {}
                }
                wrong_tool_family += event.outcome.wrong_tool_family_attempts;
            }
            _ => {}
        }
    }

    let mut metrics = vec![
        count_metric("retrieval_router_route_planned", planned),
        count_metric("retrieval_router_route_accepted", accepted),
        count_metric("retrieval_router_route_ignored", ignored),
        count_metric("retrieval_router_route_failed", failed),
        count_metric("retrieval_router_useful_results", useful),
        count_metric("retrieval_router_irrelevant_searches", irrelevant),
        count_metric(
            "retrieval_router_discovery_before_tool_use",
            discovery_before_tool_use,
        ),
        count_metric(
            "retrieval_router_promotion_before_tool_use",
            promotion_before_tool_use,
        ),
        count_metric("retrieval_router_wrong_tool_family", wrong_tool_family),
        count_metric("retrieval_router_unknown_tool", unknown_tool),
        count_metric("retrieval_router_missing_promotion", missing_promotion),
        duration_metric("retrieval_router_latency_ms", latency_ms),
        token_metric("retrieval_router_returned_tokens", returned_tokens),
        count_metric("retrieval_router_first_useful_event", first_useful_event),
        outcome_metric(
            "retrieval_router_final_correctness",
            outcome == &EvalOutcome::Pass,
        ),
        outcome_metric(
            "retrieval_router_on_variant",
            fixture.tags.iter().any(|tag| tag == "router:on"),
        ),
        outcome_metric(
            "retrieval_router_off_variant",
            fixture.tags.iter().any(|tag| tag == "router:off"),
        ),
        outcome_metric(
            "retrieval_router_deferred_context_variant",
            fixture.tags.iter().any(|tag| tag == "context:deferred"),
        ),
        outcome_metric(
            "retrieval_router_full_static_context_variant",
            fixture.tags.iter().any(|tag| tag == "context:full-static"),
        ),
    ];
    for mode in [
        RetrievalMode::ExactText,
        RetrievalMode::FileName,
        RetrievalMode::SemanticCode,
        RetrievalMode::Artifact,
        RetrievalMode::History,
        RetrievalMode::Discovery,
        RetrievalMode::Promotion,
        RetrievalMode::Web,
    ] {
        metrics.push(outcome_metric(
            &format!("retrieval_router_first_useful_path_{}", mode_label(&mode)),
            first_useful_path.as_ref() == Some(&mode),
        ));
    }
    metrics
}

pub(crate) fn retrieval_router_markdown(report: &EvalSuiteReport) -> String {
    let rows = report
        .results
        .iter()
        .filter(|result| {
            result
                .report
                .run
                .tags
                .iter()
                .any(|tag| tag == "retrieval-router")
        })
        .collect::<Vec<_>>();
    if rows.is_empty() {
        return String::new();
    }

    let mut text = String::from(
        "\n## Retrieval Router Metrics\n\n| Fixture | Router | Context | Bucket | Planned | Accepted | Ignored | Failed | Useful | Irrelevant | Discovery first | Promotion first | Wrong family | Latency ms | Tokens | Correct |\n| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
    );
    for result in &rows {
        text.push_str(&format!(
            "| `{}` | `{}` | `{}` | `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | `{}` |\n",
            result.fixture_id,
            retrieval_tag(&result.report.run.tags, "router:", "unknown"),
            retrieval_tag(&result.report.run.tags, "context:", "unknown"),
            retrieval_tag(&result.report.run.tags, "bucket:", "none"),
            metric_value(result, "retrieval_router_route_planned"),
            metric_value(result, "retrieval_router_route_accepted"),
            metric_value(result, "retrieval_router_route_ignored"),
            metric_value(result, "retrieval_router_route_failed"),
            metric_value(result, "retrieval_router_useful_results"),
            metric_value(result, "retrieval_router_irrelevant_searches"),
            metric_value(result, "retrieval_router_discovery_before_tool_use"),
            metric_value(result, "retrieval_router_promotion_before_tool_use"),
            metric_value(result, "retrieval_router_wrong_tool_family"),
            metric_value(result, "retrieval_router_latency_ms"),
            metric_value(result, "retrieval_router_returned_tokens"),
            if metric_value(result, "retrieval_router_final_correctness") >= 1.0 {
                "pass"
            } else {
                "fail"
            }
        ));
    }

    text.push_str(
        "\n## Retrieval Router Comparisons\n\n| Slice | Runs | Pass rate | Avg useful results | Avg ignored routes | Avg latency ms | Avg returned tokens |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: |\n",
    );
    for (label, prefix, value) in [
        ("router:on", "router:", "on"),
        ("router:off", "router:", "off"),
        ("context:deferred", "context:", "deferred"),
        ("context:full-static", "context:", "full-static"),
        ("bucket:20-50", "bucket:", "20-50"),
        ("bucket:50-100", "bucket:", "50-100"),
        ("bucket:100-plus", "bucket:", "100-plus"),
    ] {
        let slice = rows
            .iter()
            .copied()
            .filter(|result| retrieval_tag(&result.report.run.tags, prefix, "") == value)
            .collect::<Vec<_>>();
        if slice.is_empty() {
            continue;
        }
        text.push_str(&format!(
            "| `{label}` | {} | {:.1}% | {:.1} | {:.1} | {:.1} | {:.1} |\n",
            slice.len(),
            average_metric(&slice, "retrieval_router_final_correctness") * 100.0,
            average_metric(&slice, "retrieval_router_useful_results"),
            average_metric(&slice, "retrieval_router_route_ignored"),
            average_metric(&slice, "retrieval_router_latency_ms"),
            average_metric(&slice, "retrieval_router_returned_tokens"),
        ));
    }
    text.push_str(
        "\nFailure diagnosis: routing failures usually surface as ignored or failed routes; stale or missing indexes surface through retrieval outcome metrics; discovery failures surface as missing promotion, unknown tool, or wrong tool-family noise.\n",
    );
    text
}

fn count_metric(name: &str, value: u64) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Count,
        value: value as f64,
        unit: None,
    }
}

fn duration_metric(name: &str, value: u64) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Duration,
        value: value as f64,
        unit: Some("ms".to_string()),
    }
}

fn token_metric(name: &str, value: u64) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Tokens,
        value: value as f64,
        unit: Some("tokens".to_string()),
    }
}

fn outcome_metric(name: &str, passed: bool) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Outcome,
        value: if passed { 1.0 } else { 0.0 },
        unit: None,
    }
}

fn mode_label(mode: &RetrievalMode) -> &'static str {
    match mode {
        RetrievalMode::ExactText => "exact_text",
        RetrievalMode::FileName => "file_name",
        RetrievalMode::SemanticCode => "semantic_code",
        RetrievalMode::Artifact => "artifact",
        RetrievalMode::History => "history",
        RetrievalMode::Discovery => "discovery",
        RetrievalMode::Promotion => "promotion",
        RetrievalMode::Web => "web",
    }
}

fn retrieval_tag<'a>(tags: &'a [String], prefix: &str, default: &'a str) -> &'a str {
    tags.iter()
        .find_map(|tag| tag.strip_prefix(prefix))
        .unwrap_or(default)
}

fn metric_value(result: &EvalFixtureResult, name: &str) -> f64 {
    result
        .report
        .metrics
        .iter()
        .find(|metric| metric.name == name)
        .map(|metric| metric.value)
        .unwrap_or(0.0)
}

fn average_metric(rows: &[&EvalFixtureResult], metric: &str) -> f64 {
    if rows.is_empty() {
        return 0.0;
    }
    rows.iter()
        .map(|result| metric_value(result, metric))
        .sum::<f64>()
        / rows.len() as f64
}

#[cfg(test)]
mod tests {
    use roder_api::events::RoderEvent;
    use roder_api::retrieval::{
        RetrievalIntent, RetrievalMeasuredOutcome, RetrievalOutcomeKind, RetrievalResultUsed,
        RetrievalRoutePlan, RetrievalRoutePlanned,
    };

    use super::*;

    #[test]
    fn retrieval_router_grader_counts_routes_and_noise() {
        let fixture = EvalFixture {
            id: "router".to_string(),
            title: "Router".to_string(),
            prompt: "Find a tool".to_string(),
            tags: vec![
                "retrieval-router".to_string(),
                "router:on".to_string(),
                "context:deferred".to_string(),
            ],
            workspace: Default::default(),
            timeout_ms: None,
            expected: Default::default(),
            constraints: Vec::new(),
            lazy_discovery: None,
        };
        let timestamp = time::OffsetDateTime::UNIX_EPOCH;
        let events = vec![
            RoderEvent::RetrievalRoutePlanned(RetrievalRoutePlanned {
                plan: RetrievalRoutePlan {
                    route_id: "route".to_string(),
                    thread_id: "thread".to_string(),
                    turn_id: "turn".to_string(),
                    intent: RetrievalIntent::InspectTool,
                    recommended: Vec::new(),
                    avoid: Vec::new(),
                    timestamp,
                },
            }),
            RoderEvent::RetrievalResultUsed(RetrievalResultUsed {
                outcome: RetrievalMeasuredOutcome {
                    route_id: "route".to_string(),
                    mode: RetrievalMode::Discovery,
                    tool: "discovery.search".to_string(),
                    outcome: RetrievalOutcomeKind::Useful,
                    first_useful_path: Some(RetrievalMode::Discovery),
                    discovery_before_tool_use: true,
                    promotion_before_tool_use: false,
                    wrong_tool_family_attempts: 1,
                    result_count: 2,
                    latency_ms: 12,
                    bytes_returned: 100,
                    estimated_tokens_returned: 25,
                },
                thread_id: "thread".to_string(),
                turn_id: "turn".to_string(),
                timestamp,
            }),
        ];

        let metrics = grade_retrieval_router_fixture(&fixture, &events, &EvalOutcome::Pass);
        let value = |name: &str| {
            metrics
                .iter()
                .find(|metric| metric.name == name)
                .map(|metric| metric.value)
                .unwrap_or_default()
        };
        assert_eq!(value("retrieval_router_route_planned"), 1.0);
        assert_eq!(value("retrieval_router_useful_results"), 1.0);
        assert_eq!(value("retrieval_router_discovery_before_tool_use"), 1.0);
        assert_eq!(value("retrieval_router_wrong_tool_family"), 1.0);
        assert_eq!(value("retrieval_router_first_useful_path_discovery"), 1.0);
    }
}