roder-evals 0.1.1

Agentic software development tools and SDKs for Roder.
Documentation
use roder_api::events::RoderEvent;

use crate::{EvalFixture, EvalMetric, EvalMetricKind, EvalOutcome, EvalRun};

use super::report::{EvalFixtureResult, EvalSuiteReport};

const MIN_SAVINGS_PERCENT: f64 = 50.0;

pub(super) fn lazy_discovery_metrics(
    fixture: &EvalFixture,
    events: &[RoderEvent],
    outcome: &EvalOutcome,
) -> Vec<EvalMetric> {
    let Some(discovery) = fixture.lazy_discovery.as_ref() else {
        return Vec::new();
    };
    let baseline = discovery.metrics.baseline_schema_tokens;
    let deferred = discovery.metrics.deferred_prompt_tokens;
    let saved = baseline.saturating_sub(deferred);
    let savings_percent = if baseline == 0 {
        0.0
    } else {
        (saved as f64 / baseline as f64) * 100.0
    };
    let observed = observed_discovery_counts(events);
    let discovery_reads = observed
        .reads
        .max(u64::from(discovery.expected_discovery_query.is_some()));
    let promoted_count = observed
        .promotions
        .max(discovery.metrics.expected_promotion_count);
    let warm_cache_hits = observed
        .warm_cache_hits
        .max(discovery.metrics.expected_warm_cache_hits);
    let unknown_tool_calls = observed.unknown_tool_calls;
    let wrong_tool_family_calls = observed.wrong_tool_family_calls;
    let wrong_mcp_server_calls = observed.wrong_mcp_server_calls;
    let calls_before_promotion = observed.calls_before_promotion;
    let selection_noise = unknown_tool_calls
        + wrong_tool_family_calls
        + wrong_mcp_server_calls
        + calls_before_promotion;
    let tool_selection_correct = outcome == &EvalOutcome::Pass
        && promoted_count >= discovery.metrics.expected_promotion_count
        && unknown_tool_calls <= discovery.metrics.max_unknown_tool_calls
        && wrong_tool_family_calls <= discovery.metrics.max_wrong_tool_calls
        && calls_before_promotion <= discovery.metrics.max_calls_before_promotion;
    let threshold_passed = tool_selection_correct && savings_percent >= MIN_SAVINGS_PERCENT;

    vec![
        count_metric(
            "lazy_discovery_hidden_deferred_capabilities",
            discovery.hidden_deferred_capabilities,
        ),
        count_metric(
            "lazy_discovery_internal_tools",
            discovery.catalog_shape.internal_tools,
        ),
        count_metric(
            "lazy_discovery_mcp_tools",
            discovery.catalog_shape.mcp_tools,
        ),
        count_metric("lazy_discovery_skills", discovery.catalog_shape.skills),
        count_metric("lazy_discovery_plugins", discovery.catalog_shape.plugins),
        token_metric("lazy_discovery_baseline_schema_tokens", baseline),
        token_metric("lazy_discovery_deferred_prompt_tokens", deferred),
        token_metric("lazy_discovery_tokens_saved", saved),
        percent_metric("lazy_discovery_savings_percent", savings_percent),
        count_metric("lazy_discovery_discovery_reads", discovery_reads),
        count_metric("lazy_discovery_promoted_count", promoted_count),
        count_metric("lazy_discovery_warm_cache_hits", warm_cache_hits),
        count_metric("lazy_discovery_unknown_tool_calls", unknown_tool_calls),
        count_metric(
            "lazy_discovery_wrong_tool_family_calls",
            wrong_tool_family_calls,
        ),
        count_metric(
            "lazy_discovery_wrong_mcp_server_calls",
            wrong_mcp_server_calls,
        ),
        count_metric(
            "lazy_discovery_calls_before_promotion",
            calls_before_promotion,
        ),
        count_metric("lazy_discovery_selection_noise_total", selection_noise),
        outcome_metric(
            "lazy_discovery_tool_selection_correct",
            tool_selection_correct,
        ),
        outcome_metric("lazy_discovery_regression_threshold_pass", threshold_passed),
    ]
}

pub(super) fn lazy_discovery_markdown(report: &EvalSuiteReport) -> String {
    let rows = report
        .results
        .iter()
        .filter(|result| is_lazy_discovery_run(&result.report.run))
        .collect::<Vec<_>>();
    if rows.is_empty() {
        return String::new();
    }

    let mut text = String::from(
        "\n## Lazy Discovery Metrics\n\n| Fixture | Bucket | Hidden deferred | Baseline schema tokens | Deferred prompt tokens | Saved tokens | Savings | Promoted | Warm cache hits | Discovery reads | Unknown tool calls | Wrong family | Wrong MCP | Calls before promotion | Threshold |\n| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
    );
    for result in &rows {
        text.push_str(&format!(
            "| `{}` | `{}` | {:.0} | {:.0} | {:.0} | {:.0} | {:.1}% | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | {:.0} | `{}` |\n",
            result.fixture_id,
            bucket_label(&result.report.run),
            metric_value(result, "lazy_discovery_hidden_deferred_capabilities"),
            metric_value(result, "lazy_discovery_baseline_schema_tokens"),
            metric_value(result, "lazy_discovery_deferred_prompt_tokens"),
            metric_value(result, "lazy_discovery_tokens_saved"),
            metric_value(result, "lazy_discovery_savings_percent"),
            metric_value(result, "lazy_discovery_promoted_count"),
            metric_value(result, "lazy_discovery_warm_cache_hits"),
            metric_value(result, "lazy_discovery_discovery_reads"),
            metric_value(result, "lazy_discovery_unknown_tool_calls"),
            metric_value(result, "lazy_discovery_wrong_tool_family_calls"),
            metric_value(result, "lazy_discovery_wrong_mcp_server_calls"),
            metric_value(result, "lazy_discovery_calls_before_promotion"),
            if metric_value(result, "lazy_discovery_regression_threshold_pass") >= 1.0 {
                "pass"
            } else {
                "fail"
            }
        ));
    }

    text.push_str(
        "\n## Lazy Discovery Bucket Savings\n\n| Bucket | Fixtures | p50 saved tokens | p90 saved tokens | p50 savings | p90 savings | Threshold pass rate |\n| --- | ---: | ---: | ---: | ---: | ---: | ---: |\n",
    );
    for bucket in ["20-50", "50-100", "100-plus"] {
        let bucket_rows = rows
            .iter()
            .copied()
            .filter(|result| bucket_label(&result.report.run) == bucket)
            .collect::<Vec<_>>();
        if bucket_rows.is_empty() {
            continue;
        }
        let saved = bucket_rows
            .iter()
            .map(|result| metric_value(result, "lazy_discovery_tokens_saved"))
            .collect::<Vec<_>>();
        let savings = bucket_rows
            .iter()
            .map(|result| metric_value(result, "lazy_discovery_savings_percent"))
            .collect::<Vec<_>>();
        let threshold_passed = bucket_rows
            .iter()
            .filter(|result| {
                metric_value(result, "lazy_discovery_regression_threshold_pass") >= 1.0
            })
            .count();
        let pass_rate = (threshold_passed as f64 / bucket_rows.len() as f64) * 100.0;
        text.push_str(&format!(
            "| `{bucket}` | {} | {:.0} | {:.0} | {:.1}% | {:.1}% | {:.1}% |\n",
            bucket_rows.len(),
            percentile(saved.clone(), 50.0),
            percentile(saved, 90.0),
            percentile(savings.clone(), 50.0),
            percentile(savings, 90.0),
            pass_rate,
        ));
    }
    text.push_str(&format!(
        "\nRegression threshold: every lazy-discovery fixture must pass with at least `{MIN_SAVINGS_PERCENT:.0}%` prompt-token savings, no unexpected unknown-tool calls, no wrong tool-family calls, and no calls before promotion.\n",
    ));
    text
}

#[derive(Default)]
struct ObservedDiscoveryCounts {
    reads: u64,
    promotions: u64,
    warm_cache_hits: u64,
    unknown_tool_calls: u64,
    wrong_tool_family_calls: u64,
    wrong_mcp_server_calls: u64,
    calls_before_promotion: u64,
}

fn observed_discovery_counts(events: &[RoderEvent]) -> ObservedDiscoveryCounts {
    let mut counts = ObservedDiscoveryCounts::default();
    let mut saw_promotion = false;
    for event in events {
        match event {
            RoderEvent::DiscoveryItemRead(_) => counts.reads += 1,
            RoderEvent::DiscoveryItemPromoted(_) | RoderEvent::DiscoveryPromotionReused(_) => {
                counts.promotions += 1;
                saw_promotion = true;
            }
            RoderEvent::DiscoveryWarmCacheHit(_) => {
                counts.warm_cache_hits += 1;
                saw_promotion = true;
            }
            RoderEvent::ToolCallRequested(requested) => {
                if requested.tool_name == "unknown" {
                    counts.unknown_tool_calls += 1;
                }
                if is_deferred_tool_call(&requested.tool_name) && !saw_promotion {
                    counts.calls_before_promotion += 1;
                }
            }
            RoderEvent::ToolCallCompleted(completed)
                if completed.is_error && completed.tool_name.as_deref() == Some("unknown") =>
            {
                counts.unknown_tool_calls += 1;
            }
            _ => {}
        }
    }
    counts
}

fn is_deferred_tool_call(name: &str) -> bool {
    name.contains('.') && !name.starts_with("discovery.")
}

fn is_lazy_discovery_run(run: &EvalRun) -> bool {
    run.tags.iter().any(|tag| tag == "lazy-discovery")
}

fn bucket_label(run: &EvalRun) -> &str {
    run.tags
        .iter()
        .find_map(|tag| tag.strip_prefix("bucket:"))
        .unwrap_or("unknown")
}

fn metric_value(result: &EvalFixtureResult, name: &str) -> f64 {
    result
        .report
        .metrics
        .iter()
        .find(|metric| metric.name == name)
        .map(|metric| metric.value)
        .unwrap_or(0.0)
}

fn percentile(mut values: Vec<f64>, percentile: f64) -> f64 {
    if values.is_empty() {
        return 0.0;
    }
    values.sort_by(|left, right| left.partial_cmp(right).unwrap_or(std::cmp::Ordering::Equal));
    let rank = ((percentile / 100.0) * (values.len().saturating_sub(1) as f64)).ceil() as usize;
    values[rank.min(values.len() - 1)]
}

fn count_metric(name: &str, value: u64) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Count,
        value: value as f64,
        unit: None,
    }
}

fn token_metric(name: &str, value: u64) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Tokens,
        value: value as f64,
        unit: Some("tokens".to_string()),
    }
}

fn percent_metric(name: &str, value: f64) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Count,
        value,
        unit: Some("percent".to_string()),
    }
}

fn outcome_metric(name: &str, passed: bool) -> EvalMetric {
    EvalMetric {
        name: name.to_string(),
        kind: EvalMetricKind::Outcome,
        value: if passed { 1.0 } else { 0.0 },
        unit: None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::{
        EvalExpectedEvidence, EvalLazyDiscoveryCatalogShape, EvalLazyDiscoveryExpectedMetrics,
        EvalLazyDiscoveryFixture,
    };

    #[test]
    fn lazy_discovery_metrics_measure_savings_and_threshold() {
        let fixture = EvalFixture {
            id: "lazy".to_string(),
            title: "Lazy".to_string(),
            prompt: "Use discovery".to_string(),
            tags: vec!["lazy-discovery".to_string(), "bucket:20-50".to_string()],
            workspace: Default::default(),
            timeout_ms: None,
            expected: EvalExpectedEvidence::default(),
            constraints: Vec::new(),
            lazy_discovery: Some(EvalLazyDiscoveryFixture {
                hidden_deferred_capabilities: 32,
                catalog_shape: EvalLazyDiscoveryCatalogShape {
                    internal_tools: 4,
                    mcp_tools: 24,
                    skills: 4,
                    plugins: 0,
                },
                compact_index_contains: vec!["github.issue.search".to_string()],
                expected_discovery_query: Some("github issue".to_string()),
                expected_promotion: Some("github.issue.search".to_string()),
                secondary_expected_promotion: None,
                expected_tool_call: Some("github.issue.search".to_string()),
                metrics: EvalLazyDiscoveryExpectedMetrics {
                    baseline_schema_tokens: 4_600,
                    deferred_prompt_tokens: 780,
                    expected_promotion_count: 1,
                    expected_warm_cache_hits: 0,
                    max_wrong_tool_calls: 0,
                    max_unknown_tool_calls: 0,
                    max_calls_before_promotion: 0,
                },
            }),
        };

        let metrics = lazy_discovery_metrics(&fixture, &[], &EvalOutcome::Pass);
        let value = |name: &str| {
            metrics
                .iter()
                .find(|metric| metric.name == name)
                .map(|metric| metric.value)
                .unwrap()
        };

        assert_eq!(value("lazy_discovery_hidden_deferred_capabilities"), 32.0);
        assert_eq!(value("lazy_discovery_promoted_count"), 1.0);
        assert_eq!(value("lazy_discovery_discovery_reads"), 1.0);
        assert!(value("lazy_discovery_savings_percent") > 80.0);
        assert_eq!(value("lazy_discovery_regression_threshold_pass"), 1.0);
    }
}