#![cfg(test)]
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use async_trait::async_trait;
use car_inference::{GenerateRequest, InferenceResult};
use serde_json::{json, Value};
use crate::coder::native_loop::TurnGenerator;
use crate::coder::rpc::{
approve_merge_session, confirm_session, start_session, CoderSessionEntry, StartArgs,
};
use crate::coder::router::EngineChoice;
use crate::coder::session::CoderState;
use crate::session::ServerState;
struct Script {
turns: Vec<InferenceResult>,
cursor: AtomicUsize,
}
fn turn(text: &str, tool_calls: Value) -> InferenceResult {
serde_json::from_value(json!({
"text": text,
"tool_calls": tool_calls,
"trace_id": "bench",
"model_used": "scripted",
"latency_ms": 0,
}))
.expect("scripted InferenceResult shape")
}
#[async_trait]
impl TurnGenerator for Script {
async fn generate(&self, _req: GenerateRequest) -> Result<InferenceResult, String> {
let i = self.cursor.fetch_add(1, Ordering::SeqCst);
self.turns
.get(i)
.cloned()
.ok_or_else(|| "bench script exhausted".to_string())
}
}
struct CoderBenchTask {
id: &'static str,
seed: &'static [(&'static str, &'static str)],
intent: &'static str,
native_turns: fn() -> Vec<InferenceResult>,
grade_cmd: &'static str,
}
#[derive(Debug, Clone)]
struct BenchResult {
task_id: String,
reached_approval: bool,
iterations: u32,
branch_landed: bool,
graded_pass: bool,
error: Option<String>,
}
impl BenchResult {
fn solved(&self) -> bool {
self.reached_approval && self.branch_landed && self.graded_pass
}
}
#[derive(Debug, Default)]
struct BenchScorecard {
engine: String,
rows: Vec<BenchResult>,
}
impl BenchScorecard {
fn solved_count(&self) -> usize {
self.rows.iter().filter(|r| r.solved()).count()
}
fn render(&self) -> String {
let mut out = format!(
"engine={} solved={}/{}\n",
self.engine,
self.solved_count(),
self.rows.len()
);
for r in &self.rows {
out.push_str(&format!(
" {:<28} approval={} branch={} graded={} iters={}{}\n",
r.task_id,
r.reached_approval,
r.branch_landed,
r.graded_pass,
r.iterations,
match &r.error {
Some(e) => format!(" err={e}"),
None => String::new(),
}
));
}
out
}
}
fn task_create_file() -> CoderBenchTask {
CoderBenchTask {
id: "create_file_greeting",
seed: &[],
intent: "create greeting.txt containing the text 'hello from car coder'",
native_turns: || {
vec![
turn(
r#"{"description": "greeting.txt contains the greeting",
"checks": [{"name": "content",
"command": "grep -q 'hello from car coder' greeting.txt"}]}"#,
json!([]),
),
turn(
"creating greeting.txt",
json!([{
"id": "w1", "name": "write_file",
"arguments": {"path": "greeting.txt", "content": "hello from car coder"}
}]),
),
turn("done — greeting.txt written", json!([])),
]
},
grade_cmd: "grep -q 'hello from car coder' greeting.txt",
}
}
fn task_make_test_pass() -> CoderBenchTask {
CoderBenchTask {
id: "add_function_pass_test",
seed: &[
("mathlib.sh", "# add(a, b) goes here\n"),
(
"test.sh",
"#!/bin/sh\n. ./mathlib.sh\n[ \"$(add 2 3)\" = \"5\" ] || exit 1\n",
),
],
intent: "implement an `add` shell function in mathlib.sh so that `sh test.sh` passes",
native_turns: || {
vec![
turn(
r#"{"description": "the provided test passes",
"checks": [{"name": "test", "command": "sh test.sh"}]}"#,
json!([]),
),
turn(
"adding the add() function",
json!([{
"id": "w1", "name": "write_file",
"arguments": {
"path": "mathlib.sh",
"content": "add() { echo $(( $1 + $2 )); }\n"
}
}]),
),
turn("done — add() implemented", json!([])),
]
},
grade_cmd: "sh test.sh",
}
}
fn bench_tasks() -> Vec<CoderBenchTask> {
vec![task_create_file(), task_make_test_pass()]
}
fn provision_repo(task: &CoderBenchTask) -> tempfile::TempDir {
let dir = tempfile::tempdir().unwrap();
git(dir.path(), &["init", "-q", "-b", "main"]);
for (rel, content) in task.seed {
let path = dir.path().join(rel);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).unwrap();
}
std::fs::write(&path, content).unwrap();
}
git(dir.path(), &["add", "-A"]);
git(
dir.path(),
&[
"-c",
"user.name=bench",
"-c",
"user.email=bench@car",
"commit",
"-q",
"--allow-empty",
"-m",
"seed",
],
);
dir
}
fn git(dir: &Path, args: &[&str]) {
let out = std::process::Command::new("git")
.arg("-C")
.arg(dir)
.args(args)
.output()
.unwrap();
assert!(
out.status.success(),
"git {args:?}: {}",
String::from_utf8_lossy(&out.stderr)
);
}
async fn get_entry(state: &Arc<ServerState>, id: &str) -> Arc<CoderSessionEntry> {
state
.coder_sessions
.lock()
.await
.get(id)
.cloned()
.expect("session registered")
}
async fn run_arm(
task: &CoderBenchTask,
engine: EngineChoice,
generator: Arc<dyn TurnGenerator>,
) -> BenchResult {
let repo = provision_repo(task);
let state_dir = tempfile::tempdir().unwrap();
let journal = tempfile::tempdir().unwrap();
let state = Arc::new(ServerState::standalone(journal.path().to_path_buf()));
let start = start_session(
&state,
StartArgs {
repo: repo.path().to_path_buf(),
intent: task.intent.into(),
engine,
max_iterations: Some(6),
state_dir: state_dir.path().to_path_buf(),
project: None,
},
generator,
)
.await;
let response = match start {
Ok(v) => v,
Err(e) => {
return BenchResult {
task_id: task.id.into(),
reached_approval: false,
iterations: 0,
branch_landed: false,
graded_pass: false,
error: Some(format!("start: {e}")),
}
}
};
let session_id = response["session_id"].as_str().unwrap().to_string();
confirm_session(&state, &session_id, None).await.unwrap();
let entry = get_entry(&state, &session_id).await;
let handle = entry.task.lock().unwrap().take();
if let Some(handle) = handle {
handle.await.unwrap();
}
let (reached_approval, iterations, error) = {
let session = entry.session.lock().await;
(
session.state == CoderState::NeedsApproval,
session.iterations,
session.error.clone(),
)
};
let mut branch_landed = false;
let mut graded_pass = false;
if reached_approval {
let merged = approve_merge_session(&state, &session_id, true).await;
if let Ok(merged) = merged {
if merged["state"] == "merged" {
if let Some(branch) = merged["branch"].as_str() {
branch_landed = branch_starts_clean(repo.path(), branch);
graded_pass = grade_on_branch(repo.path(), branch, task.grade_cmd);
}
}
}
}
BenchResult {
task_id: task.id.into(),
reached_approval,
iterations,
branch_landed,
graded_pass,
error,
}
}
fn branch_starts_clean(repo: &Path, branch: &str) -> bool {
let rev = std::process::Command::new("git")
.arg("-C")
.arg(repo)
.args(["rev-parse", "--verify", branch])
.output()
.unwrap();
if !rev.status.success() {
return false;
}
let status = std::process::Command::new("git")
.arg("-C")
.arg(repo)
.args(["status", "--porcelain"])
.output()
.unwrap();
status.stdout.is_empty()
}
fn grade_on_branch(repo: &Path, branch: &str, cmd: &str) -> bool {
let work = tempfile::tempdir().unwrap();
let add = std::process::Command::new("git")
.arg("-C")
.arg(repo)
.args(["worktree", "add", "-q", "--detach"])
.arg(work.path())
.arg(branch)
.output()
.unwrap();
if !add.status.success() {
return false;
}
let graded = std::process::Command::new("sh")
.arg("-c")
.arg(cmd)
.current_dir(work.path())
.output()
.map(|o| o.status.success())
.unwrap_or(false);
let _ = std::process::Command::new("git")
.arg("-C")
.arg(repo)
.args(["worktree", "prune"])
.output();
graded
}
async fn native_scorecard() -> BenchScorecard {
let mut card = BenchScorecard {
engine: "native".into(),
..Default::default()
};
for task in bench_tasks() {
let generator: Arc<dyn TurnGenerator> = Arc::new(Script {
turns: (task.native_turns)(),
cursor: AtomicUsize::new(0),
});
let result = run_arm(&task, EngineChoice::Native, generator).await;
card.rows.push(result);
}
card
}
#[tokio::test]
async fn native_suite_solves_all_tasks() {
let card = native_scorecard().await;
println!("{}", card.render());
assert_eq!(
card.solved_count(),
card.rows.len(),
"native engine failed a bench task:\n{}",
card.render()
);
}
#[tokio::test]
async fn native_per_task_invariants_hold() {
let card = native_scorecard().await;
for r in &card.rows {
assert!(
r.reached_approval,
"[{}] did not reach approval: {:?}",
r.task_id, r.error
);
assert!(r.branch_landed, "[{}] branch did not land", r.task_id);
assert!(
r.graded_pass,
"[{}] independent grader failed on landed branch",
r.task_id
);
assert_eq!(
r.iterations, 1,
"[{}] expected a clean single-iteration solve, got {}",
r.task_id, r.iterations
);
}
}
#[tokio::test]
#[ignore = "spends external-CLI quota; run by hand with CAR_CODER_BENCH_LIVE=1"]
async fn foreman_vs_native_live() {
if std::env::var("CAR_CODER_BENCH_LIVE").ok().as_deref() != Some("1") {
eprintln!("skipping: set CAR_CODER_BENCH_LIVE=1 to run the live foreman comparison");
return;
}
let detected = crate::coder::router::detect_ready_agents().await;
let cli = match detected.iter().find(|d| d.ready) {
Some(d) => d.id.clone(),
None => {
eprintln!("skipping: no ready external coding CLI detected");
return;
}
};
eprintln!("live foreman comparison against CLI: {cli}");
let native = native_scorecard().await;
let mut foreman = BenchScorecard {
engine: format!("foreman:{cli}"),
..Default::default()
};
for task in bench_tasks() {
let generator: Arc<dyn TurnGenerator> = Arc::new(Script {
turns: (task.native_turns)(),
cursor: AtomicUsize::new(0),
});
let result = run_arm(
&task,
EngineChoice::Foreman(cli.clone()),
generator,
)
.await;
foreman.rows.push(result);
}
println!("{}", native.render());
println!("{}", foreman.render());
assert_eq!(
native.solved_count(),
native.rows.len(),
"native regressed:\n{}",
native.render()
);
assert!(
foreman.solved_count() >= 1,
"foreman solved nothing — delegation floor not met:\n{}",
foreman.render()
);
for r in &foreman.rows {
assert!(
!(r.branch_landed && !r.graded_pass),
"[{}] foreman landed a branch that fails the grader (shipped breakage):\n{}",
r.task_id,
foreman.render()
);
}
}