car-server-core 0.24.1

//! A focused coding-bench suite for the coder engines.
//!
//! This exercises a couple of small, deterministic coding tasks **through the
//! coder's own contract-evaluated session loop** — the same `start_session →
//! confirm → run → approve` path the daemon drives — and records a comparable
//! scorecard per engine. The point is to *compare engines* on identical tasks,
//! not to re-test any one stage in isolation.
//!
//! ## What is and isn't live here
//!
//! The honest split (see the shakedown findings that motivated this module):
//!
//! - **native** runs fully here, deterministically, with no model and no
//!   network. The `native` arm scripts the model seam ([`TurnGenerator`]) so
//!   the plan→edit→verify→repair loop is driven by canned turns, but **every
//!   other stage is the real thing**: the real contract derivation plumbing,
//!   the real policy-gated [`WorktreeExecutor`], the real contract evaluation
//!   (real `grep`/`cargo`/`test` in a real git worktree), the real merge that
//!   publishes a `car/coder/<id>` branch. So a green `native` arm proves the
//!   contract loop reached `NeedsApproval` AND the branch actually landed with
//!   the right content — the two things the daemon promises.
//!
//! - **foreman** cannot be scripted: it delegates execution to a real external
//!   CLI (Claude Code / Codex / Gemini), which spends agent quota and is
//!   non-deterministic. So the foreman comparison arm is a separate,
//!   double-gated test ([`foreman_vs_native_live`]): `#[ignore]` **and** an
//!   env flag, run by hand, only when a CLI is detected. It populates the
//!   *same* [`BenchScorecard`] so the two engines line up column-for-column.
//!
//! ## Running it
//!
//! ```bash
//! # The deterministic native suite (part of the normal test path):
//! ./scripts/cargo-shared-target.sh test -p car-server-core coder::bench
//!
//! # The live native-vs-foreman comparison (spends CLI quota; needs a CLI):
//! CAR_CODER_BENCH_LIVE=1 ./scripts/cargo-shared-target.sh test -p car-server-core \
//!     coder::bench::foreman_vs_native_live -- --ignored --nocapture
//! ```

#![cfg(test)]

use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;

use async_trait::async_trait;
use car_inference::{GenerateRequest, InferenceResult};
use serde_json::{json, Value};

use crate::coder::native_loop::TurnGenerator;
use crate::coder::rpc::{
    approve_merge_session, confirm_session, start_session, CoderSessionEntry, StartArgs,
};
use crate::coder::router::EngineChoice;
use crate::coder::session::CoderState;
use crate::session::ServerState;

// ---------------------------------------------------------------------------
// Scripted model seam (shared with the native_loop / rpc test style).
// ---------------------------------------------------------------------------

/// Pops pre-canned turns in order; the same seam the rest of the coder tests
/// use to make the model deterministic.
struct Script {
    turns: Vec<InferenceResult>,
    cursor: AtomicUsize,
}

fn turn(text: &str, tool_calls: Value) -> InferenceResult {
    serde_json::from_value(json!({
        "text": text,
        "tool_calls": tool_calls,
        "trace_id": "bench",
        "model_used": "scripted",
        "latency_ms": 0,
    }))
    .expect("scripted InferenceResult shape")
}

#[async_trait]
impl TurnGenerator for Script {
    async fn generate(&self, _req: GenerateRequest) -> Result<InferenceResult, String> {
        let i = self.cursor.fetch_add(1, Ordering::SeqCst);
        self.turns
            .get(i)
            .cloned()
            .ok_or_else(|| "bench script exhausted".to_string())
    }
}

// ---------------------------------------------------------------------------
// Bench task model + scorecard.
// ---------------------------------------------------------------------------

/// One small, deterministic coding task the bench drives through a coder engine.
struct CoderBenchTask {
    /// Stable identifier (used as the scorecard row key).
    id: &'static str,
    /// Files to seed the base repo with before running: `(relpath, content)`.
    seed: &'static [(&'static str, &'static str)],
    /// The natural-language intent handed to the coder.
    intent: &'static str,
    /// The model turns the *native* arm replays. The FIRST turn is the contract
    /// derivation (a JSON object); the rest drive the plan→edit→verify loop.
    /// Foreman ignores this (a real CLI executes there) but reuses turn 0 as the
    /// derivation seed.
    native_turns: fn() -> Vec<InferenceResult>,
    /// Independent grade run against the *landed branch* worktree: a shell
    /// command that must exit zero for the task to count as solved. This is the
    /// engine-agnostic oracle — both arms are graded by the same command.
    grade_cmd: &'static str,
}

/// One engine's result on one task. The columns line up across engines so the
/// `native` and `foreman` scorecards compare directly.
#[derive(Debug, Clone)]
struct BenchResult {
    task_id: String,
    /// The contract loop reached `NeedsApproval` (every contract check green).
    reached_approval: bool,
    /// Loop iterations spent (lower is better; surfaces repair churn).
    iterations: u32,
    /// The `car/coder/<id>` branch was published in the user's repo.
    branch_landed: bool,
    /// The independent grade command passed against the landed branch.
    graded_pass: bool,
    /// Terminal session error, if any (for diagnostics on a red row).
    error: Option<String>,
}

impl BenchResult {
    /// A row counts as fully solved only when the contract loop accepted it,
    /// the branch landed, AND the independent grader agrees.
    fn solved(&self) -> bool {
        self.reached_approval && self.branch_landed && self.graded_pass
    }
}

/// A whole-suite scorecard for one engine.
#[derive(Debug, Default)]
struct BenchScorecard {
    engine: String,
    rows: Vec<BenchResult>,
}

impl BenchScorecard {
    fn solved_count(&self) -> usize {
        self.rows.iter().filter(|r| r.solved()).count()
    }

    /// A compact human-readable table, printed under `--nocapture` so a live
    /// comparison run shows native vs foreman side by side.
    fn render(&self) -> String {
        let mut out = format!(
            "engine={} solved={}/{}\n",
            self.engine,
            self.solved_count(),
            self.rows.len()
        );
        for r in &self.rows {
            out.push_str(&format!(
                "  {:<28} approval={} branch={} graded={} iters={}{}\n",
                r.task_id,
                r.reached_approval,
                r.branch_landed,
                r.graded_pass,
                r.iterations,
                match &r.error {
                    Some(e) => format!(" err={e}"),
                    None => String::new(),
                }
            ));
        }
        out
    }
}

// ---------------------------------------------------------------------------
// The task table.
// ---------------------------------------------------------------------------

/// Task A — "create file X with content Y". The smallest possible coding task:
/// derivation yields a `grep` contract, one `write_file` satisfies it.
fn task_create_file() -> CoderBenchTask {
    CoderBenchTask {
        id: "create_file_greeting",
        seed: &[],
        intent: "create greeting.txt containing the text 'hello from car coder'",
        native_turns: || {
            vec![
                // Turn 0: contract derivation (JSON object).
                turn(
                    r#"{"description": "greeting.txt contains the greeting",
                        "checks": [{"name": "content",
                                    "command": "grep -q 'hello from car coder' greeting.txt"}]}"#,
                    json!([]),
                ),
                // Turn 1: write the file.
                turn(
                    "creating greeting.txt",
                    json!([{
                        "id": "w1", "name": "write_file",
                        "arguments": {"path": "greeting.txt", "content": "hello from car coder"}
                    }]),
                ),
                // Turn 2: declare done.
                turn("done — greeting.txt written", json!([])),
            ]
        },
        grade_cmd: "grep -q 'hello from car coder' greeting.txt",
    }
}

/// Task B — "add a function and make a provided test pass". A two-file repo with
/// a failing-by-absence test; the coder must add the implementation so the
/// provided shell-driven test passes. Uses a tiny POSIX-shell test (no toolchain
/// dependency) so the bench stays deterministic and fast on any box.
fn task_make_test_pass() -> CoderBenchTask {
    CoderBenchTask {
        id: "add_function_pass_test",
        seed: &[
            // A library file the coder must extend with an `add` function.
            ("mathlib.sh", "# add(a, b) goes here\n"),
            // The provided test: it sources the lib, calls add, checks the sum.
            // Red until `add` exists.
            (
                "test.sh",
                "#!/bin/sh\n. ./mathlib.sh\n[ \"$(add 2 3)\" = \"5\" ] || exit 1\n",
            ),
        ],
        intent: "implement an `add` shell function in mathlib.sh so that `sh test.sh` passes",
        native_turns: || {
            vec![
                // Turn 0: derivation — the contract is exactly the provided test.
                turn(
                    r#"{"description": "the provided test passes",
                        "checks": [{"name": "test", "command": "sh test.sh"}]}"#,
                    json!([]),
                ),
                // Turn 1: implement add() by overwriting mathlib.sh.
                turn(
                    "adding the add() function",
                    json!([{
                        "id": "w1", "name": "write_file",
                        "arguments": {
                            "path": "mathlib.sh",
                            "content": "add() { echo $(( $1 + $2 )); }\n"
                        }
                    }]),
                ),
                // Turn 2: declare done.
                turn("done — add() implemented", json!([])),
            ]
        },
        grade_cmd: "sh test.sh",
    }
}

fn bench_tasks() -> Vec<CoderBenchTask> {
    vec![task_create_file(), task_make_test_pass()]
}

// ---------------------------------------------------------------------------
// Harness: drive one task through one engine, end to end.
// ---------------------------------------------------------------------------

/// `git init` a repo and seed it with the task's files, committed so the
/// worktree has a clean base.
fn provision_repo(task: &CoderBenchTask) -> tempfile::TempDir {
    let dir = tempfile::tempdir().unwrap();
    git(dir.path(), &["init", "-q", "-b", "main"]);
    for (rel, content) in task.seed {
        let path = dir.path().join(rel);
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent).unwrap();
        }
        std::fs::write(&path, content).unwrap();
    }
    git(dir.path(), &["add", "-A"]);
    git(
        dir.path(),
        &[
            "-c",
            "user.name=bench",
            "-c",
            "user.email=bench@car",
            "commit",
            "-q",
            "--allow-empty",
            "-m",
            "seed",
        ],
    );
    dir
}

fn git(dir: &Path, args: &[&str]) {
    let out = std::process::Command::new("git")
        .arg("-C")
        .arg(dir)
        .args(args)
        .output()
        .unwrap();
    assert!(
        out.status.success(),
        "git {args:?}: {}",
        String::from_utf8_lossy(&out.stderr)
    );
}

async fn get_entry(state: &Arc<ServerState>, id: &str) -> Arc<CoderSessionEntry> {
    state
        .coder_sessions
        .lock()
        .await
        .get(id)
        .cloned()
        .expect("session registered")
}

/// Run one task through one engine, returning a comparable [`BenchResult`].
///
/// `generator` scripts the model seam (native uses the task's canned turns;
/// foreman reuses turn 0 only — derivation — and a real CLI executes). On the
/// native arm this completes deterministically; on foreman it blocks on the CLI.
async fn run_arm(
    task: &CoderBenchTask,
    engine: EngineChoice,
    generator: Arc<dyn TurnGenerator>,
) -> BenchResult {
    let repo = provision_repo(task);
    let state_dir = tempfile::tempdir().unwrap();
    let journal = tempfile::tempdir().unwrap();
    let state = Arc::new(ServerState::standalone(journal.path().to_path_buf()));

    let start = start_session(
        &state,
        StartArgs {
            repo: repo.path().to_path_buf(),
            intent: task.intent.into(),
            engine,
            max_iterations: Some(6),
            state_dir: state_dir.path().to_path_buf(),
            project: None,
        },
        generator,
    )
    .await;

    let response = match start {
        Ok(v) => v,
        Err(e) => {
            return BenchResult {
                task_id: task.id.into(),
                reached_approval: false,
                iterations: 0,
                branch_landed: false,
                graded_pass: false,
                error: Some(format!("start: {e}")),
            }
        }
    };
    let session_id = response["session_id"].as_str().unwrap().to_string();

    confirm_session(&state, &session_id, None).await.unwrap();

    // Wait for the loop task to finish.
    let entry = get_entry(&state, &session_id).await;
    let handle = entry.task.lock().unwrap().take();
    if let Some(handle) = handle {
        handle.await.unwrap();
    }

    let (reached_approval, iterations, error) = {
        let session = entry.session.lock().await;
        (
            session.state == CoderState::NeedsApproval,
            session.iterations,
            session.error.clone(),
        )
    };

    let mut branch_landed = false;
    let mut graded_pass = false;
    if reached_approval {
        let merged = approve_merge_session(&state, &session_id, true).await;
        if let Ok(merged) = merged {
            if merged["state"] == "merged" {
                if let Some(branch) = merged["branch"].as_str() {
                    branch_landed = branch_starts_clean(repo.path(), branch);
                    graded_pass = grade_on_branch(repo.path(), branch, task.grade_cmd);
                }
            }
        }
    }

    BenchResult {
        task_id: task.id.into(),
        reached_approval,
        iterations,
        branch_landed,
        graded_pass,
        error,
    }
}

/// The published branch exists and the user's checkout is untouched (the merge
/// promise: work lands on `car/coder/<id>`, not on the working tree).
fn branch_starts_clean(repo: &Path, branch: &str) -> bool {
    let rev = std::process::Command::new("git")
        .arg("-C")
        .arg(repo)
        .args(["rev-parse", "--verify", branch])
        .output()
        .unwrap();
    if !rev.status.success() {
        return false;
    }
    let status = std::process::Command::new("git")
        .arg("-C")
        .arg(repo)
        .args(["status", "--porcelain"])
        .output()
        .unwrap();
    status.stdout.is_empty()
}

/// Grade the task by running its command in a throwaway worktree checked out at
/// the landed branch — the engine-agnostic oracle. We grade the *branch* (not
/// the session worktree, which is reaped on the terminal transition).
fn grade_on_branch(repo: &Path, branch: &str, cmd: &str) -> bool {
    let work = tempfile::tempdir().unwrap();
    let add = std::process::Command::new("git")
        .arg("-C")
        .arg(repo)
        .args(["worktree", "add", "-q", "--detach"])
        .arg(work.path())
        .arg(branch)
        .output()
        .unwrap();
    if !add.status.success() {
        return false;
    }
    let graded = std::process::Command::new("sh")
        .arg("-c")
        .arg(cmd)
        .current_dir(work.path())
        .output()
        .map(|o| o.status.success())
        .unwrap_or(false);
    // Best-effort cleanup of the worktree registration (the tempdir drop
    // removes the files; this clears git's bookkeeping).
    let _ = std::process::Command::new("git")
        .arg("-C")
        .arg(repo)
        .args(["worktree", "prune"])
        .output();
    graded
}

// ---------------------------------------------------------------------------
// The deterministic native suite (normal `cargo test` path).
// ---------------------------------------------------------------------------

/// Run the whole task table on the native engine and return its scorecard.
async fn native_scorecard() -> BenchScorecard {
    let mut card = BenchScorecard {
        engine: "native".into(),
        ..Default::default()
    };
    for task in bench_tasks() {
        let generator: Arc<dyn TurnGenerator> = Arc::new(Script {
            turns: (task.native_turns)(),
            cursor: AtomicUsize::new(0),
        });
        let result = run_arm(&task, EngineChoice::Native, generator).await;
        card.rows.push(result);
    }
    card
}

/// Every native task solves end to end: contract loop → approval → branch lands
/// → independent grader passes. This is the green artifact that proves the
/// coder's contract-evaluated loop works on real tasks, no model required.
#[tokio::test]
async fn native_suite_solves_all_tasks() {
    let card = native_scorecard().await;
    println!("{}", card.render());
    assert_eq!(
        card.solved_count(),
        card.rows.len(),
        "native engine failed a bench task:\n{}",
        card.render()
    );
}

/// Per-task assertions, so a failure points at the offending stage rather than
/// just a count. Each row must reach approval, land its branch, and grade green.
#[tokio::test]
async fn native_per_task_invariants_hold() {
    let card = native_scorecard().await;
    for r in &card.rows {
        assert!(
            r.reached_approval,
            "[{}] did not reach approval: {:?}",
            r.task_id, r.error
        );
        assert!(r.branch_landed, "[{}] branch did not land", r.task_id);
        assert!(
            r.graded_pass,
            "[{}] independent grader failed on landed branch",
            r.task_id
        );
        // The scripted happy path solves in a single iteration; a regression
        // that introduces spurious repair churn should trip this.
        assert_eq!(
            r.iterations, 1,
            "[{}] expected a clean single-iteration solve, got {}",
            r.task_id, r.iterations
        );
    }
}

// ---------------------------------------------------------------------------
// The live native-vs-foreman comparison (gated; spends CLI quota).
// ---------------------------------------------------------------------------

/// Compare native vs foreman on the same task table, *only* when an external
/// CLI is detected and the operator opts in. Double-gated so it can never
/// auto-spend: `#[ignore]` AND `CAR_CODER_BENCH_LIVE=1`. The foreman arm
/// delegates execution to a real CLI (non-deterministic, costs quota), so it
/// asserts only the engine-agnostic invariant both arms must satisfy — the
/// contract loop accepted the work, the branch landed, and the independent
/// grader agrees — and prints both scorecards side by side.
///
/// ```bash
/// CAR_CODER_BENCH_LIVE=1 ./scripts/cargo-shared-target.sh test -p car-server-core \
///     coder::bench::foreman_vs_native_live -- --ignored --nocapture
/// ```
#[tokio::test]
#[ignore = "spends external-CLI quota; run by hand with CAR_CODER_BENCH_LIVE=1"]
async fn foreman_vs_native_live() {
    if std::env::var("CAR_CODER_BENCH_LIVE").ok().as_deref() != Some("1") {
        eprintln!("skipping: set CAR_CODER_BENCH_LIVE=1 to run the live foreman comparison");
        return;
    }

    let detected = crate::coder::router::detect_ready_agents().await;
    let cli = match detected.iter().find(|d| d.ready) {
        Some(d) => d.id.clone(),
        None => {
            eprintln!("skipping: no ready external coding CLI detected");
            return;
        }
    };
    eprintln!("live foreman comparison against CLI: {cli}");

    // The native baseline (deterministic — identical to the normal suite).
    let native = native_scorecard().await;

    // The foreman arm: real CLI execution. The scripted generator still seeds
    // contract derivation (turn 0); the loop body is the live CLI.
    let mut foreman = BenchScorecard {
        engine: format!("foreman:{cli}"),
        ..Default::default()
    };
    for task in bench_tasks() {
        let generator: Arc<dyn TurnGenerator> = Arc::new(Script {
            turns: (task.native_turns)(),
            cursor: AtomicUsize::new(0),
        });
        let result = run_arm(
            &task,
            EngineChoice::Foreman(cli.clone()),
            generator,
        )
        .await;
        foreman.rows.push(result);
    }

    println!("{}", native.render());
    println!("{}", foreman.render());

    // Native must be flawless (it's deterministic). Foreman must solve at least
    // the trivial create-file task — the floor any working delegation clears —
    // and never *regress* a task native solved into a landed-but-wrong branch.
    assert_eq!(
        native.solved_count(),
        native.rows.len(),
        "native regressed:\n{}",
        native.render()
    );
    assert!(
        foreman.solved_count() >= 1,
        "foreman solved nothing — delegation floor not met:\n{}",
        foreman.render()
    );
    for r in &foreman.rows {
        // A landed branch that fails the grader is the worst outcome: shipped
        // breakage. The contract gate is supposed to prevent exactly this.
        assert!(
            !(r.branch_landed && !r.graded_pass),
            "[{}] foreman landed a branch that fails the grader (shipped breakage):\n{}",
            r.task_id,
            foreman.render()
        );
    }
}