car-server-core 0.24.1

//! Outcome contracts — the verifiable definition of "done" for a coder session.
//!
//! A contract is a set of shell commands that must pass inside the worktree.
//! It is derived from the user's intent by a model (with a bounded repair loop
//! mirroring `car-builder`: generation is an injected closure, so tests run
//! without inference) and then becomes the trust boundary for the whole
//! session: whatever engine did the work — the native loop or an external CLI
//! — the runtime re-runs the checks itself before asking for merge approval.

use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::future::Future;

use super::session::{CoderEventKind, EventSink};
use super::shell_tool::WorktreeExecutor;

fn default_true() -> bool {
    true
}

fn default_check_timeout() -> u64 {
    120
}

/// The verifiable definition of done for a coding session.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct OutcomeContract {
    /// Human summary of what success means.
    pub description: String,
    /// Checks that must all pass. Evaluated through the policy-gated shell
    /// tool, so a malicious "check" cannot do what the agent itself couldn't.
    pub checks: Vec<ContractCheck>,
}

/// One acceptance check: a shell command run at the worktree root.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ContractCheck {
    /// Short, unique label ("tests_pass", "file_created").
    pub name: String,
    /// Command run via the worktree shell tool.
    pub command: String,
    /// Require exit code 0 (default true).
    #[serde(default = "default_true")]
    pub expect_exit_zero: bool,
    /// Additionally require this substring in the combined output.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub output_contains: Option<String>,
    /// Per-check timeout (default 120s; the shell tool clamps further).
    #[serde(default = "default_check_timeout")]
    pub timeout_secs: u64,
}

/// Result of evaluating one [`ContractCheck`].
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct CheckResult {
    pub name: String,
    pub passed: bool,
    /// None when the command could not run at all (spawn/policy failure).
    pub exit_code: Option<i64>,
    /// Tail of combined stdout+stderr — enough for repair prompts and the UI.
    pub output_tail: String,
    pub duration_ms: u64,
}

impl OutcomeContract {
    /// Structural problems that make a contract unusable. Empty = valid.
    ///
    /// Beyond pure structure (empty/duplicate names, assertion-less checks)
    /// this also rejects two failure modes seen live from small local models
    /// (issue #168 follow-up): the prompt's literal placeholder name leaking
    /// through verbatim, and "toolchain-only" no-op commands like
    /// `cargo --version` that prove nothing about the change. Both pass the
    /// structural checks but make a contract that gates nothing, so they're
    /// surfaced as validation issues to drive the repair loop rather than
    /// silently becoming the trust boundary.
    pub fn validate(&self) -> Vec<String> {
        let mut issues = Vec::new();
        if self.checks.is_empty() {
            issues.push("contract has no checks — at least one is required".to_string());
        }
        let mut seen = std::collections::HashSet::new();
        for (i, c) in self.checks.iter().enumerate() {
            let name = c.name.trim();
            if name.is_empty() {
                issues.push(format!("check #{i} has an empty name"));
            }
            if name == "unique_snake_case_label" {
                issues.push(format!(
                    "check #{i} kept the literal placeholder name \
                     'unique_snake_case_label' — give it a real descriptive label"
                ));
            }
            if c.command.trim().is_empty() {
                issues.push(format!("check '{}' has an empty command", c.name));
            } else if is_toolchain_only(c.command.trim()) {
                issues.push(format!(
                    "check '{}' runs a toolchain-only no-op (`{}`) that verifies the \
                     tool is installed, not the task — replace it with a command that \
                     exercises the actual change",
                    c.name,
                    c.command.trim()
                ));
            }
            if !seen.insert(name.to_string()) {
                issues.push(format!("duplicate check name '{}'", c.name));
            }
            if !c.expect_exit_zero && c.output_contains.is_none() {
                issues.push(format!(
                    "check '{}' asserts nothing (expect_exit_zero=false and no output_contains)",
                    c.name
                ));
            }
        }
        issues
    }

    /// Render for prompts and CLI display.
    pub fn render(&self) -> String {
        let mut out = format!("{}\nChecks:\n", self.description.trim());
        for c in &self.checks {
            out.push_str(&format!("- {}: `{}`", c.name, c.command));
            let mut expects = Vec::new();
            if c.expect_exit_zero {
                expects.push("exit 0".to_string());
            }
            if let Some(s) = &c.output_contains {
                expects.push(format!("output contains {s:?}"));
            }
            if !expects.is_empty() {
                out.push_str(&format!(" (expects {})", expects.join(", ")));
            }
            out.push('\n');
        }
        out
    }
}

/// True when a command only probes that a build tool is installed (e.g.
/// `cargo --version`, `rustc --version`, `node -v`) — it proves nothing about
/// the task. Conservative by design: it only fires on a bare
/// `<tool> --version` / `-V` / `--help` / `-v` invocation with no other
/// subcommand or shell composition, so real checks like `cargo run -- --version`,
/// `cargo build`, or `cargo test --version-of-something` are never flagged.
fn is_toolchain_only(command: &str) -> bool {
    // Any shell composition means it's doing more than a bare version probe.
    if command.contains("&&")
        || command.contains("||")
        || command.contains('|')
        || command.contains(';')
        || command.contains('\n')
    {
        return false;
    }
    let tokens: Vec<&str> = command.split_whitespace().collect();
    // Expect exactly `<tool> <version-or-help-flag>`. Anything longer (e.g.
    // `cargo run -- --version`, `cargo build`) has a subcommand and is real.
    let [tool, flag] = tokens.as_slice() else {
        return false;
    };
    const TOOLS: &[&str] = &[
        "cargo", "rustc", "rustup", "node", "npm", "npx", "yarn", "pnpm", "python", "python3",
        "pip", "pip3", "go", "java", "javac", "ruby", "gem", "dotnet", "deno", "bun", "tsc", "gcc",
        "clang", "make", "cmake",
    ];
    const FLAGS: &[&str] = &["--version", "-V", "-v", "--help", "-h", "version"];
    TOOLS.contains(tool) && FLAGS.contains(flag)
}

/// Build the contract-derivation prompt. `issues` carries repair feedback from
/// a prior failed attempt (car-builder pattern).
fn build_contract_prompt(intent: &str, repo_summary: &str, issues: &[String]) -> String {
    let mut p = format!(
        "You are deriving an OUTCOME CONTRACT for a coding task: a small set of shell \
         commands that objectively verify the task is done. The commands run at the root of a \
         fresh git checkout of the repository, non-interactively, with no TTY.\n\n\
         Task intent:\n{intent}\n\n\
         Repository summary:\n{repo_summary}\n\n\
         Respond with ONLY a JSON object, no prose, no markdown fences, in this shape:\n\
         {{\n  \"description\": \"one-sentence definition of done\",\n  \"checks\": [\n    \
         {{\"name\": \"unique_snake_case_label\", \"command\": \"shell command\", \
         \"expect_exit_zero\": true, \"output_contains\": null, \"timeout_secs\": 120}}\n  ]\n}}\n\n\
         Rules:\n\
         - 1 to 5 checks. Each must verify THE TASK ITSELF, not just that the toolchain works \
           (e.g. `rustc --version` or `cargo --version` prove nothing about the change).\n\
         - At least one check should exercise the actual new behaviour the intent describes \
           (run the program/test that the change affects).\n\
         - `name` must be a real, descriptive snake_case label unique within the contract — \
           never the literal placeholder `unique_snake_case_label`.\n\
         - Every command must run non-interactively and deterministically (no prompts, no \
           watchers, no servers that don't exit). Use the repo's own build/test commands when \
           the summary reveals them — a build that must compile the change is a strong check.\n\
         - `expect_exit_zero: true` (the default) is usually enough. Only set `output_contains` \
           to a substring you are CERTAIN will appear verbatim in stdout/stderr; if unsure, \
           leave it null. Do NOT invent example output or placeholder values.\n\
         - Never use git push, network access, sudo, or anything destructive outside the \
           checkout. Timeouts are in seconds; keep them realistic for a build.\n"
    );
    if !issues.is_empty() {
        p.push_str("\nYour previous attempt FAILED validation with these issues — fix them:\n");
        for i in issues {
            p.push_str(&format!("- {i}\n"));
        }
    }
    p
}

/// Extract the first JSON object from model output, tolerating code fences and
/// surrounding prose.
pub(crate) fn extract_json_object(text: &str) -> Result<Value, String> {
    let start = text.find('{').ok_or("no JSON object found in output")?;
    let end = text.rfind('}').ok_or("no closing brace found in output")?;
    if end < start {
        return Err("malformed JSON object in output".to_string());
    }
    serde_json::from_str(&text[start..=end]).map_err(|e| format!("invalid JSON: {e}"))
}

/// Derive a contract from `intent` via the injected `generate` closure, with a
/// bounded validate→repair loop.
pub async fn derive_contract<F, Fut>(
    generate: F,
    intent: &str,
    repo_summary: &str,
    max_attempts: u32,
) -> Result<OutcomeContract, String>
where
    F: Fn(String) -> Fut + Send + Sync,
    Fut: Future<Output = Result<String, String>> + Send,
{
    let max = max_attempts.max(1);
    let mut issues: Vec<String> = Vec::new();
    let mut last_err = String::new();

    for _ in 0..max {
        let prompt = build_contract_prompt(intent, repo_summary, &issues);
        let text = match generate(prompt).await {
            Ok(t) => t,
            Err(e) => {
                // Transient model/transport failure — retry with the same prompt.
                last_err = format!("generation failed: {e}");
                continue;
            }
        };
        let value = match extract_json_object(&text) {
            Ok(v) => v,
            Err(e) => {
                issues = vec![format!("output did not parse: {e}. Return ONLY the JSON object.")];
                last_err = issues.join("; ");
                continue;
            }
        };
        let contract: OutcomeContract = match serde_json::from_value(value) {
            Ok(c) => c,
            Err(e) => {
                issues = vec![format!("JSON did not match the contract schema: {e}")];
                last_err = issues.join("; ");
                continue;
            }
        };
        let problems = contract.validate();
        if problems.is_empty() {
            return Ok(contract);
        }
        last_err = problems.join("; ");
        issues = problems;
    }
    Err(format!(
        "could not derive a valid outcome contract after {max} attempts: {last_err}"
    ))
}

/// Run every check through the worktree shell tool and report results.
///
/// All checks run even after a failure — repair prompts and the UI want the
/// full picture, and checks are independent by construction.
pub async fn evaluate_contract(
    contract: &OutcomeContract,
    executor: &WorktreeExecutor,
    sink: &EventSink,
) -> Vec<CheckResult> {
    let mut results = Vec::with_capacity(contract.checks.len());
    for check in &contract.checks {
        sink.emit(CoderEventKind::CheckStarted {
            name: check.name.clone(),
        });
        let started = std::time::Instant::now();
        let outcome = executor
            .run_shell(&check.command, Some(check.timeout_secs))
            .await;
        let duration_ms = started.elapsed().as_millis() as u64;

        let result = match outcome {
            Ok(v) => {
                let exit_code = v.get("exit_code").and_then(Value::as_i64);
                let output = v.get("output").and_then(Value::as_str).unwrap_or_default();
                let timed_out = v
                    .get("timed_out")
                    .and_then(Value::as_bool)
                    .unwrap_or(false);
                let exit_ok = !check.expect_exit_zero || exit_code == Some(0);
                let contains_ok = check
                    .output_contains
                    .as_deref()
                    .map(|needle| output.contains(needle))
                    .unwrap_or(true);
                CheckResult {
                    name: check.name.clone(),
                    passed: exit_ok && contains_ok && !timed_out,
                    exit_code,
                    output_tail: super::shell_tool::tail(output, 4 * 1024),
                    duration_ms,
                }
            }
            Err(e) => CheckResult {
                name: check.name.clone(),
                passed: false,
                exit_code: None,
                output_tail: format!("check failed to run: {e}"),
                duration_ms,
            },
        };
        sink.emit(CoderEventKind::CheckCompleted {
            result: result.clone(),
        });
        results.push(result);
    }
    results
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicUsize, Ordering};

    const VALID: &str = r#"{
        "description": "file exists",
        "checks": [{"name": "exists", "command": "test -f x.txt"}]
    }"#;

    #[test]
    fn prompt_steers_toward_verifying_the_task_and_real_labels() {
        let p = build_contract_prompt(
            "add a --version flag",
            "Top-level entries: Cargo.toml, src\nBuild systems detected: Rust (cargo)",
            &[],
        );
        // Carries the task and repo orientation.
        assert!(p.contains("add a --version flag"));
        assert!(p.contains("Rust (cargo)"));
        // Steers away from toolchain-only checks and placeholder labels.
        assert!(p.contains("verify THE TASK ITSELF"));
        assert!(p.contains("rustc --version"), "names the toolchain-only anti-pattern");
        assert!(p.contains("never the literal placeholder"));
        // Guards the common small-model failure modes.
        assert!(p.contains("non-interactively"));
        assert!(p.contains("CERTAIN will appear"), "output_contains caution present");
        assert!(p.contains("no markdown fences"));
    }

    #[test]
    fn repair_prompt_appends_prior_issues() {
        let p = build_contract_prompt("t", "r", &["check 'a' has an empty command".into()]);
        assert!(p.contains("FAILED validation"));
        assert!(p.contains("empty command"));
    }

    #[tokio::test]
    async fn derives_on_first_valid_attempt() {
        let c = derive_contract(|_p| async { Ok::<_, String>(VALID.into()) }, "make x", "repo", 3)
            .await
            .unwrap();
        assert_eq!(c.checks.len(), 1);
        assert!(c.checks[0].expect_exit_zero, "default applies");
        assert_eq!(c.checks[0].timeout_secs, 120);
    }

    #[tokio::test]
    async fn repairs_fenced_and_chatty_output() {
        let fenced = format!("Sure! Here is the contract:\n```json\n{VALID}\n```");
        let c = derive_contract(
            |_p| {
                let text = fenced.clone();
                async move { Ok::<_, String>(text) }
            },
            "x",
            "r",
            3,
        )
        .await
        .unwrap();
        assert_eq!(c.checks[0].name, "exists");
    }

    #[tokio::test]
    async fn invalid_then_repaired() {
        let calls = AtomicUsize::new(0);
        let c = derive_contract(
            |prompt: String| {
                let n = calls.fetch_add(1, Ordering::SeqCst) + 1;
                async move {
                    if n == 1 {
                        Ok::<_, String>(r#"{"description": "no checks", "checks": []}"#.into())
                    } else {
                        assert!(prompt.contains("FAILED validation"), "repair prompt carries issues");
                        Ok(VALID.into())
                    }
                }
            },
            "x",
            "r",
            3,
        )
        .await
        .unwrap();
        assert_eq!(c.checks.len(), 1);
    }

    #[tokio::test]
    async fn gives_up_with_error_after_max() {
        let err = derive_contract(
            |_p| async { Ok::<_, String>("not json at all".into()) },
            "x",
            "r",
            2,
        )
        .await
        .unwrap_err();
        assert!(err.contains("after 2 attempts"), "{err}");
    }

    #[test]
    fn is_toolchain_only_flags_bare_version_probes_only() {
        // Bare version/help probes of build tools — these gate nothing.
        for c in [
            "cargo --version",
            "cargo -V",
            "rustc --version",
            "node -v",
            "npm --version",
            "python3 --version",
            "go version",
            "make --help",
        ] {
            assert!(is_toolchain_only(c), "should flag `{c}`");
        }
        // Real checks that exercise the change must NOT be flagged.
        for c in [
            "cargo build",
            "cargo test",
            "cargo run -- --version",
            "cargo run --release -- --version",
            "./target/debug/greeter --version",
            "cargo --version && cargo build",
            "test -f src/main.rs",
            "grep -q version Cargo.toml",
            "rustc src/main.rs -o /tmp/x",
        ] {
            assert!(!is_toolchain_only(c), "should NOT flag `{c}`");
        }
    }

    #[test]
    fn validate_rejects_toolchain_only_and_placeholder_name() {
        let c = OutcomeContract {
            description: "d".into(),
            checks: vec![ContractCheck {
                // The literal placeholder leaking through, paired with a
                // toolchain-only command — both seen live from a 1.7B model.
                name: "unique_snake_case_label".into(),
                command: "cargo --version".into(),
                expect_exit_zero: true,
                output_contains: None,
                timeout_secs: 120,
            }],
        };
        let issues = c.validate();
        assert!(
            issues.iter().any(|i| i.contains("placeholder name")),
            "{issues:?}"
        );
        assert!(
            issues.iter().any(|i| i.contains("toolchain-only no-op")),
            "{issues:?}"
        );
    }

    #[tokio::test]
    async fn derive_repairs_a_toolchain_only_first_attempt() {
        let calls = AtomicUsize::new(0);
        let toolchain_only = r#"{"description":"v","checks":[
            {"name":"unique_snake_case_label","command":"cargo --version"}]}"#;
        let real = r#"{"description":"v","checks":[
            {"name":"version_flag_prints","command":"cargo run -- --version"}]}"#;
        let c = derive_contract(
            |prompt: String| {
                let n = calls.fetch_add(1, Ordering::SeqCst) + 1;
                async move {
                    if n == 1 {
                        Ok::<_, String>(toolchain_only.into())
                    } else {
                        // Repair prompt must carry both rejections.
                        assert!(prompt.contains("toolchain-only no-op"), "{prompt}");
                        assert!(prompt.contains("placeholder name"), "{prompt}");
                        Ok(real.into())
                    }
                }
            },
            "add a --version flag",
            "Rust (cargo)",
            3,
        )
        .await
        .unwrap();
        assert_eq!(c.checks[0].command, "cargo run -- --version");
        assert_eq!(calls.load(Ordering::SeqCst), 2, "took exactly one repair");
    }

    #[test]
    fn validate_catches_empty_and_duplicate_and_assertless() {
        let c = OutcomeContract {
            description: "d".into(),
            checks: vec![
                ContractCheck {
                    name: "a".into(),
                    command: "true".into(),
                    expect_exit_zero: false,
                    output_contains: None,
                    timeout_secs: 5,
                },
                ContractCheck {
                    name: "a".into(),
                    command: "".into(),
                    expect_exit_zero: true,
                    output_contains: None,
                    timeout_secs: 5,
                },
            ],
        };
        let issues = c.validate();
        assert!(issues.iter().any(|i| i.contains("asserts nothing")));
        assert!(issues.iter().any(|i| i.contains("empty command")));
        assert!(issues.iter().any(|i| i.contains("duplicate")));
    }

    #[tokio::test]
    async fn evaluate_passes_and_fails_checks_in_a_real_dir() {
        let dir = tempfile::tempdir().unwrap();
        std::fs::write(dir.path().join("present.txt"), "hello needle").unwrap();
        let exec = WorktreeExecutor::new(dir.path());
        let sink = EventSink::test_sink();
        let contract = OutcomeContract {
            description: "d".into(),
            checks: vec![
                ContractCheck {
                    name: "exists".into(),
                    command: "test -f present.txt".into(),
                    expect_exit_zero: true,
                    output_contains: None,
                    timeout_secs: 10,
                },
                ContractCheck {
                    name: "content".into(),
                    command: "cat present.txt".into(),
                    expect_exit_zero: true,
                    output_contains: Some("needle".into()),
                    timeout_secs: 10,
                },
                ContractCheck {
                    name: "missing".into(),
                    command: "test -f absent.txt".into(),
                    expect_exit_zero: true,
                    output_contains: None,
                    timeout_secs: 10,
                },
            ],
        };
        let results = evaluate_contract(&contract, &exec, &sink).await;
        assert_eq!(results.len(), 3, "all checks run even after a failure");
        assert!(results[0].passed);
        assert!(results[1].passed);
        assert!(!results[2].passed);
        assert_eq!(results[2].exit_code, Some(1));
    }

    #[tokio::test]
    async fn evaluate_fails_on_missing_substring() {
        let dir = tempfile::tempdir().unwrap();
        let exec = WorktreeExecutor::new(dir.path());
        let sink = EventSink::test_sink();
        let contract = OutcomeContract {
            description: "d".into(),
            checks: vec![ContractCheck {
                name: "needle".into(),
                command: "echo haystack".into(),
                expect_exit_zero: true,
                output_contains: Some("needle".into()),
                timeout_secs: 10,
            }],
        };
        let results = evaluate_contract(&contract, &exec, &sink).await;
        assert!(!results[0].passed, "exit 0 but substring missing must fail");
        assert_eq!(results[0].exit_code, Some(0));
    }
}