nornir 0.4.32 - Docs.rs

//! **Compile-and-test judge** for the bake-off (the missing quality scorer).
//!
//! A bake-off records each model's answer plus its economics, but for a REAL
//! ollama run the quality `score` was left `0.0` (no judge wired) — so real
//! leaderboards ranked only by speed. This module fills that gap with a *pure
//! Rust* judge ([`CodegenJudge`]) that **does not call an LLM**: it takes an
//! answer that was already produced (a full file body, or a unified diff/patch),
//! drops it into a scratch copy of a target crate, runs `cargo build` + an
//! acceptance command (default `cargo test`), and turns the outcome into a
//! `0.0..=1.0` quality score.
//!
//! - `0.0` — the answer doesn't apply, or it applies but doesn't compile.
//! - partial — it compiles but some acceptance tests fail (the *fraction* that
//!   pass; `0.5` if the count can't be parsed but the command failed).
//! - `1.0` — `cargo build` + every acceptance test pass.
//!
//! The judge is the [`AnswerJudge`] trait so the **funnel epic (#45, task
//! decomposition)** reuses the SAME evaluator as its accept/reject oracle —
//! [`CodegenJudge`] is the first impl. The bake-off wires it through
//! [`super::agent_model_runs::score_rows_with_judge`].

use std::path::{Path, PathBuf};
use std::process::Command;

use anyhow::{anyhow, Context, Result};

/// The structured outcome of judging one answer. `score` is the only field the
/// bake-off needs (it replaces the row's `0.0`), but `compiled` /
/// `tests_passed` / `tests_total` / `message` make the verdict explainable in a
/// CLI / viz and let the funnel oracle decide accept/reject on more than a float.
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct Verdict {
    /// Did `cargo build` succeed after the answer was applied?
    pub compiled: bool,
    /// Acceptance tests that passed (0 when it never compiled).
    pub tests_passed: u32,
    /// Acceptance tests discovered (0 when it never compiled, or the count was
    /// unparseable).
    pub tests_total: u32,
    /// Quality score in `0.0..=1.0` — what the bake-off row's `score` becomes.
    pub score: f64,
    /// Human-readable one-liner explaining the verdict.
    pub message: String,
}

impl Verdict {
    /// A `0.0` verdict for an answer that never got to build (didn't apply, or
    /// the build failed) — carries the reason in `message`.
    pub fn rejected(message: impl Into<String>) -> Self {
        Verdict {
            compiled: false,
            tests_passed: 0,
            tests_total: 0,
            score: 0.0,
            message: message.into(),
        }
    }

    /// Did this verdict accept the answer (compiled AND every test passed)? The
    /// funnel #45 oracle uses this as its accept/reject decision.
    pub fn accepted(&self) -> bool {
        self.compiled && self.tests_total > 0 && self.tests_passed == self.tests_total
    }
}

/// A pluggable answer evaluator. The bake-off scores each `ModelAnswer.output`
/// through this; the funnel #45 task-decomposition oracle reuses the SAME trait
/// to accept/reject a decomposed sub-task's produced code. [`CodegenJudge`] is
/// the first (compile-and-test) impl.
pub trait AnswerJudge {
    /// Judge `answer` (a code answer — a full file body, or a unified diff), and
    /// return its [`Verdict`]. An evaluator must NOT panic on a bad answer: a
    /// non-applying / non-compiling answer is a `0.0` [`Verdict::rejected`], not
    /// an `Err`. `Err` is reserved for the judge's own infrastructure failing
    /// (e.g. the target crate couldn't be copied).
    fn judge(&self, answer: &str) -> Result<Verdict>;
}

/// How an answer is dropped into the target crate before building.
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub enum ApplyMode {
    /// The answer is a full file body; overwrite this path (relative to the
    /// crate root) with it.
    WriteFile { rel_path: String },
    /// The answer is a unified diff/patch; apply it with `git apply` at the
    /// crate root.
    ApplyPatch,
}

/// The spec describing how to evaluate a codegen answer: which crate to copy,
/// how to drop the answer in, and the acceptance command to gate on.
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct CodegenSpec {
    /// Path to the target crate/dir (the thing copied into a scratch dir).
    pub crate_dir: PathBuf,
    /// How the answer is applied to the scratch copy.
    pub apply: ApplyMode,
    /// The acceptance command tokens, e.g. `["cargo", "test"]` or
    /// `["cargo", "test", "--test", "foo"]`. Defaults to `cargo test`.
    pub accept_cmd: Vec<String>,
}

impl CodegenSpec {
    /// A spec that overwrites `rel_path` with the answer and accepts on the
    /// default `cargo test`.
    pub fn write_file(crate_dir: impl Into<PathBuf>, rel_path: impl Into<String>) -> Self {
        CodegenSpec {
            crate_dir: crate_dir.into(),
            apply: ApplyMode::WriteFile { rel_path: rel_path.into() },
            accept_cmd: default_accept_cmd(),
        }
    }

    /// A spec that `git apply`s the answer as a patch and accepts on the default
    /// `cargo test`.
    pub fn apply_patch(crate_dir: impl Into<PathBuf>) -> Self {
        CodegenSpec {
            crate_dir: crate_dir.into(),
            apply: ApplyMode::ApplyPatch,
            accept_cmd: default_accept_cmd(),
        }
    }

    /// Override the acceptance command (e.g. a specific `--test` or a doctest
    /// run). Empty ⇒ the default `cargo test`. Chainable.
    pub fn with_accept_cmd(mut self, cmd: Vec<String>) -> Self {
        self.accept_cmd = if cmd.is_empty() { default_accept_cmd() } else { cmd };
        self
    }
}

/// The default acceptance command: `cargo test`.
pub fn default_accept_cmd() -> Vec<String> {
    vec!["cargo".to_string(), "test".to_string()]
}

/// The compile-and-test judge. Copies the target crate to a temp scratch dir,
/// applies the answer, runs `cargo build` then the acceptance command, and
/// scores the result. No LLM is involved — it evaluates an already-produced
/// answer.
#[derive(Debug, Clone)]
pub struct CodegenJudge {
    spec: CodegenSpec,
}

impl CodegenJudge {
    /// New judge for `spec`.
    pub fn new(spec: CodegenSpec) -> Self {
        Self { spec }
    }

    /// The spec this judge evaluates against.
    pub fn spec(&self) -> &CodegenSpec {
        &self.spec
    }

    /// Copy the target crate into `dest` (a fresh scratch dir), skipping the
    /// `target/` build dir and `.git/` so the copy is fast and clean.
    fn copy_crate(&self, dest: &Path) -> Result<()> {
        let src = &self.spec.crate_dir;
        if !src.join("Cargo.toml").is_file() {
            return Err(anyhow!(
                "judge target `{}` is not a crate (no Cargo.toml)",
                src.display()
            ));
        }
        copy_dir_filtered(src, dest)
            .with_context(|| format!("copy crate `{}` → scratch", src.display()))
    }

    /// Apply `answer` to the scratch copy at `root`, per the spec's [`ApplyMode`].
    /// Returns `Ok(None)` on success, `Ok(Some(reason))` if the answer couldn't
    /// be applied (a `0.0` verdict, not an infra error).
    fn apply_answer(&self, root: &Path, answer: &str) -> Result<Option<String>> {
        match &self.spec.apply {
            ApplyMode::WriteFile { rel_path } => {
                let target = root.join(rel_path);
                if let Some(parent) = target.parent() {
                    std::fs::create_dir_all(parent)
                        .with_context(|| format!("mkdir for {}", target.display()))?;
                }
                std::fs::write(&target, answer)
                    .with_context(|| format!("write answer → {}", target.display()))?;
                Ok(None)
            }
            ApplyMode::ApplyPatch => {
                // `git apply` works in a plain dir (no repo needed) when fed the
                // patch on stdin; a patch that doesn't apply is a 0.0 verdict.
                use std::io::Write;
                let mut child = Command::new("git")
                    .args(["apply", "--whitespace=nowarn", "-"])
                    .current_dir(root)
                    .stdin(std::process::Stdio::piped())
                    .stdout(std::process::Stdio::piped())
                    .stderr(std::process::Stdio::piped())
                    .spawn()
                    .context("spawn `git apply`")?;
                child
                    .stdin
                    .take()
                    .ok_or_else(|| anyhow!("git apply: no stdin"))?
                    .write_all(answer.as_bytes())
                    .context("write patch to git apply stdin")?;
                let out = child.wait_with_output().context("wait for git apply")?;
                if out.status.success() {
                    Ok(None)
                } else {
                    let err = String::from_utf8_lossy(&out.stderr);
                    Ok(Some(format!("patch did not apply: {}", err.trim())))
                }
            }
        }
    }
}

impl AnswerJudge for CodegenJudge {
    fn judge(&self, answer: &str) -> Result<Verdict> {
        let scratch = tempfile::tempdir().context("create judge scratch dir")?;
        let root = scratch.path().join("crate");
        self.copy_crate(&root)?;

        // 1. apply the answer (write file / git apply patch).
        if let Some(reason) = self.apply_answer(&root, answer)? {
            return Ok(Verdict::rejected(reason));
        }

        // 2. cargo build — a non-compiling answer scores 0.0.
        let build = Command::new("cargo")
            .args(["build"])
            .current_dir(&root)
            .output()
            .context("run `cargo build` in scratch crate")?;
        if !build.status.success() {
            let err = String::from_utf8_lossy(&build.stderr);
            return Ok(Verdict::rejected(format!(
                "did not compile: {}",
                last_error_line(&err)
            )));
        }

        // 3. acceptance command — score by the fraction of tests passing.
        let cmd = if self.spec.accept_cmd.is_empty() {
            default_accept_cmd()
        } else {
            self.spec.accept_cmd.clone()
        };
        let (prog, rest) = cmd.split_first().expect("accept_cmd is non-empty");
        let test = Command::new(prog)
            .args(rest)
            .current_dir(&root)
            .output()
            .with_context(|| format!("run acceptance command `{}`", cmd.join(" ")))?;
        let stdout = String::from_utf8_lossy(&test.stdout);
        let stderr = String::from_utf8_lossy(&test.stderr);

        let (passed, total) = parse_cargo_test_counts(&stdout);
        Ok(score_from_run(test.status.success(), passed, total, &stdout, &stderr))
    }
}

/// Turn an acceptance-command outcome into a [`Verdict`]. Pulled out so a test
/// can assert the scoring math without spawning cargo.
pub fn score_from_run(
    cmd_ok: bool,
    passed: u32,
    total: u32,
    stdout: &str,
    stderr: &str,
) -> Verdict {
    if total > 0 {
        // We could parse the counts: score is the pass fraction.
        let score = passed as f64 / total as f64;
        let message = if cmd_ok && passed == total {
            format!("compiled; all {total} acceptance tests passed")
        } else {
            format!("compiled; {passed}/{total} acceptance tests passed")
        };
        Verdict { compiled: true, tests_passed: passed, tests_total: total, score, message }
    } else if cmd_ok {
        // Compiled, command succeeded, but no test count (e.g. a check-only
        // accept cmd, or 0 tests): a pass is full marks.
        Verdict {
            compiled: true,
            tests_passed: 0,
            tests_total: 0,
            score: 1.0,
            message: "compiled; acceptance command passed".to_string(),
        }
    } else {
        // Compiled but the command failed and we couldn't parse a count:
        // half-credit (compiles-but-fails).
        let tail = last_error_line(if stderr.trim().is_empty() { stdout } else { stderr });
        Verdict {
            compiled: true,
            tests_passed: 0,
            tests_total: 0,
            score: 0.5,
            message: format!("compiled but acceptance command failed: {tail}"),
        }
    }
}

/// Parse `passed; failed` test counts from `cargo test` stdout. Sums every
/// `test result: ok. N passed; M failed; …` line (unit + integration + doc
/// suites each print one). Returns `(passed, passed+failed)`; `(0, 0)` when no
/// such line is present.
pub fn parse_cargo_test_counts(stdout: &str) -> (u32, u32) {
    let mut passed = 0u32;
    let mut failed = 0u32;
    let mut saw = false;
    for line in stdout.lines() {
        let line = line.trim();
        // e.g. "test result: ok. 3 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out"
        if let Some(rest) = line.strip_prefix("test result:") {
            saw = true;
            let toks: Vec<&str> = rest.split_whitespace().collect();
            for (i, tok) in toks.iter().enumerate() {
                if let Ok(n) = tok.parse::<u32>() {
                    // The number is followed by its label as the next token.
                    let label = toks.get(i + 1).copied().unwrap_or("");
                    if label.starts_with("passed") {
                        passed += n;
                    } else if label.starts_with("failed") {
                        failed += n;
                    }
                }
            }
        }
    }
    if saw {
        (passed, passed + failed)
    } else {
        (0, 0)
    }
}

/// The last non-empty `error`-ish line of compiler/test output, truncated, so a
/// verdict message stays a one-liner.
fn last_error_line(s: &str) -> String {
    let pick = s
        .lines()
        .rev()
        .map(|l| l.trim())
        .find(|l| l.starts_with("error") || l.contains("error["))
        .or_else(|| s.lines().rev().map(|l| l.trim()).find(|l| !l.is_empty()))
        .unwrap_or("");
    truncate(pick, 200)
}

fn truncate(s: &str, n: usize) -> String {
    if s.chars().count() <= n {
        s.to_string()
    } else {
        let head: String = s.chars().take(n.saturating_sub(1)).collect();
        format!("{head}…")
    }
}

/// Recursively copy `src` → `dst`, skipping `target/` and `.git/` (the heavy,
/// regenerable dirs) so the scratch crate is a clean, buildable copy.
fn copy_dir_filtered(src: &Path, dst: &Path) -> Result<()> {
    std::fs::create_dir_all(dst)?;
    for entry in std::fs::read_dir(src)? {
        let entry = entry?;
        let name = entry.file_name();
        let name_str = name.to_string_lossy();
        if name_str == "target" || name_str == ".git" {
            continue;
        }
        let from = entry.path();
        let to = dst.join(&name);
        let ft = entry.file_type()?;
        if ft.is_dir() {
            copy_dir_filtered(&from, &to)?;
        } else if ft.is_symlink() {
            // Resolve & copy the target so the scratch copy is self-contained.
            let resolved = std::fs::canonicalize(&from)?;
            if resolved.is_dir() {
                copy_dir_filtered(&resolved, &to)?;
            } else {
                std::fs::copy(&resolved, &to)?;
            }
        } else {
            std::fs::copy(&from, &to)?;
        }
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    // ─── pure scoring math (no cargo spawn) ──────────────────────────────

    #[test]
    fn parse_counts_sums_suites() {
        let out = "\
running 2 tests
test result: ok. 2 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out
running 1 test
test result: FAILED. 0 passed; 1 failed; 0 ignored; 0 measured; 0 filtered out
";
        let (p, t) = parse_cargo_test_counts(out);
        assert_eq!((p, t), (2, 3), "2 passed across suites, 3 total");
    }

    #[test]
    fn score_from_run_math() {
        // all pass → 1.0, accepted
        let v = score_from_run(true, 3, 3, "", "");
        assert_eq!(v.score, 1.0);
        assert!(v.compiled && v.accepted());

        // partial → fraction
        let v = score_from_run(false, 2, 3, "", "");
        assert!((v.score - 2.0 / 3.0).abs() < 1e-9);
        assert!(v.compiled && !v.accepted());

        // compiled, cmd failed, no count → 0.5
        let v = score_from_run(false, 0, 0, "", "boom");
        assert_eq!(v.score, 0.5);
        assert!(v.compiled && !v.accepted());

        // compiled, cmd ok, no count → 1.0
        let v = score_from_run(true, 0, 0, "", "");
        assert_eq!(v.score, 1.0);
    }

    #[test]
    fn rejected_verdict_is_zero() {
        let v = Verdict::rejected("did not compile: error[E0599]");
        assert_eq!(v.score, 0.0);
        assert!(!v.compiled);
        assert!(!v.accepted());
        assert!(v.message.contains("E0599"));
    }

    // ─── end-to-end judge against a real scratch crate ───────────────────
    //
    // LAW (inject-assert): feed the judge a KNOWN-GOOD, a NON-COMPILING, and a
    // COMPILING-BUT-FAILING answer; assert the real scores (1.0 / 0.0 / partial).
    // Each test builds a tiny throwaway crate so it never touches the repo and
    // needs only cargo (already required to run these tests).

    /// Build a minimal crate whose `src/lib.rs` is `body`.
    fn scratch_crate(body: &str) -> tempfile::TempDir {
        let dir = tempfile::tempdir().unwrap();
        let root = dir.path();
        std::fs::write(
            root.join("Cargo.toml"),
            "[package]\nname = \"judgee\"\nversion = \"0.0.0\"\nedition = \"2021\"\n\n[lib]\npath = \"src/lib.rs\"\n",
        )
        .unwrap();
        std::fs::create_dir_all(root.join("src")).unwrap();
        std::fs::write(root.join("src/lib.rs"), body).unwrap();
        dir
    }

    /// The acceptance tests, baked into the answer file (so overwriting the
    /// whole `lib.rs` keeps them). `t_ok` always passes; `t_target` asserts
    /// `add(2, 2) == 4`.
    const ACCEPT_TESTS: &str = r#"
#[cfg(test)]
mod judge_tests {
    use super::*;
    #[test]
    fn t_ok() { assert!(true); }
    #[test]
    fn t_target() { assert_eq!(add(2, 2), 4); }
}
"#;

    fn good_answer() -> String {
        format!("pub fn add(a: i32, b: i32) -> i32 {{ a + b }}\n{ACCEPT_TESTS}")
    }
    fn noncompiling_answer() -> String {
        // `add` returns a &str but is typed i32 → type error.
        format!("pub fn add(_a: i32, _b: i32) -> i32 {{ \"nope\" }}\n{ACCEPT_TESTS}")
    }
    fn failing_answer() -> String {
        // Compiles fine, but add(2,2) == 5 → t_target fails, t_ok passes → 1/2.
        format!("pub fn add(a: i32, b: i32) -> i32 {{ a + b + 1 }}\n{ACCEPT_TESTS}")
    }

    fn judge_for(dir: &Path) -> CodegenJudge {
        CodegenJudge::new(CodegenSpec::write_file(dir.to_path_buf(), "src/lib.rs"))
    }

    #[test]
    fn known_good_answer_scores_one() {
        let scratch = scratch_crate("pub fn add(_a: i32, _b: i32) -> i32 { 0 }\n");
        let judge = judge_for(scratch.path());
        let v = judge.judge(&good_answer()).unwrap();
        assert!(v.compiled, "good answer compiles: {}", v.message);
        assert_eq!(v.tests_total, 2, "two acceptance tests");
        assert_eq!(v.tests_passed, 2);
        assert_eq!(v.score, 1.0, "all pass → 1.0: {}", v.message);
        assert!(v.accepted());
    }

    #[test]
    fn noncompiling_answer_scores_zero() {
        let scratch = scratch_crate("pub fn add(_a: i32, _b: i32) -> i32 { 0 }\n");
        let judge = judge_for(scratch.path());
        let v = judge.judge(&noncompiling_answer()).unwrap();
        assert!(!v.compiled, "type-error answer must not compile");
        assert_eq!(v.score, 0.0, "non-compiling → 0.0: {}", v.message);
        assert!(!v.accepted());
    }

    #[test]
    fn compiling_but_failing_answer_scores_partial() {
        let scratch = scratch_crate("pub fn add(_a: i32, _b: i32) -> i32 { 0 }\n");
        let judge = judge_for(scratch.path());
        let v = judge.judge(&failing_answer()).unwrap();
        assert!(v.compiled, "answer compiles: {}", v.message);
        assert_eq!(v.tests_total, 2);
        assert_eq!(v.tests_passed, 1, "only t_ok passes; t_target fails");
        assert!((v.score - 0.5).abs() < 1e-9, "1/2 tests → 0.5: {}", v.message);
        assert!(!v.accepted());
    }

    #[test]
    fn patch_that_does_not_apply_scores_zero() {
        let scratch = scratch_crate("pub fn add(_a: i32, _b: i32) -> i32 { 0 }\n");
        let judge = CodegenJudge::new(CodegenSpec::apply_patch(scratch.path().to_path_buf()));
        // A bogus patch that references a non-existent file → git apply fails.
        let bad = "--- a/nope.rs\n+++ b/nope.rs\n@@ -1 +1 @@\n-x\n+y\n";
        let v = judge.judge(bad).unwrap();
        assert_eq!(v.score, 0.0, "unappliable patch → 0.0: {}", v.message);
        assert!(!v.compiled);
    }
}