roder-evals 0.1.1

Agentic software development tools and SDKs for Roder.
Documentation
use std::collections::BTreeSet;
use std::path::{Path, PathBuf};
use std::process::Command;

use anyhow::Context;

use crate::{EvalFailureClass, EvalFixture};

pub(super) struct EvalWorkspace {
    pub path: PathBuf,
}

impl Drop for EvalWorkspace {
    fn drop(&mut self) {
        let _ = std::fs::remove_dir_all(&self.path);
    }
}

pub(super) fn create_workspace(fixture: &EvalFixture) -> anyhow::Result<EvalWorkspace> {
    let path = std::env::temp_dir().join(format!(
        "roder-eval-{}-{}",
        fixture.id,
        uuid::Uuid::new_v4()
    ));
    std::fs::create_dir_all(&path)?;
    for file in &fixture.workspace.files {
        let file_path = safe_workspace_path(&path, &file.path)?;
        if let Some(parent) = file_path.parent() {
            std::fs::create_dir_all(parent)?;
        }
        std::fs::write(file_path, &file.contents)?;
    }
    Ok(EvalWorkspace { path })
}

pub(super) fn run_workspace_setup(fixture: &EvalFixture, workspace: &Path) -> anyhow::Result<()> {
    for command in &fixture.workspace.commands {
        let output = run_shell_command(command, workspace)?;
        if !output.status.success() {
            anyhow::bail!(
                "setup command `{}` failed with status {:?}",
                command,
                output.status.code()
            );
        }
    }
    Ok(())
}

pub(super) fn grade_expected_evidence(
    fixture: &EvalFixture,
    workspace: &Path,
    final_answer: &str,
) -> anyhow::Result<()> {
    for needle in &fixture.expected.final_answer_contains {
        if !final_answer.contains(needle) {
            anyhow::bail!("final answer did not contain `{needle}`");
        }
    }
    for expected in &fixture.expected.files {
        let path = safe_workspace_path(workspace, &expected.path)?;
        if expected.exists && !path.exists() {
            anyhow::bail!("expected file missing: {}", expected.path.display());
        }
        if !expected.exists && path.exists() {
            anyhow::bail!("file should not exist: {}", expected.path.display());
        }
        if path.exists() {
            let contents = std::fs::read_to_string(&path)
                .with_context(|| format!("failed to read {}", expected.path.display()))?;
            if let Some(exact) = &expected.exact_contents
                && contents != *exact
            {
                anyhow::bail!(
                    "{} did not exactly match expected contents",
                    expected.path.display()
                );
            }
            if let Some(max_bytes) = expected.max_bytes {
                let actual = contents.len() as u64;
                if actual > max_bytes {
                    anyhow::bail!(
                        "{} was {actual} bytes, exceeding maxBytes {max_bytes}",
                        expected.path.display()
                    );
                }
            }
            if let Some(allowed_chars) = &expected.allowed_chars {
                let allowed = allowed_chars.chars().collect::<BTreeSet<_>>();
                for character in contents.chars() {
                    if !allowed.contains(&character) {
                        anyhow::bail!(
                            "{} contained disallowed character {:?}",
                            expected.path.display(),
                            character
                        );
                    }
                }
            }
            if !expected.json_array_fields.is_empty() {
                let json: serde_json::Value =
                    serde_json::from_str(&contents).with_context(|| {
                        format!("{} did not contain valid JSON", expected.path.display())
                    })?;
                for field in &expected.json_array_fields {
                    let Some(value) = json_field(&json, field) else {
                        anyhow::bail!("{} missing JSON field `{field}`", expected.path.display());
                    };
                    if !value.is_array() {
                        anyhow::bail!(
                            "{} JSON field `{field}` was not an array",
                            expected.path.display()
                        );
                    }
                }
            }
            for needle in &expected.contains {
                if !contents.contains(needle) {
                    anyhow::bail!("{} did not contain `{needle}`", expected.path.display());
                }
            }
        }
    }
    for check in &fixture.expected.command_checks {
        let output = run_shell_command(&check.command, workspace)?;
        let code = output.status.code().unwrap_or(-1);
        if code != check.expected_exit_code {
            anyhow::bail!(
                "command `{}` exited {code}, expected {}",
                check.command,
                check.expected_exit_code
            );
        }
        let stdout = String::from_utf8_lossy(&output.stdout);
        for needle in &check.stdout_contains {
            if !stdout.contains(needle) {
                anyhow::bail!("command `{}` stdout missing `{needle}`", check.command);
            }
        }
        let stderr = String::from_utf8_lossy(&output.stderr);
        for needle in &check.stderr_contains {
            if !stderr.contains(needle) {
                anyhow::bail!("command `{}` stderr missing `{needle}`", check.command);
            }
        }
    }
    Ok(())
}

pub(super) fn failure_class_for_fixture(fixture: &EvalFixture) -> EvalFailureClass {
    if fixture.tags.iter().any(|tag| tag == "tool-misuse") {
        EvalFailureClass::ToolSchema
    } else if fixture
        .tags
        .iter()
        .any(|tag| tag == "verification-before-final")
    {
        EvalFailureClass::Verifier
    } else {
        EvalFailureClass::Model
    }
}

fn run_shell_command(command: &str, cwd: &Path) -> anyhow::Result<std::process::Output> {
    #[cfg(windows)]
    let mut shell = {
        let mut command_process = Command::new("cmd");
        command_process.arg("/C").arg(command);
        command_process
    };
    #[cfg(not(windows))]
    let mut shell = {
        let mut command_process = Command::new("sh");
        command_process.arg("-c").arg(command);
        command_process
    };

    shell
        .current_dir(cwd)
        .output()
        .with_context(|| format!("failed to run `{command}`"))
}

fn safe_workspace_path(root: &Path, relative: &Path) -> anyhow::Result<PathBuf> {
    if relative.is_absolute()
        || relative
            .components()
            .any(|part| matches!(part, std::path::Component::ParentDir))
    {
        anyhow::bail!("workspace paths must be relative and stay inside the temp workspace");
    }
    Ok(root.join(relative))
}

fn json_field<'a>(value: &'a serde_json::Value, path: &str) -> Option<&'a serde_json::Value> {
    path.split('.').try_fold(value, |current, part| {
        current.as_object().and_then(|object| object.get(part))
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn expected_file_contract_accepts_exact_json_array_size_and_allowed_chars() {
        let root =
            std::env::temp_dir().join(format!("roder-tbench-contract-ok-{}", uuid::Uuid::new_v4()));
        std::fs::create_dir_all(&root).unwrap();
        std::fs::write(root.join("out.txt"), "flag{gc0d3_iz_ch4LLenGiNg}\n").unwrap();
        std::fs::write(
            root.join("sam.json"),
            r#"{"coords_x":[1,2],"coords_y":[3,4]}"#,
        )
        .unwrap();
        std::fs::write(root.join("gblock.txt"), "ACGTACGT\n").unwrap();
        let fixture: EvalFixture = serde_json::from_value(serde_json::json!({
            "id": "tbench-contract-ok",
            "title": "TBench exact file contract",
            "prompt": "Check the output contracts.",
            "expected": {
                "files": [
                    {
                        "path": "out.txt",
                        "exactContents": "flag{gc0d3_iz_ch4LLenGiNg}\n"
                    },
                    {
                        "path": "sam.json",
                        "jsonArrayFields": ["coords_x", "coords_y"]
                    },
                    {
                        "path": "gblock.txt",
                        "maxBytes": 3000,
                        "allowedChars": "ACGT\n"
                    }
                ]
            }
        }))
        .unwrap();

        let result = grade_expected_evidence(&fixture, &root, "done");

        assert!(result.is_ok(), "{result:?}");
        let _ = std::fs::remove_dir_all(root);
    }

    #[test]
    fn expected_file_contract_rejects_non_array_json_field() {
        let root = std::env::temp_dir().join(format!(
            "roder-tbench-contract-bad-json-{}",
            uuid::Uuid::new_v4()
        ));
        std::fs::create_dir_all(&root).unwrap();
        std::fs::write(root.join("sam.json"), r#"{"coords_x":"(1, 2)"}"#).unwrap();
        let fixture: EvalFixture = serde_json::from_value(serde_json::json!({
            "id": "tbench-contract-bad-json",
            "title": "TBench bad JSON contract",
            "prompt": "Check the output contracts.",
            "expected": {
                "files": [
                    {
                        "path": "sam.json",
                        "jsonArrayFields": ["coords_x"]
                    }
                ]
            }
        }))
        .unwrap();

        let error = grade_expected_evidence(&fixture, &root, "done").unwrap_err();

        assert!(
            error
                .to_string()
                .contains("sam.json JSON field `coords_x` was not an array"),
            "{error}"
        );
        let _ = std::fs::remove_dir_all(root);
    }

    #[test]
    fn expected_file_contract_rejects_overlong_or_disallowed_sequence() {
        let root = std::env::temp_dir().join(format!(
            "roder-tbench-contract-bad-sequence-{}",
            uuid::Uuid::new_v4()
        ));
        std::fs::create_dir_all(&root).unwrap();
        std::fs::write(root.join("gblock.txt"), "ACGTNN\n").unwrap();
        let fixture: EvalFixture = serde_json::from_value(serde_json::json!({
            "id": "tbench-contract-bad-sequence",
            "title": "TBench bad sequence contract",
            "prompt": "Check the output contracts.",
            "expected": {
                "files": [
                    {
                        "path": "gblock.txt",
                        "maxBytes": 4,
                        "allowedChars": "ACGT\n"
                    }
                ]
            }
        }))
        .unwrap();

        let error = grade_expected_evidence(&fixture, &root, "done").unwrap_err();

        assert!(
            error
                .to_string()
                .contains("gblock.txt was 7 bytes, exceeding maxBytes 4")
                || error
                    .to_string()
                    .contains("gblock.txt contained disallowed character"),
            "{error}"
        );
        let _ = std::fs::remove_dir_all(root);
    }
}