// Coding-agent harness quality suite used by `harn eval coding-agent`.
//
// Each fixture stays tiny and deterministic so provider/model/tool-format
// differences are visible without turning this command into a full coding
// benchmark.
import { agent_host_tools } from "std/agent/host_tools"
import { audit_agent, repair_agent, summary_agent } from "std/agent/presets"
import { parse_args } from "std/cli"
import { command_run } from "std/command"
import { edit_apply_old_new_patch } from "std/edit"
import { write_json } from "std/fs"
import { write_jsonl } from "std/jsonl"
fn __has_parent_segment(path: string) -> bool {
for segment in split(path ?? "", "/") {
if segment == ".." {
return true
}
}
return false
}
fn __safe_relative_path(path: string) -> string {
let normalized = replace(trim(path ?? ""), "\\", "/")
if normalized == "" {
throw "path must be non-empty"
}
if starts_with(normalized, "/") || __has_parent_segment(normalized) {
throw "path must be relative and must not contain '..'"
}
return normalized
}
fn __workspace_path(root: string, path: string) -> string {
return path_join(root, __safe_relative_path(path))
}
fn __seed_python_add(harness: Harness, root: string) {
harness.fs.mkdir(path_join(root, "tests"))
harness.fs
.write_text(path_join(root, "math_utils.py"), "def add(a, b):\n return a - b\n")
harness.fs
.write_text(
path_join(root, "tests", "test_math_utils.py"),
"import unittest\n\nfrom math_utils import add\n\n\nclass MathUtilsTest(unittest.TestCase):\n def test_adds_two_numbers(self):\n self.assertEqual(add(2, 3), 5)\n\n\nif __name__ == \"__main__\":\n unittest.main()\n",
)
harness.fs
.write_text(
path_join(root, "README.md"),
"# Mini coding-agent fixture\n\nThe `add` helper is intentionally broken.\n",
)
}
fn __seed_cli_help_flag(harness: Harness, root: string) {
harness.fs
.write_text(
path_join(root, "app.py"),
"import argparse\n\n\ndef build_parser():\n parser = argparse.ArgumentParser(description=\"Print a greeting.\")\n parser.add_argument(\"name\", help=\"Name to greet.\")\n return parser\n\n\ndef main(argv=None):\n args = build_parser().parse_args(argv)\n print(f\"Hello, {args.name}!\")\n\n\nif __name__ == \"__main__\":\n main()\n",
)
harness.fs
.write_text(
path_join(root, "README.md"),
"# Greeting CLI\n\nRun `python app.py Ada` to print `Hello, Ada!`.\n",
)
}
fn __seed_test_output_first(harness: Harness, root: string) {
harness.fs.mkdir(path_join(root, "tests"))
harness.fs
.write_text(
path_join(root, "stats_utils.py"),
"def median(values):\n ordered = sorted(values)\n mid = len(ordered) // 2\n return ordered[mid]\n",
)
harness.fs
.write_text(
path_join(root, "tests", "test_stats_utils.py"),
"import unittest\n\nfrom stats_utils import median\n\n\nclass StatsUtilsTest(unittest.TestCase):\n def test_even_count_uses_average_of_middle_pair(self):\n self.assertEqual(median([10, 2, 4, 6]), 5)\n\n def test_odd_count_uses_middle_value(self):\n self.assertEqual(median([9, 1, 4]), 4)\n\n\nif __name__ == \"__main__\":\n unittest.main()\n",
)
}
fn __seed_docs_symbol_rename(harness: Harness, root: string) {
harness.fs
.write_text(
path_join(root, "greeter.py"),
"def format_greeting(name):\n return f\"Hello, {name}!\"\n",
)
harness.fs
.write_text(
path_join(root, "example.py"),
"from greeter import make_greeting\n\n\nprint(make_greeting(\"Ada\"))\n",
)
harness.fs
.write_text(
path_join(root, "README.md"),
"# Greeter\n\nUse `make_greeting(name)` to build the greeting shown in `example.py`.\n",
)
}
fn __seed_read_only_audit(harness: Harness, root: string) {
harness.fs.write_text(path_join(root, "settings.py"), "DEFAULT_TIMEOUT_SECONDS = 30\n")
harness.fs
.write_text(path_join(root, "README.md"), "# Settings\n\nThe default timeout is 30 seconds.\n")
}
fn __seed_workspace(harness: Harness, root: string, fixture: string) {
harness.fs.mkdir(root)
if fixture == "python-add" {
__seed_python_add(harness, root)
return
}
if fixture == "cli-help-flag" {
__seed_cli_help_flag(harness, root)
return
}
if fixture == "test-output-first" {
__seed_test_output_first(harness, root)
return
}
if fixture == "docs-symbol-rename" {
__seed_docs_symbol_rename(harness, root)
return
}
if fixture == "read-only-audit" {
__seed_read_only_audit(harness, root)
return
}
if fixture == "no-tool-diagnosis" {
return
}
throw "unknown fixture: " + fixture
}
fn __replace_tool(harness: Harness, registry, root: string) {
return tool_define(
registry,
"replace_in_file",
"Replace one exact or structurally matched text region in a UTF-8 file under the workspace root.",
{
parameters: {path: {type: "string"}, old_text: {type: "string"}, new_text: {type: "string"}},
returns: {type: "object"},
annotations: {kind: "edit", side_effect_level: "workspace_write"},
handler: { args ->
let path = __workspace_path(root, args.path)
let before = harness.fs.read_text(path)
let patch = edit_apply_old_new_patch(
before,
args.old_text,
args.new_text,
{strip_line_numbers: true, structural_require_anchored_lines: "either"},
)
if patch.ok {
harness.fs.write_text(path, patch.patched)
}
return json_stringify(
{
ok: patch.ok,
changed: patch.changed,
match_kind: patch.match_kind,
error_code: patch?.error_code,
message: patch?.message,
changed_regions: patch.changed_regions,
before_sha256: patch.before_sha256,
after_sha256: patch.after_sha256,
},
)
},
},
)
}
fn __fixture_tools(harness: Harness, root: string, python: string, fixture: string) {
if fixture == "no-tool-diagnosis" {
return nil
}
if fixture == "read-only-audit" {
return agent_host_tools(nil, {root: root, cwd: root, max_inline_bytes: 4000, enabled_tools: ["read_file"]})
}
let tools = agent_host_tools(
nil,
{
root: root,
cwd: root,
max_inline_bytes: 8000,
allow_argv_prefixes: [[python]],
enabled_tools: ["read_file", "list_directory", "search_files", "run_command"],
search_exclude_globs: ["__pycache__/**"],
},
)
return __replace_tool(harness, tools, root)
}
fn __seed_python_add_native_mock(python: string) {
llm_mock(
{text: "", tool_calls: [{id: "read_math", name: "read_file", arguments: {path: "math_utils.py"}}]},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "replace_math",
name: "replace_in_file",
arguments: {
path: "math_utils.py",
old_text: "def add(a, b):\n return a - b\n",
new_text: "def add(a, b):\n return a + b\n",
},
},
],
},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "run_tests",
name: "run_command",
arguments: {argv: [python, "-m", "unittest", "discover", "-s", "tests"], capture: {max_inline_bytes: 4000}},
},
],
},
)
llm_mock({text: "Fixed add and verified the unittest suite passes."})
}
fn __seed_python_add_text_mock(python: string) {
let python_literal = json_stringify(python)
llm_mock({text: "<tool_call>\nread_file({ path: \"math_utils.py\" })\n</tool_call>"})
llm_mock(
{
text: "<tool_call>\nreplace_in_file({ path: \"math_utils.py\", old_text: \"def add(a, b):\\n return a - b\\n\", new_text: \"def add(a, b):\\n return a + b\\n\" })\n</tool_call>",
},
)
llm_mock(
{
text: "<tool_call>\nrun_command({ argv: ["
+ python_literal
+ ", \"-m\", \"unittest\", \"discover\", \"-s\", \"tests\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>",
},
)
llm_mock(
{
text: "<user_response>Fixed add and verified the unittest suite passes.</user_response>\n<done>##DONE##</done>",
},
)
}
fn __seed_cli_help_flag_native_mock(python: string) {
llm_mock({text: "", tool_calls: [{id: "read_app", name: "read_file", arguments: {path: "app.py"}}]})
llm_mock(
{
text: "",
tool_calls: [
{
id: "replace_app",
name: "replace_in_file",
arguments: {
path: "app.py",
old_text: "def build_parser():\n parser = argparse.ArgumentParser(description=\"Print a greeting.\")\n parser.add_argument(\"name\", help=\"Name to greet.\")\n return parser\n\n\ndef main(argv=None):\n args = build_parser().parse_args(argv)\n print(f\"Hello, {args.name}!\")\n",
new_text: "def build_parser():\n parser = argparse.ArgumentParser(description=\"Print a greeting.\")\n parser.add_argument(\"--shout\", action=\"store_true\", help=\"Print the greeting in uppercase.\")\n parser.add_argument(\"name\", help=\"Name to greet.\")\n return parser\n\n\ndef main(argv=None):\n args = build_parser().parse_args(argv)\n greeting = f\"Hello, {args.name}!\"\n if args.shout:\n greeting = greeting.upper()\n print(greeting)\n",
},
},
],
},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "replace_readme",
name: "replace_in_file",
arguments: {
path: "README.md",
old_text: "# Greeting CLI\n\nRun `python app.py Ada` to print `Hello, Ada!`.\n",
new_text: "# Greeting CLI\n\nRun `python app.py Ada` to print `Hello, Ada!`.\nRun `python app.py --shout Ada` to print `HELLO, ADA!`.\n",
},
},
],
},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "run_shout",
name: "run_command",
arguments: {argv: [python, "app.py", "--shout", "Ada"], capture: {max_inline_bytes: 4000}},
},
],
},
)
llm_mock({text: "Added --shout, documented it, and verified uppercase output."})
}
fn __seed_cli_help_flag_text_mock(python: string) {
let python_literal = json_stringify(python)
llm_mock({text: "<tool_call>\nread_file({ path: \"app.py\" })\n</tool_call>"})
llm_mock(
{
text: "<tool_call>\nreplace_in_file({ path: \"app.py\", old_text: \"def build_parser():\\n parser = argparse.ArgumentParser(description=\\\"Print a greeting.\\\")\\n parser.add_argument(\\\"name\\\", help=\\\"Name to greet.\\\")\\n return parser\\n\\n\\ndef main(argv=None):\\n args = build_parser().parse_args(argv)\\n print(f\\\"Hello, {args.name}!\\\")\\n\", new_text: \"def build_parser():\\n parser = argparse.ArgumentParser(description=\\\"Print a greeting.\\\")\\n parser.add_argument(\\\"--shout\\\", action=\\\"store_true\\\", help=\\\"Print the greeting in uppercase.\\\")\\n parser.add_argument(\\\"name\\\", help=\\\"Name to greet.\\\")\\n return parser\\n\\n\\ndef main(argv=None):\\n args = build_parser().parse_args(argv)\\n greeting = f\\\"Hello, {args.name}!\\\"\\n if args.shout:\\n greeting = greeting.upper()\\n print(greeting)\\n\" })\n</tool_call>",
},
)
llm_mock(
{
text: "<tool_call>\nreplace_in_file({ path: \"README.md\", old_text: \"# Greeting CLI\\n\\nRun `python app.py Ada` to print `Hello, Ada!`.\\n\", new_text: \"# Greeting CLI\\n\\nRun `python app.py Ada` to print `Hello, Ada!`.\\nRun `python app.py --shout Ada` to print `HELLO, ADA!`.\\n\" })\n</tool_call>",
},
)
llm_mock(
{
text: "<tool_call>\nrun_command({ argv: [" + python_literal
+ ", \"app.py\", \"--shout\", \"Ada\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>",
},
)
llm_mock(
{
text: "<user_response>Added --shout, documented it, and verified uppercase output.</user_response>\n<done>##DONE##</done>",
},
)
}
fn __seed_test_output_first_native_mock(python: string) {
llm_mock(
{
text: "",
tool_calls: [
{
id: "run_failing_tests",
name: "run_command",
arguments: {argv: [python, "-m", "unittest", "discover", "-s", "tests"], capture: {max_inline_bytes: 4000}},
},
],
},
)
llm_mock(
{text: "", tool_calls: [{id: "read_stats", name: "read_file", arguments: {path: "stats_utils.py"}}]},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "replace_stats",
name: "replace_in_file",
arguments: {
path: "stats_utils.py",
old_text: "def median(values):\n ordered = sorted(values)\n mid = len(ordered) // 2\n return ordered[mid]\n",
new_text: "def median(values):\n ordered = sorted(values)\n mid = len(ordered) // 2\n if len(ordered) % 2 == 0:\n return (ordered[mid - 1] + ordered[mid]) / 2\n return ordered[mid]\n",
},
},
],
},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "run_passing_tests",
name: "run_command",
arguments: {argv: [python, "-m", "unittest", "discover", "-s", "tests"], capture: {max_inline_bytes: 4000}},
},
],
},
)
llm_mock({text: "Used the failing test output, fixed median, and re-ran the suite."})
}
fn __seed_test_output_first_text_mock(python: string) {
let python_literal = json_stringify(python)
let run_tests = "<tool_call>\nrun_command({ argv: [" + python_literal
+ ", \"-m\", \"unittest\", \"discover\", \"-s\", \"tests\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>"
llm_mock({text: run_tests})
llm_mock({text: "<tool_call>\nread_file({ path: \"stats_utils.py\" })\n</tool_call>"})
llm_mock(
{
text: "<tool_call>\nreplace_in_file({ path: \"stats_utils.py\", old_text: \"def median(values):\\n ordered = sorted(values)\\n mid = len(ordered) // 2\\n return ordered[mid]\\n\", new_text: \"def median(values):\\n ordered = sorted(values)\\n mid = len(ordered) // 2\\n if len(ordered) % 2 == 0:\\n return (ordered[mid - 1] + ordered[mid]) / 2\\n return ordered[mid]\\n\" })\n</tool_call>",
},
)
llm_mock({text: run_tests})
llm_mock(
{
text: "<user_response>Used the failing test output, fixed median, and re-ran the suite.</user_response>\n<done>##DONE##</done>",
},
)
}
fn __seed_docs_symbol_rename_native_mock(python: string) {
llm_mock(
{text: "", tool_calls: [{id: "read_greeter", name: "read_file", arguments: {path: "greeter.py"}}]},
)
llm_mock(
{text: "", tool_calls: [{id: "read_example", name: "read_file", arguments: {path: "example.py"}}]},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "replace_example",
name: "replace_in_file",
arguments: {
path: "example.py",
old_text: "from greeter import make_greeting\n\n\nprint(make_greeting(\"Ada\"))\n",
new_text: "from greeter import format_greeting\n\n\nprint(format_greeting(\"Ada\"))\n",
},
},
],
},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "replace_readme",
name: "replace_in_file",
arguments: {
path: "README.md",
old_text: "# Greeter\n\nUse `make_greeting(name)` to build the greeting shown in `example.py`.\n",
new_text: "# Greeter\n\nUse `format_greeting(name)` to build the greeting shown in `example.py`.\n",
},
},
],
},
)
llm_mock(
{
text: "",
tool_calls: [
{
id: "run_example",
name: "run_command",
arguments: {argv: [python, "example.py"], capture: {max_inline_bytes: 4000}},
},
],
},
)
llm_mock({text: "Updated the docs and example to use format_greeting."})
}
fn __seed_docs_symbol_rename_text_mock(python: string) {
let python_literal = json_stringify(python)
llm_mock({text: "<tool_call>\nread_file({ path: \"greeter.py\" })\n</tool_call>"})
llm_mock({text: "<tool_call>\nread_file({ path: \"example.py\" })\n</tool_call>"})
llm_mock(
{
text: "<tool_call>\nreplace_in_file({ path: \"example.py\", old_text: \"from greeter import make_greeting\\n\\n\\nprint(make_greeting(\\\"Ada\\\"))\\n\", new_text: \"from greeter import format_greeting\\n\\n\\nprint(format_greeting(\\\"Ada\\\"))\\n\" })\n</tool_call>",
},
)
llm_mock(
{
text: "<tool_call>\nreplace_in_file({ path: \"README.md\", old_text: \"# Greeter\\n\\nUse `make_greeting(name)` to build the greeting shown in `example.py`.\\n\", new_text: \"# Greeter\\n\\nUse `format_greeting(name)` to build the greeting shown in `example.py`.\\n\" })\n</tool_call>",
},
)
llm_mock(
{
text: "<tool_call>\nrun_command({ argv: [" + python_literal
+ ", \"example.py\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>",
},
)
llm_mock(
{
text: "<user_response>Updated the docs and example to use format_greeting.</user_response>\n<done>##DONE##</done>",
},
)
}
fn __seed_read_only_audit_native_mock() {
llm_mock(
{text: "", tool_calls: [{id: "read_readme", name: "read_file", arguments: {path: "README.md"}}]},
)
llm_mock({text: "AUDIT_OK: README matches the configured default timeout; no edits are needed."})
}
fn __seed_read_only_audit_text_mock() {
llm_mock({text: "<tool_call>\nread_file({ path: \"README.md\" })\n</tool_call>"})
llm_mock(
{
text: "<user_response>AUDIT_OK: README matches the configured default timeout; no edits are needed.</user_response>\n<done>##DONE##</done>",
},
)
}
fn __seed_no_tool_native_mock() {
llm_mock({text: "PATCH_HINT: change `return a - b` to `return a + b`."})
}
fn __seed_no_tool_text_mock() {
llm_mock({text: "PATCH_HINT: change `return a - b` to `return a + b`.\n##DONE##"})
}
fn __seed_mock(fixture: string, tool_format: string, python: string) {
llm_mock_clear()
if fixture == "python-add" && tool_format == "native" {
__seed_python_add_native_mock(python)
return
}
if fixture == "python-add" {
__seed_python_add_text_mock(python)
return
}
if fixture == "cli-help-flag" && tool_format == "native" {
__seed_cli_help_flag_native_mock(python)
return
}
if fixture == "cli-help-flag" {
__seed_cli_help_flag_text_mock(python)
return
}
if fixture == "test-output-first" && tool_format == "native" {
__seed_test_output_first_native_mock(python)
return
}
if fixture == "test-output-first" {
__seed_test_output_first_text_mock(python)
return
}
if fixture == "docs-symbol-rename" && tool_format == "native" {
__seed_docs_symbol_rename_native_mock(python)
return
}
if fixture == "docs-symbol-rename" {
__seed_docs_symbol_rename_text_mock(python)
return
}
if fixture == "read-only-audit" && tool_format == "native" {
__seed_read_only_audit_native_mock()
return
}
if fixture == "read-only-audit" {
__seed_read_only_audit_text_mock()
return
}
if fixture == "no-tool-diagnosis" && tool_format == "native" {
__seed_no_tool_native_mock()
return
}
if fixture == "no-tool-diagnosis" {
__seed_no_tool_text_mock()
return
}
throw "unknown fixture: " + fixture
}
fn __command_verification(
success: bool,
exit_code: int,
stdout: string,
stderr: string,
duration_ms: int,
) {
return {
success: success,
exit_code: exit_code,
stdout: stdout,
stderr: stderr,
combined: stdout + stderr,
duration_ms: duration_ms,
}
}
fn __run_unittest(root: string, python: string) {
return command_run(
{argv: [python, "-m", "unittest", "discover", "-s", "tests"], cwd: root},
{capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
)
}
fn __verify_python_add(harness: Harness, root: string, python: string) {
let verify = __run_unittest(root, python)
let source = harness.fs.read_text(path_join(root, "math_utils.py"))
return __command_verification(
verify.success && contains(source, "return a + b"),
verify.exit_code,
verify.stdout,
verify.stderr,
verify.duration_ms,
)
}
fn __verify_cli_help_flag(harness: Harness, root: string, python: string) {
let help = command_run(
{argv: [python, "app.py", "--help"], cwd: root},
{capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
)
let shout = command_run(
{argv: [python, "app.py", "--shout", "Ada"], cwd: root},
{capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
)
let readme = harness.fs.read_text(path_join(root, "README.md"))
return __command_verification(
help.success && shout.success && contains(help.stdout, "--shout")
&& contains(shout.stdout, "HELLO, ADA!")
&& contains(readme, "--shout"),
if shout.success {
help.exit_code
} else {
shout.exit_code
},
help.stdout + "\n" + shout.stdout,
help.stderr + "\n" + shout.stderr,
help.duration_ms + shout.duration_ms,
)
}
fn __verify_test_output_first(harness: Harness, root: string, python: string) {
let verify = __run_unittest(root, python)
let source = harness.fs.read_text(path_join(root, "stats_utils.py"))
return __command_verification(
verify.success && contains(source, "return (ordered[mid - 1] + ordered[mid]) / 2"),
verify.exit_code,
verify.stdout,
verify.stderr,
verify.duration_ms,
)
}
fn __verify_docs_symbol_rename(harness: Harness, root: string, python: string) {
let verify = command_run(
{argv: [python, "example.py"], cwd: root},
{capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
)
let example = harness.fs.read_text(path_join(root, "example.py"))
let readme = harness.fs.read_text(path_join(root, "README.md"))
let implementation = harness.fs.read_text(path_join(root, "greeter.py"))
return __command_verification(
verify.success
&& contains(verify.stdout, "Hello, Ada!")
&& contains(example, "format_greeting")
&& contains(readme, "format_greeting")
&& !contains(example, "make_greeting")
&& !contains(readme, "make_greeting")
&& contains(implementation, "def format_greeting"),
verify.exit_code,
verify.stdout,
verify.stderr,
verify.duration_ms,
)
}
fn __tool_call_count(result) -> int {
return len(result?.tools?.calls ?? [])
}
fn __final_text(result) -> string {
return result.visible_text ?? result.text ?? ""
}
fn __verify_read_only_audit(harness: Harness, root: string, result) {
let readme = harness.fs.read_text(path_join(root, "README.md"))
let settings = harness.fs.read_text(path_join(root, "settings.py"))
let final_text = __final_text(result)
let success = __tool_call_count(result) == 1
&& contains(final_text, "AUDIT_OK")
&& contains(readme, "30 seconds")
&& contains(settings, "DEFAULT_TIMEOUT_SECONDS = 30")
return __command_verification(
success,
if success {
0
} else {
1
},
final_text,
"",
0,
)
}
fn __verify_no_tool_diagnosis(result) {
let final_text = __final_text(result)
let success = __tool_call_count(result) == 0
&& contains(final_text, "PATCH_HINT")
&& contains(final_text, "return a + b")
return __command_verification(
success,
if success {
0
} else {
1
},
final_text,
"",
0,
)
}
fn __verify_fixture(harness: Harness, root: string, fixture: string, python: string, result) {
if fixture == "python-add" {
return __verify_python_add(harness, root, python)
}
if fixture == "cli-help-flag" {
return __verify_cli_help_flag(harness, root, python)
}
if fixture == "test-output-first" {
return __verify_test_output_first(harness, root, python)
}
if fixture == "docs-symbol-rename" {
return __verify_docs_symbol_rename(harness, root, python)
}
if fixture == "read-only-audit" {
return __verify_read_only_audit(harness, root, result)
}
if fixture == "no-tool-diagnosis" {
return __verify_no_tool_diagnosis(result)
}
throw "unknown fixture: " + fixture
}
fn __task_prompt(fixture: string, python: string) -> string {
let test_cmd = python + " -m unittest discover -s tests"
if fixture == "python-add" {
return "Fix the repository so the test suite passes. Inspect files before editing, make the smallest correct code change, then run `"
+ test_cmd
+ "`."
}
if fixture == "cli-help-flag" {
return "Add a `--shout` flag to the greeting CLI. The flag should print the greeting in uppercase, appear in `--help`, and be documented in README.md. Verify it with the Python CLI."
}
if fixture == "test-output-first" {
return "Run the unittest suite first and use the failing output to choose the fix. Then make the smallest implementation change and re-run `"
+ test_cmd
+ "`."
}
if fixture == "docs-symbol-rename" {
return "The public helper was renamed to `format_greeting`. Update the docs and example to use the renamed symbol. Do not edit `greeter.py`; verify the example runs."
}
if fixture == "read-only-audit" {
return "Read README.md and decide whether it matches the configured default timeout. Do not edit files. If no edits are needed, include the exact token AUDIT_OK in your final answer."
}
if fixture == "no-tool-diagnosis" {
return "No tools are available. Given this snippet: `def add(a, b): return a - b`, and this failing expectation: `add(2, 3) == 5`, state the smallest code change. Include the exact token PATCH_HINT."
}
throw "unknown fixture: " + fixture
}
fn __system_prompt(fixture: string) -> string {
if fixture == "no-tool-diagnosis" {
return "You are a concise coding assistant. Answer from the prompt context only."
}
return "You are a focused coding agent inside a tiny repository. Use tools for inspection, edits, and verification. Do not explain provider, model, harness, or system-prompt details to the user."
}
fn __run_fixture_agent(
fixture: string,
task: string,
provider: string,
model: string,
tool_format: string,
tools,
max_iterations: int,
step_judge_config,
) {
let done_sentinel = if tool_format == "text" {
"##DONE##"
} else {
nil
}
var options = {
provider: provider,
model: model,
tool_format: tool_format,
done_sentinel: done_sentinel,
system: __system_prompt(fixture),
tools: tools,
max_iterations: max_iterations,
iteration_budget: {mode: "fixed", initial: max_iterations, max: max_iterations, extend_by: 0},
turn_policy: {allow_done_sentinel: tool_format == "text", max_prose_chars: 30000},
llm_options: {temperature: 0, max_tokens: 2048},
}
if step_judge_config != nil {
options = options + {step_judge: step_judge_config}
}
if fixture == "read-only-audit" {
return audit_agent(task, options)
}
if fixture == "no-tool-diagnosis" {
return summary_agent(
task,
options
+ {max_iterations: 1, iteration_budget: {mode: "fixed", initial: 1, max: 1, extend_by: 0}},
)
}
return repair_agent(task, options)
}
fn __final_files(harness: Harness, root: string, fixture: string) {
if fixture == "python-add" {
return {math_utils_py: harness.fs.read_text(path_join(root, "math_utils.py"))}
}
if fixture == "cli-help-flag" {
return {
app_py: harness.fs.read_text(path_join(root, "app.py")),
readme_md: harness.fs.read_text(path_join(root, "README.md")),
}
}
if fixture == "test-output-first" {
return {stats_utils_py: harness.fs.read_text(path_join(root, "stats_utils.py"))}
}
if fixture == "docs-symbol-rename" {
return {
greeter_py: harness.fs.read_text(path_join(root, "greeter.py")),
example_py: harness.fs.read_text(path_join(root, "example.py")),
readme_md: harness.fs.read_text(path_join(root, "README.md")),
}
}
if fixture == "read-only-audit" {
return {
settings_py: harness.fs.read_text(path_join(root, "settings.py")),
readme_md: harness.fs.read_text(path_join(root, "README.md")),
}
}
return {}
}
fn __summary(
harness: Harness,
root: string,
output_dir: string,
fixture: string,
provider: string,
model: string,
tool_format: string,
result,
verify,
duration_ms: int,
) {
let events = transcript_events(result.transcript)
return {
schema_version: 2,
fixture_id: fixture,
provider: provider,
model: model,
tool_format: tool_format,
workspace_root: root,
output_dir: output_dir,
passed: verify.success,
status: if verify.success {
result.status
} else {
"failed"
},
final_text: __final_text(result),
duration_ms: duration_ms,
llm: result.llm,
tools: result.tools,
transcript_event_count: len(events),
transcript_event_kinds: events.map({ event -> event.kind }),
verification: verify,
final_files: __final_files(harness, root, fixture),
}
}
fn main(harness: Harness) {
let args = parse_args(
argv,
[
{kind: "option", name: "fixture", flags: ["--fixture"], required: true},
{kind: "option", name: "workspace_root", flags: ["--workspace-root"]},
{kind: "option", name: "output_dir", flags: ["--output-dir"], required: true},
{kind: "option", name: "provider", flags: ["--provider"], default: "mock"},
{kind: "option", name: "model", flags: ["--model"], default: "mock"},
{kind: "option", name: "tool_format", flags: ["--tool-format"], default: "native"},
{kind: "option", name: "max_iterations", flags: ["--max-iterations"], parse: "int", default: 8},
{kind: "option", name: "python", flags: ["--python"], default: "python3"},
{kind: "flag", name: "seed_mock", flags: ["--seed-mock"]},
{kind: "option", name: "step_judge_json", flags: ["--step-judge-json"], default: ""},
],
)
if len(args._errors) > 0 {
throw join(args._errors, "\n")
}
let output_dir = args.output_dir
harness.fs.mkdir(output_dir)
let root = args.workspace_root ?? path_join(output_dir, "workspace")
__seed_workspace(harness, root, args.fixture)
if args.seed_mock {
__seed_mock(args.fixture, args.tool_format, args.python)
}
let started_ms = harness.clock.now_ms()
let step_judge_config = if args.step_judge_json == "" {
nil
} else {
json_parse(args.step_judge_json)
}
let result = __run_fixture_agent(
args.fixture,
__task_prompt(args.fixture, args.python),
args.provider,
args.model,
args.tool_format,
__fixture_tools(harness, root, args.python, args.fixture),
args.max_iterations,
step_judge_config,
)
let duration_ms = harness.clock.now_ms() - started_ms
let verify = __verify_fixture(harness, root, args.fixture, args.python, result)
let summary = __summary(
harness,
root,
output_dir,
args.fixture,
args.provider,
args.model,
args.tool_format,
result,
verify,
duration_ms,
)
write_json(path_join(output_dir, "summary.json"), summary, {pretty: true})
write_json(path_join(output_dir, "result.json"), result, {pretty: true})
write_jsonl(path_join(output_dir, "transcript_events.jsonl"), transcript_events(result.transcript))
harness.stdio.println(json_stringify(summary))
return if summary.passed {
0
} else {
1
}
}