harn-cli 0.8.35

// Coding-agent harness quality suite used by `harn eval coding-agent`.
//
// Each fixture stays tiny and deterministic so provider/model/tool-format
// differences are visible without turning this command into a full coding
// benchmark.
import { agent_host_tools } from "std/agent/host_tools"
import { audit_agent, repair_agent, summary_agent } from "std/agent/presets"
import { parse_args } from "std/cli"
import { command_run } from "std/command"
import { edit_apply_old_new_patch } from "std/edit"
import { write_json } from "std/fs"
import { write_jsonl } from "std/jsonl"

fn __has_parent_segment(path: string) -> bool {
  for segment in split(path ?? "", "/") {
    if segment == ".." {
      return true
    }
  }
  return false
}

fn __safe_relative_path(path: string) -> string {
  let normalized = replace(trim(path ?? ""), "\\", "/")
  if normalized == "" {
    throw "path must be non-empty"
  }
  if starts_with(normalized, "/") || __has_parent_segment(normalized) {
    throw "path must be relative and must not contain '..'"
  }
  return normalized
}

fn __workspace_path(root: string, path: string) -> string {
  return path_join(root, __safe_relative_path(path))
}

fn __seed_python_add(harness: Harness, root: string) {
  harness.fs.mkdir(path_join(root, "tests"))
  harness.fs
    .write_text(path_join(root, "math_utils.py"), "def add(a, b):\n    return a - b\n")
  harness.fs
    .write_text(
    path_join(root, "tests", "test_math_utils.py"),
    "import unittest\n\nfrom math_utils import add\n\n\nclass MathUtilsTest(unittest.TestCase):\n    def test_adds_two_numbers(self):\n        self.assertEqual(add(2, 3), 5)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n",
  )
  harness.fs
    .write_text(
    path_join(root, "README.md"),
    "# Mini coding-agent fixture\n\nThe `add` helper is intentionally broken.\n",
  )
}

fn __seed_cli_help_flag(harness: Harness, root: string) {
  harness.fs
    .write_text(
    path_join(root, "app.py"),
    "import argparse\n\n\ndef build_parser():\n    parser = argparse.ArgumentParser(description=\"Print a greeting.\")\n    parser.add_argument(\"name\", help=\"Name to greet.\")\n    return parser\n\n\ndef main(argv=None):\n    args = build_parser().parse_args(argv)\n    print(f\"Hello, {args.name}!\")\n\n\nif __name__ == \"__main__\":\n    main()\n",
  )
  harness.fs
    .write_text(
    path_join(root, "README.md"),
    "# Greeting CLI\n\nRun `python app.py Ada` to print `Hello, Ada!`.\n",
  )
}

fn __seed_test_output_first(harness: Harness, root: string) {
  harness.fs.mkdir(path_join(root, "tests"))
  harness.fs
    .write_text(
    path_join(root, "stats_utils.py"),
    "def median(values):\n    ordered = sorted(values)\n    mid = len(ordered) // 2\n    return ordered[mid]\n",
  )
  harness.fs
    .write_text(
    path_join(root, "tests", "test_stats_utils.py"),
    "import unittest\n\nfrom stats_utils import median\n\n\nclass StatsUtilsTest(unittest.TestCase):\n    def test_even_count_uses_average_of_middle_pair(self):\n        self.assertEqual(median([10, 2, 4, 6]), 5)\n\n    def test_odd_count_uses_middle_value(self):\n        self.assertEqual(median([9, 1, 4]), 4)\n\n\nif __name__ == \"__main__\":\n    unittest.main()\n",
  )
}

fn __seed_docs_symbol_rename(harness: Harness, root: string) {
  harness.fs
    .write_text(
    path_join(root, "greeter.py"),
    "def format_greeting(name):\n    return f\"Hello, {name}!\"\n",
  )
  harness.fs
    .write_text(
    path_join(root, "example.py"),
    "from greeter import make_greeting\n\n\nprint(make_greeting(\"Ada\"))\n",
  )
  harness.fs
    .write_text(
    path_join(root, "README.md"),
    "# Greeter\n\nUse `make_greeting(name)` to build the greeting shown in `example.py`.\n",
  )
}

fn __seed_read_only_audit(harness: Harness, root: string) {
  harness.fs.write_text(path_join(root, "settings.py"), "DEFAULT_TIMEOUT_SECONDS = 30\n")
  harness.fs
    .write_text(path_join(root, "README.md"), "# Settings\n\nThe default timeout is 30 seconds.\n")
}

fn __seed_workspace(harness: Harness, root: string, fixture: string) {
  harness.fs.mkdir(root)
  if fixture == "python-add" {
    __seed_python_add(harness, root)
    return
  }
  if fixture == "cli-help-flag" {
    __seed_cli_help_flag(harness, root)
    return
  }
  if fixture == "test-output-first" {
    __seed_test_output_first(harness, root)
    return
  }
  if fixture == "docs-symbol-rename" {
    __seed_docs_symbol_rename(harness, root)
    return
  }
  if fixture == "read-only-audit" {
    __seed_read_only_audit(harness, root)
    return
  }
  if fixture == "no-tool-diagnosis" {
    return
  }
  throw "unknown fixture: " + fixture
}

fn __replace_tool(harness: Harness, registry, root: string) {
  return tool_define(
    registry,
    "replace_in_file",
    "Replace one exact or structurally matched text region in a UTF-8 file under the workspace root.",
    {
      parameters: {path: {type: "string"}, old_text: {type: "string"}, new_text: {type: "string"}},
      returns: {type: "object"},
      annotations: {kind: "edit", side_effect_level: "workspace_write"},
      handler: { args ->
        let path = __workspace_path(root, args.path)
        let before = harness.fs.read_text(path)
        let patch = edit_apply_old_new_patch(
          before,
          args.old_text,
          args.new_text,
          {strip_line_numbers: true, structural_require_anchored_lines: "either"},
        )
        if patch.ok {
          harness.fs.write_text(path, patch.patched)
        }
        return json_stringify(
          {
            ok: patch.ok,
            changed: patch.changed,
            match_kind: patch.match_kind,
            error_code: patch?.error_code,
            message: patch?.message,
            changed_regions: patch.changed_regions,
            before_sha256: patch.before_sha256,
            after_sha256: patch.after_sha256,
          },
        )
      },
    },
  )
}

fn __fixture_tools(harness: Harness, root: string, python: string, fixture: string) {
  if fixture == "no-tool-diagnosis" {
    return nil
  }
  if fixture == "read-only-audit" {
    return agent_host_tools(nil, {root: root, cwd: root, max_inline_bytes: 4000, enabled_tools: ["read_file"]})
  }
  let tools = agent_host_tools(
    nil,
    {
      root: root,
      cwd: root,
      max_inline_bytes: 8000,
      allow_argv_prefixes: [[python]],
      enabled_tools: ["read_file", "list_directory", "search_files", "run_command"],
      search_exclude_globs: ["__pycache__/**"],
    },
  )
  return __replace_tool(harness, tools, root)
}

fn __seed_python_add_native_mock(python: string) {
  llm_mock(
    {text: "", tool_calls: [{id: "read_math", name: "read_file", arguments: {path: "math_utils.py"}}]},
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "replace_math",
          name: "replace_in_file",
          arguments: {
            path: "math_utils.py",
            old_text: "def add(a, b):\n    return a - b\n",
            new_text: "def add(a, b):\n    return a + b\n",
          },
        },
      ],
    },
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "run_tests",
          name: "run_command",
          arguments: {argv: [python, "-m", "unittest", "discover", "-s", "tests"], capture: {max_inline_bytes: 4000}},
        },
      ],
    },
  )
  llm_mock({text: "Fixed add and verified the unittest suite passes."})
}

fn __seed_python_add_text_mock(python: string) {
  let python_literal = json_stringify(python)
  llm_mock({text: "<tool_call>\nread_file({ path: \"math_utils.py\" })\n</tool_call>"})
  llm_mock(
    {
      text: "<tool_call>\nreplace_in_file({ path: \"math_utils.py\", old_text: \"def add(a, b):\\n    return a - b\\n\", new_text: \"def add(a, b):\\n    return a + b\\n\" })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<tool_call>\nrun_command({ argv: ["
        + python_literal
        + ", \"-m\", \"unittest\", \"discover\", \"-s\", \"tests\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<user_response>Fixed add and verified the unittest suite passes.</user_response>\n<done>##DONE##</done>",
    },
  )
}

fn __seed_cli_help_flag_native_mock(python: string) {
  llm_mock({text: "", tool_calls: [{id: "read_app", name: "read_file", arguments: {path: "app.py"}}]})
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "replace_app",
          name: "replace_in_file",
          arguments: {
            path: "app.py",
            old_text: "def build_parser():\n    parser = argparse.ArgumentParser(description=\"Print a greeting.\")\n    parser.add_argument(\"name\", help=\"Name to greet.\")\n    return parser\n\n\ndef main(argv=None):\n    args = build_parser().parse_args(argv)\n    print(f\"Hello, {args.name}!\")\n",
            new_text: "def build_parser():\n    parser = argparse.ArgumentParser(description=\"Print a greeting.\")\n    parser.add_argument(\"--shout\", action=\"store_true\", help=\"Print the greeting in uppercase.\")\n    parser.add_argument(\"name\", help=\"Name to greet.\")\n    return parser\n\n\ndef main(argv=None):\n    args = build_parser().parse_args(argv)\n    greeting = f\"Hello, {args.name}!\"\n    if args.shout:\n        greeting = greeting.upper()\n    print(greeting)\n",
          },
        },
      ],
    },
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "replace_readme",
          name: "replace_in_file",
          arguments: {
            path: "README.md",
            old_text: "# Greeting CLI\n\nRun `python app.py Ada` to print `Hello, Ada!`.\n",
            new_text: "# Greeting CLI\n\nRun `python app.py Ada` to print `Hello, Ada!`.\nRun `python app.py --shout Ada` to print `HELLO, ADA!`.\n",
          },
        },
      ],
    },
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "run_shout",
          name: "run_command",
          arguments: {argv: [python, "app.py", "--shout", "Ada"], capture: {max_inline_bytes: 4000}},
        },
      ],
    },
  )
  llm_mock({text: "Added --shout, documented it, and verified uppercase output."})
}

fn __seed_cli_help_flag_text_mock(python: string) {
  let python_literal = json_stringify(python)
  llm_mock({text: "<tool_call>\nread_file({ path: \"app.py\" })\n</tool_call>"})
  llm_mock(
    {
      text: "<tool_call>\nreplace_in_file({ path: \"app.py\", old_text: \"def build_parser():\\n    parser = argparse.ArgumentParser(description=\\\"Print a greeting.\\\")\\n    parser.add_argument(\\\"name\\\", help=\\\"Name to greet.\\\")\\n    return parser\\n\\n\\ndef main(argv=None):\\n    args = build_parser().parse_args(argv)\\n    print(f\\\"Hello, {args.name}!\\\")\\n\", new_text: \"def build_parser():\\n    parser = argparse.ArgumentParser(description=\\\"Print a greeting.\\\")\\n    parser.add_argument(\\\"--shout\\\", action=\\\"store_true\\\", help=\\\"Print the greeting in uppercase.\\\")\\n    parser.add_argument(\\\"name\\\", help=\\\"Name to greet.\\\")\\n    return parser\\n\\n\\ndef main(argv=None):\\n    args = build_parser().parse_args(argv)\\n    greeting = f\\\"Hello, {args.name}!\\\"\\n    if args.shout:\\n        greeting = greeting.upper()\\n    print(greeting)\\n\" })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<tool_call>\nreplace_in_file({ path: \"README.md\", old_text: \"# Greeting CLI\\n\\nRun `python app.py Ada` to print `Hello, Ada!`.\\n\", new_text: \"# Greeting CLI\\n\\nRun `python app.py Ada` to print `Hello, Ada!`.\\nRun `python app.py --shout Ada` to print `HELLO, ADA!`.\\n\" })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<tool_call>\nrun_command({ argv: [" + python_literal
        + ", \"app.py\", \"--shout\", \"Ada\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<user_response>Added --shout, documented it, and verified uppercase output.</user_response>\n<done>##DONE##</done>",
    },
  )
}

fn __seed_test_output_first_native_mock(python: string) {
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "run_failing_tests",
          name: "run_command",
          arguments: {argv: [python, "-m", "unittest", "discover", "-s", "tests"], capture: {max_inline_bytes: 4000}},
        },
      ],
    },
  )
  llm_mock(
    {text: "", tool_calls: [{id: "read_stats", name: "read_file", arguments: {path: "stats_utils.py"}}]},
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "replace_stats",
          name: "replace_in_file",
          arguments: {
            path: "stats_utils.py",
            old_text: "def median(values):\n    ordered = sorted(values)\n    mid = len(ordered) // 2\n    return ordered[mid]\n",
            new_text: "def median(values):\n    ordered = sorted(values)\n    mid = len(ordered) // 2\n    if len(ordered) % 2 == 0:\n        return (ordered[mid - 1] + ordered[mid]) / 2\n    return ordered[mid]\n",
          },
        },
      ],
    },
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "run_passing_tests",
          name: "run_command",
          arguments: {argv: [python, "-m", "unittest", "discover", "-s", "tests"], capture: {max_inline_bytes: 4000}},
        },
      ],
    },
  )
  llm_mock({text: "Used the failing test output, fixed median, and re-ran the suite."})
}

fn __seed_test_output_first_text_mock(python: string) {
  let python_literal = json_stringify(python)
  let run_tests = "<tool_call>\nrun_command({ argv: [" + python_literal
    + ", \"-m\", \"unittest\", \"discover\", \"-s\", \"tests\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>"
  llm_mock({text: run_tests})
  llm_mock({text: "<tool_call>\nread_file({ path: \"stats_utils.py\" })\n</tool_call>"})
  llm_mock(
    {
      text: "<tool_call>\nreplace_in_file({ path: \"stats_utils.py\", old_text: \"def median(values):\\n    ordered = sorted(values)\\n    mid = len(ordered) // 2\\n    return ordered[mid]\\n\", new_text: \"def median(values):\\n    ordered = sorted(values)\\n    mid = len(ordered) // 2\\n    if len(ordered) % 2 == 0:\\n        return (ordered[mid - 1] + ordered[mid]) / 2\\n    return ordered[mid]\\n\" })\n</tool_call>",
    },
  )
  llm_mock({text: run_tests})
  llm_mock(
    {
      text: "<user_response>Used the failing test output, fixed median, and re-ran the suite.</user_response>\n<done>##DONE##</done>",
    },
  )
}

fn __seed_docs_symbol_rename_native_mock(python: string) {
  llm_mock(
    {text: "", tool_calls: [{id: "read_greeter", name: "read_file", arguments: {path: "greeter.py"}}]},
  )
  llm_mock(
    {text: "", tool_calls: [{id: "read_example", name: "read_file", arguments: {path: "example.py"}}]},
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "replace_example",
          name: "replace_in_file",
          arguments: {
            path: "example.py",
            old_text: "from greeter import make_greeting\n\n\nprint(make_greeting(\"Ada\"))\n",
            new_text: "from greeter import format_greeting\n\n\nprint(format_greeting(\"Ada\"))\n",
          },
        },
      ],
    },
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "replace_readme",
          name: "replace_in_file",
          arguments: {
            path: "README.md",
            old_text: "# Greeter\n\nUse `make_greeting(name)` to build the greeting shown in `example.py`.\n",
            new_text: "# Greeter\n\nUse `format_greeting(name)` to build the greeting shown in `example.py`.\n",
          },
        },
      ],
    },
  )
  llm_mock(
    {
      text: "",
      tool_calls: [
        {
          id: "run_example",
          name: "run_command",
          arguments: {argv: [python, "example.py"], capture: {max_inline_bytes: 4000}},
        },
      ],
    },
  )
  llm_mock({text: "Updated the docs and example to use format_greeting."})
}

fn __seed_docs_symbol_rename_text_mock(python: string) {
  let python_literal = json_stringify(python)
  llm_mock({text: "<tool_call>\nread_file({ path: \"greeter.py\" })\n</tool_call>"})
  llm_mock({text: "<tool_call>\nread_file({ path: \"example.py\" })\n</tool_call>"})
  llm_mock(
    {
      text: "<tool_call>\nreplace_in_file({ path: \"example.py\", old_text: \"from greeter import make_greeting\\n\\n\\nprint(make_greeting(\\\"Ada\\\"))\\n\", new_text: \"from greeter import format_greeting\\n\\n\\nprint(format_greeting(\\\"Ada\\\"))\\n\" })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<tool_call>\nreplace_in_file({ path: \"README.md\", old_text: \"# Greeter\\n\\nUse `make_greeting(name)` to build the greeting shown in `example.py`.\\n\", new_text: \"# Greeter\\n\\nUse `format_greeting(name)` to build the greeting shown in `example.py`.\\n\" })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<tool_call>\nrun_command({ argv: [" + python_literal
        + ", \"example.py\"], capture: { max_inline_bytes: 4000 } })\n</tool_call>",
    },
  )
  llm_mock(
    {
      text: "<user_response>Updated the docs and example to use format_greeting.</user_response>\n<done>##DONE##</done>",
    },
  )
}

fn __seed_read_only_audit_native_mock() {
  llm_mock(
    {text: "", tool_calls: [{id: "read_readme", name: "read_file", arguments: {path: "README.md"}}]},
  )
  llm_mock({text: "AUDIT_OK: README matches the configured default timeout; no edits are needed."})
}

fn __seed_read_only_audit_text_mock() {
  llm_mock({text: "<tool_call>\nread_file({ path: \"README.md\" })\n</tool_call>"})
  llm_mock(
    {
      text: "<user_response>AUDIT_OK: README matches the configured default timeout; no edits are needed.</user_response>\n<done>##DONE##</done>",
    },
  )
}

fn __seed_no_tool_native_mock() {
  llm_mock({text: "PATCH_HINT: change `return a - b` to `return a + b`."})
}

fn __seed_no_tool_text_mock() {
  llm_mock({text: "PATCH_HINT: change `return a - b` to `return a + b`.\n##DONE##"})
}

fn __seed_mock(fixture: string, tool_format: string, python: string) {
  llm_mock_clear()
  if fixture == "python-add" && tool_format == "native" {
    __seed_python_add_native_mock(python)
    return
  }
  if fixture == "python-add" {
    __seed_python_add_text_mock(python)
    return
  }
  if fixture == "cli-help-flag" && tool_format == "native" {
    __seed_cli_help_flag_native_mock(python)
    return
  }
  if fixture == "cli-help-flag" {
    __seed_cli_help_flag_text_mock(python)
    return
  }
  if fixture == "test-output-first" && tool_format == "native" {
    __seed_test_output_first_native_mock(python)
    return
  }
  if fixture == "test-output-first" {
    __seed_test_output_first_text_mock(python)
    return
  }
  if fixture == "docs-symbol-rename" && tool_format == "native" {
    __seed_docs_symbol_rename_native_mock(python)
    return
  }
  if fixture == "docs-symbol-rename" {
    __seed_docs_symbol_rename_text_mock(python)
    return
  }
  if fixture == "read-only-audit" && tool_format == "native" {
    __seed_read_only_audit_native_mock()
    return
  }
  if fixture == "read-only-audit" {
    __seed_read_only_audit_text_mock()
    return
  }
  if fixture == "no-tool-diagnosis" && tool_format == "native" {
    __seed_no_tool_native_mock()
    return
  }
  if fixture == "no-tool-diagnosis" {
    __seed_no_tool_text_mock()
    return
  }
  throw "unknown fixture: " + fixture
}

fn __command_verification(
  success: bool,
  exit_code: int,
  stdout: string,
  stderr: string,
  duration_ms: int,
) {
  return {
    success: success,
    exit_code: exit_code,
    stdout: stdout,
    stderr: stderr,
    combined: stdout + stderr,
    duration_ms: duration_ms,
  }
}

fn __run_unittest(root: string, python: string) {
  return command_run(
    {argv: [python, "-m", "unittest", "discover", "-s", "tests"], cwd: root},
    {capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
  )
}

fn __verify_python_add(harness: Harness, root: string, python: string) {
  let verify = __run_unittest(root, python)
  let source = harness.fs.read_text(path_join(root, "math_utils.py"))
  return __command_verification(
    verify.success && contains(source, "return a + b"),
    verify.exit_code,
    verify.stdout,
    verify.stderr,
    verify.duration_ms,
  )
}

fn __verify_cli_help_flag(harness: Harness, root: string, python: string) {
  let help = command_run(
    {argv: [python, "app.py", "--help"], cwd: root},
    {capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
  )
  let shout = command_run(
    {argv: [python, "app.py", "--shout", "Ada"], cwd: root},
    {capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
  )
  let readme = harness.fs.read_text(path_join(root, "README.md"))
  return __command_verification(
    help.success && shout.success && contains(help.stdout, "--shout")
      && contains(shout.stdout, "HELLO, ADA!")
      && contains(readme, "--shout"),
    if shout.success {
      help.exit_code
    } else {
      shout.exit_code
    },
    help.stdout + "\n" + shout.stdout,
    help.stderr + "\n" + shout.stderr,
    help.duration_ms + shout.duration_ms,
  )
}

fn __verify_test_output_first(harness: Harness, root: string, python: string) {
  let verify = __run_unittest(root, python)
  let source = harness.fs.read_text(path_join(root, "stats_utils.py"))
  return __command_verification(
    verify.success && contains(source, "return (ordered[mid - 1] + ordered[mid]) / 2"),
    verify.exit_code,
    verify.stdout,
    verify.stderr,
    verify.duration_ms,
  )
}

fn __verify_docs_symbol_rename(harness: Harness, root: string, python: string) {
  let verify = command_run(
    {argv: [python, "example.py"], cwd: root},
    {capture: {max_inline_bytes: 12000}, timeout_ms: 30000},
  )
  let example = harness.fs.read_text(path_join(root, "example.py"))
  let readme = harness.fs.read_text(path_join(root, "README.md"))
  let implementation = harness.fs.read_text(path_join(root, "greeter.py"))
  return __command_verification(
    verify.success
      && contains(verify.stdout, "Hello, Ada!")
      && contains(example, "format_greeting")
      && contains(readme, "format_greeting")
      && !contains(example, "make_greeting")
      && !contains(readme, "make_greeting")
      && contains(implementation, "def format_greeting"),
    verify.exit_code,
    verify.stdout,
    verify.stderr,
    verify.duration_ms,
  )
}

fn __tool_call_count(result) -> int {
  return len(result?.tools?.calls ?? [])
}

fn __final_text(result) -> string {
  return result.visible_text ?? result.text ?? ""
}

fn __verify_read_only_audit(harness: Harness, root: string, result) {
  let readme = harness.fs.read_text(path_join(root, "README.md"))
  let settings = harness.fs.read_text(path_join(root, "settings.py"))
  let final_text = __final_text(result)
  let success = __tool_call_count(result) == 1
    && contains(final_text, "AUDIT_OK")
    && contains(readme, "30 seconds")
    && contains(settings, "DEFAULT_TIMEOUT_SECONDS = 30")
  return __command_verification(
    success,
    if success {
      0
    } else {
      1
    },
    final_text,
    "",
    0,
  )
}

fn __verify_no_tool_diagnosis(result) {
  let final_text = __final_text(result)
  let success = __tool_call_count(result) == 0
    && contains(final_text, "PATCH_HINT")
    && contains(final_text, "return a + b")
  return __command_verification(
    success,
    if success {
      0
    } else {
      1
    },
    final_text,
    "",
    0,
  )
}

fn __verify_fixture(harness: Harness, root: string, fixture: string, python: string, result) {
  if fixture == "python-add" {
    return __verify_python_add(harness, root, python)
  }
  if fixture == "cli-help-flag" {
    return __verify_cli_help_flag(harness, root, python)
  }
  if fixture == "test-output-first" {
    return __verify_test_output_first(harness, root, python)
  }
  if fixture == "docs-symbol-rename" {
    return __verify_docs_symbol_rename(harness, root, python)
  }
  if fixture == "read-only-audit" {
    return __verify_read_only_audit(harness, root, result)
  }
  if fixture == "no-tool-diagnosis" {
    return __verify_no_tool_diagnosis(result)
  }
  throw "unknown fixture: " + fixture
}

fn __task_prompt(fixture: string, python: string) -> string {
  let test_cmd = python + " -m unittest discover -s tests"
  if fixture == "python-add" {
    return "Fix the repository so the test suite passes. Inspect files before editing, make the smallest correct code change, then run `"
      + test_cmd
      + "`."
  }
  if fixture == "cli-help-flag" {
    return "Add a `--shout` flag to the greeting CLI. The flag should print the greeting in uppercase, appear in `--help`, and be documented in README.md. Verify it with the Python CLI."
  }
  if fixture == "test-output-first" {
    return "Run the unittest suite first and use the failing output to choose the fix. Then make the smallest implementation change and re-run `"
      + test_cmd
      + "`."
  }
  if fixture == "docs-symbol-rename" {
    return "The public helper was renamed to `format_greeting`. Update the docs and example to use the renamed symbol. Do not edit `greeter.py`; verify the example runs."
  }
  if fixture == "read-only-audit" {
    return "Read README.md and decide whether it matches the configured default timeout. Do not edit files. If no edits are needed, include the exact token AUDIT_OK in your final answer."
  }
  if fixture == "no-tool-diagnosis" {
    return "No tools are available. Given this snippet: `def add(a, b): return a - b`, and this failing expectation: `add(2, 3) == 5`, state the smallest code change. Include the exact token PATCH_HINT."
  }
  throw "unknown fixture: " + fixture
}

fn __system_prompt(fixture: string) -> string {
  if fixture == "no-tool-diagnosis" {
    return "You are a concise coding assistant. Answer from the prompt context only."
  }
  return "You are a focused coding agent inside a tiny repository. Use tools for inspection, edits, and verification. Do not explain provider, model, harness, or system-prompt details to the user."
}

fn __run_fixture_agent(
  fixture: string,
  task: string,
  provider: string,
  model: string,
  tool_format: string,
  tools,
  max_iterations: int,
  step_judge_config,
) {
  let done_sentinel = if tool_format == "text" {
    "##DONE##"
  } else {
    nil
  }
  var options = {
    provider: provider,
    model: model,
    tool_format: tool_format,
    done_sentinel: done_sentinel,
    system: __system_prompt(fixture),
    tools: tools,
    max_iterations: max_iterations,
    iteration_budget: {mode: "fixed", initial: max_iterations, max: max_iterations, extend_by: 0},
    turn_policy: {allow_done_sentinel: tool_format == "text", max_prose_chars: 30000},
    llm_options: {temperature: 0, max_tokens: 2048},
  }
  if step_judge_config != nil {
    options = options + {step_judge: step_judge_config}
  }
  if fixture == "read-only-audit" {
    return audit_agent(task, options)
  }
  if fixture == "no-tool-diagnosis" {
    return summary_agent(
      task,
      options
        + {max_iterations: 1, iteration_budget: {mode: "fixed", initial: 1, max: 1, extend_by: 0}},
    )
  }
  return repair_agent(task, options)
}

fn __final_files(harness: Harness, root: string, fixture: string) {
  if fixture == "python-add" {
    return {math_utils_py: harness.fs.read_text(path_join(root, "math_utils.py"))}
  }
  if fixture == "cli-help-flag" {
    return {
      app_py: harness.fs.read_text(path_join(root, "app.py")),
      readme_md: harness.fs.read_text(path_join(root, "README.md")),
    }
  }
  if fixture == "test-output-first" {
    return {stats_utils_py: harness.fs.read_text(path_join(root, "stats_utils.py"))}
  }
  if fixture == "docs-symbol-rename" {
    return {
      greeter_py: harness.fs.read_text(path_join(root, "greeter.py")),
      example_py: harness.fs.read_text(path_join(root, "example.py")),
      readme_md: harness.fs.read_text(path_join(root, "README.md")),
    }
  }
  if fixture == "read-only-audit" {
    return {
      settings_py: harness.fs.read_text(path_join(root, "settings.py")),
      readme_md: harness.fs.read_text(path_join(root, "README.md")),
    }
  }
  return {}
}

fn __summary(
  harness: Harness,
  root: string,
  output_dir: string,
  fixture: string,
  provider: string,
  model: string,
  tool_format: string,
  result,
  verify,
  duration_ms: int,
) {
  let events = transcript_events(result.transcript)
  return {
    schema_version: 2,
    fixture_id: fixture,
    provider: provider,
    model: model,
    tool_format: tool_format,
    workspace_root: root,
    output_dir: output_dir,
    passed: verify.success,
    status: if verify.success {
      result.status
    } else {
      "failed"
    },
    final_text: __final_text(result),
    duration_ms: duration_ms,
    llm: result.llm,
    tools: result.tools,
    transcript_event_count: len(events),
    transcript_event_kinds: events.map({ event -> event.kind }),
    verification: verify,
    final_files: __final_files(harness, root, fixture),
  }
}

fn main(harness: Harness) {
  let args = parse_args(
    argv,
    [
      {kind: "option", name: "fixture", flags: ["--fixture"], required: true},
      {kind: "option", name: "workspace_root", flags: ["--workspace-root"]},
      {kind: "option", name: "output_dir", flags: ["--output-dir"], required: true},
      {kind: "option", name: "provider", flags: ["--provider"], default: "mock"},
      {kind: "option", name: "model", flags: ["--model"], default: "mock"},
      {kind: "option", name: "tool_format", flags: ["--tool-format"], default: "native"},
      {kind: "option", name: "max_iterations", flags: ["--max-iterations"], parse: "int", default: 8},
      {kind: "option", name: "python", flags: ["--python"], default: "python3"},
      {kind: "flag", name: "seed_mock", flags: ["--seed-mock"]},
      {kind: "option", name: "step_judge_json", flags: ["--step-judge-json"], default: ""},
    ],
  )
  if len(args._errors) > 0 {
    throw join(args._errors, "\n")
  }
  let output_dir = args.output_dir
  harness.fs.mkdir(output_dir)
  let root = args.workspace_root ?? path_join(output_dir, "workspace")
  __seed_workspace(harness, root, args.fixture)
  if args.seed_mock {
    __seed_mock(args.fixture, args.tool_format, args.python)
  }
  let started_ms = harness.clock.now_ms()
  let step_judge_config = if args.step_judge_json == "" {
    nil
  } else {
    json_parse(args.step_judge_json)
  }
  let result = __run_fixture_agent(
    args.fixture,
    __task_prompt(args.fixture, args.python),
    args.provider,
    args.model,
    args.tool_format,
    __fixture_tools(harness, root, args.python, args.fixture),
    args.max_iterations,
    step_judge_config,
  )
  let duration_ms = harness.clock.now_ms() - started_ms
  let verify = __verify_fixture(harness, root, args.fixture, args.python, result)
  let summary = __summary(
    harness,
    root,
    output_dir,
    args.fixture,
    args.provider,
    args.model,
    args.tool_format,
    result,
    verify,
    duration_ms,
  )
  write_json(path_join(output_dir, "summary.json"), summary, {pretty: true})
  write_json(path_join(output_dir, "result.json"), result, {pretty: true})
  write_jsonl(path_join(output_dir, "transcript_events.jsonl"), transcript_events(result.transcript))
  harness.stdio.println(json_stringify(summary))
  return if summary.passed {
    0
  } else {
    1
  }
}