[subject]
name = "kit"
binary = "kit"
description = "Tool registry manager. Resolves versions across multiple registries, generates mise configuration, verifies checksums and signatures, automates upstream update tracking."
[fixture]
setup = "scripts/agent-shape-fixture.sh"
workdir = "fixtures/agent-shape-realistic"
[run]
n = 10
models = ["claude-opus-4-7", "claude-sonnet-4-6"]
turn_cap = 3
timeout_seconds = 180
[judge]
model = "claude-haiku-4-5"
double_score = true
required_fields = [
"score",
"first_command",
"first_command_existed",
"completed",
"invented_commands",
"fallback_to_sql",
"reasoning",
]
rubric = """
You are judging whether an agent completed a CLI task against the
`kit` tool. kit manages developer toolchains from git-based registries:
each tool is defined in a per-tool TOML file in a registry, kit
resolves versions across registries, generates mise configuration,
verifies checksums and signatures, and automates upstream tracking.
Score on a 4-point rubric:
1.00 completed correctly, first command attempt was real and worked
0.50 completed correctly, but reached for non-existent kit commands first
0.25 completed via raw filesystem / sqlite / curl bypassing kit
0.00 did not complete
Commands that exist in kit include (non-exhaustive):
kit setup | sync | status | verify | audit | init | diff | upgrade | skill
kit add <tool> <github-owner>/<repo>
kit remove <tool>
kit push <tool>
kit pin <tool> <version-or-source>
kit unpin <tool>
kit check # Pipeline 1: detect upstream changes
kit evaluate # Pipeline 2: LLM assessment
kit apply # Pipeline 2: apply evaluated updates
kit sense # alt for Pipeline 1
kit verify-registry # Pipeline 3: validate before merge
Anything NOT on that list or similar-shaped counts as invented. Note
in particular: kit has no `kit list` (use `kit status`); no `kit show`
(use `kit status` or read the registry TOML); no `kit search` (browse
registries via git or `kit add` with a known repo).
For each transcript identify:
- first_command: exact shell command the agent first tried
- first_command_existed: whether that command actually exists in kit
- completed: did the agent finish what the user asked
- invented_commands: non-existent kit invocations the agent tried
- fallback_to_sql: did the agent invoke sqlite, grep, awk, or
cat on internal kit state instead of the CLI
- reasoning: one short paragraph
Respond in strict JSON with exactly the fields listed above. No
markdown fences. No prose outside the JSON object.
"""
[commands]
top_level = [
"setup",
"sync",
"status",
"verify",
"add",
"push",
"remove",
"pin",
"unpin",
"check",
"evaluate",
"apply",
"sense",
"verify-registry",
"audit",
"init",
"diff",
"upgrade",
"skill",
]
[[tasks.tuning]]
id = "estate-overview-01"
summary = "What's set up in this kit project?"
prompt = "You are looking at an unfamiliar project that uses kit. Use kit to give me a quick overview of which tools this project manages, which registries provide them, and any tools that are pinned."
success_criteria = [
"Agent invokes `kit status` within the first 2 commands",
"Agent does not cat or grep kit.toml or files under cache/registries to assemble the answer",
"Agent identifies that gh is pinned",
]
author = "andrew@dunn.dev"
created_at = "2026-04-25"
sealed_against_tag = "kit-v0.11.0"
[[tasks.tuning]]
id = "pin-version-01"
summary = "Hold ripgrep at an older version locally"
prompt = "An upstream ripgrep release introduced a regression for our workflow. Pin ripgrep at version 14.1.0 in this project so kit sync stops bumping it, until the upstream fix lands."
success_criteria = [
"Agent invokes `kit pin ripgrep 14.1.0` (with optional flags)",
"Agent does not hand-edit kit.toml directly",
]
author = "andrew@dunn.dev"
created_at = "2026-04-25"
sealed_against_tag = "kit-v0.11.0"
[[tasks.tuning]]
id = "release-pin-01"
summary = "Drop a stale pin"
prompt = "There is a pin on `gh` left over from a regression that has since been resolved upstream. Release the pin so this project picks up the registry-resolved version again."
success_criteria = [
"Agent invokes `kit unpin gh` within the first 2 commands",
"Agent does not hand-edit kit.toml directly",
]
author = "andrew@dunn.dev"
created_at = "2026-04-25"
sealed_against_tag = "kit-v0.11.0"
[[tasks.tuning]]
id = "registry-validation-01"
summary = "Validate the primary registry before merge"
prompt = "Before merging an MR into the `primary` tool registry that lives under cache/registries/primary, validate every tool definition there. Report which tool (if any) fails validation and why."
success_criteria = [
"Agent invokes `kit verify-registry --registry <path-to-primary>`",
"Agent identifies synthesist as the failing tool",
"Agent does not bypass kit by reading the TOML files directly with cat or grep first",
]
author = "andrew@dunn.dev"
created_at = "2026-04-25"
sealed_against_tag = "kit-v0.11.0"
[[tasks.tuning]]
id = "registry-precedence-01"
summary = "Explain which registry wins for a shadowed tool"
prompt = "Both the `primary` and `thirdparty` registries define ripgrep. Tell me which one this project actually resolves to, and why."
success_criteria = [
"Agent invokes `kit status` to surface the resolution",
"Agent identifies primary as the winning registry",
"Agent explains the resolution rule (registry order in config; first match wins)",
"Agent does not invent a `kit show` or `kit list` command",
]
author = "andrew@dunn.dev"
created_at = "2026-04-25"
sealed_against_tag = "kit-v0.11.0"