import concurrent.futures
import itertools
import json
import os
import shutil
import subprocess
import sys
import tempfile
import uuid
MODELS = os.environ.get("MODELS", os.environ.get("MODEL", "sonnet")).split(",")
FILLERS = [int(x) for x in os.environ.get("FILLERS", "0").split(",")]
STARTS = os.environ.get("STARTS", "fresh").split(",")
N = int(os.environ.get("N", "3"))
MAX_TURNS = os.environ.get("MAX_TURNS", "6")
WORKERS = int(os.environ.get("WORKERS", "4"))
RUN_TIMEOUT = int(os.environ.get("RUN_TIMEOUT", "420"))
CONDS = os.environ.get("CONDS", "A,B,C").split(",")
WARM_PROMPT = "quick question before we start: in one sentence, what does HTTP status 429 mean?"
HEADER = (
"ski matched these skills to your request — a dedicated retrieval+rerank pass, "
"separate from and complementary to the host's own skill selection. Invoke "
"fitting ones by name via the `Skill` tool; do not Read the files. Prefer "
"invoking a matching skill over doing its task by hand; skip a "
"recommendation only if it clearly does not apply:"
)
SCENARIOS = [
{
"id": "uv",
"skill": "uv-development",
"description": (
"Manage Python projects with uv — add dependencies, sync environments, "
"run tools, and keep pyproject.toml healthy. Use when working in a "
"Python project that uses uv."
),
"body": """# uv project development
Rules for dependency work in uv projects:
1. Never edit `pyproject.toml` dependencies by hand — always use `uv add <package>` (or `uv add --dev`) so the lockfile stays consistent.
2. After adding, run `uv lock --check` to verify the lockfile.
3. Pin only with constraints (`uv add "requests>=2.31"`), never exact pins unless asked.
4. Never run `pip` inside a uv project.
""",
"files": {
"uv.lock": "",
"pyproject.toml": '[project]\nname = "probe-proj"\nversion = "0.1.0"\nrequires-python = ">=3.12"\ndependencies = []\n',
"src/main.py": "import requests\nprint(requests.__version__)\n",
},
"prompt": "running src/main.py fails with ModuleNotFoundError: No module named 'requests' — fix it",
"evidence": "you are working in a uv project",
},
{
"id": "csv",
"skill": "tabular-hygiene",
"description": (
"Clean and transform CSV/spreadsheet data — deduplicate rows, normalize "
"headers, validate types. Use when the user wants tabular data cleaned "
"or reshaped."
),
"body": """# Tabular data hygiene
Rules for cleaning tabular files:
1. Never overwrite the original file — write the cleaned copy alongside it (`<name>.clean.csv`).
2. Use Python's csv module (or pandas), never shell awk/sed one-liners.
3. Report row counts before and after every transformation.
4. Normalize headers to snake_case.
""",
"files": {
"data.csv": "Order ID,Customer Name,ammount\n1,alice,10\n1,alice,10\n2,bob,20\n2,bob,20\n3,carol,30\n",
},
"prompt": "data.csv has duplicate rows and inconsistent headers — clean it up",
"evidence": "a file of this skill's document type is part of this conversation",
},
{
"id": "rust",
"skill": "rust-error-handling",
"description": (
"Idiomatic Rust error handling — Result types, custom error enums, no "
"unwrap in library code. Use when making Rust code robust to failures."
),
"body": """# Rust error handling
Rules for robustness work in Rust:
1. No `unwrap()`/`expect()` in library code — return `Result` with a custom error enum.
2. Propagate with `?`; convert foreign errors via `From` impls.
3. Panics are only for violated invariants, never for bad input.
4. Add a test for each failure path you introduce.
""",
"files": {
"Cargo.toml": '[package]\nname = "probe"\nversion = "0.1.0"\nedition = "2021"\n',
"src/lib.rs": (
"pub fn parse_config(s: &str) -> (String, u16) {\n"
" let mut parts = s.split(':');\n"
" let host = parts.next().unwrap().to_string();\n"
" let port: u16 = parts.next().unwrap().parse().unwrap();\n"
" (host, port)\n"
"}\n"
),
},
"prompt": "parse_config in src/lib.rs panics on malformed input — make it robust",
"evidence": "you are working in a rust project",
},
{
"id": "docker",
"skill": "dockerfile-slim",
"description": (
"Optimize Dockerfiles — multi-stage builds, minimal base images, layer "
"caching, non-root users. Use when reducing container image size or "
"build time."
),
"body": """# Dockerfile slimming
Rules for image optimization:
1. Always convert to a multi-stage build: build stage with the toolchain, runtime stage on `-slim`/`alpine`/distroless.
2. Combine apt/apk lines and clean caches in the same layer.
3. COPY only what the runtime needs — never `COPY . .` into the final stage.
4. Add a non-root USER in the final stage.
""",
"files": {
"Dockerfile": (
"FROM ubuntu:22.04\n"
"RUN apt-get update\n"
"RUN apt-get install -y python3 python3-pip build-essential curl git vim\n"
"COPY . .\n"
"RUN pip3 install -r requirements.txt\n"
'CMD ["python3", "app.py"]\n'
),
"requirements.txt": "flask\n",
"app.py": "print('hi')\n",
},
"prompt": "our container image is 1.2GB — slim it down",
"evidence": "you are working in a docker project",
},
]
FILLER_SKILLS = [
("react-performance", "Diagnose and fix React render performance — memoization, virtualization, profiler traces. Use when a React UI feels slow."),
("terraform-modules", "Structure reusable Terraform modules with clean variable contracts and remote state. Use when refactoring infrastructure code."),
("k8s-debugging", "Debug Kubernetes workloads — crashloops, OOMKills, pending pods, service DNS. Use when a cluster deployment misbehaves."),
("graphql-schema", "Design GraphQL schemas — pagination, nullability, versioning-free evolution. Use when adding or reviewing GraphQL APIs."),
("oauth-flows", "Implement OAuth 2.0 and OIDC flows correctly — PKCE, token refresh, session handling. Use when wiring up third-party login."),
("i18n-setup", "Internationalize an application — string extraction, locale files, pluralization rules. Use when adding multi-language support."),
("a11y-audit", "Audit and fix web accessibility — ARIA roles, focus order, contrast, screen readers. Use when improving accessibility compliance."),
("sql-migrations", "Write safe SQL schema migrations — reversible steps, zero-downtime patterns, backfills. Use when changing database schemas."),
("redis-caching", "Add Redis caching layers — key design, TTLs, stampede protection, invalidation. Use when reducing database load."),
("kafka-streams", "Build Kafka consumers and producers — partitioning, offsets, exactly-once semantics. Use when working with event streams."),
("grpc-services", "Define and evolve gRPC services — proto hygiene, deadlines, retries, streaming. Use when building RPC APIs."),
("css-layout", "Solve CSS layout problems — grid, flexbox, container queries, sticky positioning. Use when wrestling with page layout."),
("webpack-tuning", "Speed up webpack/vite builds — code splitting, tree shaking, cache configuration. Use when frontend builds are slow."),
("git-bisect", "Hunt regressions with git bisect — scripted bisection, skip handling, narrowing ranges. Use when finding the commit that broke something."),
("gh-actions-ci", "Author GitHub Actions workflows — matrices, caching, reusable workflows, secrets. Use when setting up or fixing CI."),
("monorepo-tooling", "Manage monorepos — task graphs, affected-only builds, dependency boundaries. Use when scaling a multi-package repository."),
("api-versioning", "Version REST APIs without breaking clients — deprecation windows, header negotiation. Use when evolving public endpoints."),
("logging-hygiene", "Structure application logs — levels, correlation ids, PII scrubbing, sampling. Use when improving observability."),
("feature-flags", "Roll out features behind flags — targeting rules, kill switches, flag debt cleanup. Use when shipping risky changes gradually."),
("secrets-rotation", "Rotate credentials safely — dual-write windows, dependency mapping, zero-downtime swaps. Use when rotating keys or tokens."),
("unit-test-style", "Write focused unit tests — arrange/act/assert, test doubles, parameterization. Use when improving test suites."),
("e2e-playwright", "Build reliable Playwright end-to-end tests — locators, fixtures, flake control. Use when automating browser testing."),
("load-testing", "Design load tests — workload modeling, ramp profiles, latency percentiles. Use when validating performance under traffic."),
("cpu-profiling", "Profile CPU hotspots — flamegraphs, sampling vs instrumentation, inlining effects. Use when code is compute-bound."),
("memory-leaks", "Track down memory leaks — heap snapshots, retention paths, fragmentation. Use when a process grows without bound."),
("email-templates", "Build responsive HTML email templates that survive Outlook and Gmail quirks. Use when designing transactional email."),
("stripe-billing", "Integrate Stripe billing — subscriptions, webhooks, proration, dunning. Use when adding payments to a product."),
("webhook-design", "Design webhook systems — signatures, retries, idempotency, ordering. Use when publishing events to third parties."),
("cron-scheduling", "Schedule background jobs — cron expressions, jitter, overlap locks, catch-up runs. Use when adding periodic tasks."),
("pdf-reports", "Generate PDF reports programmatically — layout engines, pagination, embedded fonts. Use when producing printable documents."),
("image-optimization", "Optimize web images — responsive srcsets, AVIF/WebP, lazy loading, CDNs. Use when improving page weight."),
("video-transcode", "Transcode video with ffmpeg — codecs, bitrate ladders, HLS packaging. Use when processing uploaded video."),
("ml-notebooks", "Keep Jupyter notebooks reproducible — seeded runs, parameterization, papermill. Use when maturing exploratory ML work."),
("prompt-engineering", "Structure LLM prompts — system/user separation, few-shot examples, output contracts. Use when building AI features."),
("data-warehouse", "Model data warehouse tables — star schemas, slowly changing dimensions, dbt tests. Use when building analytics models."),
("airflow-dags", "Author Airflow DAGs — idempotent tasks, backfills, sensors, SLAs. Use when orchestrating data pipelines."),
("spark-jobs", "Tune Spark jobs — partitioning, shuffles, skew mitigation, memory config. Use when big-data jobs are slow or failing."),
("mobile-release", "Ship mobile releases — store metadata, staged rollouts, crash monitoring. Use when releasing iOS/Android builds."),
("electron-packaging", "Package Electron apps — code signing, auto-update, notarization. Use when distributing desktop builds."),
("cli-ux", "Design command-line interfaces — flags vs subcommands, progress output, exit codes. Use when building CLI tools."),
("semver-releases", "Cut semantic-version releases — changelogs, tags, breaking-change policy. Use when versioning a library."),
("changelog-hygiene", "Maintain useful changelogs — keep-a-changelog format, user-facing language. Use when documenting releases."),
("license-compliance", "Audit dependency licenses — copyleft obligations, attribution files, SBOMs. Use when reviewing third-party code."),
("threat-modeling", "Run lightweight threat models — STRIDE, trust boundaries, mitigations backlog. Use when reviewing a design for security."),
("incident-runbooks", "Write actionable incident runbooks — detection, triage steps, rollback procedures. Use when preparing on-call documentation."),
("docs-style", "Write clear technical documentation — task-oriented structure, examples first. Use when authoring or editing docs."),
]
def directive(sc, with_evidence):
ev = f" [matched because {sc['evidence']}]" if with_evidence else ""
line = f"- SkillRecommendation(`{sc['skill']}`): {sc['description']}{ev} — invoke it now, before you respond."
return HEADER + "\n\n" + line
def setup_fixture(root, sc, fillers):
base = os.path.join(root, f"{sc['id']}-f{fillers}")
proj = os.path.join(base, "proj")
os.makedirs(proj, exist_ok=True)
for rel, content in sc["files"].items():
path = os.path.join(proj, rel)
os.makedirs(os.path.dirname(path) or proj, exist_ok=True)
with open(path, "w") as f:
f.write(content)
skills = [(s["skill"], s["description"], s["body"]) for s in SCENARIOS]
skills += [(n, d, f"# {n}\n\nDetailed guidance for {n.replace('-', ' ')}.\n")
for n, d in FILLER_SKILLS[:fillers]]
for name, desc, body in skills:
sdir = os.path.join(proj, ".claude", "skills", name)
os.makedirs(sdir, exist_ok=True)
with open(os.path.join(sdir, "SKILL.md"), "w") as f:
f.write(f"---\nname: {name}\ndescription: {desc}\n---\n\n{body}")
for cond in ["B", "C"]:
inj = {
"hookSpecificOutput": {
"hookEventName": "UserPromptSubmit",
"additionalContext": directive(sc, with_evidence=(cond == "C")),
}
}
with open(os.path.join(base, f"inject-{cond}.json"), "w") as f:
json.dump(inj, f)
settings = {
"hooks": {
"UserPromptSubmit": [
{"hooks": [{"type": "command", "command": f"cat {base}/inject-{cond}.json"}]}
]
}
}
with open(os.path.join(base, f"settings-{cond}.json"), "w") as f:
json.dump(settings, f)
with open(os.path.join(base, "settings-A.json"), "w") as f:
json.dump({}, f)
return base
def claude_cmd(prompt, model, settings, extra):
return [
"claude", "-p", prompt,
"--model", model,
"--settings", settings,
"--max-turns", MAX_TURNS,
"--allowedTools", "Skill,Read,Bash,Edit,Write",
"--output-format", "stream-json", "--verbose",
] + extra
def run_one(root, sc, model, fillers, start, cond, i):
base = os.path.join(root, f"{sc['id']}-f{fillers}")
tag = f"{model}-f{fillers}-{start}-{cond}-{i}"
proj = os.path.join(base, f"run-{tag}")
shutil.copytree(os.path.join(base, "proj"), proj)
out_path = os.path.join(base, f"out-{tag}.jsonl")
settings = os.path.join(base, f"settings-{cond}.json")
with open(os.devnull) as devnull:
try:
extra = []
if start == "continue":
sid = str(uuid.uuid4())
warm = claude_cmd(WARM_PROMPT, model,
os.path.join(base, "settings-A.json"),
["--session-id", sid])
warm[warm.index("--max-turns") + 1] = "3"
subprocess.run(warm, cwd=proj, stdin=devnull,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
timeout=RUN_TIMEOUT)
extra = ["--resume", sid]
cmd = claude_cmd(sc["prompt"], model, settings, extra)
with open(out_path, "w") as out:
subprocess.run(cmd, cwd=proj, stdin=devnull, stdout=out,
stderr=subprocess.DEVNULL, timeout=RUN_TIMEOUT)
except subprocess.TimeoutExpired:
pass
shutil.rmtree(proj, ignore_errors=True) return tag
def tools_of(path):
tools = []
try:
lines = open(path).read().splitlines()
except OSError:
return tools
for line in lines:
try:
v = json.loads(line)
except ValueError:
continue
if v.get("type") == "assistant":
for b in v.get("message", {}).get("content", []):
if b.get("type") == "tool_use":
tools.append((b["name"], json.dumps(b.get("input", {}))))
return tools
def score(root, scenarios):
all_skills = {s["skill"] for s in SCENARIOS} | {n for n, _ in FILLER_SKILLS}
print(f"\n{'cell':<26} {'run':<4} {'target?':<8} {'first':<7} {'wrong-skill':<16} sequence")
totals = {}
for sc, model, fillers, start in itertools.product(scenarios, MODELS, FILLERS, STARTS):
base = os.path.join(root, f"{sc['id']}-f{fillers}")
for cond in CONDS:
for i in range(1, N + 1):
tag = f"{model}-f{fillers}-{start}-{cond}-{i}"
tools = tools_of(os.path.join(base, f"out-{tag}.jsonl"))
target = any(n == "Skill" and sc["skill"] in d for n, d in tools)
read_md = any(n == "Read" and f"{sc['skill']}/SKILL.md" in d for n, d in tools)
consumed = target or read_md
wrong = sorted({s for n, d in tools if n == "Skill"
for s in all_skills - {sc["skill"]} if s in d})
first = tools[0][0] if tools else "-"
first_is_target = bool(tools) and tools[0][0] == "Skill" and sc["skill"] in tools[0][1]
key = (model, fillers, start, cond)
t = totals.setdefault(key, [0, 0, 0, 0])
t[0] += target
t[1] += first_is_target
t[2] += 1
t[3] += consumed
seq = "->".join(n for n, _ in tools[:6])
print(f"{sc['id'] + ' ' + tag:<26} {i:<4} {str(target):<8} {first:<7} "
f"{','.join(wrong) or '-':<16} {seq}")
print(f"\n{'model':<8} {'menu':<6} {'start':<10} {'cond':<5} {'invoked':<9} {'consumed':<10} first-action")
for (model, fillers, start, cond), (inv, first, tot, cons) in sorted(totals.items()):
print(f"{model:<8} {4 + fillers:<6} {start:<10} {cond:<5} {inv}/{tot:<7} {cons}/{tot:<8} {first}/{tot}")
def main():
only = os.environ.get("SCENARIOS")
scenarios = SCENARIOS if not only else [s for s in SCENARIOS if s["id"] in only.split(",")]
if not scenarios:
sys.exit(f"no scenarios match SCENARIOS={only}")
root = tempfile.mkdtemp(prefix="ski-probe.")
jobs = list(itertools.product(scenarios, MODELS, FILLERS, STARTS, CONDS, range(1, N + 1)))
print(f"probe dir: {root}")
print(f"models={MODELS} fillers={FILLERS} starts={STARTS} "
f"scenarios={[s['id'] for s in scenarios]} conds={CONDS} N={N} -> {len(jobs)} cells")
for sc, fillers in {(s["id"], f) for s in scenarios for f in FILLERS}:
setup_fixture(root, next(s for s in scenarios if s["id"] == sc), fillers)
done = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as ex:
futs = [ex.submit(run_one, root, sc, m, f, st, c, i)
for sc, m, f, st, c, i in jobs]
for fut in concurrent.futures.as_completed(futs):
done += 1
print(f" [{done}/{len(jobs)}] {fut.result()} done", flush=True)
score(root, scenarios)
print(f"\ntranscripts kept in {root}")
if __name__ == "__main__":
main()