harn_cli/cli/eval.rs
1//! Clap definitions for `harn eval` and its subcommands.
2//!
3//! The bare form `harn eval <path>` evaluates a run record, run directory,
4//! eval manifest, or `.harn` pipeline (legacy entrypoint, dispatched through
5//! `eval_run_record`). The `harn eval prompt <file> --fleet <models>`
6//! subcommand renders (and optionally runs / judges) a single
7//! `.harn.prompt` template against a fleet of models so authors can compare
8//! the wire envelope each capability profile materializes.
9
10use std::path::PathBuf;
11
12use clap::{Args, Subcommand, ValueEnum};
13
14#[derive(Debug, Args)]
15#[command(args_conflicts_with_subcommands = true)]
16pub struct EvalArgs {
17 /// Run record path, run directory, eval manifest path, or `.harn` pipeline.
18 /// Required unless a subcommand (e.g. `prompt`) is used.
19 pub path: Option<String>,
20 /// Optional baseline run record for diffing.
21 #[arg(long)]
22 pub compare: Option<String>,
23 /// Run a pipeline twice and compare the baseline against this structural experiment.
24 #[arg(long = "structural-experiment")]
25 pub structural_experiment: Option<String>,
26 /// Replay LLM responses from a JSONL fixture file when `path` is a `.harn` pipeline.
27 #[arg(
28 long = "llm-mock",
29 value_name = "PATH",
30 conflicts_with = "llm_mock_record"
31 )]
32 pub llm_mock: Option<String>,
33 /// Record executed LLM responses into a JSONL fixture file when `path` is a `.harn` pipeline.
34 #[arg(
35 long = "llm-mock-record",
36 value_name = "PATH",
37 conflicts_with = "llm_mock"
38 )]
39 pub llm_mock_record: Option<String>,
40 /// Positional arguments forwarded to `harn run <pipeline.harn> -- ...` when
41 /// `path` is a pipeline file and `--structural-experiment` is set.
42 #[arg(last = true)]
43 pub argv: Vec<String>,
44 #[command(subcommand)]
45 pub command: Option<EvalCommand>,
46}
47
48#[derive(Debug, Subcommand)]
49pub enum EvalCommand {
50 /// Benchmark coding-agent fixtures across providers and tool formats.
51 CodingAgent(EvalCodingAgentArgs),
52 /// Run deterministic context-engineering modes over task fixtures.
53 Context(EvalContextArgs),
54 /// Render and optionally run a `.harn.prompt` across a fleet of models.
55 Prompt(EvalPromptArgs),
56 /// Gate skill/guidance variants on contamination-safe held-out eval results.
57 #[command(name = "skill-gate", visible_alias = "skill_gate")]
58 SkillGate(EvalSkillGateArgs),
59 /// Measure pre-turn scope-triage savings and false-positive rates.
60 #[command(name = "scope_triage", visible_alias = "scope-triage")]
61 ScopeTriage(EvalScopeTriageArgs),
62 /// Run tool-call accuracy, latency, and cost evals over a dataset.
63 ToolCalls(EvalToolCallsArgs),
64}
65
66#[derive(Debug, Args)]
67pub struct EvalContextArgs {
68 /// Context eval manifest JSON or TOML.
69 pub manifest: PathBuf,
70 /// Output directory for summary.json, per_run.jsonl, and summary.md.
71 #[arg(long)]
72 pub output: Option<PathBuf>,
73 /// Print the aggregate summary JSON to stdout.
74 #[arg(long)]
75 pub json: bool,
76}
77
78#[derive(Debug, Args)]
79pub struct EvalSkillGateArgs {
80 /// Skill gate manifest JSON or TOML.
81 pub manifest: PathBuf,
82 /// Output directory for summary.json, per_case.jsonl, receipt.json, and summary.md.
83 #[arg(long)]
84 pub output: Option<PathBuf>,
85 /// Print the aggregate report JSON to stdout.
86 #[arg(long)]
87 pub json: bool,
88}
89
90#[derive(Debug, Args)]
91pub struct EvalScopeTriageArgs {
92 /// Scope-triage dataset JSON.
93 #[arg(long, default_value = "evals/scope_triage/dataset.json")]
94 pub dataset: PathBuf,
95 /// Output directory for summary.json, per_case.jsonl, and summary.md.
96 #[arg(long)]
97 pub output: Option<PathBuf>,
98 /// Print the aggregate summary JSON to stdout.
99 #[arg(long)]
100 pub json: bool,
101 /// Run the live default classifier model instead of the deterministic reference classifier.
102 #[arg(long)]
103 pub live: bool,
104 /// Live classifier model selector.
105 #[arg(long, default_value = "ollama:qwen3:1.7b")]
106 pub model: String,
107 /// Confidence threshold below which labels become escalate.
108 #[arg(long = "confidence-threshold", default_value_t = 0.65)]
109 pub confidence_threshold: f64,
110 /// Stop after N cases, useful for smoke runs.
111 #[arg(long = "max-cases")]
112 pub max_cases: Option<usize>,
113}
114
115#[derive(Debug, Args)]
116pub struct EvalCodingAgentArgs {
117 /// Fixture ids to run (comma-separated, repeatable). Use `all` for the full suite.
118 #[arg(long = "fixture", value_delimiter = ',', default_value = "all")]
119 pub fixtures: Vec<String>,
120 /// Model selectors to run (comma-separated, repeatable). Each entry may be
121 /// an alias, `provider:model`, or `provider=...,model=...`.
122 #[arg(long = "model", value_delimiter = ',', default_value = "mock:mock")]
123 pub models: Vec<String>,
124 /// Tool-call rendering modes to compare.
125 #[arg(
126 long = "tool-format",
127 value_delimiter = ',',
128 default_value = "native,text"
129 )]
130 pub tool_formats: Vec<String>,
131 /// Output directory for summary.json, per_run.jsonl, transcripts, and markdown reports.
132 #[arg(long)]
133 pub output: Option<PathBuf>,
134 /// Optional .env file(s) to load for provider credentials. Values are never written to artifacts.
135 #[arg(long = "env-file")]
136 pub env_files: Vec<PathBuf>,
137 /// Append reachable local Ollama/llama.cpp/MLX/vLLM models to the selected matrix.
138 #[arg(long = "include-local")]
139 pub include_local: bool,
140 /// Restrict local discovery to one provider id. Repeatable.
141 #[arg(long = "local-provider")]
142 pub local_providers: Vec<String>,
143 /// Maximum discovered local models to append.
144 #[arg(long = "max-local-models", default_value_t = 2)]
145 pub max_local_models: usize,
146 /// Leave newly-loaded Ollama models running after each local benchmark run.
147 #[arg(long = "keep-local-after-run")]
148 pub keep_local_after_run: bool,
149 /// Stop after N matrix entries, useful for cost-capped smoke runs.
150 #[arg(long = "max-runs")]
151 pub max_runs: Option<usize>,
152 /// Maximum repair-agent loop iterations per run.
153 #[arg(long = "max-iterations", default_value_t = 8)]
154 pub max_iterations: usize,
155 /// Python executable used by the fixture and verification command.
156 #[arg(long, default_value = "python3")]
157 pub python: String,
158 /// Treat missing credentials as an error instead of skipping the run.
159 #[arg(long = "fail-on-unauthorized")]
160 pub fail_on_unauthorized: bool,
161 /// Print the aggregate summary JSON to stdout.
162 #[arg(long)]
163 pub json: bool,
164 /// Optional step_judge config applied to every run in this invocation.
165 /// Accepts a preset name (`symmetric-cheap`, `asymmetric`,
166 /// `symmetric-strong`) which expands to a known {model, provider}
167 /// pair, or `custom:<json>` for a literal JSON object passed verbatim
168 /// to `agent_loop({step_judge: ...})`. Omit (or pass `none` / `off`) to disable.
169 /// For matrix sweeps across presets, the step-judge experiment driver
170 /// at experiments/step-judge/run.sh invokes the eval runner once per
171 /// preset and aggregates.
172 #[arg(long = "step-judge")]
173 pub step_judge: Option<String>,
174 /// Override the on_veto remediation shape for the step-judge config
175 /// (`replace` or `retain`). Default is `replace`.
176 #[arg(long = "step-judge-on-veto")]
177 pub step_judge_on_veto: Option<String>,
178 /// Use the adversarial rubric variant.
179 #[arg(long = "step-judge-adversarial")]
180 pub step_judge_adversarial: bool,
181 /// Free-form reason attached when forcing a tool format against catalog guidance.
182 #[arg(long = "override-reason")]
183 pub override_reason: Option<String>,
184 /// Structural-validator config applied to every run in this invocation.
185 /// Omit to use the suite default (currently the 4-rule validator).
186 /// Accepts `on` / `default`, `off` / `none`, or `custom:<json>` for a
187 /// literal JSON object passed to `with_structural_validator(...)`.
188 #[arg(long = "structural-validator")]
189 pub structural_validator: Option<String>,
190 /// Free-form label persisted in summary.json for grouping repeat runs
191 /// (e.g. "replicate-1", "probe-judge-arch-gpt"). Defaults to empty.
192 #[arg(long = "run-label", default_value = "")]
193 pub run_label: String,
194 /// Path to a previous coding-agent `summary.json` (or its parent dir).
195 /// When present, the new summary embeds a `baseline_comparison` block
196 /// listing per-fixture regressions (baseline passed but this cell
197 /// failed) and recoveries (baseline failed but this cell passed),
198 /// plus aggregate counts and a net lift in percentage points.
199 /// Useful for cross-cell A/Bs (provider sweep, prompt change, step
200 /// judge on/off) where net pass-rate hides destructive interactions
201 /// like the cli-help-flag regression the step-judge experiment
202 /// surfaced (harn#2318).
203 #[arg(long = "baseline-comparison-against")]
204 pub baseline_comparison_against: Option<PathBuf>,
205}
206
207#[derive(Debug, Args)]
208pub struct EvalToolCallsArgs {
209 #[command(subcommand)]
210 pub command: Option<EvalToolCallsCommand>,
211 /// Dataset directory or JSON file. Directories prefer a `cases/` child.
212 #[arg(long, default_value = "conformance/tool-call-eval")]
213 pub dataset: PathBuf,
214 /// Planner model selector: alias, `provider:model`, or `provider=...,model=...`.
215 #[arg(long)]
216 pub planner: Option<String>,
217 /// Optional binder model selector. When set, a second model canonicalizes
218 /// the planner's response into a call/refusal decision before scoring.
219 #[arg(long)]
220 pub binder: Option<String>,
221 /// Judge model used only for predicate cases.
222 #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
223 pub judge_model: String,
224 /// Output directory for `summary.json` and `per_case.jsonl`.
225 #[arg(long)]
226 pub output: Option<PathBuf>,
227 /// Override tool rendering for the planner (`native` or `text`).
228 #[arg(long = "tool-format")]
229 pub tool_format: Option<String>,
230 /// Maximum planner response tokens.
231 #[arg(long = "max-tokens", default_value_t = 512)]
232 pub max_tokens: i64,
233 /// Maximum binder response tokens. Default is sized to leave room for
234 /// reasoning-emitting models (e.g. GPT-OSS-120B emits ~200 tokens of
235 /// chain-of-thought before the JSON payload); non-reasoning binders
236 /// will under-fill this budget at no extra cost.
237 #[arg(long = "binder-max-tokens", default_value_t = 1024)]
238 pub binder_max_tokens: i64,
239 /// Run only cases whose id or tag contains this string.
240 #[arg(long)]
241 pub filter: Option<String>,
242 /// Stop after N selected cases, useful for smoke runs.
243 #[arg(long = "max-cases")]
244 pub max_cases: Option<usize>,
245 /// Treat missing credentials as an immediate preflight error.
246 #[arg(long = "fail-on-unauthorized")]
247 pub fail_on_unauthorized: bool,
248}
249
250#[derive(Debug, Subcommand)]
251pub enum EvalToolCallsCommand {
252 /// Compare a current summary against a pinned baseline.
253 RegressionCheck(EvalToolCallsRegressionArgs),
254}
255
256#[derive(Debug, Args)]
257pub struct EvalToolCallsRegressionArgs {
258 /// Current run summary. Defaults to `.harn-runs/tool-call-eval/latest/summary.json`.
259 #[arg(long)]
260 pub current: Option<PathBuf>,
261 /// Optional planner label for diagnostics.
262 #[arg(long)]
263 pub planner: Option<String>,
264 /// Baseline summary JSON to compare against.
265 #[arg(long)]
266 pub against: PathBuf,
267 /// Maximum allowed pass-rate drop in percentage points.
268 #[arg(long = "max-drop-pp", default_value_t = 2.0)]
269 pub max_drop_pp: f64,
270}
271
272#[derive(Debug, Args)]
273pub struct EvalPromptArgs {
274 /// Path to a `.harn.prompt` (or `.prompt`) template.
275 pub file: PathBuf,
276 /// Fleet of model selectors (comma-separated, repeatable).
277 /// Each entry is either a model alias (`claude-opus-4-7`) or a
278 /// `provider:model` selector (`ollama:qwen3.5`). Mutually exclusive
279 /// with `--fleet-name`.
280 #[arg(
281 long,
282 value_delimiter = ',',
283 required_unless_present = "fleet_name",
284 conflicts_with = "fleet_name"
285 )]
286 pub fleet: Vec<String>,
287 /// Named fleet from `harn.toml` `[eval.fleets.<name>]`.
288 #[arg(long = "fleet-name")]
289 pub fleet_name: Option<String>,
290 /// JSON file with bindings injected into the template scope.
291 #[arg(long)]
292 pub bindings: Option<PathBuf>,
293 /// Prompt context-quality fixture(s) that score artifact selection,
294 /// stale/noisy rejection, budget adherence, and logical-section shape.
295 #[arg(long = "context-fixture")]
296 pub context_fixture: Vec<PathBuf>,
297 /// Evaluation mode.
298 #[arg(long, value_enum, default_value_t = EvalPromptMode::Render)]
299 pub mode: EvalPromptMode,
300 /// Output format.
301 #[arg(long, value_enum, default_value_t = EvalPromptOutput::Terminal)]
302 pub output: EvalPromptOutput,
303 /// Output destination for HTML / JSON (defaults to stdout).
304 #[arg(long = "out-file", short = 'o')]
305 pub out_file: Option<PathBuf>,
306 /// Maximum concurrent model invocations in run/judge modes.
307 #[arg(long, default_value_t = 4)]
308 pub max_concurrent: usize,
309 /// Optional judge prompt template. When unset, a built-in equivalence
310 /// judge is used.
311 #[arg(long = "judge-template")]
312 pub judge_template: Option<PathBuf>,
313 /// Model used for `--mode judge` evaluation.
314 #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
315 pub judge_model: String,
316 /// Maximum tokens for `--mode run` / `--mode judge` calls.
317 #[arg(long = "max-tokens", default_value_t = 1024)]
318 pub max_tokens: i64,
319 /// Treat unauthenticated providers as errors rather than skipping them.
320 #[arg(long = "fail-on-unauthorized")]
321 pub fail_on_unauthorized: bool,
322}
323
324#[derive(Debug, Clone, Copy, ValueEnum)]
325pub enum EvalPromptMode {
326 /// Render the template against each model's capability profile.
327 Render,
328 /// Render + execute against each model and collect outputs.
329 Run,
330 /// Render + run + LLM-as-judge equivalence scoring.
331 Judge,
332}
333
334#[derive(Debug, Clone, Copy, ValueEnum)]
335pub enum EvalPromptOutput {
336 Terminal,
337 Json,
338 Html,
339}