1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
//! Clap definitions for `harn eval` and its subcommands.
//!
//! The bare form `harn eval <path>` evaluates a run record, run directory,
//! eval manifest, or `.harn` pipeline (legacy entrypoint, dispatched through
//! `eval_run_record`). The `harn eval prompt <file> --fleet <models>`
//! subcommand renders (and optionally runs / judges) a single
//! `.harn.prompt` template against a fleet of models so authors can compare
//! the wire envelope each capability profile materializes.
use std::path::PathBuf;
use clap::{Args, Subcommand, ValueEnum};
#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
pub struct EvalArgs {
/// Run record path, run directory, eval manifest path, or `.harn` pipeline.
/// Required unless a subcommand (e.g. `prompt`) is used.
pub path: Option<String>,
/// Optional baseline run record for diffing.
#[arg(long)]
pub compare: Option<String>,
/// Run a pipeline twice and compare the baseline against this structural experiment.
#[arg(long = "structural-experiment")]
pub structural_experiment: Option<String>,
/// Replay LLM responses from a JSONL fixture file when `path` is a `.harn` pipeline.
#[arg(
long = "llm-mock",
value_name = "PATH",
conflicts_with = "llm_mock_record"
)]
pub llm_mock: Option<String>,
/// Record executed LLM responses into a JSONL fixture file when `path` is a `.harn` pipeline.
#[arg(
long = "llm-mock-record",
value_name = "PATH",
conflicts_with = "llm_mock"
)]
pub llm_mock_record: Option<String>,
/// Positional arguments forwarded to `harn run <pipeline.harn> -- ...` when
/// `path` is a pipeline file and `--structural-experiment` is set.
#[arg(last = true)]
pub argv: Vec<String>,
#[command(subcommand)]
pub command: Option<EvalCommand>,
}
#[derive(Debug, Subcommand)]
pub enum EvalCommand {
/// Benchmark coding-agent fixtures across providers and tool formats.
CodingAgent(EvalCodingAgentArgs),
/// Run deterministic context-engineering modes over task fixtures.
Context(EvalContextArgs),
/// Render and optionally run a `.harn.prompt` across a fleet of models.
Prompt(EvalPromptArgs),
/// Run tool-call accuracy, latency, and cost evals over a dataset.
ToolCalls(EvalToolCallsArgs),
}
#[derive(Debug, Args)]
pub struct EvalContextArgs {
/// Context eval manifest JSON or TOML.
pub manifest: PathBuf,
/// Output directory for summary.json, per_run.jsonl, and summary.md.
#[arg(long)]
pub output: Option<PathBuf>,
/// Print the aggregate summary JSON to stdout.
#[arg(long)]
pub json: bool,
}
#[derive(Debug, Args)]
pub struct EvalCodingAgentArgs {
/// Fixture ids to run (comma-separated, repeatable). Use `all` for the full suite.
#[arg(long = "fixture", value_delimiter = ',', default_value = "all")]
pub fixtures: Vec<String>,
/// Model selectors to run (comma-separated, repeatable). Each entry may be
/// an alias, `provider:model`, or `provider=...,model=...`.
#[arg(long = "model", value_delimiter = ',', default_value = "mock:mock")]
pub models: Vec<String>,
/// Tool-call rendering modes to compare.
#[arg(
long = "tool-format",
value_delimiter = ',',
default_value = "native,text"
)]
pub tool_formats: Vec<String>,
/// Output directory for summary.json, per_run.jsonl, transcripts, and markdown reports.
#[arg(long)]
pub output: Option<PathBuf>,
/// Optional .env file(s) to load for provider credentials. Values are never written to artifacts.
#[arg(long = "env-file")]
pub env_files: Vec<PathBuf>,
/// Append reachable local Ollama/llama.cpp/MLX/vLLM models to the selected matrix.
#[arg(long = "include-local")]
pub include_local: bool,
/// Restrict local discovery to one provider id. Repeatable.
#[arg(long = "local-provider")]
pub local_providers: Vec<String>,
/// Maximum discovered local models to append.
#[arg(long = "max-local-models", default_value_t = 2)]
pub max_local_models: usize,
/// Leave newly-loaded Ollama models running after each local benchmark run.
#[arg(long = "keep-local-after-run")]
pub keep_local_after_run: bool,
/// Stop after N matrix entries, useful for cost-capped smoke runs.
#[arg(long = "max-runs")]
pub max_runs: Option<usize>,
/// Maximum repair-agent loop iterations per run.
#[arg(long = "max-iterations", default_value_t = 8)]
pub max_iterations: usize,
/// Python executable used by the fixture and verification command.
#[arg(long, default_value = "python3")]
pub python: String,
/// Treat missing credentials as an error instead of skipping the run.
#[arg(long = "fail-on-unauthorized")]
pub fail_on_unauthorized: bool,
/// Print the aggregate summary JSON to stdout.
#[arg(long)]
pub json: bool,
/// Optional step_judge config applied to every run in this invocation.
/// Accepts a preset name (`symmetric-cheap`, `asymmetric`,
/// `symmetric-strong`) which expands to a known {model, provider}
/// pair, or `custom:<json>` for a literal JSON object passed verbatim
/// to `agent_loop({step_judge: ...})`. Omit (or pass `none` / `off`) to disable.
/// For matrix sweeps across presets, the step-judge experiment driver
/// at experiments/step-judge/run.sh invokes the eval runner once per
/// preset and aggregates.
#[arg(long = "step-judge")]
pub step_judge: Option<String>,
/// Override the on_veto remediation shape for the step-judge config
/// (`replace` or `retain`). Default is `replace`.
#[arg(long = "step-judge-on-veto")]
pub step_judge_on_veto: Option<String>,
/// Use the adversarial rubric variant.
#[arg(long = "step-judge-adversarial")]
pub step_judge_adversarial: bool,
/// Free-form reason attached when forcing a tool format against catalog guidance.
#[arg(long = "override-reason")]
pub override_reason: Option<String>,
/// Structural-validator config applied to every run in this invocation.
/// Omit to use the suite default (currently the 4-rule validator).
/// Accepts `on` / `default`, `off` / `none`, or `custom:<json>` for a
/// literal JSON object passed to `with_structural_validator(...)`.
#[arg(long = "structural-validator")]
pub structural_validator: Option<String>,
/// Free-form label persisted in summary.json for grouping repeat runs
/// (e.g. "replicate-1", "probe-judge-arch-gpt"). Defaults to empty.
#[arg(long = "run-label", default_value = "")]
pub run_label: String,
/// Path to a previous coding-agent `summary.json` (or its parent dir).
/// When present, the new summary embeds a `baseline_comparison` block
/// listing per-fixture regressions (baseline passed but this cell
/// failed) and recoveries (baseline failed but this cell passed),
/// plus aggregate counts and a net lift in percentage points.
/// Useful for cross-cell A/Bs (provider sweep, prompt change, step
/// judge on/off) where net pass-rate hides destructive interactions
/// like the cli-help-flag regression the step-judge experiment
/// surfaced (harn#2318).
#[arg(long = "baseline-comparison-against")]
pub baseline_comparison_against: Option<PathBuf>,
}
#[derive(Debug, Args)]
pub struct EvalToolCallsArgs {
#[command(subcommand)]
pub command: Option<EvalToolCallsCommand>,
/// Dataset directory or JSON file. Directories prefer a `cases/` child.
#[arg(long, default_value = "conformance/tool-call-eval")]
pub dataset: PathBuf,
/// Planner model selector: alias, `provider:model`, or `provider=...,model=...`.
#[arg(long)]
pub planner: Option<String>,
/// Optional binder model selector. When set, a second model canonicalizes
/// the planner's response into a call/refusal decision before scoring.
#[arg(long)]
pub binder: Option<String>,
/// Judge model used only for predicate cases.
#[arg(long = "judge-model", default_value = "claude-opus-4-7")]
pub judge_model: String,
/// Output directory for `summary.json` and `per_case.jsonl`.
#[arg(long)]
pub output: Option<PathBuf>,
/// Override tool rendering for the planner (`native` or `text`).
#[arg(long = "tool-format")]
pub tool_format: Option<String>,
/// Maximum planner response tokens.
#[arg(long = "max-tokens", default_value_t = 512)]
pub max_tokens: i64,
/// Maximum binder response tokens. Default is sized to leave room for
/// reasoning-emitting models (e.g. GPT-OSS-120B emits ~200 tokens of
/// chain-of-thought before the JSON payload); non-reasoning binders
/// will under-fill this budget at no extra cost.
#[arg(long = "binder-max-tokens", default_value_t = 1024)]
pub binder_max_tokens: i64,
/// Run only cases whose id or tag contains this string.
#[arg(long)]
pub filter: Option<String>,
/// Stop after N selected cases, useful for smoke runs.
#[arg(long = "max-cases")]
pub max_cases: Option<usize>,
/// Treat missing credentials as an immediate preflight error.
#[arg(long = "fail-on-unauthorized")]
pub fail_on_unauthorized: bool,
}
#[derive(Debug, Subcommand)]
pub enum EvalToolCallsCommand {
/// Compare a current summary against a pinned baseline.
RegressionCheck(EvalToolCallsRegressionArgs),
}
#[derive(Debug, Args)]
pub struct EvalToolCallsRegressionArgs {
/// Current run summary. Defaults to `.harn-runs/tool-call-eval/latest/summary.json`.
#[arg(long)]
pub current: Option<PathBuf>,
/// Optional planner label for diagnostics.
#[arg(long)]
pub planner: Option<String>,
/// Baseline summary JSON to compare against.
#[arg(long)]
pub against: PathBuf,
/// Maximum allowed pass-rate drop in percentage points.
#[arg(long = "max-drop-pp", default_value_t = 2.0)]
pub max_drop_pp: f64,
}
#[derive(Debug, Args)]
pub struct EvalPromptArgs {
/// Path to a `.harn.prompt` (or `.prompt`) template.
pub file: PathBuf,
/// Fleet of model selectors (comma-separated, repeatable).
/// Each entry is either a model alias (`claude-opus-4-7`) or a
/// `provider:model` selector (`ollama:qwen3.5`). Mutually exclusive
/// with `--fleet-name`.
#[arg(
long,
value_delimiter = ',',
required_unless_present = "fleet_name",
conflicts_with = "fleet_name"
)]
pub fleet: Vec<String>,
/// Named fleet from `harn.toml` `[eval.fleets.<name>]`.
#[arg(long = "fleet-name")]
pub fleet_name: Option<String>,
/// JSON file with bindings injected into the template scope.
#[arg(long)]
pub bindings: Option<PathBuf>,
/// Prompt context-quality fixture(s) that score artifact selection,
/// stale/noisy rejection, budget adherence, and logical-section shape.
#[arg(long = "context-fixture")]
pub context_fixture: Vec<PathBuf>,
/// Evaluation mode.
#[arg(long, value_enum, default_value_t = EvalPromptMode::Render)]
pub mode: EvalPromptMode,
/// Output format.
#[arg(long, value_enum, default_value_t = EvalPromptOutput::Terminal)]
pub output: EvalPromptOutput,
/// Output destination for HTML / JSON (defaults to stdout).
#[arg(long = "out-file", short = 'o')]
pub out_file: Option<PathBuf>,
/// Maximum concurrent model invocations in run/judge modes.
#[arg(long, default_value_t = 4)]
pub max_concurrent: usize,
/// Optional judge prompt template. When unset, a built-in equivalence
/// judge is used.
#[arg(long = "judge-template")]
pub judge_template: Option<PathBuf>,
/// Model used for `--mode judge` evaluation.
#[arg(long = "judge-model", default_value = "claude-opus-4-7")]
pub judge_model: String,
/// Maximum tokens for `--mode run` / `--mode judge` calls.
#[arg(long = "max-tokens", default_value_t = 1024)]
pub max_tokens: i64,
/// Treat unauthenticated providers as errors rather than skipping them.
#[arg(long = "fail-on-unauthorized")]
pub fail_on_unauthorized: bool,
}
#[derive(Debug, Clone, Copy, ValueEnum)]
pub enum EvalPromptMode {
/// Render the template against each model's capability profile.
Render,
/// Render + execute against each model and collect outputs.
Run,
/// Render + run + LLM-as-judge equivalence scoring.
Judge,
}
#[derive(Debug, Clone, Copy, ValueEnum)]
pub enum EvalPromptOutput {
Terminal,
Json,
Html,
}