Skip to main content

harn_cli/cli/
eval.rs

1//! Clap definitions for `harn eval` and its subcommands.
2//!
3//! The bare form `harn eval <path>` evaluates a run record, run directory,
4//! eval manifest, or `.harn` pipeline (legacy entrypoint, dispatched through
5//! `eval_run_record`). The `harn eval prompt <file> --fleet <models>`
6//! subcommand renders (and optionally runs / judges) a single
7//! `.harn.prompt` template against a fleet of models so authors can compare
8//! the wire envelope each capability profile materializes.
9
10use std::path::PathBuf;
11
12use clap::{Args, Subcommand, ValueEnum};
13
14#[derive(Debug, Args)]
15#[command(args_conflicts_with_subcommands = true)]
16pub struct EvalArgs {
17    /// Run record path, run directory, eval manifest path, or `.harn` pipeline.
18    /// Required unless a subcommand (e.g. `prompt`) is used.
19    pub path: Option<String>,
20    /// Optional baseline run record for diffing.
21    #[arg(long)]
22    pub compare: Option<String>,
23    /// Run a pipeline twice and compare the baseline against this structural experiment.
24    #[arg(long = "structural-experiment")]
25    pub structural_experiment: Option<String>,
26    /// Replay LLM responses from a JSONL fixture file when `path` is a `.harn` pipeline.
27    #[arg(
28        long = "llm-mock",
29        value_name = "PATH",
30        conflicts_with = "llm_mock_record"
31    )]
32    pub llm_mock: Option<String>,
33    /// Record executed LLM responses into a JSONL fixture file when `path` is a `.harn` pipeline.
34    #[arg(
35        long = "llm-mock-record",
36        value_name = "PATH",
37        conflicts_with = "llm_mock"
38    )]
39    pub llm_mock_record: Option<String>,
40    /// Positional arguments forwarded to `harn run <pipeline.harn> -- ...` when
41    /// `path` is a pipeline file and `--structural-experiment` is set.
42    #[arg(last = true)]
43    pub argv: Vec<String>,
44    #[command(subcommand)]
45    pub command: Option<EvalCommand>,
46}
47
48#[derive(Debug, Subcommand)]
49pub enum EvalCommand {
50    /// Benchmark coding-agent fixtures across providers and tool formats.
51    CodingAgent(EvalCodingAgentArgs),
52    /// Run deterministic context-engineering modes over task fixtures.
53    Context(EvalContextArgs),
54    /// Render and optionally run a `.harn.prompt` across a fleet of models.
55    Prompt(EvalPromptArgs),
56    /// Measure pre-turn scope-triage savings and false-positive rates.
57    #[command(name = "scope_triage", visible_alias = "scope-triage")]
58    ScopeTriage(EvalScopeTriageArgs),
59    /// Run tool-call accuracy, latency, and cost evals over a dataset.
60    ToolCalls(EvalToolCallsArgs),
61}
62
63#[derive(Debug, Args)]
64pub struct EvalContextArgs {
65    /// Context eval manifest JSON or TOML.
66    pub manifest: PathBuf,
67    /// Output directory for summary.json, per_run.jsonl, and summary.md.
68    #[arg(long)]
69    pub output: Option<PathBuf>,
70    /// Print the aggregate summary JSON to stdout.
71    #[arg(long)]
72    pub json: bool,
73}
74
75#[derive(Debug, Args)]
76pub struct EvalScopeTriageArgs {
77    /// Scope-triage dataset JSON.
78    #[arg(long, default_value = "evals/scope_triage/dataset.json")]
79    pub dataset: PathBuf,
80    /// Output directory for summary.json, per_case.jsonl, and summary.md.
81    #[arg(long)]
82    pub output: Option<PathBuf>,
83    /// Print the aggregate summary JSON to stdout.
84    #[arg(long)]
85    pub json: bool,
86    /// Run the live default classifier model instead of the deterministic reference classifier.
87    #[arg(long)]
88    pub live: bool,
89    /// Live classifier model selector.
90    #[arg(long, default_value = "ollama:qwen3:1.7b")]
91    pub model: String,
92    /// Confidence threshold below which labels become escalate.
93    #[arg(long = "confidence-threshold", default_value_t = 0.65)]
94    pub confidence_threshold: f64,
95    /// Stop after N cases, useful for smoke runs.
96    #[arg(long = "max-cases")]
97    pub max_cases: Option<usize>,
98}
99
100#[derive(Debug, Args)]
101pub struct EvalCodingAgentArgs {
102    /// Fixture ids to run (comma-separated, repeatable). Use `all` for the full suite.
103    #[arg(long = "fixture", value_delimiter = ',', default_value = "all")]
104    pub fixtures: Vec<String>,
105    /// Model selectors to run (comma-separated, repeatable). Each entry may be
106    /// an alias, `provider:model`, or `provider=...,model=...`.
107    #[arg(long = "model", value_delimiter = ',', default_value = "mock:mock")]
108    pub models: Vec<String>,
109    /// Tool-call rendering modes to compare.
110    #[arg(
111        long = "tool-format",
112        value_delimiter = ',',
113        default_value = "native,text"
114    )]
115    pub tool_formats: Vec<String>,
116    /// Output directory for summary.json, per_run.jsonl, transcripts, and markdown reports.
117    #[arg(long)]
118    pub output: Option<PathBuf>,
119    /// Optional .env file(s) to load for provider credentials. Values are never written to artifacts.
120    #[arg(long = "env-file")]
121    pub env_files: Vec<PathBuf>,
122    /// Append reachable local Ollama/llama.cpp/MLX/vLLM models to the selected matrix.
123    #[arg(long = "include-local")]
124    pub include_local: bool,
125    /// Restrict local discovery to one provider id. Repeatable.
126    #[arg(long = "local-provider")]
127    pub local_providers: Vec<String>,
128    /// Maximum discovered local models to append.
129    #[arg(long = "max-local-models", default_value_t = 2)]
130    pub max_local_models: usize,
131    /// Leave newly-loaded Ollama models running after each local benchmark run.
132    #[arg(long = "keep-local-after-run")]
133    pub keep_local_after_run: bool,
134    /// Stop after N matrix entries, useful for cost-capped smoke runs.
135    #[arg(long = "max-runs")]
136    pub max_runs: Option<usize>,
137    /// Maximum repair-agent loop iterations per run.
138    #[arg(long = "max-iterations", default_value_t = 8)]
139    pub max_iterations: usize,
140    /// Python executable used by the fixture and verification command.
141    #[arg(long, default_value = "python3")]
142    pub python: String,
143    /// Treat missing credentials as an error instead of skipping the run.
144    #[arg(long = "fail-on-unauthorized")]
145    pub fail_on_unauthorized: bool,
146    /// Print the aggregate summary JSON to stdout.
147    #[arg(long)]
148    pub json: bool,
149    /// Optional step_judge config applied to every run in this invocation.
150    /// Accepts a preset name (`symmetric-cheap`, `asymmetric`,
151    /// `symmetric-strong`) which expands to a known {model, provider}
152    /// pair, or `custom:<json>` for a literal JSON object passed verbatim
153    /// to `agent_loop({step_judge: ...})`. Omit (or pass `none` / `off`) to disable.
154    /// For matrix sweeps across presets, the step-judge experiment driver
155    /// at experiments/step-judge/run.sh invokes the eval runner once per
156    /// preset and aggregates.
157    #[arg(long = "step-judge")]
158    pub step_judge: Option<String>,
159    /// Override the on_veto remediation shape for the step-judge config
160    /// (`replace` or `retain`). Default is `replace`.
161    #[arg(long = "step-judge-on-veto")]
162    pub step_judge_on_veto: Option<String>,
163    /// Use the adversarial rubric variant.
164    #[arg(long = "step-judge-adversarial")]
165    pub step_judge_adversarial: bool,
166    /// Free-form reason attached when forcing a tool format against catalog guidance.
167    #[arg(long = "override-reason")]
168    pub override_reason: Option<String>,
169    /// Structural-validator config applied to every run in this invocation.
170    /// Omit to use the suite default (currently the 4-rule validator).
171    /// Accepts `on` / `default`, `off` / `none`, or `custom:<json>` for a
172    /// literal JSON object passed to `with_structural_validator(...)`.
173    #[arg(long = "structural-validator")]
174    pub structural_validator: Option<String>,
175    /// Free-form label persisted in summary.json for grouping repeat runs
176    /// (e.g. "replicate-1", "probe-judge-arch-gpt"). Defaults to empty.
177    #[arg(long = "run-label", default_value = "")]
178    pub run_label: String,
179    /// Path to a previous coding-agent `summary.json` (or its parent dir).
180    /// When present, the new summary embeds a `baseline_comparison` block
181    /// listing per-fixture regressions (baseline passed but this cell
182    /// failed) and recoveries (baseline failed but this cell passed),
183    /// plus aggregate counts and a net lift in percentage points.
184    /// Useful for cross-cell A/Bs (provider sweep, prompt change, step
185    /// judge on/off) where net pass-rate hides destructive interactions
186    /// like the cli-help-flag regression the step-judge experiment
187    /// surfaced (harn#2318).
188    #[arg(long = "baseline-comparison-against")]
189    pub baseline_comparison_against: Option<PathBuf>,
190}
191
192#[derive(Debug, Args)]
193pub struct EvalToolCallsArgs {
194    #[command(subcommand)]
195    pub command: Option<EvalToolCallsCommand>,
196    /// Dataset directory or JSON file. Directories prefer a `cases/` child.
197    #[arg(long, default_value = "conformance/tool-call-eval")]
198    pub dataset: PathBuf,
199    /// Planner model selector: alias, `provider:model`, or `provider=...,model=...`.
200    #[arg(long)]
201    pub planner: Option<String>,
202    /// Optional binder model selector. When set, a second model canonicalizes
203    /// the planner's response into a call/refusal decision before scoring.
204    #[arg(long)]
205    pub binder: Option<String>,
206    /// Judge model used only for predicate cases.
207    #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
208    pub judge_model: String,
209    /// Output directory for `summary.json` and `per_case.jsonl`.
210    #[arg(long)]
211    pub output: Option<PathBuf>,
212    /// Override tool rendering for the planner (`native` or `text`).
213    #[arg(long = "tool-format")]
214    pub tool_format: Option<String>,
215    /// Maximum planner response tokens.
216    #[arg(long = "max-tokens", default_value_t = 512)]
217    pub max_tokens: i64,
218    /// Maximum binder response tokens. Default is sized to leave room for
219    /// reasoning-emitting models (e.g. GPT-OSS-120B emits ~200 tokens of
220    /// chain-of-thought before the JSON payload); non-reasoning binders
221    /// will under-fill this budget at no extra cost.
222    #[arg(long = "binder-max-tokens", default_value_t = 1024)]
223    pub binder_max_tokens: i64,
224    /// Run only cases whose id or tag contains this string.
225    #[arg(long)]
226    pub filter: Option<String>,
227    /// Stop after N selected cases, useful for smoke runs.
228    #[arg(long = "max-cases")]
229    pub max_cases: Option<usize>,
230    /// Treat missing credentials as an immediate preflight error.
231    #[arg(long = "fail-on-unauthorized")]
232    pub fail_on_unauthorized: bool,
233}
234
235#[derive(Debug, Subcommand)]
236pub enum EvalToolCallsCommand {
237    /// Compare a current summary against a pinned baseline.
238    RegressionCheck(EvalToolCallsRegressionArgs),
239}
240
241#[derive(Debug, Args)]
242pub struct EvalToolCallsRegressionArgs {
243    /// Current run summary. Defaults to `.harn-runs/tool-call-eval/latest/summary.json`.
244    #[arg(long)]
245    pub current: Option<PathBuf>,
246    /// Optional planner label for diagnostics.
247    #[arg(long)]
248    pub planner: Option<String>,
249    /// Baseline summary JSON to compare against.
250    #[arg(long)]
251    pub against: PathBuf,
252    /// Maximum allowed pass-rate drop in percentage points.
253    #[arg(long = "max-drop-pp", default_value_t = 2.0)]
254    pub max_drop_pp: f64,
255}
256
257#[derive(Debug, Args)]
258pub struct EvalPromptArgs {
259    /// Path to a `.harn.prompt` (or `.prompt`) template.
260    pub file: PathBuf,
261    /// Fleet of model selectors (comma-separated, repeatable).
262    /// Each entry is either a model alias (`claude-opus-4-7`) or a
263    /// `provider:model` selector (`ollama:qwen3.5`). Mutually exclusive
264    /// with `--fleet-name`.
265    #[arg(
266        long,
267        value_delimiter = ',',
268        required_unless_present = "fleet_name",
269        conflicts_with = "fleet_name"
270    )]
271    pub fleet: Vec<String>,
272    /// Named fleet from `harn.toml` `[eval.fleets.<name>]`.
273    #[arg(long = "fleet-name")]
274    pub fleet_name: Option<String>,
275    /// JSON file with bindings injected into the template scope.
276    #[arg(long)]
277    pub bindings: Option<PathBuf>,
278    /// Prompt context-quality fixture(s) that score artifact selection,
279    /// stale/noisy rejection, budget adherence, and logical-section shape.
280    #[arg(long = "context-fixture")]
281    pub context_fixture: Vec<PathBuf>,
282    /// Evaluation mode.
283    #[arg(long, value_enum, default_value_t = EvalPromptMode::Render)]
284    pub mode: EvalPromptMode,
285    /// Output format.
286    #[arg(long, value_enum, default_value_t = EvalPromptOutput::Terminal)]
287    pub output: EvalPromptOutput,
288    /// Output destination for HTML / JSON (defaults to stdout).
289    #[arg(long = "out-file", short = 'o')]
290    pub out_file: Option<PathBuf>,
291    /// Maximum concurrent model invocations in run/judge modes.
292    #[arg(long, default_value_t = 4)]
293    pub max_concurrent: usize,
294    /// Optional judge prompt template. When unset, a built-in equivalence
295    /// judge is used.
296    #[arg(long = "judge-template")]
297    pub judge_template: Option<PathBuf>,
298    /// Model used for `--mode judge` evaluation.
299    #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
300    pub judge_model: String,
301    /// Maximum tokens for `--mode run` / `--mode judge` calls.
302    #[arg(long = "max-tokens", default_value_t = 1024)]
303    pub max_tokens: i64,
304    /// Treat unauthenticated providers as errors rather than skipping them.
305    #[arg(long = "fail-on-unauthorized")]
306    pub fail_on_unauthorized: bool,
307}
308
309#[derive(Debug, Clone, Copy, ValueEnum)]
310pub enum EvalPromptMode {
311    /// Render the template against each model's capability profile.
312    Render,
313    /// Render + execute against each model and collect outputs.
314    Run,
315    /// Render + run + LLM-as-judge equivalence scoring.
316    Judge,
317}
318
319#[derive(Debug, Clone, Copy, ValueEnum)]
320pub enum EvalPromptOutput {
321    Terminal,
322    Json,
323    Html,
324}