harn_cli/cli/eval.rs
1//! Clap definitions for `harn eval` and its subcommands.
2//!
3//! The bare form `harn eval <path>` evaluates a run record, run directory,
4//! eval manifest, or `.harn` pipeline (legacy entrypoint, dispatched through
5//! `eval_run_record`). The `harn eval prompt <file> --fleet <models>`
6//! subcommand renders (and optionally runs / judges) a single
7//! `.harn.prompt` template against a fleet of models so authors can compare
8//! the wire envelope each capability profile materializes.
9
10use std::path::PathBuf;
11
12use clap::{Args, Subcommand, ValueEnum};
13
14#[derive(Debug, Args)]
15#[command(args_conflicts_with_subcommands = true)]
16pub struct EvalArgs {
17 /// Run record path, run directory, eval manifest path, or `.harn` pipeline.
18 /// Required unless a subcommand (e.g. `prompt`) is used.
19 pub path: Option<String>,
20 /// Optional baseline run record for diffing.
21 #[arg(long)]
22 pub compare: Option<String>,
23 /// Run a pipeline twice and compare the baseline against this structural experiment.
24 #[arg(long = "structural-experiment")]
25 pub structural_experiment: Option<String>,
26 /// Replay LLM responses from a JSONL fixture file when `path` is a `.harn` pipeline.
27 #[arg(
28 long = "llm-mock",
29 value_name = "PATH",
30 conflicts_with = "llm_mock_record"
31 )]
32 pub llm_mock: Option<String>,
33 /// Record executed LLM responses into a JSONL fixture file when `path` is a `.harn` pipeline.
34 #[arg(
35 long = "llm-mock-record",
36 value_name = "PATH",
37 conflicts_with = "llm_mock"
38 )]
39 pub llm_mock_record: Option<String>,
40 /// Positional arguments forwarded to `harn run <pipeline.harn> -- ...` when
41 /// `path` is a pipeline file and `--structural-experiment` is set.
42 #[arg(last = true)]
43 pub argv: Vec<String>,
44 #[command(subcommand)]
45 pub command: Option<EvalCommand>,
46}
47
48#[derive(Debug, Subcommand)]
49pub enum EvalCommand {
50 /// Render and optionally run a `.harn.prompt` across a fleet of models.
51 Prompt(EvalPromptArgs),
52 /// Run tool-call accuracy, latency, and cost evals over a dataset.
53 ToolCalls(EvalToolCallsArgs),
54}
55
56#[derive(Debug, Args)]
57pub struct EvalToolCallsArgs {
58 #[command(subcommand)]
59 pub command: Option<EvalToolCallsCommand>,
60 /// Dataset directory or JSON file. Directories prefer a `cases/` child.
61 #[arg(long, default_value = "conformance/tool-call-eval")]
62 pub dataset: PathBuf,
63 /// Planner model selector: alias, `provider:model`, or `provider=...,model=...`.
64 #[arg(long)]
65 pub planner: Option<String>,
66 /// Optional binder model selector. When set, a second model canonicalizes
67 /// the planner's response into a call/refusal decision before scoring.
68 #[arg(long)]
69 pub binder: Option<String>,
70 /// Judge model used only for predicate cases.
71 #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
72 pub judge_model: String,
73 /// Output directory for `summary.json` and `per_case.jsonl`.
74 #[arg(long)]
75 pub output: Option<PathBuf>,
76 /// Override tool rendering for the planner (`native` or `text`).
77 #[arg(long = "tool-format")]
78 pub tool_format: Option<String>,
79 /// Maximum planner response tokens.
80 #[arg(long = "max-tokens", default_value_t = 512)]
81 pub max_tokens: i64,
82 /// Maximum binder response tokens.
83 #[arg(long = "binder-max-tokens", default_value_t = 256)]
84 pub binder_max_tokens: i64,
85 /// Run only cases whose id or tag contains this string.
86 #[arg(long)]
87 pub filter: Option<String>,
88 /// Stop after N selected cases, useful for smoke runs.
89 #[arg(long = "max-cases")]
90 pub max_cases: Option<usize>,
91 /// Treat missing credentials as an immediate preflight error.
92 #[arg(long = "fail-on-unauthorized")]
93 pub fail_on_unauthorized: bool,
94}
95
96#[derive(Debug, Subcommand)]
97pub enum EvalToolCallsCommand {
98 /// Compare a current summary against a pinned baseline.
99 RegressionCheck(EvalToolCallsRegressionArgs),
100}
101
102#[derive(Debug, Args)]
103pub struct EvalToolCallsRegressionArgs {
104 /// Current run summary. Defaults to `.harn-runs/tool-call-eval/latest/summary.json`.
105 #[arg(long)]
106 pub current: Option<PathBuf>,
107 /// Optional planner label for diagnostics.
108 #[arg(long)]
109 pub planner: Option<String>,
110 /// Baseline summary JSON to compare against.
111 #[arg(long)]
112 pub against: PathBuf,
113 /// Maximum allowed pass-rate drop in percentage points.
114 #[arg(long = "max-drop-pp", default_value_t = 2.0)]
115 pub max_drop_pp: f64,
116}
117
118#[derive(Debug, Args)]
119pub struct EvalPromptArgs {
120 /// Path to a `.harn.prompt` (or `.prompt`) template.
121 pub file: PathBuf,
122 /// Fleet of model selectors (comma-separated, repeatable).
123 /// Each entry is either a model alias (`claude-opus-4-7`) or a
124 /// `provider:model` selector (`ollama:qwen3.5`). Mutually exclusive
125 /// with `--fleet-name`.
126 #[arg(
127 long,
128 value_delimiter = ',',
129 required_unless_present = "fleet_name",
130 conflicts_with = "fleet_name"
131 )]
132 pub fleet: Vec<String>,
133 /// Named fleet from `harn.toml` `[eval.fleets.<name>]`.
134 #[arg(long = "fleet-name")]
135 pub fleet_name: Option<String>,
136 /// JSON file with bindings injected into the template scope.
137 #[arg(long)]
138 pub bindings: Option<PathBuf>,
139 /// Prompt context-quality fixture(s) that score artifact selection,
140 /// stale/noisy rejection, budget adherence, and logical-section shape.
141 #[arg(long = "context-fixture")]
142 pub context_fixture: Vec<PathBuf>,
143 /// Evaluation mode.
144 #[arg(long, value_enum, default_value_t = EvalPromptMode::Render)]
145 pub mode: EvalPromptMode,
146 /// Output format.
147 #[arg(long, value_enum, default_value_t = EvalPromptOutput::Terminal)]
148 pub output: EvalPromptOutput,
149 /// Output destination for HTML / JSON (defaults to stdout).
150 #[arg(long = "out-file", short = 'o')]
151 pub out_file: Option<PathBuf>,
152 /// Maximum concurrent model invocations in run/judge modes.
153 #[arg(long, default_value_t = 4)]
154 pub max_concurrent: usize,
155 /// Optional judge prompt template. When unset, a built-in equivalence
156 /// judge is used.
157 #[arg(long = "judge-template")]
158 pub judge_template: Option<PathBuf>,
159 /// Model used for `--mode judge` evaluation.
160 #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
161 pub judge_model: String,
162 /// Maximum tokens for `--mode run` / `--mode judge` calls.
163 #[arg(long = "max-tokens", default_value_t = 1024)]
164 pub max_tokens: i64,
165 /// Treat unauthenticated providers as errors rather than skipping them.
166 #[arg(long = "fail-on-unauthorized")]
167 pub fail_on_unauthorized: bool,
168}
169
170#[derive(Debug, Clone, Copy, ValueEnum)]
171pub enum EvalPromptMode {
172 /// Render the template against each model's capability profile.
173 Render,
174 /// Render + execute against each model and collect outputs.
175 Run,
176 /// Render + run + LLM-as-judge equivalence scoring.
177 Judge,
178}
179
180#[derive(Debug, Clone, Copy, ValueEnum)]
181pub enum EvalPromptOutput {
182 Terminal,
183 Json,
184 Html,
185}