Skip to main content

harn_cli/cli/
eval.rs

1//! Clap definitions for `harn eval` and its subcommands.
2//!
3//! The bare form `harn eval <path>` evaluates a run record, run directory,
4//! eval manifest, or `.harn` pipeline (legacy entrypoint, dispatched through
5//! `eval_run_record`). The `harn eval prompt <file> --fleet <models>`
6//! subcommand renders (and optionally runs / judges) a single
7//! `.harn.prompt` template against a fleet of models so authors can compare
8//! the wire envelope each capability profile materializes.
9
10use std::path::PathBuf;
11
12use clap::{Args, Subcommand, ValueEnum};
13
14#[derive(Debug, Args)]
15#[command(args_conflicts_with_subcommands = true)]
16pub struct EvalArgs {
17    /// Run record path, run directory, eval manifest path, or `.harn` pipeline.
18    /// Required unless a subcommand (e.g. `prompt`) is used.
19    pub path: Option<String>,
20    /// Optional baseline run record for diffing.
21    #[arg(long)]
22    pub compare: Option<String>,
23    /// Run a pipeline twice and compare the baseline against this structural experiment.
24    #[arg(long = "structural-experiment")]
25    pub structural_experiment: Option<String>,
26    /// Replay LLM responses from a JSONL fixture file when `path` is a `.harn` pipeline.
27    #[arg(
28        long = "llm-mock",
29        value_name = "PATH",
30        conflicts_with = "llm_mock_record"
31    )]
32    pub llm_mock: Option<String>,
33    /// Record executed LLM responses into a JSONL fixture file when `path` is a `.harn` pipeline.
34    #[arg(
35        long = "llm-mock-record",
36        value_name = "PATH",
37        conflicts_with = "llm_mock"
38    )]
39    pub llm_mock_record: Option<String>,
40    /// Positional arguments forwarded to `harn run <pipeline.harn> -- ...` when
41    /// `path` is a pipeline file and `--structural-experiment` is set.
42    #[arg(last = true)]
43    pub argv: Vec<String>,
44    #[command(subcommand)]
45    pub command: Option<EvalCommand>,
46}
47
48#[derive(Debug, Subcommand)]
49pub enum EvalCommand {
50    /// Render and optionally run a `.harn.prompt` across a fleet of models.
51    Prompt(EvalPromptArgs),
52    /// Run tool-call accuracy, latency, and cost evals over a dataset.
53    ToolCalls(EvalToolCallsArgs),
54}
55
56#[derive(Debug, Args)]
57pub struct EvalToolCallsArgs {
58    #[command(subcommand)]
59    pub command: Option<EvalToolCallsCommand>,
60    /// Dataset directory or JSON file. Directories prefer a `cases/` child.
61    #[arg(long, default_value = "conformance/tool-call-eval")]
62    pub dataset: PathBuf,
63    /// Planner model selector: alias, `provider:model`, or `provider=...,model=...`.
64    #[arg(long)]
65    pub planner: Option<String>,
66    /// Optional binder model selector. When set, a second model canonicalizes
67    /// the planner's response into a call/refusal decision before scoring.
68    #[arg(long)]
69    pub binder: Option<String>,
70    /// Judge model used only for predicate cases.
71    #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
72    pub judge_model: String,
73    /// Output directory for `summary.json` and `per_case.jsonl`.
74    #[arg(long)]
75    pub output: Option<PathBuf>,
76    /// Override tool rendering for the planner (`native` or `text`).
77    #[arg(long = "tool-format")]
78    pub tool_format: Option<String>,
79    /// Maximum planner response tokens.
80    #[arg(long = "max-tokens", default_value_t = 512)]
81    pub max_tokens: i64,
82    /// Maximum binder response tokens. Default is sized to leave room for
83    /// reasoning-emitting models (e.g. GPT-OSS-120B emits ~200 tokens of
84    /// chain-of-thought before the JSON payload); non-reasoning binders
85    /// will under-fill this budget at no extra cost.
86    #[arg(long = "binder-max-tokens", default_value_t = 1024)]
87    pub binder_max_tokens: i64,
88    /// Run only cases whose id or tag contains this string.
89    #[arg(long)]
90    pub filter: Option<String>,
91    /// Stop after N selected cases, useful for smoke runs.
92    #[arg(long = "max-cases")]
93    pub max_cases: Option<usize>,
94    /// Treat missing credentials as an immediate preflight error.
95    #[arg(long = "fail-on-unauthorized")]
96    pub fail_on_unauthorized: bool,
97}
98
99#[derive(Debug, Subcommand)]
100pub enum EvalToolCallsCommand {
101    /// Compare a current summary against a pinned baseline.
102    RegressionCheck(EvalToolCallsRegressionArgs),
103}
104
105#[derive(Debug, Args)]
106pub struct EvalToolCallsRegressionArgs {
107    /// Current run summary. Defaults to `.harn-runs/tool-call-eval/latest/summary.json`.
108    #[arg(long)]
109    pub current: Option<PathBuf>,
110    /// Optional planner label for diagnostics.
111    #[arg(long)]
112    pub planner: Option<String>,
113    /// Baseline summary JSON to compare against.
114    #[arg(long)]
115    pub against: PathBuf,
116    /// Maximum allowed pass-rate drop in percentage points.
117    #[arg(long = "max-drop-pp", default_value_t = 2.0)]
118    pub max_drop_pp: f64,
119}
120
121#[derive(Debug, Args)]
122pub struct EvalPromptArgs {
123    /// Path to a `.harn.prompt` (or `.prompt`) template.
124    pub file: PathBuf,
125    /// Fleet of model selectors (comma-separated, repeatable).
126    /// Each entry is either a model alias (`claude-opus-4-7`) or a
127    /// `provider:model` selector (`ollama:qwen3.5`). Mutually exclusive
128    /// with `--fleet-name`.
129    #[arg(
130        long,
131        value_delimiter = ',',
132        required_unless_present = "fleet_name",
133        conflicts_with = "fleet_name"
134    )]
135    pub fleet: Vec<String>,
136    /// Named fleet from `harn.toml` `[eval.fleets.<name>]`.
137    #[arg(long = "fleet-name")]
138    pub fleet_name: Option<String>,
139    /// JSON file with bindings injected into the template scope.
140    #[arg(long)]
141    pub bindings: Option<PathBuf>,
142    /// Prompt context-quality fixture(s) that score artifact selection,
143    /// stale/noisy rejection, budget adherence, and logical-section shape.
144    #[arg(long = "context-fixture")]
145    pub context_fixture: Vec<PathBuf>,
146    /// Evaluation mode.
147    #[arg(long, value_enum, default_value_t = EvalPromptMode::Render)]
148    pub mode: EvalPromptMode,
149    /// Output format.
150    #[arg(long, value_enum, default_value_t = EvalPromptOutput::Terminal)]
151    pub output: EvalPromptOutput,
152    /// Output destination for HTML / JSON (defaults to stdout).
153    #[arg(long = "out-file", short = 'o')]
154    pub out_file: Option<PathBuf>,
155    /// Maximum concurrent model invocations in run/judge modes.
156    #[arg(long, default_value_t = 4)]
157    pub max_concurrent: usize,
158    /// Optional judge prompt template. When unset, a built-in equivalence
159    /// judge is used.
160    #[arg(long = "judge-template")]
161    pub judge_template: Option<PathBuf>,
162    /// Model used for `--mode judge` evaluation.
163    #[arg(long = "judge-model", default_value = "claude-opus-4-7")]
164    pub judge_model: String,
165    /// Maximum tokens for `--mode run` / `--mode judge` calls.
166    #[arg(long = "max-tokens", default_value_t = 1024)]
167    pub max_tokens: i64,
168    /// Treat unauthenticated providers as errors rather than skipping them.
169    #[arg(long = "fail-on-unauthorized")]
170    pub fail_on_unauthorized: bool,
171}
172
173#[derive(Debug, Clone, Copy, ValueEnum)]
174pub enum EvalPromptMode {
175    /// Render the template against each model's capability profile.
176    Render,
177    /// Render + execute against each model and collect outputs.
178    Run,
179    /// Render + run + LLM-as-judge equivalence scoring.
180    Judge,
181}
182
183#[derive(Debug, Clone, Copy, ValueEnum)]
184pub enum EvalPromptOutput {
185    Terminal,
186    Json,
187    Html,
188}