1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
//! Clap definitions for `harn eval` and its subcommands.
//!
//! The bare form `harn eval <path>` evaluates a run record, run directory,
//! eval manifest, or `.harn` pipeline (legacy entrypoint, dispatched through
//! `eval_run_record`). The `harn eval prompt <file> --fleet <models>`
//! subcommand renders (and optionally runs / judges) a single
//! `.harn.prompt` template against a fleet of models so authors can compare
//! the wire envelope each capability profile materializes.
use std::path::PathBuf;
use clap::{Args, Subcommand, ValueEnum};
#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
pub struct EvalArgs {
/// Run record path, run directory, eval manifest path, or `.harn` pipeline.
/// Required unless a subcommand (e.g. `prompt`) is used.
pub path: Option<String>,
/// Optional baseline run record for diffing.
#[arg(long)]
pub compare: Option<String>,
/// Run a pipeline twice and compare the baseline against this structural experiment.
#[arg(long = "structural-experiment")]
pub structural_experiment: Option<String>,
/// Replay LLM responses from a JSONL fixture file when `path` is a `.harn` pipeline.
#[arg(
long = "llm-mock",
value_name = "PATH",
conflicts_with = "llm_mock_record"
)]
pub llm_mock: Option<String>,
/// Record executed LLM responses into a JSONL fixture file when `path` is a `.harn` pipeline.
#[arg(
long = "llm-mock-record",
value_name = "PATH",
conflicts_with = "llm_mock"
)]
pub llm_mock_record: Option<String>,
/// Positional arguments forwarded to `harn run <pipeline.harn> -- ...` when
/// `path` is a pipeline file and `--structural-experiment` is set.
#[arg(last = true)]
pub argv: Vec<String>,
#[command(subcommand)]
pub command: Option<EvalCommand>,
}
#[derive(Debug, Subcommand)]
pub enum EvalCommand {
/// Render and optionally run a `.harn.prompt` across a fleet of models.
Prompt(EvalPromptArgs),
}
#[derive(Debug, Args)]
pub struct EvalPromptArgs {
/// Path to a `.harn.prompt` (or `.prompt`) template.
pub file: PathBuf,
/// Fleet of model selectors (comma-separated, repeatable).
/// Each entry is either a model alias (`claude-opus-4-7`) or a
/// `provider:model` selector (`ollama:qwen3.5`). Mutually exclusive
/// with `--fleet-name`.
#[arg(
long,
value_delimiter = ',',
required_unless_present = "fleet_name",
conflicts_with = "fleet_name"
)]
pub fleet: Vec<String>,
/// Named fleet from `harn.toml` `[eval.fleets.<name>]`.
#[arg(long = "fleet-name")]
pub fleet_name: Option<String>,
/// JSON file with bindings injected into the template scope.
#[arg(long)]
pub bindings: Option<PathBuf>,
/// Evaluation mode.
#[arg(long, value_enum, default_value_t = EvalPromptMode::Render)]
pub mode: EvalPromptMode,
/// Output format.
#[arg(long, value_enum, default_value_t = EvalPromptOutput::Terminal)]
pub output: EvalPromptOutput,
/// Output destination for HTML / JSON (defaults to stdout).
#[arg(long = "out-file", short = 'o')]
pub out_file: Option<PathBuf>,
/// Maximum concurrent model invocations in run/judge modes.
#[arg(long, default_value_t = 4)]
pub max_concurrent: usize,
/// Optional judge prompt template. When unset, a built-in equivalence
/// judge is used.
#[arg(long = "judge-template")]
pub judge_template: Option<PathBuf>,
/// Model used for `--mode judge` evaluation.
#[arg(long = "judge-model", default_value = "claude-opus-4-7")]
pub judge_model: String,
/// Maximum tokens for `--mode run` / `--mode judge` calls.
#[arg(long = "max-tokens", default_value_t = 1024)]
pub max_tokens: i64,
/// Treat unauthenticated providers as errors rather than skipping them.
#[arg(long = "fail-on-unauthorized")]
pub fail_on_unauthorized: bool,
}
#[derive(Debug, Clone, Copy, ValueEnum)]
pub enum EvalPromptMode {
/// Render the template against each model's capability profile.
Render,
/// Render + execute against each model and collect outputs.
Run,
/// Render + run + LLM-as-judge equivalence scoring.
Judge,
}
#[derive(Debug, Clone, Copy, ValueEnum)]
pub enum EvalPromptOutput {
Terminal,
Json,
Html,
}