harn_cli/commands/
eval_skill_gate.rs1use std::fs;
4use std::io::Write as _;
5use std::path::{Path, PathBuf};
6
7use harn_vm::orchestration::{
8 evaluate_skill_gate_manifest, load_skill_gate_manifest, SkillGateCaseReport, SkillGateReport,
9 SkillGateVariantReport,
10};
11
12use crate::cli::EvalSkillGateArgs;
13
14pub async fn run(args: EvalSkillGateArgs) -> i32 {
15 let manifest = match load_skill_gate_manifest(&args.manifest) {
16 Ok(manifest) => manifest,
17 Err(error) => {
18 eprintln!("error: {error}");
19 return 1;
20 }
21 };
22 let report = match evaluate_skill_gate_manifest(&manifest) {
23 Ok(report) => report,
24 Err(error) => {
25 eprintln!("error: {error}");
26 return 1;
27 }
28 };
29 let output_dir = args.output.unwrap_or_else(|| default_output_dir(&report));
30 if let Err(error) = fs::create_dir_all(&output_dir) {
31 eprintln!("error: failed to create {}: {error}", output_dir.display());
32 return 1;
33 }
34 if let Err(error) = write_outputs(&output_dir, &report) {
35 eprintln!("error: failed to write skill gate outputs: {error}");
36 return 1;
37 }
38 eprintln!(
39 "wrote {}, {}, {}, and {}",
40 output_dir.join("summary.json").display(),
41 output_dir.join("per_case.jsonl").display(),
42 output_dir.join("receipt.json").display(),
43 output_dir.join("summary.md").display()
44 );
45 if args.json {
46 match serde_json::to_string_pretty(&report) {
47 Ok(payload) => println!("{payload}"),
48 Err(error) => {
49 eprintln!("error: failed to serialize skill gate report: {error}");
50 return 1;
51 }
52 }
53 } else {
54 println!(
55 "skill gate: {} selected={} variants={} included={} excluded={} tamper={}",
56 if report.pass { "PASS" } else { "FAIL" },
57 report.selected_variant_id.as_deref().unwrap_or("none"),
58 report.variants.len(),
59 report.included_task_count,
60 report.excluded_task_count,
61 if report.tamper.pass { "pass" } else { "fail" }
62 );
63 }
64 i32::from(!report.pass)
65}
66
67fn default_output_dir(report: &SkillGateReport) -> PathBuf {
68 Path::new(".harn-runs")
69 .join("skill-gate")
70 .join(&report.manifest_id)
71}
72
73fn write_outputs(output_dir: &Path, report: &SkillGateReport) -> Result<(), String> {
74 write_json(output_dir.join("summary.json"), report)?;
75 write_per_case(output_dir.join("per_case.jsonl"), report)?;
76 write_json(output_dir.join("receipt.json"), &report.receipt)?;
77 fs::write(output_dir.join("summary.md"), render_markdown(report))
78 .map_err(|error| error.to_string())
79}
80
81fn write_json<T: serde::Serialize>(path: PathBuf, value: &T) -> Result<(), String> {
82 let payload = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
83 fs::write(path, payload).map_err(|error| error.to_string())
84}
85
86fn write_per_case(path: PathBuf, report: &SkillGateReport) -> Result<(), String> {
87 let mut file = fs::File::create(path).map_err(|error| error.to_string())?;
88 for variant in &report.variants {
89 for case in &variant.cases {
90 let line = serde_json::to_string(&PerCaseLine {
91 variant_id: &variant.id,
92 accepted: variant.accepted,
93 case,
94 })
95 .map_err(|error| error.to_string())?;
96 file.write_all(line.as_bytes())
97 .map_err(|error| error.to_string())?;
98 file.write_all(b"\n").map_err(|error| error.to_string())?;
99 }
100 }
101 Ok(())
102}
103
104#[derive(serde::Serialize)]
105struct PerCaseLine<'a> {
106 variant_id: &'a str,
107 accepted: bool,
108 #[serde(flatten)]
109 case: &'a SkillGateCaseReport,
110}
111
112fn render_markdown(report: &SkillGateReport) -> String {
113 let mut out = String::new();
114 out.push_str(&format!("# Skill Gate: {}\n\n", report.manifest_id));
115 out.push_str(&format!(
116 "- status: {}\n- target model: `{}`\n- selected variant: `{}`\n- included tasks: {}\n- excluded tasks: {}\n- tamper: {}\n- pareto frontier: {}\n\n",
117 if report.pass { "PASS" } else { "FAIL" },
118 escape_md(&report.target_model.id),
119 escape_md(report.selected_variant_id.as_deref().unwrap_or("none")),
120 report.included_task_count,
121 report.excluded_task_count,
122 if report.tamper.pass { "pass" } else { "fail" },
123 if report.pareto_frontier.is_empty() {
124 "none".to_string()
125 } else {
126 report.pareto_frontier.join(", ")
127 }
128 ));
129 out.push_str(
130 "| variant | decision | lift | gap recovery | regressions | context delta | failures |\n",
131 );
132 out.push_str("|---|---|---:|---:|---:|---:|---|\n");
133 for variant in &report.variants {
134 out.push_str(&variant_row(variant));
135 }
136 if !report.task_safety.is_empty() {
137 out.push_str("\n## Held-out Filter\n\n");
138 out.push_str("| task | cluster | included | reason |\n");
139 out.push_str("|---|---|---:|---|\n");
140 for task in &report.task_safety {
141 out.push_str(&format!(
142 "| {} | {} | {} | {} |\n",
143 escape_md(&task.task_id),
144 escape_md(&task.cluster),
145 if task.included { "yes" } else { "no" },
146 escape_md(task.exclusion_reason.as_deref().unwrap_or(""))
147 ));
148 }
149 }
150 if !report.tamper.checks.is_empty() {
151 out.push_str("\n## Immutable Grader Checks\n\n");
152 out.push_str("| path | status | actual sha256 |\n");
153 out.push_str("|---|---|---|\n");
154 for check in &report.tamper.checks {
155 out.push_str(&format!(
156 "| {} | {} | `{}` |\n",
157 escape_md(&check.path),
158 escape_md(&check.status),
159 check.actual_sha256.as_deref().unwrap_or("")
160 ));
161 }
162 }
163 out
164}
165
166fn variant_row(variant: &SkillGateVariantReport) -> String {
167 format!(
168 "| {} | {} | {:.4} | {:.4} | {}/{} | {} | {} |\n",
169 escape_md(&variant.id),
170 if variant.accepted {
171 "accepted"
172 } else {
173 "rejected"
174 },
175 variant.metrics.mean_score_lift,
176 variant.metrics.mean_gap_recovery,
177 variant.metrics.regression_count,
178 variant.metrics.regression_denominator,
179 variant.context.delta_tokens,
180 escape_md(&variant.failures.join("; "))
181 )
182}
183
184fn escape_md(value: &str) -> String {
185 value.replace('|', "\\|")
186}