Skip to main content

harn_cli/commands/
eval_skill_gate.rs

1//! `harn eval skill-gate` - contamination-safe gate reports for skill/guidance candidates.
2
3use std::fs;
4use std::io::Write as _;
5use std::path::{Path, PathBuf};
6
7use harn_vm::orchestration::{
8    evaluate_skill_gate_manifest, load_skill_gate_manifest, SkillGateCaseReport, SkillGateReport,
9    SkillGateVariantReport,
10};
11
12use crate::cli::EvalSkillGateArgs;
13
14pub async fn run(args: EvalSkillGateArgs) -> i32 {
15    let manifest = match load_skill_gate_manifest(&args.manifest) {
16        Ok(manifest) => manifest,
17        Err(error) => {
18            eprintln!("error: {error}");
19            return 1;
20        }
21    };
22    let report = match evaluate_skill_gate_manifest(&manifest) {
23        Ok(report) => report,
24        Err(error) => {
25            eprintln!("error: {error}");
26            return 1;
27        }
28    };
29    let output_dir = args.output.unwrap_or_else(|| default_output_dir(&report));
30    if let Err(error) = fs::create_dir_all(&output_dir) {
31        eprintln!("error: failed to create {}: {error}", output_dir.display());
32        return 1;
33    }
34    if let Err(error) = write_outputs(&output_dir, &report) {
35        eprintln!("error: failed to write skill gate outputs: {error}");
36        return 1;
37    }
38    eprintln!(
39        "wrote {}, {}, {}, and {}",
40        output_dir.join("summary.json").display(),
41        output_dir.join("per_case.jsonl").display(),
42        output_dir.join("receipt.json").display(),
43        output_dir.join("summary.md").display()
44    );
45    if args.json {
46        match serde_json::to_string_pretty(&report) {
47            Ok(payload) => println!("{payload}"),
48            Err(error) => {
49                eprintln!("error: failed to serialize skill gate report: {error}");
50                return 1;
51            }
52        }
53    } else {
54        println!(
55            "skill gate: {} selected={} variants={} included={} excluded={} tamper={}",
56            if report.pass { "PASS" } else { "FAIL" },
57            report.selected_variant_id.as_deref().unwrap_or("none"),
58            report.variants.len(),
59            report.included_task_count,
60            report.excluded_task_count,
61            if report.tamper.pass { "pass" } else { "fail" }
62        );
63    }
64    i32::from(!report.pass)
65}
66
67fn default_output_dir(report: &SkillGateReport) -> PathBuf {
68    Path::new(".harn-runs")
69        .join("skill-gate")
70        .join(&report.manifest_id)
71}
72
73fn write_outputs(output_dir: &Path, report: &SkillGateReport) -> Result<(), String> {
74    write_json(output_dir.join("summary.json"), report)?;
75    write_per_case(output_dir.join("per_case.jsonl"), report)?;
76    write_json(output_dir.join("receipt.json"), &report.receipt)?;
77    fs::write(output_dir.join("summary.md"), render_markdown(report))
78        .map_err(|error| error.to_string())
79}
80
81fn write_json<T: serde::Serialize>(path: PathBuf, value: &T) -> Result<(), String> {
82    let payload = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
83    fs::write(path, payload).map_err(|error| error.to_string())
84}
85
86fn write_per_case(path: PathBuf, report: &SkillGateReport) -> Result<(), String> {
87    let mut file = fs::File::create(path).map_err(|error| error.to_string())?;
88    for variant in &report.variants {
89        for case in &variant.cases {
90            let line = serde_json::to_string(&PerCaseLine {
91                variant_id: &variant.id,
92                accepted: variant.accepted,
93                case,
94            })
95            .map_err(|error| error.to_string())?;
96            file.write_all(line.as_bytes())
97                .map_err(|error| error.to_string())?;
98            file.write_all(b"\n").map_err(|error| error.to_string())?;
99        }
100    }
101    Ok(())
102}
103
104#[derive(serde::Serialize)]
105struct PerCaseLine<'a> {
106    variant_id: &'a str,
107    accepted: bool,
108    #[serde(flatten)]
109    case: &'a SkillGateCaseReport,
110}
111
112fn render_markdown(report: &SkillGateReport) -> String {
113    let mut out = String::new();
114    out.push_str(&format!("# Skill Gate: {}\n\n", report.manifest_id));
115    out.push_str(&format!(
116        "- status: {}\n- target model: `{}`\n- selected variant: `{}`\n- included tasks: {}\n- excluded tasks: {}\n- tamper: {}\n- pareto frontier: {}\n\n",
117        if report.pass { "PASS" } else { "FAIL" },
118        escape_md(&report.target_model.id),
119        escape_md(report.selected_variant_id.as_deref().unwrap_or("none")),
120        report.included_task_count,
121        report.excluded_task_count,
122        if report.tamper.pass { "pass" } else { "fail" },
123        if report.pareto_frontier.is_empty() {
124            "none".to_string()
125        } else {
126            report.pareto_frontier.join(", ")
127        }
128    ));
129    out.push_str(
130        "| variant | decision | lift | gap recovery | regressions | context delta | failures |\n",
131    );
132    out.push_str("|---|---|---:|---:|---:|---:|---|\n");
133    for variant in &report.variants {
134        out.push_str(&variant_row(variant));
135    }
136    if !report.task_safety.is_empty() {
137        out.push_str("\n## Held-out Filter\n\n");
138        out.push_str("| task | cluster | included | reason |\n");
139        out.push_str("|---|---|---:|---|\n");
140        for task in &report.task_safety {
141            out.push_str(&format!(
142                "| {} | {} | {} | {} |\n",
143                escape_md(&task.task_id),
144                escape_md(&task.cluster),
145                if task.included { "yes" } else { "no" },
146                escape_md(task.exclusion_reason.as_deref().unwrap_or(""))
147            ));
148        }
149    }
150    if !report.tamper.checks.is_empty() {
151        out.push_str("\n## Immutable Grader Checks\n\n");
152        out.push_str("| path | status | actual sha256 |\n");
153        out.push_str("|---|---|---|\n");
154        for check in &report.tamper.checks {
155            out.push_str(&format!(
156                "| {} | {} | `{}` |\n",
157                escape_md(&check.path),
158                escape_md(&check.status),
159                check.actual_sha256.as_deref().unwrap_or("")
160            ));
161        }
162    }
163    out
164}
165
166fn variant_row(variant: &SkillGateVariantReport) -> String {
167    format!(
168        "| {} | {} | {:.4} | {:.4} | {}/{} | {} | {} |\n",
169        escape_md(&variant.id),
170        if variant.accepted {
171            "accepted"
172        } else {
173            "rejected"
174        },
175        variant.metrics.mean_score_lift,
176        variant.metrics.mean_gap_recovery,
177        variant.metrics.regression_count,
178        variant.metrics.regression_denominator,
179        variant.context.delta_tokens,
180        escape_md(&variant.failures.join("; "))
181    )
182}
183
184fn escape_md(value: &str) -> String {
185    value.replace('|', "\\|")
186}