Skip to main content

harn_cli/commands/
eval_context.rs

1//! `harn eval context` — deterministic context-engineering mode runner.
2//!
3//! ## .harn dispatch (W6 partial port — see harn#2306)
4//!
5//! The **evaluation pipeline** (manifest load, `evaluate_context_eval_manifest`
6//! invocation, per-run scoring) stays in Rust — it reaches into
7//! `harn_vm::orchestration::context_eval` internals (mode runs,
8//! projection policies, scoring) that aren't reachable from script-land
9//! today without G4 (#2297) exposing the orchestration surface.
10//!
11//! The **rendering layer** (the markdown body of `summary.md`, the
12//! one-line human summary, the `--json` pretty form) is delegated to
13//! `crates/harn-stdlib/src/stdlib/cli/eval/context.harn`. The Rust shim
14//! pre-serialises the `ContextEvalReport` to JSON, forwards it via
15//! [`CONTEXT_REPORT_ENV`], dispatches three times (markdown for
16//! `summary.md`, summary for stdout, optional JSON for stdout when
17//! `--json` is set), and writes the captured payloads to disk / real
18//! stdout. The artifacts that need byte-identical serde output
19//! (`summary.json`, `per_run.jsonl`) stay on the Rust side because
20//! Harn's `json_stringify_pretty` sorts dict keys alphabetically and
21//! the on-disk format is consumed by the regression-check / hosted
22//! ingestion paths that depend on the serde struct-field order.
23//!
24//! `HARN_CLI_IMPL=rust` keeps the legacy direct-render path for the
25//! parity-snapshot harness (#2299) until the C1 ratchet (#2314) lands.
26
27use std::fs;
28use std::io::Write as _;
29use std::path::{Path, PathBuf};
30
31use harn_vm::orchestration::{
32    context_eval_default_output_dir, evaluate_context_eval_manifest, load_context_eval_manifest,
33    ContextEvalReport, ContextEvalRunReport,
34};
35
36use crate::cli::EvalContextArgs;
37use crate::dispatch;
38use crate::env_guard::ScopedEnvVar;
39
40/// Env var the embedded `cli/eval/context` script reads to pick up the
41/// pre-serialised `ContextEvalReport`. The Rust shim does all the
42/// pipeline work and hands the script the assembled report so it only
43/// has to format it.
44const CONTEXT_REPORT_ENV: &str = "HARN_EVAL_CONTEXT_REPORT_JSON";
45
46/// Env var the script reads to select between the three rendering
47/// modes (`"markdown"` for `summary.md`, `"summary"` for the one-line
48/// stdout summary, `"json"` for the `--json` pretty form).
49const CONTEXT_OUTPUT_MODE_ENV: &str = "HARN_EVAL_CONTEXT_OUTPUT_MODE";
50
51/// Serialises the dispatch-render path so concurrent in-process callers
52/// (the existing `eval_context_cli` integration tests, plus any future
53/// fanout caller) don't race on the process-global env vars the Rust
54/// shim sets to hand the report off to the .harn script. The CLI binary
55/// itself is single-call, so this mutex is uncontended in production;
56/// in tests it serialises the dispatch window only — aggregation still
57/// runs freely.
58///
59/// Mirrors the pattern W5's `eval_prompt.rs` uses (see harn#2305) so
60/// the cross-script env-var hand-off stays consistent across the eval
61/// cluster.
62static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
63
64pub async fn run(args: EvalContextArgs) -> i32 {
65    let report = match aggregate_report(&args) {
66        Ok(report) => report,
67        Err(code) => return code,
68    };
69
70    let output_dir = args.output.unwrap_or_else(context_eval_default_output_dir);
71    if let Err(error) = fs::create_dir_all(&output_dir) {
72        eprintln!("error: failed to create {}: {error}", output_dir.display());
73        return 1;
74    }
75
76    // The JSON artifacts (summary.json, per_run.jsonl) always stay on the
77    // serde-driven Rust path — see module docstring for the byte-format
78    // rationale. They write before any rendering so a render failure
79    // doesn't leave a partially-written report directory.
80    if let Err(error) = write_json_artifacts(&output_dir, &report) {
81        eprintln!("error: failed to write context eval outputs: {error}");
82        return 1;
83    }
84
85    // `HARN_CLI_IMPL=rust` keeps the legacy direct-render path so the
86    // parity-snapshot harness (#2299) can compare both impls byte-for-byte
87    // until C1 (#2314) deletes it.
88    let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
89
90    if use_legacy {
91        if let Err(error) = write_markdown_legacy(&output_dir, &report) {
92            eprintln!("error: failed to write context eval markdown: {error}");
93            return 1;
94        }
95        announce_output_paths(&output_dir);
96        if args.json {
97            if let Err(code) = print_json_legacy(&report) {
98                return code;
99            }
100        } else {
101            print_summary_legacy(&report);
102        }
103        return post_render_exit_code(&report);
104    }
105
106    match write_markdown_dispatch(&output_dir, &report).await {
107        Ok(()) => {}
108        Err(code) => return code,
109    }
110    announce_output_paths(&output_dir);
111    if args.json {
112        if let Err(code) = print_json_dispatch(&report).await {
113            return code;
114        }
115    } else if let Err(code) = print_summary_dispatch(&report).await {
116        return code;
117    }
118    post_render_exit_code(&report)
119}
120
121/// Build the aggregated [`ContextEvalReport`] without any rendering.
122/// Pulled out of [`run`] so both the legacy direct-render path and the
123/// .harn dispatch path see the same report.
124fn aggregate_report(args: &EvalContextArgs) -> Result<ContextEvalReport, i32> {
125    let manifest = match load_context_eval_manifest(&args.manifest) {
126        Ok(manifest) => manifest,
127        Err(error) => {
128            eprintln!("error: {error}");
129            return Err(1);
130        }
131    };
132    let report = match evaluate_context_eval_manifest(&manifest) {
133        Ok(report) => report,
134        Err(error) => {
135            eprintln!("error: {error}");
136            return Err(1);
137        }
138    };
139    Ok(report)
140}
141
142fn post_render_exit_code(report: &ContextEvalReport) -> i32 {
143    i32::from(!report.pass)
144}
145
146fn announce_output_paths(output_dir: &Path) {
147    eprintln!(
148        "wrote {}, {}, and {}",
149        output_dir.join("summary.json").display(),
150        output_dir.join("per_run.jsonl").display(),
151        output_dir.join("summary.md").display()
152    );
153}
154
155fn write_json_artifacts(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
156    write_json(output_dir.join("summary.json"), report)?;
157    write_jsonl(output_dir.join("per_run.jsonl"), &report.runs)
158}
159
160fn write_json(path: PathBuf, report: &ContextEvalReport) -> Result<(), String> {
161    let payload = serde_json::to_string_pretty(report).map_err(|error| error.to_string())?;
162    fs::write(path, payload).map_err(|error| error.to_string())
163}
164
165fn write_jsonl(path: PathBuf, runs: &[ContextEvalRunReport]) -> Result<(), String> {
166    let mut file = fs::File::create(path).map_err(|error| error.to_string())?;
167    for run in runs {
168        let line = serde_json::to_string(run).map_err(|error| error.to_string())?;
169        file.write_all(line.as_bytes())
170            .map_err(|error| error.to_string())?;
171        file.write_all(b"\n").map_err(|error| error.to_string())?;
172    }
173    Ok(())
174}
175
176// ─── Legacy direct-render path (gated by HARN_CLI_IMPL=rust) ────────────
177
178fn write_markdown_legacy(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
179    fs::write(
180        output_dir.join("summary.md"),
181        legacy_render_markdown(report),
182    )
183    .map_err(|error| error.to_string())
184}
185
186fn print_json_legacy(report: &ContextEvalReport) -> Result<(), i32> {
187    match serde_json::to_string_pretty(report) {
188        Ok(payload) => {
189            println!("{payload}");
190            Ok(())
191        }
192        Err(error) => {
193            eprintln!("error: failed to serialize context eval summary: {error}");
194            Err(1)
195        }
196    }
197}
198
199fn print_summary_legacy(report: &ContextEvalReport) {
200    println!(
201        "context eval: {}/{} passed, mean_correctness={:.2}, mean_tool_quality={:.2}",
202        report.passed_runs,
203        report.total_runs,
204        report.aggregate.mean_final_correctness,
205        report.aggregate.mean_tool_call_quality
206    );
207}
208
209fn legacy_render_markdown(report: &ContextEvalReport) -> String {
210    let mut out = String::new();
211    out.push_str(&format!(
212        "# Context Eval: {}\n\n",
213        report
214            .manifest_name
215            .as_deref()
216            .unwrap_or(report.manifest_id.as_str())
217    ));
218    out.push_str(&format!(
219        "- status: {}\n- runs: {}/{} passed\n- mean correctness: {:.4}\n- mean tool quality: {:.4}\n- input tokens: {}\n- output tokens: {}\n- cost USD: {:.6}\n\n",
220        if report.pass { "PASS" } else { "FAIL" },
221        report.passed_runs,
222        report.total_runs,
223        report.aggregate.mean_final_correctness,
224        report.aggregate.mean_tool_call_quality,
225        report.aggregate.total_input_tokens,
226        report.aggregate.total_output_tokens,
227        report.aggregate.total_cost_usd,
228    ));
229    out.push_str("| task | mode | pass | correctness | tools | reads before edit | input tokens | compactions | cache key |\n");
230    out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
231    for run in &report.runs {
232        out.push_str(&format!(
233            "| {} | {} | {} | {:.4} | {:.4} | {} | {} | {} | `{}` |\n",
234            escape_md(&run.task_id),
235            escape_md(&run.mode_id),
236            if run.passed { "yes" } else { "no" },
237            run.final_correctness.score,
238            run.tool_call_quality.score,
239            run.reads_before_first_edit,
240            run.input_tokens,
241            run.compaction_count,
242            run.cache.key,
243        ));
244    }
245    out
246}
247
248fn escape_md(value: &str) -> String {
249    value.replace('|', "\\|")
250}
251
252// ─── Dispatch (.harn) render path ────────────────────────────────────────
253
254async fn write_markdown_dispatch(output_dir: &Path, report: &ContextEvalReport) -> Result<(), i32> {
255    let payload = render_via_dispatch(report, "markdown").await?;
256    if let Err(error) = fs::write(output_dir.join("summary.md"), payload) {
257        eprintln!("error: failed to write context eval markdown: {error}");
258        return Err(1);
259    }
260    Ok(())
261}
262
263async fn print_summary_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
264    let payload = render_via_dispatch(report, "summary").await?;
265    print!("{payload}");
266    // The script emits exactly the legacy summary line (no trailing
267    // newline); add one to match the legacy `println!` semantics.
268    if !payload.ends_with('\n') {
269        println!();
270    }
271    Ok(())
272}
273
274async fn print_json_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
275    let payload = render_via_dispatch(report, "json").await?;
276    print!("{payload}");
277    if !payload.ends_with('\n') {
278        println!();
279    }
280    Ok(())
281}
282
283/// Dispatch to the embedded `cli/eval/context.harn` script for one of
284/// the three rendering modes (markdown / summary / json). Returns the
285/// captured stdout on success, or a propagated exit code on failure.
286///
287/// **Concurrency.** Held under [`DISPATCH_RENDER_LOCK`] so concurrent
288/// in-process callers don't race on the global env vars the Rust shim
289/// sets to hand the report to the script. See the lock's docstring for
290/// the trade-off rationale.
291async fn render_via_dispatch(report: &ContextEvalReport, mode: &str) -> Result<String, i32> {
292    let report_json = match serde_json::to_string(report) {
293        Ok(json) => json,
294        Err(error) => {
295            eprintln!("error: failed to serialise ContextEvalReport for dispatch: {error}");
296            return Err(1);
297        }
298    };
299
300    let _guard = DISPATCH_RENDER_LOCK.lock().await;
301    let _report = ScopedEnvVar::set(CONTEXT_REPORT_ENV, &report_json);
302    let _mode = ScopedEnvVar::set(CONTEXT_OUTPUT_MODE_ENV, mode);
303
304    let outcome = dispatch::run_embedded_script("eval/context", Vec::new(), false).await;
305    if !outcome.stderr.is_empty() {
306        use std::io::Write as _;
307        let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
308    }
309    if outcome.exit_code != 0 {
310        return Err(outcome.exit_code);
311    }
312    Ok(outcome.stdout)
313}