Skip to main content

harn_cli/commands/
eval_context.rs

1//! `harn eval context` — deterministic context-engineering mode runner.
2//!
3//! ## .harn dispatch (W6 partial port — see harn#2306)
4//!
5//! The **evaluation pipeline** (manifest load, `evaluate_context_eval_manifest`
6//! invocation, per-run scoring) stays in Rust — it reaches into
7//! `harn_vm::orchestration::context_eval` internals (mode runs,
8//! projection policies, scoring) that aren't reachable from script-land
9//! today without G4 (#2297) exposing the orchestration surface.
10//!
11//! The **rendering layer** (the markdown body of `summary.md`, the
12//! one-line human summary, the `--json` pretty form) is delegated to
13//! `crates/harn-stdlib/src/stdlib/cli/eval/context.harn`. The Rust shim
14//! pre-serialises the `ContextEvalReport` to JSON, forwards it via
15//! [`CONTEXT_REPORT_ENV`], dispatches three times (markdown for
16//! `summary.md`, summary for stdout, optional JSON for stdout when
17//! `--json` is set), and writes the captured payloads to disk / real
18//! stdout. The artifacts that need byte-identical serde output
19//! (`summary.json`, `per_run.jsonl`) stay on the Rust side because
20//! Harn's `json_stringify_pretty` sorts dict keys alphabetically and
21//! the on-disk format is consumed by the regression-check / hosted
22//! ingestion paths that depend on the serde struct-field order.
23//!
24//! `HARN_CLI_IMPL=rust` keeps the legacy direct-render path for the
25//! parity-snapshot harness (#2299) until the C1 ratchet (#2314) lands.
26
27use std::fs;
28use std::io::Write as _;
29use std::path::{Path, PathBuf};
30
31use harn_vm::orchestration::{
32    context_eval_default_output_dir, evaluate_context_eval_manifest, load_context_eval_manifest,
33    ContextEvalReport, ContextEvalRunReport,
34};
35
36use crate::cli::EvalContextArgs;
37use crate::dispatch;
38use crate::env_guard::ScopedEnvVar;
39
40/// Env var the embedded `cli/eval/context` script reads to pick up the
41/// pre-serialised `ContextEvalReport`. The Rust shim does all the
42/// pipeline work and hands the script the assembled report so it only
43/// has to format it.
44const CONTEXT_REPORT_ENV: &str = "HARN_EVAL_CONTEXT_REPORT_JSON";
45
46/// Env var the script reads to select between the three rendering
47/// modes (`"markdown"` for `summary.md`, `"summary"` for the one-line
48/// stdout summary, `"json"` for the `--json` pretty form).
49const CONTEXT_OUTPUT_MODE_ENV: &str = "HARN_EVAL_CONTEXT_OUTPUT_MODE";
50
51/// Serialises the dispatch-render path so concurrent in-process callers
52/// (the existing `eval_context_cli` integration tests, plus any future
53/// fanout caller) don't race on the process-global env vars the Rust
54/// shim sets to hand the report off to the .harn script. The CLI binary
55/// itself is single-call, so this mutex is uncontended in production;
56/// in tests it serialises the dispatch window only — aggregation still
57/// runs freely.
58///
59/// Mirrors the pattern W5's `eval_prompt.rs` uses (see harn#2305) so
60/// the cross-script env-var hand-off stays consistent across the eval
61/// cluster.
62static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
63
64pub async fn run(args: EvalContextArgs) -> i32 {
65    let report = match aggregate_report(&args) {
66        Ok(report) => report,
67        Err(code) => return code,
68    };
69
70    let output_dir = args.output.unwrap_or_else(context_eval_default_output_dir);
71    if let Err(error) = fs::create_dir_all(&output_dir) {
72        eprintln!("error: failed to create {}: {error}", output_dir.display());
73        return 1;
74    }
75
76    // The JSON artifacts (summary.json, per_run.jsonl) always stay on the
77    // serde-driven Rust path — see module docstring for the byte-format
78    // rationale. They write before any rendering so a render failure
79    // doesn't leave a partially-written report directory.
80    if let Err(error) = write_json_artifacts(&output_dir, &report) {
81        eprintln!("error: failed to write context eval outputs: {error}");
82        return 1;
83    }
84
85    // `HARN_CLI_IMPL=rust` keeps the legacy direct-render path so the
86    // parity-snapshot harness (#2299) can compare both impls byte-for-byte
87    // until C1 (#2314) deletes it.
88    let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
89
90    if use_legacy {
91        if let Err(error) = write_markdown_legacy(&output_dir, &report) {
92            eprintln!("error: failed to write context eval markdown: {error}");
93            return 1;
94        }
95        announce_output_paths(&output_dir);
96        if args.json {
97            if let Err(code) = print_json_legacy(&report) {
98                return code;
99            }
100        } else {
101            print_summary_legacy(&report);
102        }
103        return post_render_exit_code(&report);
104    }
105
106    match write_markdown_dispatch(&output_dir, &report).await {
107        Ok(()) => {}
108        Err(code) => return code,
109    }
110    announce_output_paths(&output_dir);
111    if args.json {
112        if let Err(code) = print_json_dispatch(&report).await {
113            return code;
114        }
115    } else if let Err(code) = print_summary_dispatch(&report).await {
116        return code;
117    }
118    post_render_exit_code(&report)
119}
120
121/// Build the aggregated [`ContextEvalReport`] without any rendering.
122/// Pulled out of [`run`] so both the legacy direct-render path and the
123/// .harn dispatch path see the same report.
124fn aggregate_report(args: &EvalContextArgs) -> Result<ContextEvalReport, i32> {
125    let manifest = match load_context_eval_manifest(&args.manifest) {
126        Ok(manifest) => manifest,
127        Err(error) => {
128            eprintln!("error: {error}");
129            return Err(1);
130        }
131    };
132    let report = match evaluate_context_eval_manifest(&manifest) {
133        Ok(report) => report,
134        Err(error) => {
135            eprintln!("error: {error}");
136            return Err(1);
137        }
138    };
139    Ok(report)
140}
141
142fn post_render_exit_code(report: &ContextEvalReport) -> i32 {
143    if report.pass {
144        0
145    } else {
146        1
147    }
148}
149
150fn announce_output_paths(output_dir: &Path) {
151    eprintln!(
152        "wrote {}, {}, and {}",
153        output_dir.join("summary.json").display(),
154        output_dir.join("per_run.jsonl").display(),
155        output_dir.join("summary.md").display()
156    );
157}
158
159fn write_json_artifacts(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
160    write_json(output_dir.join("summary.json"), report)?;
161    write_jsonl(output_dir.join("per_run.jsonl"), &report.runs)
162}
163
164fn write_json(path: PathBuf, report: &ContextEvalReport) -> Result<(), String> {
165    let payload = serde_json::to_string_pretty(report).map_err(|error| error.to_string())?;
166    fs::write(path, payload).map_err(|error| error.to_string())
167}
168
169fn write_jsonl(path: PathBuf, runs: &[ContextEvalRunReport]) -> Result<(), String> {
170    let mut file = fs::File::create(path).map_err(|error| error.to_string())?;
171    for run in runs {
172        let line = serde_json::to_string(run).map_err(|error| error.to_string())?;
173        file.write_all(line.as_bytes())
174            .map_err(|error| error.to_string())?;
175        file.write_all(b"\n").map_err(|error| error.to_string())?;
176    }
177    Ok(())
178}
179
180// ─── Legacy direct-render path (gated by HARN_CLI_IMPL=rust) ────────────
181
182fn write_markdown_legacy(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
183    fs::write(
184        output_dir.join("summary.md"),
185        legacy_render_markdown(report),
186    )
187    .map_err(|error| error.to_string())
188}
189
190fn print_json_legacy(report: &ContextEvalReport) -> Result<(), i32> {
191    match serde_json::to_string_pretty(report) {
192        Ok(payload) => {
193            println!("{payload}");
194            Ok(())
195        }
196        Err(error) => {
197            eprintln!("error: failed to serialize context eval summary: {error}");
198            Err(1)
199        }
200    }
201}
202
203fn print_summary_legacy(report: &ContextEvalReport) {
204    println!(
205        "context eval: {}/{} passed, mean_correctness={:.2}, mean_tool_quality={:.2}",
206        report.passed_runs,
207        report.total_runs,
208        report.aggregate.mean_final_correctness,
209        report.aggregate.mean_tool_call_quality
210    );
211}
212
213fn legacy_render_markdown(report: &ContextEvalReport) -> String {
214    let mut out = String::new();
215    out.push_str(&format!(
216        "# Context Eval: {}\n\n",
217        report
218            .manifest_name
219            .as_deref()
220            .unwrap_or(report.manifest_id.as_str())
221    ));
222    out.push_str(&format!(
223        "- status: {}\n- runs: {}/{} passed\n- mean correctness: {:.4}\n- mean tool quality: {:.4}\n- input tokens: {}\n- output tokens: {}\n- cost USD: {:.6}\n\n",
224        if report.pass { "PASS" } else { "FAIL" },
225        report.passed_runs,
226        report.total_runs,
227        report.aggregate.mean_final_correctness,
228        report.aggregate.mean_tool_call_quality,
229        report.aggregate.total_input_tokens,
230        report.aggregate.total_output_tokens,
231        report.aggregate.total_cost_usd,
232    ));
233    out.push_str("| task | mode | pass | correctness | tools | reads before edit | input tokens | compactions | cache key |\n");
234    out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
235    for run in &report.runs {
236        out.push_str(&format!(
237            "| {} | {} | {} | {:.4} | {:.4} | {} | {} | {} | `{}` |\n",
238            escape_md(&run.task_id),
239            escape_md(&run.mode_id),
240            if run.passed { "yes" } else { "no" },
241            run.final_correctness.score,
242            run.tool_call_quality.score,
243            run.reads_before_first_edit,
244            run.input_tokens,
245            run.compaction_count,
246            run.cache.key,
247        ));
248    }
249    out
250}
251
252fn escape_md(value: &str) -> String {
253    value.replace('|', "\\|")
254}
255
256// ─── Dispatch (.harn) render path ────────────────────────────────────────
257
258async fn write_markdown_dispatch(output_dir: &Path, report: &ContextEvalReport) -> Result<(), i32> {
259    let payload = render_via_dispatch(report, "markdown").await?;
260    if let Err(error) = fs::write(output_dir.join("summary.md"), payload) {
261        eprintln!("error: failed to write context eval markdown: {error}");
262        return Err(1);
263    }
264    Ok(())
265}
266
267async fn print_summary_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
268    let payload = render_via_dispatch(report, "summary").await?;
269    print!("{payload}");
270    // The script emits exactly the legacy summary line (no trailing
271    // newline); add one to match the legacy `println!` semantics.
272    if !payload.ends_with('\n') {
273        println!();
274    }
275    Ok(())
276}
277
278async fn print_json_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
279    let payload = render_via_dispatch(report, "json").await?;
280    print!("{payload}");
281    if !payload.ends_with('\n') {
282        println!();
283    }
284    Ok(())
285}
286
287/// Dispatch to the embedded `cli/eval/context.harn` script for one of
288/// the three rendering modes (markdown / summary / json). Returns the
289/// captured stdout on success, or a propagated exit code on failure.
290///
291/// **Concurrency.** Held under [`DISPATCH_RENDER_LOCK`] so concurrent
292/// in-process callers don't race on the global env vars the Rust shim
293/// sets to hand the report to the script. See the lock's docstring for
294/// the trade-off rationale.
295async fn render_via_dispatch(report: &ContextEvalReport, mode: &str) -> Result<String, i32> {
296    let report_json = match serde_json::to_string(report) {
297        Ok(json) => json,
298        Err(error) => {
299            eprintln!("error: failed to serialise ContextEvalReport for dispatch: {error}");
300            return Err(1);
301        }
302    };
303
304    let _guard = DISPATCH_RENDER_LOCK.lock().await;
305    let _report = ScopedEnvVar::set(CONTEXT_REPORT_ENV, &report_json);
306    let _mode = ScopedEnvVar::set(CONTEXT_OUTPUT_MODE_ENV, mode);
307
308    let outcome = dispatch::run_embedded_script("eval/context", Vec::new(), false).await;
309    if !outcome.stderr.is_empty() {
310        use std::io::Write as _;
311        let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
312    }
313    if outcome.exit_code != 0 {
314        return Err(outcome.exit_code);
315    }
316    Ok(outcome.stdout)
317}