/**
* `harn eval coding-agent` rendering layer ported to .harn — see
* harn#2307 (W7).
*
* **Pragmatic partial port.** The Rust handler in
* `crates/harn-cli/src/commands/eval_coding_agent.rs` is ~2.2k LOC and
* is tightly entangled with VM internals: it builds a fixture/model/
* tool-format matrix, invokes `execute_run` per cell (which spins up
* the embedded coding-agent driver against live or mock provider
* credentials), snapshots Ollama state via `local::runtime`, fans out
* step-judge presets, scores per-run summaries, builds rollups + a
* native/text comparison + follow-up suggestions, and writes
* `summary.json`, `per_run.jsonl`, and `local_readiness.json` to disk.
* None of that is reachable from script-land today without exposing a
* wider VM surface than W7 should land — the same constraint that
* shaped W5 / W6.
*
* What this script owns: the **rendering / reporting layer** — the
* `summary.md` body, the `followups.md` body, the post-run human one-
* line summary, and the `--json` pretty form of the assembled
* `EvalSummary`. The Rust shim does all the matrix execution, run
* scoring, rollup, comparison, and follow-up generation, collects the
* result into an `EvalSummary`, serialises it to JSON, and hands it
* off here.
*
* The on-disk JSON artifacts (`summary.json`, `per_run.jsonl`,
* `local_readiness.json`) stay on the serde-driven Rust path because
* Harn's `json_stringify_pretty` sorts dict keys alphabetically while
* serde emits struct fields in declaration order — and those artifacts
* are consumed by regression-check / hosted ingestion / the
* experiment driver in `experiments/step-judge/run.sh`, which depend
* on the serde struct-field byte order.
*
* Inputs (from the dispatch shim in crates/harn-cli/src/commands/eval_coding_agent.rs):
* HARN_EVAL_CODING_AGENT_SUMMARY_JSON — serialised `EvalSummary`.
* HARN_EVAL_CODING_AGENT_MODE — one of:
* "markdown" — render the summary.md body to stdout.
* "followups" — render the followups.md body to stdout.
* "summary" — render the one-line human summary to stdout.
* "json" — render the JSON pretty form to stdout for `--json`.
*
* The Rust shim serialises these env vars under a tokio Mutex so
* concurrent in-process callers (the existing eval_coding_agent_cli
* test plus any future fanout caller) don't clobber each other mid-
* dispatch. Pattern mirrors W5 / W6.
*
* The wider port (replacing the Rust shim) is gated on G4 (#2297)
* exposing `execute_run`, `snapshot_provider`, `llm_pricing_per_1k`,
* and `provider_key_available` to script-land. C1 (#2314) will delete
* the `HARN_CLI_IMPL=rust` escape hatch.
*/
fn __safe_string(value, fallback: string) -> string {
if type_of(value) == "string" {
return value
}
return fallback
}
fn __safe_list(value) -> list {
if type_of(value) == "list" {
return value
}
return []
}
fn __safe_dict(value) -> dict {
if type_of(value) == "dict" {
return value
}
return {}
}
/**
* Format a float as `"X.YYYYYY"` (6 decimals, half-up padded). Mirrors
* Rust's `format!("{:.6}", x)` rounding/padding so the rendered
* markdown/summary lines stay byte-identical with the legacy path.
*/
fn __format_float_6(value) -> string {
let f = to_float(value) ?? 0.0
let negative = f < 0.0
let abs_f = if negative {
-f
} else {
f
}
let scaled = to_int(round(abs_f * 1000000.0)) ?? 0
let whole = scaled / 1000000
let frac = scaled - whole * 1000000
var frac_str = to_string(frac)
while len(frac_str) < 6 {
frac_str = "0" + frac_str
}
let sign = if negative && (whole != 0 || frac != 0) {
"-"
} else {
""
}
return sign + to_string(whole) + "." + frac_str
}
/**
* Format a float as `"+X.Y"` / `"-X.Y"` (1 decimal, always signed,
* half-up padded). Mirrors Rust's `format!("{:+.1}", x)` for the
* baseline-comparison `net lift: **{:+.1}pp**` line.
*/
fn __format_float_signed_1(value) -> string {
let f = to_float(value) ?? 0.0
let negative = f < 0.0
let abs_f = if negative {
-f
} else {
f
}
let scaled = to_int(round(abs_f * 10.0)) ?? 0
let whole = scaled / 10
let frac = scaled - whole * 10
let frac_str = to_string(frac)
// Rust's `{:+.1}` prints a `+` for zero (and positive values), `-` for
// negative — match that even when the rounded value is 0.0.
let sign = if negative && (whole != 0 || frac != 0) {
"-"
} else {
"+"
}
return sign + to_string(whole) + "." + frac_str
}
/**
* Format a float as `"X.Y"` (1 decimal, half-up padded). Used for the
* parity summary percentages to mirror Rust's `"{:.1}%"`.
*/
fn __format_float_1(value) -> string {
let f = to_float(value) ?? 0.0
let negative = f < 0.0
let abs_f = if negative {
-f
} else {
f
}
let scaled = to_int(round(abs_f * 10.0)) ?? 0
let whole = scaled / 10
let frac = scaled - whole * 10
let frac_str = to_string(frac)
let sign = if negative && (whole != 0 || frac != 0) {
"-"
} else {
""
}
return sign + to_string(whole) + "." + frac_str
}
/**
* Escape Markdown table cell content by replacing `|` with `\|`.
* Mirrors Rust's `value.replace('|', "\\|")`.
*/
fn __md_escape(s: string) -> string {
return s.replace("|", "\\|")
}
/**
* Build the `selector_label` string the way Rust's `selector_label`
* helper does — the serialised `EvalSummary` already emits a
* `selector.selector` field that matches the label, so prefer that
* when present and fall back to `"provider:model"`.
*/
fn __selector_label(selector: dict) -> string {
let raw = selector["selector"]
if type_of(raw) == "string" {
return raw
}
return __safe_string(selector["provider"], "")
+ ":"
+ __safe_string(selector["model"], "")
}
/**
* Format an `Option<i64>` as the rendered `i64.to_string()` or `"-"`
* for `None`. Mirrors the legacy
* `value.map(|v| v.to_string()).unwrap_or_else(|| "-".to_string())`
* pattern used in the native/text comparison table.
*/
fn __optional_int_mark(value) -> string {
if type_of(value) == "int" {
return to_string(value)
}
if type_of(value) == "float" {
return to_string(to_int(value) ?? 0)
}
return "-"
}
/**
* Format an `Option<bool>` as `"yes"` / `"no"` / `"-"`. Mirrors Rust's
* `optional_bool_mark`.
*/
fn __optional_bool_mark(value) -> string {
if type_of(value) == "bool" {
if value {
return "yes"
}
return "no"
}
return "-"
}
/**
* URL-encode a markdown link target the way Rust's `markdown_link`
* does: replace `' '` with `%20`, `'('` with `%28`, `')'` with `%29`.
* The label gets its `|` characters escaped for the table cell.
*/
fn __markdown_link(label: string, target: string) -> string {
let target_encoded = target
.replace(" ", "%20")
.replace("(", "%28")
.replace(")", "%29")
return "[" + __md_escape(label) + "](" + target_encoded + ")"
}
fn __comparison_evidence_links(comparison: dict) -> string {
var links = []
let native = comparison["native_evidence_path"]
if type_of(native) == "string" {
links = links + [__markdown_link("native", native)]
}
let text = comparison["text_evidence_path"]
if type_of(text) == "string" {
links = links + [__markdown_link("text", text)]
}
if len(links) == 0 {
return "-"
}
return join(links, "<br>")
}
fn __render_rollup_table(title: string, rollups) -> string {
var out = "## " + title + "\n\n"
out = out + "| key | passed | failed | skipped | total | cost |\n"
out = out + "|---|---:|---:|---:|---:|---:|\n"
for rollup in __safe_list(rollups) {
let rd = __safe_dict(rollup)
out = out + "| `"
+ __md_escape(__safe_string(rd["key"], ""))
+ "` | "
+ to_string(rd["passed_runs"] ?? 0)
+ " | "
+ to_string(rd["failed_runs"] ?? 0)
+ " | "
+ to_string(rd["skipped_runs"] ?? 0)
+ " | "
+ to_string(rd["total_runs"] ?? 0)
+ " | "
+ __format_float_6(rd["total_cost_usd"] ?? 0.0)
+ " |\n"
}
out = out + "\n"
return out
}
fn __render_runs_table(runs) -> string {
var out = "\n## Runs\n\n"
out = out
+ "| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n"
out = out + "|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n"
for run in __safe_list(runs) {
let rd = __safe_dict(run)
let selector = __safe_dict(rd["selector"])
let sequence = __safe_list(rd["tool_sequence"])
let tool_sequence = if len(sequence) == 0 {
"-"
} else {
__md_escape(join(sequence, ", "))
}
let input_tokens = rd["input_tokens"] ?? 0
let output_tokens = rd["output_tokens"] ?? 0
out = out + "| `"
+ __safe_string(rd["fixture_id"], "")
+ "` | `"
+ __safe_string(rd["run_id"], "")
+ "` | `"
+ __safe_string(selector["provider"], "")
+ "` | `"
+ __md_escape(__safe_string(selector["model"], ""))
+ "` | `"
+ __safe_string(rd["tool_format"], "")
+ "` | `"
+ __safe_string(rd["fixture_tool_sequence"], "")
+ "` | `"
+ tool_sequence
+ "` | "
+ __safe_string(rd["status"], "")
+ " | "
+ to_string(rd["iterations"] ?? 0)
+ " | "
+ to_string(input_tokens + output_tokens)
+ " | "
+ __format_float_6(rd["cost_usd"] ?? 0.0)
+ " | "
+ __markdown_link(
to_string(rd["transcript_event_count"] ?? 0),
__safe_string(rd["transcript_events_path"], ""),
)
+ " | `"
+ __safe_string(rd["output_dir"], "")
+ "` |\n"
}
return out
}
fn __render_baseline_comparison(comparison: dict) -> string {
var out = "\n## Baseline Comparison\n\n"
let label = __safe_string(comparison["baseline_label"], "")
let label_suffix = if label == "" {
""
} else {
" (label: `" + label + "`)"
}
out = out + "Compared against `"
+ __safe_string(comparison["baseline_path"], "")
+ "`"
+ label_suffix
+ ".\n\n"
out = out + "- regressions: **"
+ to_string(comparison["regressions_count"] ?? 0)
+ "** (baseline passed, this cell failed)\n"
+ "- recoveries: **"
+ to_string(comparison["recoveries_count"] ?? 0)
+ "** (baseline failed, this cell passed)\n"
+ "- net lift: **"
+ __format_float_signed_1(comparison["net_lift_pp"] ?? 0.0)
+ "pp**\n\n"
let regressions = __safe_list(comparison["regressions"])
if len(regressions) > 0 {
out = out + "### Regressions\n\n"
for delta in regressions {
let dd = __safe_dict(delta)
out = out + "- `"
+ __safe_string(dd["fixture_id"], "")
+ "`: `"
+ __safe_string(dd["baseline_status"], "")
+ "` → `"
+ __safe_string(dd["cell_status"], "")
+ "`\n"
}
out = out + "\n"
}
let recoveries = __safe_list(comparison["recoveries"])
if len(recoveries) > 0 {
out = out + "### Recoveries\n\n"
for delta in recoveries {
let dd = __safe_dict(delta)
out = out + "- `"
+ __safe_string(dd["fixture_id"], "")
+ "`: `"
+ __safe_string(dd["baseline_status"], "")
+ "` → `"
+ __safe_string(dd["cell_status"], "")
+ "`\n"
}
out = out + "\n"
}
return out
}
fn __render_comparison_table(comparisons) -> string {
var out = "\n## Native/Text Comparison\n\n"
out = out
+ "| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n"
out = out + "|---|---|---|---|---|---|---|---:|---:|---:|---|\n"
for comparison in __safe_list(comparisons) {
let cd = __safe_dict(comparison)
let selector = __safe_dict(cd["selector"])
let native_status = if type_of(cd["native_status"]) == "string" {
cd["native_status"]
} else {
"-"
}
let text_status = if type_of(cd["text_status"]) == "string" {
cd["text_status"]
} else {
"-"
}
out = out + "| `"
+ __safe_string(cd["fixture_id"], "")
+ "` | `"
+ __selector_label(selector)
+ "` | "
+ native_status
+ " | "
+ text_status
+ " | "
+ __optional_bool_mark(cd["equivalent"])
+ " | "
+ __optional_bool_mark(cd["verifier_match"])
+ " | "
+ __optional_bool_mark(cd["tool_sequence_match"])
+ " | "
+ __optional_int_mark(cd["rejected_tool_call_delta_text_minus_native"])
+ " | "
+ __optional_int_mark(cd["token_delta_text_minus_native"])
+ " | "
+ __optional_int_mark(cd["iteration_delta_text_minus_native"])
+ " | "
+ __comparison_evidence_links(cd)
+ " |\n"
}
return out
}
fn __render_divergence_evidence(comparisons) -> string {
var diverged = []
for comparison in __safe_list(comparisons) {
let cd = __safe_dict(comparison)
if len(__safe_list(cd["divergence_reasons"])) > 0 {
diverged = diverged + [cd]
}
}
if len(diverged) == 0 {
return ""
}
var out = "\n## Native/Text Divergence Evidence\n\n"
for comparison in diverged {
let selector = __safe_dict(comparison["selector"])
out = out + "- `"
+ __safe_string(comparison["fixture_id"], "")
+ "` `"
+ __selector_label(selector)
+ "`: "
+ join(__safe_list(comparison["divergence_reasons"]), "; ")
+ "\n"
if len(__safe_list(comparison["evidence_paths"])) > 0 {
out = out + " Evidence: "
+ __comparison_evidence_links(comparison)
+ "\n"
}
}
return out
}
fn __render_parity_report(parity_by_pair) -> string {
var out = "\n## Parity report — native vs text\n\n"
out = out
+ "| selector | sample | native pass | text pass | agreement | verifier divergence | native_only | text_only | both_pass | both_fail |\n"
out = out + "|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|\n"
for pair in __safe_list(parity_by_pair) {
let pd = __safe_dict(pair)
let selector = {
selector: __safe_string(pd["provider"], "") + ":" + __safe_string(pd["model"], ""),
provider: __safe_string(pd["provider"], ""),
model: __safe_string(pd["model"], ""),
}
let divergence_counts = __safe_dict(pd["divergence_counts"])
let native = __safe_dict(pd["native"])
let text = __safe_dict(pd["text"])
out = out + "| `"
+ __selector_label(selector)
+ "` | "
+ to_string(pd["sample_size"] ?? 0)
+ " | "
+ __format_float_1((to_float(native["pass_rate"]) ?? 0.0) * 100.0)
+ "% | "
+ __format_float_1((to_float(text["pass_rate"]) ?? 0.0) * 100.0)
+ "% | "
+ __format_float_1((to_float(pd["agreement_rate"]) ?? 0.0) * 100.0)
+ "% | "
+ __format_float_1((to_float(pd["verifier_divergence_rate"]) ?? 0.0) * 100.0)
+ "% | "
+ to_string(divergence_counts["native_only_pass"] ?? 0)
+ " | "
+ to_string(divergence_counts["text_only_pass"] ?? 0)
+ " | "
+ to_string(divergence_counts["both_pass"] ?? 0)
+ " | "
+ to_string(divergence_counts["both_fail"] ?? 0)
+ " |\n"
}
return out
}
fn __render_markdown(summary: dict) -> string {
let fixture_ids = __safe_list(summary["fixture_ids"])
var out = "# Coding Agent Harness Quality Suite\n\n"
out = out + "- fixtures: `"
+ join(fixture_ids, "`, `")
+ "`\n- passed: "
+ to_string(summary["passed_runs"] ?? 0)
+ "/"
+ to_string(summary["total_runs"] ?? 0)
+ "\n- skipped: "
+ to_string(summary["skipped_runs"] ?? 0)
+ "\n- total_cost_usd: "
+ __format_float_6(summary["total_cost_usd"] ?? 0.0)
+ "\n\n"
let rollups = __safe_dict(summary["rollups"])
out = out + __render_rollup_table("By Fixture", rollups["by_fixture"])
out = out + __render_rollup_table("By Provider", rollups["by_provider"])
out = out + __render_rollup_table("By Model", rollups["by_model"])
out = out + __render_rollup_table("By Tool Format", rollups["by_tool_format"])
out = out + __render_rollup_table("By Tool Sequence", rollups["by_tool_sequence"])
out = out + __render_runs_table(summary["runs"])
let baseline = summary["baseline_comparison"]
if type_of(baseline) == "dict" {
out = out + __render_baseline_comparison(baseline)
}
let comparisons = __safe_list(summary["comparisons"])
if len(comparisons) > 0 {
out = out + __render_comparison_table(comparisons)
}
let parity_by_pair = __safe_list(summary["parity_by_pair"])
if len(parity_by_pair) > 0 {
out = out + __render_parity_report(parity_by_pair)
}
out = out + __render_divergence_evidence(comparisons)
return out
}
fn __render_followups(summary: dict) -> string {
var out = "# Follow-up Issue Candidates\n\n"
let followups = __safe_list(summary["followups"])
if len(followups) == 0 {
out = out + "No follow-up issue candidates were generated from this run.\n"
return out
}
for followup in followups {
let fd = __safe_dict(followup)
out = out + "## "
+ __safe_string(fd["title"], "")
+ "\n\n"
+ __safe_string(fd["body"], "")
+ "\n\n"
let run_ids = __safe_list(fd["run_ids"])
if len(run_ids) > 0 {
out = out + "- run_ids: `" + join(run_ids, "`, `") + "`\n"
}
let labels = __safe_list(fd["labels"])
if len(labels) > 0 {
out = out + "- labels: `" + join(labels, "`, `") + "`\n"
}
out = out + "\n"
}
return out
}
fn __render_summary_line(summary: dict) -> string {
return "coding-agent eval: "
+ to_string(summary["passed_runs"] ?? 0)
+ "/"
+ to_string(summary["total_runs"] ?? 0)
+ " passed, "
+ to_string(summary["skipped_runs"] ?? 0)
+ " skipped, total_cost_usd="
+ __format_float_6(summary["total_cost_usd"] ?? 0.0)
}
/**
* Entrypoint. Returns an integer exit code rather than calling
* `exit()` so the dispatch wedge's captured stdout/stderr buffers
* flush back to the Rust shim — `exit()` in the embedded `harn run`
* pipeline calls `std::process::exit` which terminates the host
* binary mid-render and drops the captured streams.
*/
fn main(harness: Harness) -> int {
let raw = harness.env.get_or("HARN_EVAL_CODING_AGENT_SUMMARY_JSON", "")
if raw == "" {
harness.stdio
.eprintln("internal error: HARN_EVAL_CODING_AGENT_SUMMARY_JSON not set by dispatch shim")
return 70
}
let summary = try {
json_parse(raw)
} catch (e) {
harness.stdio.eprintln("internal error: failed to parse EvalSummary: " + to_string(e))
return 70
}
let mode = harness.env.get_or("HARN_EVAL_CODING_AGENT_MODE", "summary")
let payload = if mode == "markdown" {
__render_markdown(summary)
} else if mode == "followups" {
__render_followups(summary)
} else if mode == "json" {
json_stringify_pretty(summary)
} else {
__render_summary_line(summary)
}
__io_print(payload)
return 0
}