/**
* `harn eval prompt` reporting layer ported to .harn — see harn#2305
* (W5).
*
* **Pragmatic partial port.** The full `eval prompt` Rust handler is
* ~1000 LOC and is tightly entangled with VM internals (it pushes
* `LlmRenderContext` guards around `render_template_to_string`, builds
* tempfile harn pipelines to drive `llm_call`/judge fanout, reads the
* provider catalog through `llm_config`, and routes context-fixture
* evaluation through `harn_vm::orchestration::assemble_context`). None
* of that is reachable from script-land today without exposing a wider
* VM surface than W5 should land.
*
* What this script owns: the **rendering / reporting layer** — the
* three output formats (terminal, JSON, HTML) and writing them to
* stdout or `--out-file`. The Rust shim does the fleet resolution,
* fleet rendering, run/judge fanout, and context-fixture evaluation,
* collects the result into a `PromptReport`, serialises it to JSON,
* and hands it off here.
*
* The wider port (replacing `eval_prompt.rs` with a thin shim) is
* deferred until G4 (#2297) exposes the missing host capabilities
* (template render with `LlmRenderContext`, provider-catalog
* resolution, `llm_call` parameterised by provider/model). That work
* is intentionally scoped out of W5 to keep this PR reviewable.
*
* Inputs (from the dispatch shim in crates/harn-cli/src/commands/eval_prompt.rs):
* HARN_EVAL_PROMPT_REPORT_JSON — serialised `PromptReport` (see the
* struct in eval_prompt.rs for the canonical shape). Top-level
* keys: `template_path`, `mode`, `renders`, `runs` (optional),
* `judge` (optional), `context_eval` (optional).
* HARN_EVAL_PROMPT_OUTPUT — "terminal" | "json" | "html".
*
* The Rust shim serialises both env vars under a tokio Mutex so
* concurrent in-process callers (the existing eval_prompt_cli tests)
* don't clobber each other mid-dispatch.
*
* The script always emits its rendered payload to stdout. `--out-file`
* is handled on the Rust side after dispatch returns: the script runs
* under the standard `harn run` sandbox where `harness.fs.write_text`
* is restricted to `workspace_roots`, but users invoke `--out-file
* /tmp/...` all the time. Capturing the script's stdout and writing it
* from the unsandboxed shim preserves that behavior.
*/
fn __safe_string(value, fallback: string) -> string {
if type_of(value) == "string" {
return value
}
return fallback
}
fn __safe_list(value) -> list {
if type_of(value) == "list" {
return value
}
return []
}
fn __safe_dict(value) -> dict {
if type_of(value) == "dict" {
return value
}
return {}
}
fn __ends_with_newline(s: string) -> bool {
if len(s) == 0 {
return false
}
return s[len(s) - 1] == "\n"
}
/**
* Format a float as `"X.YYY"` (3 decimal places, always padded with
* trailing zeros). Mirrors Rust's `format!("{:.3}", x)` rounding /
* padding behavior — needed because Harn's `to_string` for `Float`
* uses `{n}` / `{n:.1}` which doesn't pad.
*/
fn __format_float_3(value) -> string {
let f = to_float(value) ?? 0.0
let negative = f < 0.0
let abs_f = if negative {
-f
} else {
f
}
// Round half-up to 3 decimals via int math.
let scaled = to_int(round(abs_f * 1000.0)) ?? 0
let whole = scaled / 1000
let frac = scaled - whole * 1000
var frac_str = to_string(frac)
while len(frac_str) < 3 {
frac_str = "0" + frac_str
}
let sign = if negative && (whole != 0 || frac != 0) {
"-"
} else {
""
}
return sign + to_string(whole) + "." + frac_str
}
/**
* ─── Terminal rendering ───────────────────────────────────────────────────
*/
fn __line_diff_summary(baseline, candidate) -> string {
// Match the Rust impl's BTreeSet semantics: count *unique* lines on
// either side, not raw line counts.
var baseline_set = {}
for line in baseline {
baseline_set = baseline_set + {[line]: true}
}
var candidate_set = {}
for line in candidate {
candidate_set = candidate_set + {[line]: true}
}
var only_in_baseline = 0
for k in keys(baseline_set) {
if candidate_set[k] == nil {
only_in_baseline = only_in_baseline + 1
}
}
var only_in_candidate = 0
for k in keys(candidate_set) {
if baseline_set[k] == nil {
only_in_candidate = only_in_candidate + 1
}
}
if only_in_baseline == 0 && only_in_candidate == 0 {
let total_baseline = len(baseline)
let total_candidate = len(candidate)
if total_baseline == total_candidate {
return ""
}
return to_string(total_baseline) + " vs " + to_string(total_candidate)
+ " lines (same content set, different ordering or repeats)"
}
return to_string(only_in_baseline) + " line(s) only in baseline, "
+ to_string(only_in_candidate)
+ " line(s) only here"
}
fn __first_rendered(renders) -> string {
for r in renders {
let rd = __safe_dict(r)
if type_of(rd["rendered"]) == "string" {
return rd["rendered"]
}
}
return ""
}
fn __terminal_render_section(render, idx, baseline_lines) -> string {
var out = "## [" + to_string(idx) + "] "
+ __safe_string(render["selector"], "")
+ " ("
+ __safe_string(render["provider"], "")
+ "/"
+ __safe_string(render["model"], "")
+ ") family="
+ __safe_string(render["family"], "")
+ "\n"
let auth_available = render["auth_available"] ?? true
if !auth_available {
out = out + " auth: not configured\n"
}
let error = render["error"]
if type_of(error) == "string" {
out = out + " render error: " + error + "\n\n"
return out
}
let rendered = render["rendered"]
if type_of(rendered) != "string" {
return out
}
out = out + "---\n"
out = out + rendered
if !__ends_with_newline(rendered) {
out = out + "\n"
}
out = out + "---\n"
if idx > 0 && len(baseline_lines) > 0 {
let candidate_lines = split(rendered, "\n")
// Rust uses `lines()` which drops a trailing empty entry from a
// terminal "\n"; mirror that here so the diff summary matches.
var candidate_trim = candidate_lines
if len(candidate_trim) > 0 && candidate_trim[len(candidate_trim) - 1] == "" {
candidate_trim = candidate_trim[0:len(candidate_trim) - 1]
}
let summary = __line_diff_summary(baseline_lines, candidate_trim)
if summary != "" {
out = out + " diff vs #0: " + summary + "\n"
}
}
out = out + "\n"
return out
}
fn __terminal_render_context_eval(context_eval) -> string {
var out = "\n# Context fixture gates: "
+ to_string(context_eval["passed"] ?? 0)
+ " passed / "
+ to_string(context_eval["total"] ?? 0)
+ " total\n"
for fixture in __safe_list(context_eval["fixtures"]) {
let fx = __safe_dict(fixture)
out = out + "\n## " + __safe_string(fx["path"], "")
+ " ("
+ to_string(fx["passed"] ?? 0)
+ " passed / "
+ to_string(fx["total"] ?? 0)
+ " total)\n"
for case in __safe_list(fx["cases"]) {
let cs = __safe_dict(case)
let score = __safe_dict(cs["score"])
let budget = __safe_dict(cs["budget"])
let selected = __safe_list(cs["selected_artifact_ids"])
let pass_label = if cs["pass"] ?? false {
"pass"
} else {
"fail"
}
let overall = score["overall"] ?? 0.0
out = out + "- " + __safe_string(cs["id"], "")
+ ": "
+ pass_label
+ " score="
+ __format_float_3(overall)
+ " selected=["
+ join(selected, ", ")
+ "] tokens="
+ to_string(budget["total_tokens"] ?? 0)
+ "/"
+ to_string(budget["budget_tokens"] ?? 0)
+ "\n"
for failure in __safe_list(cs["failures"]) {
out = out + " failure: " + to_string(failure) + "\n"
}
}
}
return out
}
fn __terminal_render_runs(report) -> string {
let runs = __safe_dict(report["runs"])
if len(keys(runs)) == 0 {
return ""
}
var out = "\n# Model responses\n"
for render in __safe_list(report["renders"]) {
let rd = __safe_dict(render)
let selector = __safe_string(rd["selector"], "")
let run = runs[selector]
if run == nil {
continue
}
let run_d = __safe_dict(run)
out = out + "\n## " + selector
+ " ("
+ __safe_string(rd["model"], "")
+ ")\n"
if run_d["skipped"] ?? false {
out = out + " skipped: unauthenticated provider\n"
continue
}
let error = run_d["error"]
if type_of(error) == "string" {
out = out + " error: " + error + "\n"
continue
}
let response = run_d["response"]
if type_of(response) == "string" {
out = out + "---\n" + response
if !__ends_with_newline(response) {
out = out + "\n"
}
out = out + "---\n"
}
}
return out
}
fn __render_terminal(report) -> string {
var out = "# harn eval prompt — "
+ __safe_string(report["template_path"], "")
+ " (mode: "
+ __safe_string(report["mode"], "")
+ ")\n\n"
let renders = __safe_list(report["renders"])
let baseline_text = __first_rendered(renders)
var baseline_lines = []
if baseline_text != "" {
baseline_lines = split(baseline_text, "\n")
if len(baseline_lines) > 0 && baseline_lines[len(baseline_lines) - 1] == "" {
baseline_lines = baseline_lines[0:len(baseline_lines) - 1]
}
}
var idx = 0
for render in renders {
out = out + __terminal_render_section(__safe_dict(render), idx, baseline_lines)
idx = idx + 1
}
let context_eval = report["context_eval"]
if type_of(context_eval) == "dict" {
out = out + __terminal_render_context_eval(context_eval)
}
out = out + __terminal_render_runs(report)
let judge = report["judge"]
if type_of(judge) == "dict" {
out = out + "\n# Judge verdict ("
+ __safe_string(judge["judge_model"], "")
+ "): \n"
+ __safe_string(judge["verdict"], "")
+ "\n"
}
return out
}
/**
* ─── JSON rendering ───────────────────────────────────────────────────────
*/
fn __render_json(report) -> string {
// The Rust impl emits serde_json's pretty form of the PromptReport
// verbatim. Harn's `json_stringify_pretty` sorts keys alphabetically,
// so the wire byte order differs from serde's struct-field order —
// but parsing both sides into a serde_json::Value compares equal.
// The dispatch parity tests assert structural equality, not byte
// identity, for the JSON path. The trailing newline matches Rust's
// `format!("{s}\n")`.
return json_stringify_pretty(report) + "\n"
}
/**
* ─── HTML rendering ───────────────────────────────────────────────────────
*/
fn __html_escape(s: string) -> string {
// The ampersand replace must run first to avoid double-encoding the
// ampersands that the other replacements introduce.
return s
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace("\"", """)
.replace("'", "'")
}
fn __html_render_card(render, runs) -> string {
var out = "<div class=\"card\"><h2>"
+ __html_escape(__safe_string(render["selector"], ""))
+ " <span class=\"meta\">("
+ __html_escape(__safe_string(render["provider"], ""))
+ " / "
+ __html_escape(__safe_string(render["model"], ""))
+ " · "
+ __html_escape(__safe_string(render["family"], ""))
+ ")</span></h2>"
if !(render["auth_available"] ?? true) {
out = out + "<p class=\"skip\">auth: not configured</p>"
}
let error = render["error"]
let rendered = render["rendered"]
if type_of(error) == "string" {
out = out + "<p class=\"err\">render error: " + __html_escape(error) + "</p>"
} else if type_of(rendered) == "string" {
out = out + "<pre>" + __html_escape(rendered) + "</pre>"
}
let selector = __safe_string(render["selector"], "")
let run = runs[selector]
if type_of(run) == "dict" {
if run["skipped"] ?? false {
out = out + "<p class=\"skip\">run: skipped (no credentials)</p>"
} else {
let run_error = run["error"]
let response = run["response"]
if type_of(run_error) == "string" {
out = out + "<p class=\"err\">run error: " + __html_escape(run_error) + "</p>"
} else if type_of(response) == "string" {
out = out + "<h3>response</h3><pre>" + __html_escape(response) + "</pre>"
}
}
}
out = out + "</div>"
return out
}
fn __render_html(report) -> string {
var out = "<!doctype html><html><head><meta charset=\"utf-8\"><title>"
+ "harn eval prompt report</title>"
out = out + "<style>body{font-family:system-ui,sans-serif;margin:2rem;color:#222}h1{margin-bottom:0}"
out = out + ".meta{color:#666;font-size:0.9rem;margin-bottom:1.5rem}"
out = out + ".grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(28rem,1fr));gap:1rem}"
out = out + ".card{border:1px solid #ddd;border-radius:6px;padding:1rem;background:#fafafa}"
out = out + ".card h2{margin-top:0;font-size:1rem}"
out = out
+ "pre{background:#fff;border:1px solid #eee;padding:0.75rem;overflow:auto;white-space:pre-wrap;font-size:0.85rem}"
out = out + ".err{color:#b00}.skip{color:#888;font-style:italic}"
out = out + "</style></head><body>"
out = out + "<h1>harn eval prompt</h1><div class=\"meta\">"
+ __html_escape(__safe_string(report["template_path"], ""))
+ " · mode: "
+ __safe_string(report["mode"], "")
+ "</div>"
out = out + "<div class=\"grid\">"
let runs = __safe_dict(report["runs"])
for render in __safe_list(report["renders"]) {
out = out + __html_render_card(__safe_dict(render), runs)
}
out = out + "</div>"
let context_eval = report["context_eval"]
if type_of(context_eval) == "dict" {
out = out + "<h2>Context fixture gates</h2><p>"
+ to_string(context_eval["passed"] ?? 0)
+ " passed / "
+ to_string(context_eval["total"] ?? 0)
+ " total</p>"
for fixture in __safe_list(context_eval["fixtures"]) {
let fx = __safe_dict(fixture)
out = out + "<h3>" + __html_escape(__safe_string(fx["path"], "")) + "</h3><ul>"
for case in __safe_list(fx["cases"]) {
let cs = __safe_dict(case)
let score = __safe_dict(cs["score"])
let budget = __safe_dict(cs["budget"])
let pass_label = if cs["pass"] ?? false {
"pass"
} else {
"fail"
}
let selected = __safe_list(cs["selected_artifact_ids"])
out = out + "<li><strong>" + __html_escape(__safe_string(cs["id"], ""))
+ "</strong>: "
+ pass_label
+ " · score "
+ __format_float_3(score["overall"] ?? 0.0)
+ " · selected ["
+ __html_escape(join(selected, ", "))
+ "] · tokens "
+ to_string(budget["total_tokens"] ?? 0)
+ "/"
+ to_string(budget["budget_tokens"] ?? 0)
+ "</li>"
}
out = out + "</ul>"
}
}
let judge = report["judge"]
if type_of(judge) == "dict" {
out = out + "<h2>Judge (" + __html_escape(__safe_string(judge["judge_model"], "")) + ")</h2>"
+ "<pre>"
+ __html_escape(__safe_string(judge["verdict"], ""))
+ "</pre>"
}
out = out + "</body></html>\n"
return out
}
/**
* ─── Entrypoint ───────────────────────────────────────────────────────────
*/
fn main(harness: Harness) {
let raw = harness.env.get_or("HARN_EVAL_PROMPT_REPORT_JSON", "")
if raw == "" {
harness.stdio
.eprintln("internal error: HARN_EVAL_PROMPT_REPORT_JSON not set by dispatch shim")
exit(70)
}
let report = try {
json_parse(raw)
} catch (e) {
harness.stdio.eprintln("internal error: failed to parse PromptReport: " + to_string(e))
exit(70)
}
let output = harness.env.get_or("HARN_EVAL_PROMPT_OUTPUT", "terminal")
let payload = if output == "json" {
__render_json(report)
} else if output == "html" {
__render_html(report)
} else {
__render_terminal(report)
}
// Always print to stdout (no trailing newline — the rendered payloads
// already include their own terminating newline). The Rust shim
// intercepts captured stdout and writes it to `--out-file` if set.
__io_print(payload)
}