harn_cli/commands/
eval_context.rs1use std::fs;
28use std::io::Write as _;
29use std::path::{Path, PathBuf};
30
31use harn_vm::orchestration::{
32 context_eval_default_output_dir, evaluate_context_eval_manifest, load_context_eval_manifest,
33 ContextEvalReport, ContextEvalRunReport,
34};
35
36use crate::cli::EvalContextArgs;
37use crate::dispatch;
38use crate::env_guard::ScopedEnvVar;
39
40const CONTEXT_REPORT_ENV: &str = "HARN_EVAL_CONTEXT_REPORT_JSON";
45
46const CONTEXT_OUTPUT_MODE_ENV: &str = "HARN_EVAL_CONTEXT_OUTPUT_MODE";
50
51static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
63
64pub async fn run(args: EvalContextArgs) -> i32 {
65 let report = match aggregate_report(&args) {
66 Ok(report) => report,
67 Err(code) => return code,
68 };
69
70 let output_dir = args.output.unwrap_or_else(context_eval_default_output_dir);
71 if let Err(error) = fs::create_dir_all(&output_dir) {
72 eprintln!("error: failed to create {}: {error}", output_dir.display());
73 return 1;
74 }
75
76 if let Err(error) = write_json_artifacts(&output_dir, &report) {
81 eprintln!("error: failed to write context eval outputs: {error}");
82 return 1;
83 }
84
85 let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
89
90 if use_legacy {
91 if let Err(error) = write_markdown_legacy(&output_dir, &report) {
92 eprintln!("error: failed to write context eval markdown: {error}");
93 return 1;
94 }
95 announce_output_paths(&output_dir);
96 if args.json {
97 if let Err(code) = print_json_legacy(&report) {
98 return code;
99 }
100 } else {
101 print_summary_legacy(&report);
102 }
103 return post_render_exit_code(&report);
104 }
105
106 match write_markdown_dispatch(&output_dir, &report).await {
107 Ok(()) => {}
108 Err(code) => return code,
109 }
110 announce_output_paths(&output_dir);
111 if args.json {
112 if let Err(code) = print_json_dispatch(&report).await {
113 return code;
114 }
115 } else if let Err(code) = print_summary_dispatch(&report).await {
116 return code;
117 }
118 post_render_exit_code(&report)
119}
120
121fn aggregate_report(args: &EvalContextArgs) -> Result<ContextEvalReport, i32> {
125 let manifest = match load_context_eval_manifest(&args.manifest) {
126 Ok(manifest) => manifest,
127 Err(error) => {
128 eprintln!("error: {error}");
129 return Err(1);
130 }
131 };
132 let report = match evaluate_context_eval_manifest(&manifest) {
133 Ok(report) => report,
134 Err(error) => {
135 eprintln!("error: {error}");
136 return Err(1);
137 }
138 };
139 Ok(report)
140}
141
142fn post_render_exit_code(report: &ContextEvalReport) -> i32 {
143 if report.pass {
144 0
145 } else {
146 1
147 }
148}
149
150fn announce_output_paths(output_dir: &Path) {
151 eprintln!(
152 "wrote {}, {}, and {}",
153 output_dir.join("summary.json").display(),
154 output_dir.join("per_run.jsonl").display(),
155 output_dir.join("summary.md").display()
156 );
157}
158
159fn write_json_artifacts(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
160 write_json(output_dir.join("summary.json"), report)?;
161 write_jsonl(output_dir.join("per_run.jsonl"), &report.runs)
162}
163
164fn write_json(path: PathBuf, report: &ContextEvalReport) -> Result<(), String> {
165 let payload = serde_json::to_string_pretty(report).map_err(|error| error.to_string())?;
166 fs::write(path, payload).map_err(|error| error.to_string())
167}
168
169fn write_jsonl(path: PathBuf, runs: &[ContextEvalRunReport]) -> Result<(), String> {
170 let mut file = fs::File::create(path).map_err(|error| error.to_string())?;
171 for run in runs {
172 let line = serde_json::to_string(run).map_err(|error| error.to_string())?;
173 file.write_all(line.as_bytes())
174 .map_err(|error| error.to_string())?;
175 file.write_all(b"\n").map_err(|error| error.to_string())?;
176 }
177 Ok(())
178}
179
180fn write_markdown_legacy(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
183 fs::write(
184 output_dir.join("summary.md"),
185 legacy_render_markdown(report),
186 )
187 .map_err(|error| error.to_string())
188}
189
190fn print_json_legacy(report: &ContextEvalReport) -> Result<(), i32> {
191 match serde_json::to_string_pretty(report) {
192 Ok(payload) => {
193 println!("{payload}");
194 Ok(())
195 }
196 Err(error) => {
197 eprintln!("error: failed to serialize context eval summary: {error}");
198 Err(1)
199 }
200 }
201}
202
203fn print_summary_legacy(report: &ContextEvalReport) {
204 println!(
205 "context eval: {}/{} passed, mean_correctness={:.2}, mean_tool_quality={:.2}",
206 report.passed_runs,
207 report.total_runs,
208 report.aggregate.mean_final_correctness,
209 report.aggregate.mean_tool_call_quality
210 );
211}
212
213fn legacy_render_markdown(report: &ContextEvalReport) -> String {
214 let mut out = String::new();
215 out.push_str(&format!(
216 "# Context Eval: {}\n\n",
217 report
218 .manifest_name
219 .as_deref()
220 .unwrap_or(report.manifest_id.as_str())
221 ));
222 out.push_str(&format!(
223 "- status: {}\n- runs: {}/{} passed\n- mean correctness: {:.4}\n- mean tool quality: {:.4}\n- input tokens: {}\n- output tokens: {}\n- cost USD: {:.6}\n\n",
224 if report.pass { "PASS" } else { "FAIL" },
225 report.passed_runs,
226 report.total_runs,
227 report.aggregate.mean_final_correctness,
228 report.aggregate.mean_tool_call_quality,
229 report.aggregate.total_input_tokens,
230 report.aggregate.total_output_tokens,
231 report.aggregate.total_cost_usd,
232 ));
233 out.push_str("| task | mode | pass | correctness | tools | reads before edit | input tokens | compactions | cache key |\n");
234 out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
235 for run in &report.runs {
236 out.push_str(&format!(
237 "| {} | {} | {} | {:.4} | {:.4} | {} | {} | {} | `{}` |\n",
238 escape_md(&run.task_id),
239 escape_md(&run.mode_id),
240 if run.passed { "yes" } else { "no" },
241 run.final_correctness.score,
242 run.tool_call_quality.score,
243 run.reads_before_first_edit,
244 run.input_tokens,
245 run.compaction_count,
246 run.cache.key,
247 ));
248 }
249 out
250}
251
252fn escape_md(value: &str) -> String {
253 value.replace('|', "\\|")
254}
255
256async fn write_markdown_dispatch(output_dir: &Path, report: &ContextEvalReport) -> Result<(), i32> {
259 let payload = render_via_dispatch(report, "markdown").await?;
260 if let Err(error) = fs::write(output_dir.join("summary.md"), payload) {
261 eprintln!("error: failed to write context eval markdown: {error}");
262 return Err(1);
263 }
264 Ok(())
265}
266
267async fn print_summary_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
268 let payload = render_via_dispatch(report, "summary").await?;
269 print!("{payload}");
270 if !payload.ends_with('\n') {
273 println!();
274 }
275 Ok(())
276}
277
278async fn print_json_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
279 let payload = render_via_dispatch(report, "json").await?;
280 print!("{payload}");
281 if !payload.ends_with('\n') {
282 println!();
283 }
284 Ok(())
285}
286
287async fn render_via_dispatch(report: &ContextEvalReport, mode: &str) -> Result<String, i32> {
296 let report_json = match serde_json::to_string(report) {
297 Ok(json) => json,
298 Err(error) => {
299 eprintln!("error: failed to serialise ContextEvalReport for dispatch: {error}");
300 return Err(1);
301 }
302 };
303
304 let _guard = DISPATCH_RENDER_LOCK.lock().await;
305 let _report = ScopedEnvVar::set(CONTEXT_REPORT_ENV, &report_json);
306 let _mode = ScopedEnvVar::set(CONTEXT_OUTPUT_MODE_ENV, mode);
307
308 let outcome = dispatch::run_embedded_script("eval/context", Vec::new(), false).await;
309 if !outcome.stderr.is_empty() {
310 use std::io::Write as _;
311 let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
312 }
313 if outcome.exit_code != 0 {
314 return Err(outcome.exit_code);
315 }
316 Ok(outcome.stdout)
317}