harn_cli/commands/
eval_context.rs1use std::fs;
28use std::io::Write as _;
29use std::path::{Path, PathBuf};
30
31use harn_vm::orchestration::{
32 context_eval_default_output_dir, evaluate_context_eval_manifest, load_context_eval_manifest,
33 ContextEvalReport, ContextEvalRunReport,
34};
35
36use crate::cli::EvalContextArgs;
37use crate::dispatch;
38use crate::env_guard::ScopedEnvVar;
39
40const CONTEXT_REPORT_ENV: &str = "HARN_EVAL_CONTEXT_REPORT_JSON";
45
46const CONTEXT_OUTPUT_MODE_ENV: &str = "HARN_EVAL_CONTEXT_OUTPUT_MODE";
50
51static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
63
64pub async fn run(args: EvalContextArgs) -> i32 {
65 let report = match aggregate_report(&args) {
66 Ok(report) => report,
67 Err(code) => return code,
68 };
69
70 let output_dir = args.output.unwrap_or_else(context_eval_default_output_dir);
71 if let Err(error) = fs::create_dir_all(&output_dir) {
72 eprintln!("error: failed to create {}: {error}", output_dir.display());
73 return 1;
74 }
75
76 if let Err(error) = write_json_artifacts(&output_dir, &report) {
81 eprintln!("error: failed to write context eval outputs: {error}");
82 return 1;
83 }
84
85 let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
89
90 if use_legacy {
91 if let Err(error) = write_markdown_legacy(&output_dir, &report) {
92 eprintln!("error: failed to write context eval markdown: {error}");
93 return 1;
94 }
95 announce_output_paths(&output_dir);
96 if args.json {
97 if let Err(code) = print_json_legacy(&report) {
98 return code;
99 }
100 } else {
101 print_summary_legacy(&report);
102 }
103 return post_render_exit_code(&report);
104 }
105
106 match write_markdown_dispatch(&output_dir, &report).await {
107 Ok(()) => {}
108 Err(code) => return code,
109 }
110 announce_output_paths(&output_dir);
111 if args.json {
112 if let Err(code) = print_json_dispatch(&report).await {
113 return code;
114 }
115 } else if let Err(code) = print_summary_dispatch(&report).await {
116 return code;
117 }
118 post_render_exit_code(&report)
119}
120
121fn aggregate_report(args: &EvalContextArgs) -> Result<ContextEvalReport, i32> {
125 let manifest = match load_context_eval_manifest(&args.manifest) {
126 Ok(manifest) => manifest,
127 Err(error) => {
128 eprintln!("error: {error}");
129 return Err(1);
130 }
131 };
132 let report = match evaluate_context_eval_manifest(&manifest) {
133 Ok(report) => report,
134 Err(error) => {
135 eprintln!("error: {error}");
136 return Err(1);
137 }
138 };
139 Ok(report)
140}
141
142fn post_render_exit_code(report: &ContextEvalReport) -> i32 {
143 i32::from(!report.pass)
144}
145
146fn announce_output_paths(output_dir: &Path) {
147 eprintln!(
148 "wrote {}, {}, and {}",
149 output_dir.join("summary.json").display(),
150 output_dir.join("per_run.jsonl").display(),
151 output_dir.join("summary.md").display()
152 );
153}
154
155fn write_json_artifacts(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
156 write_json(output_dir.join("summary.json"), report)?;
157 write_jsonl(output_dir.join("per_run.jsonl"), &report.runs)
158}
159
160fn write_json(path: PathBuf, report: &ContextEvalReport) -> Result<(), String> {
161 let payload = serde_json::to_string_pretty(report).map_err(|error| error.to_string())?;
162 fs::write(path, payload).map_err(|error| error.to_string())
163}
164
165fn write_jsonl(path: PathBuf, runs: &[ContextEvalRunReport]) -> Result<(), String> {
166 let mut file = fs::File::create(path).map_err(|error| error.to_string())?;
167 for run in runs {
168 let line = serde_json::to_string(run).map_err(|error| error.to_string())?;
169 file.write_all(line.as_bytes())
170 .map_err(|error| error.to_string())?;
171 file.write_all(b"\n").map_err(|error| error.to_string())?;
172 }
173 Ok(())
174}
175
176fn write_markdown_legacy(output_dir: &Path, report: &ContextEvalReport) -> Result<(), String> {
179 fs::write(
180 output_dir.join("summary.md"),
181 legacy_render_markdown(report),
182 )
183 .map_err(|error| error.to_string())
184}
185
186fn print_json_legacy(report: &ContextEvalReport) -> Result<(), i32> {
187 match serde_json::to_string_pretty(report) {
188 Ok(payload) => {
189 println!("{payload}");
190 Ok(())
191 }
192 Err(error) => {
193 eprintln!("error: failed to serialize context eval summary: {error}");
194 Err(1)
195 }
196 }
197}
198
199fn print_summary_legacy(report: &ContextEvalReport) {
200 println!(
201 "context eval: {}/{} passed, mean_correctness={:.2}, mean_tool_quality={:.2}",
202 report.passed_runs,
203 report.total_runs,
204 report.aggregate.mean_final_correctness,
205 report.aggregate.mean_tool_call_quality
206 );
207}
208
209fn legacy_render_markdown(report: &ContextEvalReport) -> String {
210 let mut out = String::new();
211 out.push_str(&format!(
212 "# Context Eval: {}\n\n",
213 report
214 .manifest_name
215 .as_deref()
216 .unwrap_or(report.manifest_id.as_str())
217 ));
218 out.push_str(&format!(
219 "- status: {}\n- runs: {}/{} passed\n- mean correctness: {:.4}\n- mean tool quality: {:.4}\n- input tokens: {}\n- output tokens: {}\n- cost USD: {:.6}\n\n",
220 if report.pass { "PASS" } else { "FAIL" },
221 report.passed_runs,
222 report.total_runs,
223 report.aggregate.mean_final_correctness,
224 report.aggregate.mean_tool_call_quality,
225 report.aggregate.total_input_tokens,
226 report.aggregate.total_output_tokens,
227 report.aggregate.total_cost_usd,
228 ));
229 out.push_str("| task | mode | pass | correctness | tools | reads before edit | input tokens | compactions | cache key |\n");
230 out.push_str("|---|---|---:|---:|---:|---:|---:|---:|---|\n");
231 for run in &report.runs {
232 out.push_str(&format!(
233 "| {} | {} | {} | {:.4} | {:.4} | {} | {} | {} | `{}` |\n",
234 escape_md(&run.task_id),
235 escape_md(&run.mode_id),
236 if run.passed { "yes" } else { "no" },
237 run.final_correctness.score,
238 run.tool_call_quality.score,
239 run.reads_before_first_edit,
240 run.input_tokens,
241 run.compaction_count,
242 run.cache.key,
243 ));
244 }
245 out
246}
247
248fn escape_md(value: &str) -> String {
249 value.replace('|', "\\|")
250}
251
252async fn write_markdown_dispatch(output_dir: &Path, report: &ContextEvalReport) -> Result<(), i32> {
255 let payload = render_via_dispatch(report, "markdown").await?;
256 if let Err(error) = fs::write(output_dir.join("summary.md"), payload) {
257 eprintln!("error: failed to write context eval markdown: {error}");
258 return Err(1);
259 }
260 Ok(())
261}
262
263async fn print_summary_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
264 let payload = render_via_dispatch(report, "summary").await?;
265 print!("{payload}");
266 if !payload.ends_with('\n') {
269 println!();
270 }
271 Ok(())
272}
273
274async fn print_json_dispatch(report: &ContextEvalReport) -> Result<(), i32> {
275 let payload = render_via_dispatch(report, "json").await?;
276 print!("{payload}");
277 if !payload.ends_with('\n') {
278 println!();
279 }
280 Ok(())
281}
282
283async fn render_via_dispatch(report: &ContextEvalReport, mode: &str) -> Result<String, i32> {
292 let report_json = match serde_json::to_string(report) {
293 Ok(json) => json,
294 Err(error) => {
295 eprintln!("error: failed to serialise ContextEvalReport for dispatch: {error}");
296 return Err(1);
297 }
298 };
299
300 let _guard = DISPATCH_RENDER_LOCK.lock().await;
301 let _report = ScopedEnvVar::set(CONTEXT_REPORT_ENV, &report_json);
302 let _mode = ScopedEnvVar::set(CONTEXT_OUTPUT_MODE_ENV, mode);
303
304 let outcome = dispatch::run_embedded_script("eval/context", Vec::new(), false).await;
305 if !outcome.stderr.is_empty() {
306 use std::io::Write as _;
307 let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
308 }
309 if outcome.exit_code != 0 {
310 return Err(outcome.exit_code);
311 }
312 Ok(outcome.stdout)
313}