1use std::collections::{BTreeMap, BTreeSet, HashSet};
37use std::ffi::OsString;
38use std::fs;
39use std::io::Write as _;
40use std::path::{Path, PathBuf};
41
42use harn_vm::clock::{Clock, RealClock};
43use serde::Serialize;
44use serde_json::Value as JsonValue;
45
46use crate::cli::EvalCodingAgentArgs;
47use crate::commands::eval_coding_agent_preset::{
48 resolve_step_judge_json, resolve_structural_validator_json,
49};
50use crate::commands::eval_model_selector::{
51 resolve_selector, selector_is_local, selector_label, ModelSelector,
52};
53use crate::commands::local::runtime::{
54 local_provider_ids, ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
55};
56use crate::commands::local_readiness;
57use crate::commands::run::{
58 execute_run_with_sandbox_options, CliLlmMockMode, RunProfileOptions, RunSandboxOptions,
59};
60use crate::commands::tool_mode_parity::{
61 self, ToolModeParityFixtureInput, ToolModeParityPairSummary, TOOL_MODE_PARITY_DIRECTORY,
62 TOOL_MODE_PARITY_FIXTURE_SUITE, TOOL_MODE_PARITY_OVERLAY_FILENAME,
63};
64use crate::dispatch;
65use crate::env_guard::ScopedEnvVar;
66
67const CODING_AGENT_SUMMARY_ENV: &str = "HARN_EVAL_CODING_AGENT_SUMMARY_JSON";
72
73const CODING_AGENT_MODE_ENV: &str = "HARN_EVAL_CODING_AGENT_MODE";
79
80static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
91
92const CODING_AGENT_SUITE_HARN: &str = include_str!("../../assets/evals/coding_agent_suite.harn");
93const TOOL_FORMAT_OVERRIDE_WARNING_PREFIX: &str = "warning: tool_format override:";
94
95#[derive(Debug, Clone, Copy)]
96struct FixtureDefinition {
97 id: &'static str,
98 name: &'static str,
99 tool_sequence: &'static str,
100 description: &'static str,
101}
102
103static FIXTURE_DEFINITIONS: &[FixtureDefinition] = &[
104 FixtureDefinition {
105 id: "python-add",
106 name: "Python add repair",
107 tool_sequence: "multi-tool",
108 description: "One-file Python bug fix verified by unittest output.",
109 },
110 FixtureDefinition {
111 id: "cli-help-flag",
112 name: "CLI help flag",
113 tool_sequence: "multi-tool",
114 description: "Add a tiny CLI flag, update help-facing docs, and verify behavior.",
115 },
116 FixtureDefinition {
117 id: "test-output-first",
118 name: "Test-output-first repair",
119 tool_sequence: "multi-tool",
120 description: "Run a failing test first, then edit the implementation and re-run it.",
121 },
122 FixtureDefinition {
123 id: "docs-symbol-rename",
124 name: "Docs symbol rename",
125 tool_sequence: "multi-tool",
126 description:
127 "Update docs and an example after a symbol rename without touching implementation.",
128 },
129 FixtureDefinition {
130 id: "read-only-audit",
131 name: "Read-only audit",
132 tool_sequence: "one-tool",
133 description: "Inspect a file and report that no edits are needed.",
134 },
135 FixtureDefinition {
136 id: "no-tool-diagnosis",
137 name: "No-tool diagnosis",
138 tool_sequence: "no-tool",
139 description: "Answer from prompt-only context without any tools.",
140 },
141];
142
143#[derive(Debug, Clone, Serialize)]
144struct LoadedEnvKey {
145 key: String,
146 source: String,
147}
148
149#[derive(Debug)]
150struct EnvOverlay {
151 previous: Vec<(OsString, Option<OsString>)>,
152}
153
154impl Drop for EnvOverlay {
155 fn drop(&mut self) {
156 for (key, previous) in self.previous.iter().rev() {
157 if let Some(value) = previous {
158 std::env::set_var(key, value);
159 } else {
160 std::env::remove_var(key);
161 }
162 }
163 }
164}
165
166#[derive(Debug, Clone, Serialize)]
167struct RunReport {
168 run_id: String,
169 fixture_id: String,
170 fixture_name: String,
171 fixture_tool_sequence: String,
172 selector: ModelSelector,
173 tool_format: String,
174 status: String,
175 passed: bool,
176 skipped: bool,
177 #[serde(skip_serializing_if = "Option::is_none")]
178 skipped_reason: Option<String>,
179 output_dir: String,
180 transcript_events_path: String,
181 workspace_root: Option<String>,
182 elapsed_ms: u64,
183 duration_ms: u64,
184 iterations: i64,
185 input_tokens: i64,
186 output_tokens: i64,
187 cost_usd: f64,
188 pricing_known: bool,
189 tool_calls: usize,
190 rejected_tool_calls: usize,
191 tool_sequence: Vec<String>,
192 successful_tools: Vec<String>,
193 transcript_event_count: usize,
194 verification_success: bool,
195 harn_exit_code: i32,
196 #[serde(skip_serializing_if = "Option::is_none")]
197 error: Option<String>,
198 #[serde(skip_serializing_if = "Option::is_none")]
199 stderr_excerpt: Option<String>,
200 local_cleanup: Option<LocalCleanupReport>,
201}
202
203#[derive(Debug, Clone, Serialize)]
204struct LocalCleanupReport {
205 provider: String,
206 model: String,
207 initially_loaded: bool,
208 action: String,
209 #[serde(skip_serializing_if = "Option::is_none")]
210 detail: Option<String>,
211}
212
213#[derive(Debug, Clone, Serialize)]
214struct FormatComparison {
215 fixture_id: String,
216 selector: ModelSelector,
217 native_run_id: Option<String>,
218 text_run_id: Option<String>,
219 native_evidence_path: Option<String>,
220 text_evidence_path: Option<String>,
221 native_status: Option<String>,
222 text_status: Option<String>,
223 native_passed: Option<bool>,
224 text_passed: Option<bool>,
225 native_tool_call_count: Option<usize>,
226 text_tool_call_count: Option<usize>,
227 native_rejected_tool_call_count: Option<usize>,
228 text_rejected_tool_call_count: Option<usize>,
229 verifier_match: Option<bool>,
230 tool_sequence_match: Option<bool>,
231 rejected_tool_call_delta_text_minus_native: Option<i64>,
232 token_delta_text_minus_native: Option<i64>,
233 iteration_delta_text_minus_native: Option<i64>,
234 equivalent: Option<bool>,
235 divergence_reasons: Vec<String>,
236 evidence_paths: Vec<String>,
237}
238
239#[derive(Debug, Clone, Serialize)]
240struct FollowupSuggestion {
241 title: String,
242 body: String,
243 labels: Vec<String>,
244 run_ids: Vec<String>,
245}
246
247#[derive(Debug, Clone, Serialize)]
248struct FixtureReport {
249 id: String,
250 name: String,
251 tool_sequence: String,
252 description: String,
253}
254
255#[derive(Debug, Clone, Serialize)]
256struct RollupReport {
257 key: String,
258 total_runs: usize,
259 passed_runs: usize,
260 failed_runs: usize,
261 skipped_runs: usize,
262 total_cost_usd: f64,
263}
264
265#[derive(Debug, Clone, Serialize)]
266struct EvalRollups {
267 by_fixture: Vec<RollupReport>,
268 by_provider: Vec<RollupReport>,
269 by_model: Vec<RollupReport>,
270 by_tool_format: Vec<RollupReport>,
271 by_tool_sequence: Vec<RollupReport>,
272}
273
274#[derive(Debug, Clone, Serialize)]
275struct EvalSummary {
276 schema_version: u32,
277 fixture_ids: Vec<String>,
278 fixtures: Vec<FixtureReport>,
279 output_dir: String,
280 models: Vec<ModelSelector>,
281 tool_formats: Vec<String>,
282 env_keys_loaded: Vec<LoadedEnvKey>,
283 total_runs: usize,
284 passed_runs: usize,
285 failed_runs: usize,
286 skipped_runs: usize,
287 diverged_comparisons: usize,
288 total_cost_usd: f64,
289 rollups: EvalRollups,
290 runs: Vec<RunReport>,
291 comparisons: Vec<FormatComparison>,
292 parity_by_pair: Vec<ToolModeParityPairSummary>,
293 followups: Vec<FollowupSuggestion>,
294 #[serde(skip_serializing_if = "Option::is_none")]
298 step_judge_preset: Option<String>,
299 #[serde(skip_serializing_if = "String::is_empty")]
302 run_label: String,
303 #[serde(skip_serializing_if = "Option::is_none")]
309 baseline_comparison: Option<BaselineComparison>,
310}
311
312#[derive(Debug, Clone, Serialize, Default)]
313struct BaselineComparison {
314 baseline_label: String,
316 baseline_path: String,
318 regressions: Vec<FixtureStatusDelta>,
319 recoveries: Vec<FixtureStatusDelta>,
320 unchanged_passes: Vec<String>,
322 unchanged_failures: Vec<String>,
324 missing_in_baseline: Vec<String>,
327 missing_in_cell: Vec<String>,
328 regressions_count: usize,
329 recoveries_count: usize,
330 net_lift_pp: f64,
334}
335
336#[derive(Debug, Clone, Serialize)]
337struct FixtureStatusDelta {
338 fixture_id: String,
339 baseline_status: String,
340 cell_status: String,
341}
342
343struct LocalRunGuard {
344 selector: ModelSelector,
345 stop_after: bool,
346 snapshot: Option<LocalProviderSnapshot>,
347}
348
349struct RunSummaryContext {
350 run_id: String,
351 fixture: FixtureDefinition,
352 selector: ModelSelector,
353 tool_format: String,
354 run_dir: PathBuf,
355 elapsed_ms: u64,
356 exit_code: i32,
357 stderr: String,
358 local_cleanup: Option<LocalCleanupReport>,
359}
360
361pub async fn run(args: EvalCodingAgentArgs) -> i32 {
362 let output_dir = args.output.clone().unwrap_or_else(default_output_dir);
363 if let Err(error) = fs::create_dir_all(&output_dir) {
364 eprintln!("error: failed to create {}: {error}", output_dir.display());
365 return 1;
366 }
367
368 let (_env_guard, env_keys_loaded) = match load_env_files(&args.env_files) {
369 Ok(loaded) => loaded,
370 Err(error) => {
371 eprintln!("error: {error}");
372 return 1;
373 }
374 };
375
376 let fixtures = match resolve_fixtures(&args.fixtures) {
377 Ok(fixtures) => fixtures,
378 Err(error) => {
379 eprintln!("error: {error}");
380 return 2;
381 }
382 };
383 let models = match resolve_models(&args).await {
384 Ok(models) => models,
385 Err(error) => {
386 eprintln!("error: {error}");
387 return 1;
388 }
389 };
390 let tool_formats = match normalize_tool_formats(&args.tool_formats) {
391 Ok(formats) => formats,
392 Err(error) => {
393 eprintln!("error: {error}");
394 return 2;
395 }
396 };
397 let matrix = build_matrix(&fixtures, &models, &tool_formats, args.max_runs);
398 if matrix.is_empty() {
399 eprintln!("error: no coding-agent benchmark runs selected");
400 return 2;
401 }
402
403 let mut reports = Vec::new();
404 let mut had_error = false;
405 for (fixture, selector, tool_format) in matrix {
406 let report = run_matrix_entry(&args, &output_dir, fixture, selector, tool_format).await;
407 if !report.passed && !report.skipped {
408 had_error = true;
409 }
410 if report.skipped && args.fail_on_unauthorized {
411 had_error = true;
412 }
413 eprintln!(
414 "{} {} {}: {}",
415 report.fixture_id,
416 selector_label(&report.selector),
417 report.tool_format,
418 report.status
419 );
420 reports.push(report);
421 }
422
423 let baseline_comparison = match &args.baseline_comparison_against {
424 Some(path) => match load_baseline_comparison(path, &reports) {
425 Ok(comparison) => Some(comparison),
426 Err(error) => {
427 eprintln!("error: --baseline-comparison-against: {error}");
428 return 1;
429 }
430 },
431 None => None,
432 };
433 let summary = build_summary(
434 &output_dir,
435 fixtures,
436 models,
437 tool_formats,
438 env_keys_loaded,
439 reports,
440 args.step_judge
441 .clone()
442 .filter(|s| !s.is_empty() && s != "none"),
443 args.run_label.clone(),
444 baseline_comparison,
445 );
446 if let Err(error) = write_json_artifacts(&output_dir, &summary) {
452 eprintln!("error: failed to write benchmark outputs: {error}");
453 return 1;
454 }
455
456 let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
460
461 if use_legacy {
462 if let Err(error) = write_markdown_artifacts_legacy(&output_dir, &summary) {
463 eprintln!("error: {error}");
464 return 1;
465 }
466 announce_output_paths(&output_dir);
467 if args.json {
468 print_json_legacy(&summary);
469 } else {
470 print_summary_legacy(&summary);
471 }
472 return if had_error { 1 } else { 0 };
473 }
474
475 if let Err(code) = write_markdown_artifacts_dispatch(&output_dir, &summary).await {
476 return code;
477 }
478 announce_output_paths(&output_dir);
479 if args.json {
480 if let Err(code) = print_json_dispatch(&summary).await {
481 return code;
482 }
483 } else if let Err(code) = print_summary_dispatch(&summary).await {
484 return code;
485 }
486
487 if had_error {
488 1
489 } else {
490 0
491 }
492}
493
494async fn run_matrix_entry(
495 args: &EvalCodingAgentArgs,
496 output_dir: &Path,
497 fixture: FixtureDefinition,
498 selector: ModelSelector,
499 tool_format: String,
500) -> RunReport {
501 let run_id = run_id_for(fixture, &selector, &tool_format);
502 let run_dir = output_dir.join(&run_id);
503 if let Err(error) = reset_dir(&run_dir) {
504 return error_report(
505 run_id,
506 fixture,
507 selector,
508 tool_format,
509 run_dir,
510 format!("failed to prepare run directory: {error}"),
511 );
512 }
513
514 if !provider_available(&selector) {
515 let reason = format!(
516 "provider `{}` has no configured credentials",
517 selector.provider
518 );
519 return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
520 }
521
522 let script_path = run_dir.join("coding_agent_suite.harn");
523 if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
524 return error_report(
525 run_id,
526 fixture,
527 selector,
528 tool_format,
529 run_dir,
530 format!("failed to write benchmark harness: {error}"),
531 );
532 }
533
534 let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
535 let argv = script_argv(args, fixture, &selector, &tool_format, &run_dir);
536 let clock = RealClock::new();
537 let started_ms = clock.monotonic_ms();
538 let outcome = execute_run_with_sandbox_options(
539 &script_path.to_string_lossy(),
540 false,
541 HashSet::new(),
542 argv,
543 Vec::new(),
544 CliLlmMockMode::Off,
545 None,
546 RunProfileOptions::default(),
547 RunSandboxOptions::default().with_workspace_root(run_dir.clone()),
548 )
549 .await;
550 if let Some(line) = tool_format_override_warning_line(&outcome.stderr) {
551 eprintln!("{line}");
552 }
553 let elapsed_ms = clock
554 .monotonic_ms()
555 .saturating_sub(started_ms)
556 .try_into()
557 .unwrap_or(0);
558 let local_cleanup = if let Some(guard) = local_guard {
559 guard.cleanup().await
560 } else {
561 None
562 };
563
564 let summary_value =
565 read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
566 let Some(summary) = summary_value else {
567 return RunReport {
568 run_id,
569 fixture_id: fixture.id.to_string(),
570 fixture_name: fixture.name.to_string(),
571 fixture_tool_sequence: fixture.tool_sequence.to_string(),
572 selector,
573 tool_format,
574 status: "infra_error".to_string(),
575 passed: false,
576 skipped: false,
577 skipped_reason: None,
578 output_dir: run_dir.display().to_string(),
579 transcript_events_path: run_dir
580 .join("transcript_events.jsonl")
581 .display()
582 .to_string(),
583 workspace_root: None,
584 elapsed_ms,
585 duration_ms: 0,
586 iterations: 0,
587 input_tokens: 0,
588 output_tokens: 0,
589 cost_usd: 0.0,
590 pricing_known: false,
591 tool_calls: 0,
592 rejected_tool_calls: 0,
593 tool_sequence: Vec::new(),
594 successful_tools: Vec::new(),
595 transcript_event_count: 0,
596 verification_success: false,
597 harn_exit_code: outcome.exit_code,
598 error: Some("benchmark harness produced no summary JSON".to_string()),
599 stderr_excerpt: excerpt(&outcome.stderr),
600 local_cleanup,
601 };
602 };
603
604 report_from_summary(
605 RunSummaryContext {
606 run_id,
607 fixture,
608 selector,
609 tool_format,
610 run_dir,
611 elapsed_ms,
612 exit_code: outcome.exit_code,
613 stderr: outcome.stderr,
614 local_cleanup,
615 },
616 summary,
617 )
618}
619
620fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
621 let passed = summary
622 .get("passed")
623 .and_then(JsonValue::as_bool)
624 .unwrap_or(false)
625 && ctx.exit_code == 0;
626 let input_tokens = summary
627 .pointer("/llm/input_tokens")
628 .and_then(JsonValue::as_i64)
629 .unwrap_or(0);
630 let output_tokens = summary
631 .pointer("/llm/output_tokens")
632 .and_then(JsonValue::as_i64)
633 .unwrap_or(0);
634 let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
635 let cost_usd = pricing
636 .map(|(input, output)| {
637 (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
638 })
639 .unwrap_or(0.0);
640 let status = if passed {
641 "passed".to_string()
642 } else if ctx.exit_code == 0 {
643 "failed".to_string()
644 } else {
645 summary
646 .get("status")
647 .and_then(JsonValue::as_str)
648 .unwrap_or("failed")
649 .to_string()
650 };
651 RunReport {
652 run_id: ctx.run_id,
653 fixture_id: ctx.fixture.id.to_string(),
654 fixture_name: ctx.fixture.name.to_string(),
655 fixture_tool_sequence: ctx.fixture.tool_sequence.to_string(),
656 selector: ctx.selector,
657 tool_format: ctx.tool_format,
658 status,
659 passed,
660 skipped: false,
661 skipped_reason: None,
662 output_dir: ctx.run_dir.display().to_string(),
663 transcript_events_path: ctx
664 .run_dir
665 .join("transcript_events.jsonl")
666 .display()
667 .to_string(),
668 workspace_root: summary
669 .get("workspace_root")
670 .and_then(JsonValue::as_str)
671 .map(str::to_string),
672 elapsed_ms: ctx.elapsed_ms,
673 duration_ms: summary
674 .get("duration_ms")
675 .and_then(JsonValue::as_u64)
676 .unwrap_or(ctx.elapsed_ms),
677 iterations: summary
678 .pointer("/llm/iterations")
679 .and_then(JsonValue::as_i64)
680 .unwrap_or(0),
681 input_tokens,
682 output_tokens,
683 cost_usd,
684 pricing_known: pricing.is_some(),
685 tool_calls: summary
686 .pointer("/tools/calls")
687 .and_then(JsonValue::as_array)
688 .map(Vec::len)
689 .unwrap_or(0),
690 rejected_tool_calls: summary
691 .pointer("/tools/rejected")
692 .and_then(JsonValue::as_array)
693 .map(Vec::len)
694 .unwrap_or(0),
695 tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
696 .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
697 .unwrap_or_default(),
698 successful_tools: string_array(summary.pointer("/tools/successful")),
699 transcript_event_count: summary
700 .get("transcript_event_count")
701 .and_then(JsonValue::as_u64)
702 .unwrap_or(0) as usize,
703 verification_success: summary
704 .pointer("/verification/success")
705 .and_then(JsonValue::as_bool)
706 .unwrap_or(false),
707 harn_exit_code: ctx.exit_code,
708 error: (!passed).then(|| {
709 summary
710 .get("status")
711 .and_then(JsonValue::as_str)
712 .unwrap_or("benchmark failed")
713 .to_string()
714 }),
715 stderr_excerpt: excerpt(&ctx.stderr),
716 local_cleanup: ctx.local_cleanup,
717 }
718}
719
720impl LocalRunGuard {
721 async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
722 if !selector_is_local(selector) {
723 return None;
724 }
725 let snapshot = snapshot_provider(&selector.provider, Path::new("."))
726 .await
727 .ok();
728 Some(Self {
729 selector: selector.clone(),
730 stop_after,
731 snapshot,
732 })
733 }
734
735 async fn cleanup(self) -> Option<LocalCleanupReport> {
736 let snapshot = self.snapshot?;
737 if self.selector.provider != "ollama" {
738 return Some(LocalCleanupReport {
739 provider: self.selector.provider,
740 model: self.selector.model,
741 initially_loaded: false,
742 action: "not_applicable".to_string(),
743 detail: Some(
744 "non-Ollama local providers are only stopped when Harn launched a managed server"
745 .to_string(),
746 ),
747 });
748 }
749 let initially_loaded = snapshot
750 .loaded_models
751 .iter()
752 .any(|loaded| loaded.name == self.selector.model);
753 if !self.stop_after {
754 return Some(LocalCleanupReport {
755 provider: self.selector.provider,
756 model: self.selector.model,
757 initially_loaded,
758 action: "left_running".to_string(),
759 detail: Some("--keep-local-after-run".to_string()),
760 });
761 }
762 if initially_loaded {
763 return Some(LocalCleanupReport {
764 provider: self.selector.provider,
765 model: self.selector.model,
766 initially_loaded,
767 action: "left_preexisting".to_string(),
768 detail: None,
769 });
770 }
771 match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
772 Ok(()) => Some(LocalCleanupReport {
773 provider: self.selector.provider,
774 model: self.selector.model,
775 initially_loaded,
776 action: "unloaded".to_string(),
777 detail: None,
778 }),
779 Err(error) => Some(LocalCleanupReport {
780 provider: self.selector.provider,
781 model: self.selector.model,
782 initially_loaded,
783 action: "unload_failed".to_string(),
784 detail: Some(error),
785 }),
786 }
787 }
788}
789
790fn script_argv(
791 args: &EvalCodingAgentArgs,
792 fixture: FixtureDefinition,
793 selector: &ModelSelector,
794 tool_format: &str,
795 run_dir: &Path,
796) -> Vec<String> {
797 let mut argv = vec![
798 "--fixture".to_string(),
799 fixture.id.to_string(),
800 "--output-dir".to_string(),
801 run_dir.display().to_string(),
802 "--provider".to_string(),
803 selector.provider.clone(),
804 "--model".to_string(),
805 selector.model.clone(),
806 "--tool-format".to_string(),
807 tool_format.to_string(),
808 "--max-iterations".to_string(),
809 args.max_iterations.to_string(),
810 "--python".to_string(),
811 args.python.clone(),
812 ];
813 if selector.provider == "mock" {
814 argv.push("--seed-mock".to_string());
815 }
816 if let Some(json) = resolve_step_judge_json(args, selector) {
817 argv.push("--step-judge-json".to_string());
818 argv.push(json);
819 }
820 if let Some(reason) = args
821 .override_reason
822 .as_deref()
823 .map(str::trim)
824 .filter(|reason| !reason.is_empty())
825 {
826 argv.push("--override-reason".to_string());
827 argv.push(reason.to_string());
828 }
829 if let Some(json) = resolve_structural_validator_json(args) {
830 argv.push("--structural-validator-json".to_string());
831 argv.push(json);
832 }
833 argv
834}
835
836fn tool_format_override_warning_line(stderr: &str) -> Option<&str> {
837 stderr
838 .lines()
839 .map(str::trim)
840 .find(|line| line.starts_with(TOOL_FORMAT_OVERRIDE_WARNING_PREFIX))
841}
842
843fn error_report(
844 run_id: String,
845 fixture: FixtureDefinition,
846 selector: ModelSelector,
847 tool_format: String,
848 run_dir: PathBuf,
849 error: String,
850) -> RunReport {
851 RunReport {
852 run_id,
853 fixture_id: fixture.id.to_string(),
854 fixture_name: fixture.name.to_string(),
855 fixture_tool_sequence: fixture.tool_sequence.to_string(),
856 selector,
857 tool_format,
858 status: "infra_error".to_string(),
859 passed: false,
860 skipped: false,
861 skipped_reason: None,
862 output_dir: run_dir.display().to_string(),
863 transcript_events_path: run_dir
864 .join("transcript_events.jsonl")
865 .display()
866 .to_string(),
867 workspace_root: None,
868 elapsed_ms: 0,
869 duration_ms: 0,
870 iterations: 0,
871 input_tokens: 0,
872 output_tokens: 0,
873 cost_usd: 0.0,
874 pricing_known: false,
875 tool_calls: 0,
876 rejected_tool_calls: 0,
877 tool_sequence: Vec::new(),
878 successful_tools: Vec::new(),
879 transcript_event_count: 0,
880 verification_success: false,
881 harn_exit_code: 1,
882 error: Some(error),
883 stderr_excerpt: None,
884 local_cleanup: None,
885 }
886}
887
888fn skipped_report(
889 run_id: String,
890 fixture: FixtureDefinition,
891 selector: ModelSelector,
892 tool_format: String,
893 run_dir: PathBuf,
894 reason: String,
895) -> RunReport {
896 RunReport {
897 run_id,
898 fixture_id: fixture.id.to_string(),
899 fixture_name: fixture.name.to_string(),
900 fixture_tool_sequence: fixture.tool_sequence.to_string(),
901 selector,
902 tool_format,
903 status: "skipped".to_string(),
904 passed: false,
905 skipped: true,
906 skipped_reason: Some(reason),
907 output_dir: run_dir.display().to_string(),
908 transcript_events_path: run_dir
909 .join("transcript_events.jsonl")
910 .display()
911 .to_string(),
912 workspace_root: None,
913 elapsed_ms: 0,
914 duration_ms: 0,
915 iterations: 0,
916 input_tokens: 0,
917 output_tokens: 0,
918 cost_usd: 0.0,
919 pricing_known: false,
920 tool_calls: 0,
921 rejected_tool_calls: 0,
922 tool_sequence: Vec::new(),
923 successful_tools: Vec::new(),
924 transcript_event_count: 0,
925 verification_success: false,
926 harn_exit_code: 0,
927 error: None,
928 stderr_excerpt: None,
929 local_cleanup: None,
930 }
931}
932
933fn provider_available(selector: &ModelSelector) -> bool {
934 if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
935 return true;
936 }
937 harn_vm::llm_config::provider_key_available(&selector.provider)
938}
939
940fn resolve_fixtures(raw_fixtures: &[String]) -> Result<Vec<FixtureDefinition>, String> {
941 let mut seen = BTreeSet::new();
942 let mut out = Vec::new();
943 for raw in raw_fixtures {
944 let fixture = raw.trim().to_ascii_lowercase();
945 if fixture.is_empty() {
946 continue;
947 }
948 if fixture == "all" {
949 return Ok(FIXTURE_DEFINITIONS.to_vec());
950 }
951 let Some(definition) = fixture_definition(&fixture) else {
952 return Err(format!(
953 "unsupported --fixture `{fixture}`; expected one of: all, {}",
954 FIXTURE_DEFINITIONS
955 .iter()
956 .map(|definition| definition.id)
957 .collect::<Vec<_>>()
958 .join(", ")
959 ));
960 };
961 if seen.insert(definition.id) {
962 out.push(definition);
963 }
964 }
965 if out.is_empty() {
966 return Err("at least one coding-agent fixture must be selected".to_string());
967 }
968 Ok(out)
969}
970
971fn fixture_definition(id: &str) -> Option<FixtureDefinition> {
972 FIXTURE_DEFINITIONS
973 .iter()
974 .copied()
975 .find(|definition| definition.id == id)
976}
977
978async fn resolve_models(args: &EvalCodingAgentArgs) -> Result<Vec<ModelSelector>, String> {
979 let mut seen = BTreeSet::new();
980 let mut out = Vec::new();
981 for raw in normalize_model_selector_args(&args.models) {
982 let trimmed = raw.trim();
983 if trimmed.is_empty() {
984 continue;
985 }
986 let selector = resolve_selector(trimmed);
987 if seen.insert(selector_label(&selector)) {
988 out.push(selector);
989 }
990 }
991 if args.include_local {
992 for selector in discover_local_models(args).await {
993 if seen.insert(selector_label(&selector)) {
994 out.push(selector);
995 }
996 }
997 }
998 Ok(out)
999}
1000
1001fn normalize_model_selector_args(raw_models: &[String]) -> Vec<String> {
1002 let mut out = Vec::new();
1003 let mut index = 0;
1004 while index < raw_models.len() {
1005 let current = raw_models[index].trim();
1006 if current.starts_with("provider=") && index + 1 < raw_models.len() {
1007 let next = raw_models[index + 1].trim();
1008 if next.starts_with("model=") {
1009 out.push(format!("{current},{next}"));
1010 index += 2;
1011 continue;
1012 }
1013 }
1014 out.push(current.to_string());
1015 index += 1;
1016 }
1017 out
1018}
1019
1020async fn discover_local_models(args: &EvalCodingAgentArgs) -> Vec<ModelSelector> {
1021 let providers = if args.local_providers.is_empty() {
1022 local_provider_ids(None)
1023 } else {
1024 args.local_providers.clone()
1025 };
1026 let mut selectors = Vec::new();
1027 let mut seen = BTreeSet::new();
1028 for provider in providers {
1029 if selectors.len() >= args.max_local_models {
1030 break;
1031 }
1032 let Ok(snapshot) = snapshot_provider(&provider, Path::new(".")).await else {
1033 continue;
1034 };
1035 if !snapshot.reachable {
1036 continue;
1037 }
1038 let mut models = snapshot
1039 .loaded_models
1040 .iter()
1041 .map(|model| model.name.clone())
1042 .collect::<Vec<_>>();
1043 models.extend(snapshot.served_models);
1044 for model in models {
1045 if selectors.len() >= args.max_local_models {
1046 break;
1047 }
1048 let selector = ModelSelector {
1049 selector: format!("{provider}:{model}"),
1050 provider: provider.clone(),
1051 model,
1052 };
1053 if seen.insert(selector_label(&selector)) {
1054 selectors.push(selector);
1055 }
1056 }
1057 }
1058 selectors
1059}
1060
1061fn normalize_tool_formats(raw_formats: &[String]) -> Result<Vec<String>, String> {
1062 let mut seen = BTreeSet::new();
1063 let mut out = Vec::new();
1064 for raw in raw_formats {
1065 let format = raw.trim().to_ascii_lowercase();
1066 if format.is_empty() {
1067 continue;
1068 }
1069 if format != "native" && format != "text" {
1070 return Err(format!(
1071 "unsupported --tool-format `{format}`; expected `native` or `text`"
1072 ));
1073 }
1074 if seen.insert(format.clone()) {
1075 out.push(format);
1076 }
1077 }
1078 Ok(out)
1079}
1080
1081fn build_matrix(
1082 fixtures: &[FixtureDefinition],
1083 models: &[ModelSelector],
1084 tool_formats: &[String],
1085 max_runs: Option<usize>,
1086) -> Vec<(FixtureDefinition, ModelSelector, String)> {
1087 if max_runs == Some(0) {
1088 return Vec::new();
1089 }
1090 let mut matrix = Vec::new();
1091 for fixture in fixtures {
1092 for selector in models {
1093 for tool_format in tool_formats {
1094 matrix.push((*fixture, selector.clone(), tool_format.clone()));
1095 if max_runs.is_some_and(|limit| matrix.len() >= limit) {
1096 return matrix;
1097 }
1098 }
1099 }
1100 }
1101 matrix
1102}
1103
1104#[allow(clippy::too_many_arguments)]
1105fn build_summary(
1106 output_dir: &Path,
1107 fixtures: Vec<FixtureDefinition>,
1108 models: Vec<ModelSelector>,
1109 tool_formats: Vec<String>,
1110 env_keys_loaded: Vec<LoadedEnvKey>,
1111 runs: Vec<RunReport>,
1112 step_judge_preset: Option<String>,
1113 run_label: String,
1114 baseline_comparison: Option<BaselineComparison>,
1115) -> EvalSummary {
1116 let passed_runs = runs.iter().filter(|run| run.passed).count();
1117 let skipped_runs = runs.iter().filter(|run| run.skipped).count();
1118 let failed_runs = runs
1119 .iter()
1120 .filter(|run| !run.passed && !run.skipped)
1121 .count();
1122 let total_cost_usd = runs.iter().map(|run| run.cost_usd).sum();
1123 let rollups = build_rollups(&runs);
1124 let comparisons = compare_formats(&runs);
1125 let parity_by_pair = build_parity_by_pair(&comparisons);
1126 let diverged_comparisons = comparisons
1127 .iter()
1128 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1129 .count();
1130 let followups = suggest_followups(&runs, &comparisons);
1131 EvalSummary {
1132 schema_version: 3,
1133 fixture_ids: fixtures
1134 .iter()
1135 .map(|fixture| fixture.id.to_string())
1136 .collect(),
1137 fixtures: fixtures
1138 .iter()
1139 .map(|fixture| FixtureReport {
1140 id: fixture.id.to_string(),
1141 name: fixture.name.to_string(),
1142 tool_sequence: fixture.tool_sequence.to_string(),
1143 description: fixture.description.to_string(),
1144 })
1145 .collect(),
1146 output_dir: output_dir.display().to_string(),
1147 models,
1148 tool_formats,
1149 env_keys_loaded,
1150 total_runs: runs.len(),
1151 passed_runs,
1152 failed_runs,
1153 skipped_runs,
1154 diverged_comparisons,
1155 total_cost_usd,
1156 rollups,
1157 runs,
1158 comparisons,
1159 parity_by_pair,
1160 followups,
1161 step_judge_preset,
1162 run_label,
1163 baseline_comparison,
1164 }
1165}
1166
1167fn load_baseline_comparison(path: &Path, runs: &[RunReport]) -> Result<BaselineComparison, String> {
1168 let resolved = if path.is_dir() {
1169 path.join("summary.json")
1170 } else {
1171 path.to_path_buf()
1172 };
1173 let raw = fs::read_to_string(&resolved)
1174 .map_err(|e| format!("failed to read {}: {e}", resolved.display()))?;
1175 let baseline: serde_json::Value = serde_json::from_str(&raw)
1176 .map_err(|e| format!("failed to parse {} as JSON: {e}", resolved.display()))?;
1177 let baseline_runs = baseline
1178 .get("runs")
1179 .and_then(|v| v.as_array())
1180 .ok_or_else(|| format!("{} has no `runs` array", resolved.display()))?;
1181 let mut baseline_status: BTreeMap<String, &str> = BTreeMap::new();
1185 for run in baseline_runs {
1186 let fixture_id = match run.get("fixture_id").and_then(|v| v.as_str()) {
1187 Some(id) => id.to_string(),
1188 None => continue,
1189 };
1190 let passed = run.get("passed").and_then(|v| v.as_bool()).unwrap_or(false);
1191 let skipped = run
1192 .get("skipped")
1193 .and_then(|v| v.as_bool())
1194 .unwrap_or(false);
1195 let status = if skipped {
1196 "skipped"
1197 } else if passed {
1198 "passed"
1199 } else {
1200 "failed"
1201 };
1202 baseline_status
1203 .entry(fixture_id)
1204 .and_modify(|existing| {
1205 if *existing != "passed" && status == "passed" {
1206 *existing = status;
1207 }
1208 })
1209 .or_insert(status);
1210 }
1211 let mut cell_status: BTreeMap<String, &str> = BTreeMap::new();
1212 for run in runs {
1213 let status = if run.skipped {
1214 "skipped"
1215 } else if run.passed {
1216 "passed"
1217 } else {
1218 "failed"
1219 };
1220 cell_status
1221 .entry(run.fixture_id.clone())
1222 .and_modify(|existing| {
1223 if *existing != "passed" && status == "passed" {
1224 *existing = status;
1225 }
1226 })
1227 .or_insert(status);
1228 }
1229 let mut regressions = Vec::new();
1230 let mut recoveries = Vec::new();
1231 let mut unchanged_passes = Vec::new();
1232 let mut unchanged_failures = Vec::new();
1233 let mut missing_in_baseline = Vec::new();
1234 let mut missing_in_cell = Vec::new();
1235 for (fixture, cell) in &cell_status {
1236 match baseline_status.get(fixture) {
1237 None => missing_in_baseline.push(fixture.clone()),
1238 Some(base) => match (*base, *cell) {
1239 ("passed", "passed") => unchanged_passes.push(fixture.clone()),
1240 ("passed", _) => regressions.push(FixtureStatusDelta {
1241 fixture_id: fixture.clone(),
1242 baseline_status: (*base).to_string(),
1243 cell_status: (*cell).to_string(),
1244 }),
1245 (_, "passed") => recoveries.push(FixtureStatusDelta {
1246 fixture_id: fixture.clone(),
1247 baseline_status: (*base).to_string(),
1248 cell_status: (*cell).to_string(),
1249 }),
1250 _ => unchanged_failures.push(fixture.clone()),
1251 },
1252 }
1253 }
1254 for fixture in baseline_status.keys() {
1255 if !cell_status.contains_key(fixture) {
1256 missing_in_cell.push(fixture.clone());
1257 }
1258 }
1259 let baseline_label = baseline
1260 .get("run_label")
1261 .and_then(|v| v.as_str())
1262 .filter(|s| !s.is_empty())
1263 .or_else(|| baseline.get("output_dir").and_then(|v| v.as_str()))
1264 .unwrap_or("")
1265 .to_string();
1266 let regressions_count = regressions.len();
1267 let recoveries_count = recoveries.len();
1268 let total_compared =
1269 regressions_count + recoveries_count + unchanged_passes.len() + unchanged_failures.len();
1270 let net_lift_pp = if total_compared == 0 {
1271 0.0
1272 } else {
1273 let raw =
1274 (recoveries_count as f64 - regressions_count as f64) / total_compared as f64 * 100.0;
1275 (raw * 10.0).round() / 10.0
1276 };
1277 Ok(BaselineComparison {
1278 baseline_label,
1279 baseline_path: resolved.display().to_string(),
1280 regressions,
1281 recoveries,
1282 unchanged_passes,
1283 unchanged_failures,
1284 missing_in_baseline,
1285 missing_in_cell,
1286 regressions_count,
1287 recoveries_count,
1288 net_lift_pp,
1289 })
1290}
1291
1292fn build_rollups(runs: &[RunReport]) -> EvalRollups {
1293 EvalRollups {
1294 by_fixture: rollup_by(runs, |run| run.fixture_id.clone()),
1295 by_provider: rollup_by(runs, |run| run.selector.provider.clone()),
1296 by_model: rollup_by(runs, |run| run.selector.model.clone()),
1297 by_tool_format: rollup_by(runs, |run| run.tool_format.clone()),
1298 by_tool_sequence: rollup_by(runs, |run| run.fixture_tool_sequence.clone()),
1299 }
1300}
1301
1302fn rollup_by<F>(runs: &[RunReport], key_for: F) -> Vec<RollupReport>
1303where
1304 F: Fn(&RunReport) -> String,
1305{
1306 let mut grouped: BTreeMap<String, RollupReport> = BTreeMap::new();
1307 for run in runs {
1308 let key = key_for(run);
1309 let entry = grouped.entry(key.clone()).or_insert_with(|| RollupReport {
1310 key,
1311 total_runs: 0,
1312 passed_runs: 0,
1313 failed_runs: 0,
1314 skipped_runs: 0,
1315 total_cost_usd: 0.0,
1316 });
1317 entry.total_runs += 1;
1318 if run.passed {
1319 entry.passed_runs += 1;
1320 } else if run.skipped {
1321 entry.skipped_runs += 1;
1322 } else {
1323 entry.failed_runs += 1;
1324 }
1325 entry.total_cost_usd += run.cost_usd;
1326 }
1327 grouped.into_values().collect()
1328}
1329
1330fn compare_formats(runs: &[RunReport]) -> Vec<FormatComparison> {
1331 let mut grouped: BTreeMap<String, Vec<&RunReport>> = BTreeMap::new();
1332 for run in runs {
1333 grouped
1334 .entry(format!(
1335 "{}\0{}",
1336 run.fixture_id,
1337 selector_label(&run.selector)
1338 ))
1339 .or_default()
1340 .push(run);
1341 }
1342 let mut out = Vec::new();
1343 for group in grouped.values() {
1344 let Some(first) = group.first() else {
1345 continue;
1346 };
1347 let native = group
1348 .iter()
1349 .find(|run| run.tool_format == "native")
1350 .copied();
1351 let text = group.iter().find(|run| run.tool_format == "text").copied();
1352 if native.is_none() && text.is_none() {
1353 continue;
1354 }
1355 let pair = native.zip(text);
1356 let mut divergence_reasons = Vec::new();
1357 if let Some((native, text)) = pair {
1358 if native.status != text.status {
1359 divergence_reasons.push(format!(
1360 "status differs: native={} text={}",
1361 native.status, text.status
1362 ));
1363 }
1364 if native.passed != text.passed {
1365 divergence_reasons.push(format!(
1366 "pass result differs: native={} text={}",
1367 native.passed, text.passed
1368 ));
1369 }
1370 if native.verification_success != text.verification_success {
1371 divergence_reasons.push(format!(
1372 "verifier result differs: native={} text={}",
1373 native.verification_success, text.verification_success
1374 ));
1375 }
1376 if native.tool_sequence != text.tool_sequence {
1377 divergence_reasons.push(format!(
1378 "tool sequence differs: native=[{}] text=[{}]",
1379 native.tool_sequence.join(", "),
1380 text.tool_sequence.join(", ")
1381 ));
1382 }
1383 if native.rejected_tool_calls != text.rejected_tool_calls {
1384 divergence_reasons.push(format!(
1385 "rejected tool-call recovery differs: native={} text={}",
1386 native.rejected_tool_calls, text.rejected_tool_calls
1387 ));
1388 }
1389 }
1390 let evidence_paths = [native, text]
1391 .into_iter()
1392 .flatten()
1393 .map(|run| run.transcript_events_path.clone())
1394 .collect::<Vec<_>>();
1395 out.push(FormatComparison {
1396 fixture_id: first.fixture_id.clone(),
1397 selector: first.selector.clone(),
1398 native_run_id: native.map(|run| run.run_id.clone()),
1399 text_run_id: text.map(|run| run.run_id.clone()),
1400 native_evidence_path: native.map(|run| run.transcript_events_path.clone()),
1401 text_evidence_path: text.map(|run| run.transcript_events_path.clone()),
1402 native_status: native.map(|run| run.status.clone()),
1403 text_status: text.map(|run| run.status.clone()),
1404 native_passed: native.map(|run| run.passed),
1405 text_passed: text.map(|run| run.passed),
1406 native_tool_call_count: native.map(|run| run.tool_calls),
1407 text_tool_call_count: text.map(|run| run.tool_calls),
1408 native_rejected_tool_call_count: native.map(|run| run.rejected_tool_calls),
1409 text_rejected_tool_call_count: text.map(|run| run.rejected_tool_calls),
1410 verifier_match: pair
1411 .map(|(native, text)| native.verification_success == text.verification_success),
1412 tool_sequence_match: pair
1413 .map(|(native, text)| native.tool_sequence == text.tool_sequence),
1414 rejected_tool_call_delta_text_minus_native: pair.map(|(native, text)| {
1415 text.rejected_tool_calls as i64 - native.rejected_tool_calls as i64
1416 }),
1417 token_delta_text_minus_native: pair.map(|(native, text)| {
1418 (text.input_tokens + text.output_tokens)
1419 - (native.input_tokens + native.output_tokens)
1420 }),
1421 iteration_delta_text_minus_native: pair
1422 .map(|(native, text)| text.iterations - native.iterations),
1423 equivalent: pair.map(|(native, text)| {
1424 native.status == text.status
1425 && native.passed == text.passed
1426 && native.skipped == text.skipped
1427 && native.verification_success == text.verification_success
1428 && native.tool_sequence == text.tool_sequence
1429 && native.rejected_tool_calls == text.rejected_tool_calls
1430 }),
1431 divergence_reasons,
1432 evidence_paths,
1433 });
1434 }
1435 out
1436}
1437
1438fn build_parity_by_pair(comparisons: &[FormatComparison]) -> Vec<ToolModeParityPairSummary> {
1439 let fixture_inputs = comparisons
1440 .iter()
1441 .filter_map(parity_fixture_input)
1442 .collect::<Vec<_>>();
1443 let fixture_reports = tool_mode_parity::build_fixture_reports(&fixture_inputs);
1444 tool_mode_parity::build_pair_summaries(&fixture_reports)
1445}
1446
1447fn parity_fixture_input(comparison: &FormatComparison) -> Option<ToolModeParityFixtureInput> {
1448 let native_verdict = comparison.native_status.clone()?;
1449 let text_verdict = comparison.text_status.clone()?;
1450 if native_verdict == "skipped" || text_verdict == "skipped" {
1451 return None;
1452 }
1453 Some(ToolModeParityFixtureInput {
1454 provider: comparison.selector.provider.clone(),
1455 model: comparison.selector.model.clone(),
1456 fixture_id: comparison.fixture_id.clone(),
1457 native_verdict,
1458 text_verdict,
1459 native_passed: comparison.native_passed?,
1460 text_passed: comparison.text_passed?,
1461 agreement: comparison.equivalent?,
1462 verifier_agreement: comparison.verifier_match?,
1463 native_tool_call_count: comparison.native_tool_call_count?,
1464 text_tool_call_count: comparison.text_tool_call_count?,
1465 native_rejected_tool_call_count: comparison.native_rejected_tool_call_count?,
1466 text_rejected_tool_call_count: comparison.text_rejected_tool_call_count?,
1467 native_evidence_path: comparison.native_evidence_path.clone()?,
1468 text_evidence_path: comparison.text_evidence_path.clone()?,
1469 })
1470}
1471
1472fn suggest_followups(
1473 runs: &[RunReport],
1474 comparisons: &[FormatComparison],
1475) -> Vec<FollowupSuggestion> {
1476 let mut out = Vec::new();
1477 let failed = runs
1478 .iter()
1479 .filter(|run| !run.passed && !run.skipped)
1480 .map(|run| run.run_id.clone())
1481 .collect::<Vec<_>>();
1482 if !failed.is_empty() {
1483 out.push(FollowupSuggestion {
1484 title: "Normalize coding-agent fixture failures across provider presets".to_string(),
1485 body: "One or more fixture/provider/tool-format runs failed. Inspect the run directories and decide whether the gap belongs in provider adapters, preset prompting, transcript handling, or host-tool ergonomics.".to_string(),
1486 labels: vec!["eval".to_string(), "providers".to_string()],
1487 run_ids: failed,
1488 });
1489 }
1490
1491 let rejected = runs
1492 .iter()
1493 .filter(|run| run.rejected_tool_calls > 0)
1494 .map(|run| run.run_id.clone())
1495 .collect::<Vec<_>>();
1496 if !rejected.is_empty() {
1497 out.push(FollowupSuggestion {
1498 title: "Abstract rejected tool-call recovery in agent transcripts".to_string(),
1499 body: "Some runs recovered after rejected tool calls. Add runtime support or preset guidance so harness authors can distinguish recoverable provider/tool-shape noise from user-relevant transcript events.".to_string(),
1500 labels: vec!["agents".to_string(), "transcripts".to_string()],
1501 run_ids: rejected,
1502 });
1503 }
1504
1505 let mismatched = comparisons
1506 .iter()
1507 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1508 .map(|comparison| {
1509 format!(
1510 "{}:{} ({})",
1511 comparison.fixture_id,
1512 selector_label(&comparison.selector),
1513 comparison.divergence_reasons.join("; ")
1514 )
1515 })
1516 .collect::<Vec<_>>();
1517 if !mismatched.is_empty() {
1518 let run_ids = comparisons
1519 .iter()
1520 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1521 .flat_map(|comparison| {
1522 [
1523 comparison.native_run_id.clone(),
1524 comparison.text_run_id.clone(),
1525 ]
1526 })
1527 .flatten()
1528 .collect::<Vec<_>>();
1529 out.push(FollowupSuggestion {
1530 title: "Make native/text tool modes behaviorally interchangeable for preset harnesses"
1531 .to_string(),
1532 body: format!(
1533 "Native and text tool modes diverged for: {}. The preset/runtime boundary should hide provider tool-channel differences where possible.",
1534 mismatched.join(", ")
1535 ),
1536 labels: vec!["agents".to_string(), "tools".to_string()],
1537 run_ids,
1538 });
1539 }
1540
1541 let unknown_pricing = runs
1542 .iter()
1543 .filter(|run| {
1544 !run.skipped
1545 && !run.pricing_known
1546 && !matches!(run.selector.provider.as_str(), "mock" | "fake")
1547 && !selector_is_local(&run.selector)
1548 })
1549 .map(|run| run.run_id.clone())
1550 .collect::<Vec<_>>();
1551 if !unknown_pricing.is_empty() {
1552 out.push(FollowupSuggestion {
1553 title: "Fill provider pricing metadata for benchmarked models".to_string(),
1554 body: "At least one live provider/model produced usage metrics but had no pricing entry, which weakens cost comparisons in eval reports.".to_string(),
1555 labels: vec!["providers".to_string(), "docs".to_string()],
1556 run_ids: unknown_pricing,
1557 });
1558 }
1559 out
1560}
1561
1562fn write_json_artifacts(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1563 write_json_pretty(&output_dir.join("summary.json"), summary)?;
1564 write_jsonl(&output_dir.join("per_run.jsonl"), &summary.runs)?;
1565 let summary_value = serde_json::to_value(summary).map_err(|error| error.to_string())?;
1566 let readiness = local_readiness::report_from_summary_json(
1567 &summary_value,
1568 output_dir.display().to_string(),
1569 )?;
1570 write_json_pretty(&output_dir.join("local_readiness.json"), &readiness)?;
1571 let generated_at = RealClock::new()
1572 .now_utc()
1573 .format(&time::format_description::well_known::Rfc3339)
1574 .map_err(|error| format!("failed to format parity overlay timestamp: {error}"))?;
1575 let parity_dir = output_dir.join(TOOL_MODE_PARITY_DIRECTORY);
1576 let parity_reports = tool_mode_parity::build_fixture_reports(
1577 &summary
1578 .comparisons
1579 .iter()
1580 .filter_map(parity_fixture_input)
1581 .collect::<Vec<_>>(),
1582 );
1583 for report in &parity_reports {
1584 let path = parity_dir
1585 .join(sanitize_id(&format!(
1586 "{}__{}:{}",
1587 report.fixture_id, report.provider, report.model
1588 )))
1589 .join("parity.json");
1590 tool_mode_parity::write_fixture_report(&path, report)?;
1591 }
1592 let overlay = tool_mode_parity::build_overlay(
1593 &summary.parity_by_pair,
1594 &generated_at,
1595 TOOL_MODE_PARITY_FIXTURE_SUITE,
1596 output_dir,
1597 );
1598 tool_mode_parity::write_overlay(
1599 &output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME),
1600 &overlay,
1601 )?;
1602 Ok(())
1603}
1604
1605fn announce_output_paths(output_dir: &Path) {
1606 eprintln!(
1607 "wrote {}, {}, {}, {}, {}, {}, and {}",
1608 output_dir.join("summary.json").display(),
1609 output_dir.join("per_run.jsonl").display(),
1610 output_dir.join("local_readiness.json").display(),
1611 output_dir.join(TOOL_MODE_PARITY_DIRECTORY).display(),
1612 output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME).display(),
1613 output_dir.join("summary.md").display(),
1614 output_dir.join("followups.md").display()
1615 );
1616}
1617
1618fn write_markdown_artifacts_legacy(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1621 fs::write(output_dir.join("summary.md"), render_markdown(summary))
1622 .map_err(|error| format!("failed to write summary.md: {error}"))?;
1623 fs::write(output_dir.join("followups.md"), render_followups(summary))
1624 .map_err(|error| format!("failed to write followups.md: {error}"))?;
1625 Ok(())
1626}
1627
1628fn print_summary_legacy(summary: &EvalSummary) {
1629 println!(
1630 "coding-agent eval: {}/{} passed, {} skipped, total_cost_usd={:.6}",
1631 summary.passed_runs, summary.total_runs, summary.skipped_runs, summary.total_cost_usd
1632 );
1633}
1634
1635fn print_json_legacy(summary: &EvalSummary) {
1636 match serde_json::to_string_pretty(summary) {
1637 Ok(payload) => println!("{payload}"),
1638 Err(error) => eprintln!("warning: failed to render summary JSON: {error}"),
1639 }
1640}
1641
1642async fn write_markdown_artifacts_dispatch(
1645 output_dir: &Path,
1646 summary: &EvalSummary,
1647) -> Result<(), i32> {
1648 let markdown = render_via_dispatch(summary, "markdown").await?;
1649 if let Err(error) = fs::write(output_dir.join("summary.md"), markdown) {
1650 eprintln!("error: failed to write summary.md: {error}");
1651 return Err(1);
1652 }
1653 let followups = render_via_dispatch(summary, "followups").await?;
1654 if let Err(error) = fs::write(output_dir.join("followups.md"), followups) {
1655 eprintln!("error: failed to write followups.md: {error}");
1656 return Err(1);
1657 }
1658 Ok(())
1659}
1660
1661async fn print_summary_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1662 let payload = render_via_dispatch(summary, "summary").await?;
1663 print!("{payload}");
1664 if !payload.ends_with('\n') {
1667 println!();
1668 }
1669 Ok(())
1670}
1671
1672async fn print_json_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1673 let payload = render_via_dispatch(summary, "json").await?;
1674 print!("{payload}");
1675 if !payload.ends_with('\n') {
1676 println!();
1677 }
1678 Ok(())
1679}
1680
1681async fn render_via_dispatch(summary: &EvalSummary, mode: &str) -> Result<String, i32> {
1691 let summary_json = match serde_json::to_string(summary) {
1692 Ok(json) => json,
1693 Err(error) => {
1694 eprintln!("error: failed to serialise EvalSummary for dispatch: {error}");
1695 return Err(1);
1696 }
1697 };
1698 let _guard = DISPATCH_RENDER_LOCK.lock().await;
1699 let _summary = ScopedEnvVar::set(CODING_AGENT_SUMMARY_ENV, &summary_json);
1700 let _mode = ScopedEnvVar::set(CODING_AGENT_MODE_ENV, mode);
1701
1702 let outcome = dispatch::run_embedded_script("eval/coding_agent", Vec::new(), false).await;
1703 if !outcome.stderr.is_empty() {
1704 let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
1705 }
1706 if outcome.exit_code != 0 {
1707 return Err(outcome.exit_code);
1708 }
1709 Ok(outcome.stdout)
1710}
1711
1712fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<(), String> {
1713 let body = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
1714 fs::write(path, format!("{body}\n")).map_err(|error| error.to_string())
1715}
1716
1717fn write_jsonl<T: Serialize>(path: &Path, items: &[T]) -> Result<(), String> {
1718 let mut body = String::new();
1719 for item in items {
1720 let line = serde_json::to_string(item).map_err(|error| error.to_string())?;
1721 body.push_str(&line);
1722 body.push('\n');
1723 }
1724 fs::write(path, body).map_err(|error| error.to_string())
1725}
1726
1727fn render_markdown(summary: &EvalSummary) -> String {
1728 let mut out = String::new();
1729 out.push_str("# Coding Agent Harness Quality Suite\n\n");
1730 out.push_str(&format!(
1731 "- fixtures: `{}`\n- passed: {}/{}\n- skipped: {}\n- total_cost_usd: {:.6}\n\n",
1732 summary.fixture_ids.join("`, `"),
1733 summary.passed_runs,
1734 summary.total_runs,
1735 summary.skipped_runs,
1736 summary.total_cost_usd
1737 ));
1738 render_rollup_table(&mut out, "By Fixture", &summary.rollups.by_fixture);
1739 render_rollup_table(&mut out, "By Provider", &summary.rollups.by_provider);
1740 render_rollup_table(&mut out, "By Model", &summary.rollups.by_model);
1741 render_rollup_table(&mut out, "By Tool Format", &summary.rollups.by_tool_format);
1742 render_rollup_table(
1743 &mut out,
1744 "By Tool Sequence",
1745 &summary.rollups.by_tool_sequence,
1746 );
1747
1748 out.push_str("\n## Runs\n\n");
1749 out.push_str("| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n");
1750 out.push_str("|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n");
1751 for run in &summary.runs {
1752 let tool_sequence = if run.tool_sequence.is_empty() {
1753 "-".to_string()
1754 } else {
1755 run.tool_sequence.join(", ").replace('|', "\\|")
1756 };
1757 out.push_str(&format!(
1758 "| `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {:.6} | {} | `{}` |\n",
1759 run.fixture_id,
1760 run.run_id,
1761 run.selector.provider,
1762 run.selector.model.replace('|', "\\|"),
1763 run.tool_format,
1764 run.fixture_tool_sequence,
1765 tool_sequence,
1766 run.status,
1767 run.iterations,
1768 run.input_tokens + run.output_tokens,
1769 run.cost_usd,
1770 markdown_link(
1771 &run.transcript_event_count.to_string(),
1772 &run.transcript_events_path
1773 ),
1774 run.output_dir
1775 ));
1776 }
1777 if let Some(comparison) = &summary.baseline_comparison {
1778 out.push_str("\n## Baseline Comparison\n\n");
1779 out.push_str(&format!(
1780 "Compared against `{}`{}.\n\n",
1781 comparison.baseline_path,
1782 if comparison.baseline_label.is_empty() {
1783 String::new()
1784 } else {
1785 format!(" (label: `{}`)", comparison.baseline_label)
1786 },
1787 ));
1788 out.push_str(&format!(
1789 "- regressions: **{}** (baseline passed, this cell failed)\n- recoveries: **{}** (baseline failed, this cell passed)\n- net lift: **{:+.1}pp**\n\n",
1790 comparison.regressions_count,
1791 comparison.recoveries_count,
1792 comparison.net_lift_pp,
1793 ));
1794 if !comparison.regressions.is_empty() {
1795 out.push_str("### Regressions\n\n");
1796 for delta in &comparison.regressions {
1797 out.push_str(&format!(
1798 "- `{}`: `{}` → `{}`\n",
1799 delta.fixture_id, delta.baseline_status, delta.cell_status,
1800 ));
1801 }
1802 out.push('\n');
1803 }
1804 if !comparison.recoveries.is_empty() {
1805 out.push_str("### Recoveries\n\n");
1806 for delta in &comparison.recoveries {
1807 out.push_str(&format!(
1808 "- `{}`: `{}` → `{}`\n",
1809 delta.fixture_id, delta.baseline_status, delta.cell_status,
1810 ));
1811 }
1812 out.push('\n');
1813 }
1814 }
1815 if !summary.comparisons.is_empty() {
1816 out.push_str("\n## Native/Text Comparison\n\n");
1817 out.push_str("| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n");
1818 out.push_str("|---|---|---|---|---|---|---|---:|---:|---:|---|\n");
1819 for comparison in &summary.comparisons {
1820 out.push_str(&format!(
1821 "| `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
1822 comparison.fixture_id,
1823 selector_label(&comparison.selector),
1824 comparison
1825 .native_status
1826 .clone()
1827 .unwrap_or_else(|| "-".to_string()),
1828 comparison
1829 .text_status
1830 .clone()
1831 .unwrap_or_else(|| "-".to_string()),
1832 optional_bool_mark(comparison.equivalent),
1833 optional_bool_mark(comparison.verifier_match),
1834 optional_bool_mark(comparison.tool_sequence_match),
1835 comparison
1836 .rejected_tool_call_delta_text_minus_native
1837 .map(|v| v.to_string())
1838 .unwrap_or_else(|| "-".to_string()),
1839 comparison
1840 .token_delta_text_minus_native
1841 .map(|v| v.to_string())
1842 .unwrap_or_else(|| "-".to_string()),
1843 comparison
1844 .iteration_delta_text_minus_native
1845 .map(|v| v.to_string())
1846 .unwrap_or_else(|| "-".to_string()),
1847 comparison_evidence_links(comparison)
1848 ));
1849 }
1850 }
1851 if !summary.parity_by_pair.is_empty() {
1852 out.push_str("\n## Parity report — native vs text\n\n");
1853 out.push_str("| selector | sample | native pass | text pass | agreement | verifier divergence | native_only | text_only | both_pass | both_fail |\n");
1854 out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|\n");
1855 for pair in &summary.parity_by_pair {
1856 out.push_str(&format!(
1857 "| `{}` | {} | {:.1}% | {:.1}% | {:.1}% | {:.1}% | {} | {} | {} | {} |\n",
1858 selector_label(&ModelSelector {
1859 selector: format!("{}:{}", pair.provider, pair.model),
1860 provider: pair.provider.clone(),
1861 model: pair.model.clone(),
1862 }),
1863 pair.sample_size,
1864 pair.native.pass_rate * 100.0,
1865 pair.text.pass_rate * 100.0,
1866 pair.agreement_rate * 100.0,
1867 pair.verifier_divergence_rate * 100.0,
1868 pair.divergence_counts.native_only_pass,
1869 pair.divergence_counts.text_only_pass,
1870 pair.divergence_counts.both_pass,
1871 pair.divergence_counts.both_fail,
1872 ));
1873 }
1874 }
1875 let diverged = summary
1876 .comparisons
1877 .iter()
1878 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1879 .collect::<Vec<_>>();
1880 if !diverged.is_empty() {
1881 out.push_str("\n## Native/Text Divergence Evidence\n\n");
1882 for comparison in diverged {
1883 out.push_str(&format!(
1884 "- `{}` `{}`: {}\n",
1885 comparison.fixture_id,
1886 selector_label(&comparison.selector),
1887 comparison.divergence_reasons.join("; ")
1888 ));
1889 if !comparison.evidence_paths.is_empty() {
1890 out.push_str(&format!(
1891 " Evidence: {}\n",
1892 comparison_evidence_links(comparison)
1893 ));
1894 }
1895 }
1896 }
1897 out
1898}
1899
1900fn render_rollup_table(out: &mut String, title: &str, rollups: &[RollupReport]) {
1901 out.push_str(&format!("## {title}\n\n"));
1902 out.push_str("| key | passed | failed | skipped | total | cost |\n");
1903 out.push_str("|---|---:|---:|---:|---:|---:|\n");
1904 for rollup in rollups {
1905 out.push_str(&format!(
1906 "| `{}` | {} | {} | {} | {} | {:.6} |\n",
1907 rollup.key.replace('|', "\\|"),
1908 rollup.passed_runs,
1909 rollup.failed_runs,
1910 rollup.skipped_runs,
1911 rollup.total_runs,
1912 rollup.total_cost_usd
1913 ));
1914 }
1915 out.push('\n');
1916}
1917
1918fn render_followups(summary: &EvalSummary) -> String {
1919 let mut out = String::new();
1920 out.push_str("# Follow-up Issue Candidates\n\n");
1921 if summary.followups.is_empty() {
1922 out.push_str("No follow-up issue candidates were generated from this run.\n");
1923 return out;
1924 }
1925 for followup in &summary.followups {
1926 out.push_str(&format!("## {}\n\n{}\n\n", followup.title, followup.body));
1927 if !followup.run_ids.is_empty() {
1928 out.push_str(&format!("- run_ids: `{}`\n", followup.run_ids.join("`, `")));
1929 }
1930 if !followup.labels.is_empty() {
1931 out.push_str(&format!("- labels: `{}`\n", followup.labels.join("`, `")));
1932 }
1933 out.push('\n');
1934 }
1935 out
1936}
1937
1938fn read_run_summary(run_dir: &Path) -> Option<JsonValue> {
1939 let raw = fs::read_to_string(run_dir.join("summary.json")).ok()?;
1940 serde_json::from_str(&raw).ok()
1941}
1942
1943fn parse_last_json_line(stdout: &str) -> Option<JsonValue> {
1944 stdout
1945 .lines()
1946 .rev()
1947 .map(str::trim)
1948 .filter(|line| !line.is_empty())
1949 .find_map(|line| serde_json::from_str::<JsonValue>(line).ok())
1950}
1951
1952fn string_array(value: Option<&JsonValue>) -> Vec<String> {
1953 value
1954 .and_then(JsonValue::as_array)
1955 .map(|values| {
1956 values
1957 .iter()
1958 .filter_map(JsonValue::as_str)
1959 .map(str::to_string)
1960 .collect()
1961 })
1962 .unwrap_or_default()
1963}
1964
1965fn non_empty_string_array(value: Option<&JsonValue>) -> Option<Vec<String>> {
1966 let values = string_array(value);
1967 (!values.is_empty()).then_some(values)
1968}
1969
1970fn tool_call_sequence(value: Option<&JsonValue>) -> Option<Vec<String>> {
1971 let calls = value.and_then(JsonValue::as_array)?;
1972 let mut sequence = Vec::new();
1973 for call in calls {
1974 if let Some(name) = call
1975 .get("name")
1976 .or_else(|| call.get("tool_name"))
1977 .and_then(JsonValue::as_str)
1978 {
1979 sequence.push(name.to_string());
1980 }
1981 }
1982 (!sequence.is_empty()).then_some(sequence)
1983}
1984
1985fn optional_bool_mark(value: Option<bool>) -> &'static str {
1986 match value {
1987 Some(true) => "yes",
1988 Some(false) => "no",
1989 None => "-",
1990 }
1991}
1992
1993fn comparison_evidence_links(comparison: &FormatComparison) -> String {
1994 let mut links = Vec::new();
1995 if let Some(native) = comparison.native_evidence_path.as_deref() {
1996 links.push(markdown_link("native", native));
1997 }
1998 if let Some(text) = comparison.text_evidence_path.as_deref() {
1999 links.push(markdown_link("text", text));
2000 }
2001 if links.is_empty() {
2002 "-".to_string()
2003 } else {
2004 links.join("<br>")
2005 }
2006}
2007
2008fn markdown_link(label: &str, target: &str) -> String {
2009 format!(
2010 "[{}]({})",
2011 label.replace('|', "\\|"),
2012 target
2013 .replace(' ', "%20")
2014 .replace('(', "%28")
2015 .replace(')', "%29")
2016 )
2017}
2018
2019fn reset_dir(path: &Path) -> Result<(), String> {
2020 if path.exists() {
2021 fs::remove_dir_all(path).map_err(|error| error.to_string())?;
2022 }
2023 fs::create_dir_all(path).map_err(|error| error.to_string())
2024}
2025
2026fn run_id_for(fixture: FixtureDefinition, selector: &ModelSelector, tool_format: &str) -> String {
2027 sanitize_id(&format!(
2028 "{}__{}__{}",
2029 fixture.id,
2030 selector_label(selector),
2031 tool_format
2032 ))
2033}
2034
2035fn sanitize_id(raw: &str) -> String {
2036 let mut out = String::new();
2037 for ch in raw.chars() {
2038 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
2039 out.push(ch);
2040 } else {
2041 out.push('_');
2042 }
2043 }
2044 out.trim_matches('_').to_string()
2045}
2046
2047fn default_output_dir() -> PathBuf {
2048 PathBuf::from(".harn-runs")
2049 .join("coding-agent-bench")
2050 .join("latest")
2051}
2052
2053fn excerpt(text: &str) -> Option<String> {
2054 let trimmed = text.trim();
2055 if trimmed.is_empty() {
2056 return None;
2057 }
2058 let max = 4000;
2059 if trimmed.len() <= max {
2060 return Some(trimmed.to_string());
2061 }
2062 let mut truncated = String::new();
2063 for ch in trimmed.chars().take(max) {
2064 truncated.push(ch);
2065 }
2066 truncated.push_str("...");
2067 Some(truncated)
2068}
2069
2070fn load_env_files(paths: &[PathBuf]) -> Result<(EnvOverlay, Vec<LoadedEnvKey>), String> {
2071 let mut previous = Vec::new();
2072 let mut loaded = Vec::new();
2073 let mut touched = BTreeSet::new();
2074 for path in paths {
2075 let path = expand_home(path);
2076 let raw = fs::read_to_string(&path)
2077 .map_err(|error| format!("failed to read env file {}: {error}", path.display()))?;
2078 for (line_no, line) in raw.lines().enumerate() {
2079 let Some((key, value)) = parse_env_line(line).map_err(|error| {
2080 format!("{}:{}: {error}", path.display(), line_no.saturating_add(1))
2081 })?
2082 else {
2083 continue;
2084 };
2085 if touched.insert(key.clone()) {
2086 previous.push((OsString::from(&key), std::env::var_os(&key)));
2087 }
2088 std::env::set_var(&key, value);
2089 loaded.push(LoadedEnvKey {
2090 key,
2091 source: path.display().to_string(),
2092 });
2093 }
2094 }
2095 Ok((EnvOverlay { previous }, loaded))
2096}
2097
2098fn parse_env_line(line: &str) -> Result<Option<(String, String)>, String> {
2099 let trimmed = line.trim();
2100 if trimmed.is_empty() || trimmed.starts_with('#') {
2101 return Ok(None);
2102 }
2103 let trimmed = trimmed.strip_prefix("export ").unwrap_or(trimmed).trim();
2104 let Some((key, value)) = trimmed.split_once('=') else {
2105 return Err("expected KEY=VALUE".to_string());
2106 };
2107 let key = key.trim();
2108 if key.is_empty() {
2109 return Err("empty key".to_string());
2110 }
2111 if !key
2112 .chars()
2113 .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2114 {
2115 return Err(format!("invalid key `{key}`"));
2116 }
2117 Ok(Some((key.to_string(), unquote_env_value(value.trim()))))
2118}
2119
2120fn unquote_env_value(value: &str) -> String {
2121 if value.len() >= 2 {
2122 let bytes = value.as_bytes();
2123 if (bytes[0] == b'"' && bytes[value.len() - 1] == b'"')
2124 || (bytes[0] == b'\'' && bytes[value.len() - 1] == b'\'')
2125 {
2126 return value[1..value.len() - 1].to_string();
2127 }
2128 }
2129 value.to_string()
2130}
2131
2132fn expand_home(path: &Path) -> PathBuf {
2133 let raw = path.to_string_lossy();
2134 if raw == "~" {
2135 return std::env::var_os("HOME")
2136 .map(PathBuf::from)
2137 .unwrap_or_else(|| path.to_path_buf());
2138 }
2139 if let Some(rest) = raw.strip_prefix("~/") {
2140 if let Some(home) = std::env::var_os("HOME") {
2141 return PathBuf::from(home).join(rest);
2142 }
2143 }
2144 path.to_path_buf()
2145}
2146
2147#[cfg(test)]
2148#[path = "eval_coding_agent_tests.rs"]
2149mod tests;