1use std::collections::{BTreeMap, BTreeSet, HashSet};
37use std::ffi::OsString;
38use std::fs;
39use std::io::Write as _;
40use std::path::{Path, PathBuf};
41
42use harn_vm::clock::{Clock, RealClock};
43use serde::Serialize;
44use serde_json::Value as JsonValue;
45
46use crate::cli::EvalCodingAgentArgs;
47use crate::commands::eval_coding_agent_preset::{
48 resolve_step_judge_json, resolve_structural_validator_json,
49};
50use crate::commands::eval_model_selector::{
51 resolve_selector, selector_is_local, selector_label, ModelSelector,
52};
53use crate::commands::local::runtime::{
54 local_provider_ids, ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
55};
56use crate::commands::local_readiness;
57use crate::commands::run::{
58 execute_run_with_sandbox_options, CliLlmMockMode, RunProfileOptions, RunSandboxOptions,
59};
60use crate::commands::tool_mode_parity::{
61 self, ToolModeParityFixtureInput, ToolModeParityPairSummary, TOOL_MODE_PARITY_DIRECTORY,
62 TOOL_MODE_PARITY_FIXTURE_SUITE, TOOL_MODE_PARITY_OVERLAY_FILENAME,
63};
64use crate::dispatch;
65use crate::env_guard::ScopedEnvVar;
66
67const CODING_AGENT_SUMMARY_ENV: &str = "HARN_EVAL_CODING_AGENT_SUMMARY_JSON";
72
73const CODING_AGENT_MODE_ENV: &str = "HARN_EVAL_CODING_AGENT_MODE";
79
80static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
91
92const CODING_AGENT_SUITE_HARN: &str = include_str!("../../assets/evals/coding_agent_suite.harn");
93const TOOL_FORMAT_OVERRIDE_WARNING_PREFIX: &str = "warning: tool_format override:";
94
95#[derive(Debug, Clone, Copy)]
96struct FixtureDefinition {
97 id: &'static str,
98 name: &'static str,
99 tool_sequence: &'static str,
100 description: &'static str,
101}
102
103static FIXTURE_DEFINITIONS: &[FixtureDefinition] = &[
104 FixtureDefinition {
105 id: "python-add",
106 name: "Python add repair",
107 tool_sequence: "multi-tool",
108 description: "One-file Python bug fix verified by unittest output.",
109 },
110 FixtureDefinition {
111 id: "cli-help-flag",
112 name: "CLI help flag",
113 tool_sequence: "multi-tool",
114 description: "Add a tiny CLI flag, update help-facing docs, and verify behavior.",
115 },
116 FixtureDefinition {
117 id: "test-output-first",
118 name: "Test-output-first repair",
119 tool_sequence: "multi-tool",
120 description: "Run a failing test first, then edit the implementation and re-run it.",
121 },
122 FixtureDefinition {
123 id: "docs-symbol-rename",
124 name: "Docs symbol rename",
125 tool_sequence: "multi-tool",
126 description:
127 "Update docs and an example after a symbol rename without touching implementation.",
128 },
129 FixtureDefinition {
130 id: "read-only-audit",
131 name: "Read-only audit",
132 tool_sequence: "one-tool",
133 description: "Inspect a file and report that no edits are needed.",
134 },
135 FixtureDefinition {
136 id: "no-tool-diagnosis",
137 name: "No-tool diagnosis",
138 tool_sequence: "no-tool",
139 description: "Answer from prompt-only context without any tools.",
140 },
141];
142
143#[derive(Debug, Clone, Serialize)]
144struct LoadedEnvKey {
145 key: String,
146 source: String,
147}
148
149#[derive(Debug)]
150struct EnvOverlay {
151 previous: Vec<(OsString, Option<OsString>)>,
152}
153
154impl Drop for EnvOverlay {
155 fn drop(&mut self) {
156 for (key, previous) in self.previous.iter().rev() {
157 if let Some(value) = previous {
158 std::env::set_var(key, value);
159 } else {
160 std::env::remove_var(key);
161 }
162 }
163 }
164}
165
166#[derive(Debug, Clone, Serialize)]
167struct RunReport {
168 run_id: String,
169 fixture_id: String,
170 fixture_name: String,
171 fixture_tool_sequence: String,
172 selector: ModelSelector,
173 tool_format: String,
174 status: String,
175 passed: bool,
176 skipped: bool,
177 #[serde(skip_serializing_if = "Option::is_none")]
178 skipped_reason: Option<String>,
179 output_dir: String,
180 transcript_events_path: String,
181 workspace_root: Option<String>,
182 elapsed_ms: u64,
183 duration_ms: u64,
184 iterations: i64,
185 input_tokens: i64,
186 output_tokens: i64,
187 cost_usd: f64,
188 pricing_known: bool,
189 tool_calls: usize,
190 rejected_tool_calls: usize,
191 tool_sequence: Vec<String>,
192 successful_tools: Vec<String>,
193 transcript_event_count: usize,
194 verification_success: bool,
195 harn_exit_code: i32,
196 #[serde(skip_serializing_if = "Option::is_none")]
197 error: Option<String>,
198 #[serde(skip_serializing_if = "Option::is_none")]
199 stderr_excerpt: Option<String>,
200 local_cleanup: Option<LocalCleanupReport>,
201}
202
203#[derive(Debug, Clone, Serialize)]
204struct LocalCleanupReport {
205 provider: String,
206 model: String,
207 initially_loaded: bool,
208 action: String,
209 #[serde(skip_serializing_if = "Option::is_none")]
210 detail: Option<String>,
211}
212
213#[derive(Debug, Clone, Serialize)]
214struct FormatComparison {
215 fixture_id: String,
216 selector: ModelSelector,
217 native_run_id: Option<String>,
218 text_run_id: Option<String>,
219 native_evidence_path: Option<String>,
220 text_evidence_path: Option<String>,
221 native_status: Option<String>,
222 text_status: Option<String>,
223 native_passed: Option<bool>,
224 text_passed: Option<bool>,
225 native_tool_call_count: Option<usize>,
226 text_tool_call_count: Option<usize>,
227 native_rejected_tool_call_count: Option<usize>,
228 text_rejected_tool_call_count: Option<usize>,
229 verifier_match: Option<bool>,
230 tool_sequence_match: Option<bool>,
231 rejected_tool_call_delta_text_minus_native: Option<i64>,
232 token_delta_text_minus_native: Option<i64>,
233 iteration_delta_text_minus_native: Option<i64>,
234 equivalent: Option<bool>,
235 divergence_reasons: Vec<String>,
236 evidence_paths: Vec<String>,
237}
238
239#[derive(Debug, Clone, Serialize)]
240struct FollowupSuggestion {
241 title: String,
242 body: String,
243 labels: Vec<String>,
244 run_ids: Vec<String>,
245}
246
247#[derive(Debug, Clone, Serialize)]
248struct FixtureReport {
249 id: String,
250 name: String,
251 tool_sequence: String,
252 description: String,
253}
254
255#[derive(Debug, Clone, Serialize)]
256struct RollupReport {
257 key: String,
258 total_runs: usize,
259 passed_runs: usize,
260 failed_runs: usize,
261 skipped_runs: usize,
262 total_cost_usd: f64,
263}
264
265#[derive(Debug, Clone, Serialize)]
266struct EvalRollups {
267 by_fixture: Vec<RollupReport>,
268 by_provider: Vec<RollupReport>,
269 by_model: Vec<RollupReport>,
270 by_tool_format: Vec<RollupReport>,
271 by_tool_sequence: Vec<RollupReport>,
272}
273
274#[derive(Debug, Clone, Serialize)]
275struct EvalSummary {
276 schema_version: u32,
277 fixture_ids: Vec<String>,
278 fixtures: Vec<FixtureReport>,
279 output_dir: String,
280 models: Vec<ModelSelector>,
281 tool_formats: Vec<String>,
282 env_keys_loaded: Vec<LoadedEnvKey>,
283 total_runs: usize,
284 passed_runs: usize,
285 failed_runs: usize,
286 skipped_runs: usize,
287 diverged_comparisons: usize,
288 total_cost_usd: f64,
289 rollups: EvalRollups,
290 runs: Vec<RunReport>,
291 comparisons: Vec<FormatComparison>,
292 parity_by_pair: Vec<ToolModeParityPairSummary>,
293 followups: Vec<FollowupSuggestion>,
294 #[serde(skip_serializing_if = "Option::is_none")]
298 step_judge_preset: Option<String>,
299 #[serde(skip_serializing_if = "String::is_empty")]
302 run_label: String,
303 #[serde(skip_serializing_if = "Option::is_none")]
309 baseline_comparison: Option<BaselineComparison>,
310}
311
312#[derive(Debug, Clone, Serialize, Default)]
313struct BaselineComparison {
314 baseline_label: String,
316 baseline_path: String,
318 regressions: Vec<FixtureStatusDelta>,
319 recoveries: Vec<FixtureStatusDelta>,
320 unchanged_passes: Vec<String>,
322 unchanged_failures: Vec<String>,
324 missing_in_baseline: Vec<String>,
327 missing_in_cell: Vec<String>,
328 regressions_count: usize,
329 recoveries_count: usize,
330 net_lift_pp: f64,
334}
335
336#[derive(Debug, Clone, Serialize)]
337struct FixtureStatusDelta {
338 fixture_id: String,
339 baseline_status: String,
340 cell_status: String,
341}
342
343struct LocalRunGuard {
344 selector: ModelSelector,
345 stop_after: bool,
346 snapshot: Option<LocalProviderSnapshot>,
347}
348
349struct RunSummaryContext {
350 run_id: String,
351 fixture: FixtureDefinition,
352 selector: ModelSelector,
353 tool_format: String,
354 run_dir: PathBuf,
355 elapsed_ms: u64,
356 exit_code: i32,
357 stderr: String,
358 local_cleanup: Option<LocalCleanupReport>,
359}
360
361pub async fn run(args: EvalCodingAgentArgs) -> i32 {
362 let output_dir = args.output.clone().unwrap_or_else(default_output_dir);
363 if let Err(error) = fs::create_dir_all(&output_dir) {
364 eprintln!("error: failed to create {}: {error}", output_dir.display());
365 return 1;
366 }
367
368 let (_env_guard, env_keys_loaded) = match load_env_files(&args.env_files) {
369 Ok(loaded) => loaded,
370 Err(error) => {
371 eprintln!("error: {error}");
372 return 1;
373 }
374 };
375
376 let fixtures = match resolve_fixtures(&args.fixtures) {
377 Ok(fixtures) => fixtures,
378 Err(error) => {
379 eprintln!("error: {error}");
380 return 2;
381 }
382 };
383 let models = match resolve_models(&args).await {
384 Ok(models) => models,
385 Err(error) => {
386 eprintln!("error: {error}");
387 return 1;
388 }
389 };
390 let tool_formats = match normalize_tool_formats(&args.tool_formats) {
391 Ok(formats) => formats,
392 Err(error) => {
393 eprintln!("error: {error}");
394 return 2;
395 }
396 };
397 let matrix = build_matrix(&fixtures, &models, &tool_formats, args.max_runs);
398 if matrix.is_empty() {
399 eprintln!("error: no coding-agent benchmark runs selected");
400 return 2;
401 }
402
403 let mut reports = Vec::new();
404 let mut had_error = false;
405 for (fixture, selector, tool_format) in matrix {
406 let report = run_matrix_entry(&args, &output_dir, fixture, selector, tool_format).await;
407 if !report.passed && !report.skipped {
408 had_error = true;
409 }
410 if report.skipped && args.fail_on_unauthorized {
411 had_error = true;
412 }
413 eprintln!(
414 "{} {} {}: {}",
415 report.fixture_id,
416 selector_label(&report.selector),
417 report.tool_format,
418 report.status
419 );
420 reports.push(report);
421 }
422
423 let baseline_comparison = match &args.baseline_comparison_against {
424 Some(path) => match load_baseline_comparison(path, &reports) {
425 Ok(comparison) => Some(comparison),
426 Err(error) => {
427 eprintln!("error: --baseline-comparison-against: {error}");
428 return 1;
429 }
430 },
431 None => None,
432 };
433 let summary = build_summary(
434 &output_dir,
435 fixtures,
436 models,
437 tool_formats,
438 env_keys_loaded,
439 reports,
440 args.step_judge
441 .clone()
442 .filter(|s| !s.is_empty() && s != "none"),
443 args.run_label.clone(),
444 baseline_comparison,
445 );
446 if let Err(error) = write_json_artifacts(&output_dir, &summary) {
452 eprintln!("error: failed to write benchmark outputs: {error}");
453 return 1;
454 }
455
456 let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
460
461 if use_legacy {
462 if let Err(error) = write_markdown_artifacts_legacy(&output_dir, &summary) {
463 eprintln!("error: {error}");
464 return 1;
465 }
466 announce_output_paths(&output_dir);
467 if args.json {
468 print_json_legacy(&summary);
469 } else {
470 print_summary_legacy(&summary);
471 }
472 return i32::from(had_error);
473 }
474
475 if let Err(code) = write_markdown_artifacts_dispatch(&output_dir, &summary).await {
476 return code;
477 }
478 announce_output_paths(&output_dir);
479 if args.json {
480 if let Err(code) = print_json_dispatch(&summary).await {
481 return code;
482 }
483 } else if let Err(code) = print_summary_dispatch(&summary).await {
484 return code;
485 }
486
487 i32::from(had_error)
488}
489
490async fn run_matrix_entry(
491 args: &EvalCodingAgentArgs,
492 output_dir: &Path,
493 fixture: FixtureDefinition,
494 selector: ModelSelector,
495 tool_format: String,
496) -> RunReport {
497 let run_id = run_id_for(fixture, &selector, &tool_format);
498 let run_dir = output_dir.join(&run_id);
499 if let Err(error) = reset_dir(&run_dir) {
500 return error_report(
501 run_id,
502 fixture,
503 selector,
504 tool_format,
505 run_dir,
506 format!("failed to prepare run directory: {error}"),
507 );
508 }
509
510 if !provider_available(&selector) {
511 let reason = format!(
512 "provider `{}` has no configured credentials",
513 selector.provider
514 );
515 return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
516 }
517
518 let script_path = run_dir.join("coding_agent_suite.harn");
519 if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
520 return error_report(
521 run_id,
522 fixture,
523 selector,
524 tool_format,
525 run_dir,
526 format!("failed to write benchmark harness: {error}"),
527 );
528 }
529
530 let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
531 let argv = script_argv(args, fixture, &selector, &tool_format, &run_dir);
532 let clock = RealClock::new();
533 let started_ms = clock.monotonic_ms();
534 let outcome = execute_run_with_sandbox_options(
535 &script_path.to_string_lossy(),
536 false,
537 HashSet::new(),
538 argv,
539 Vec::new(),
540 CliLlmMockMode::Off,
541 None,
542 RunProfileOptions::default(),
543 RunSandboxOptions::default().with_workspace_root(run_dir.clone()),
544 )
545 .await;
546 if let Some(line) = tool_format_override_warning_line(&outcome.stderr) {
547 eprintln!("{line}");
548 }
549 let elapsed_ms = clock
550 .monotonic_ms()
551 .saturating_sub(started_ms)
552 .try_into()
553 .unwrap_or(0);
554 let local_cleanup = if let Some(guard) = local_guard {
555 guard.cleanup().await
556 } else {
557 None
558 };
559
560 let summary_value =
561 read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
562 let Some(summary) = summary_value else {
563 return RunReport {
564 run_id,
565 fixture_id: fixture.id.to_string(),
566 fixture_name: fixture.name.to_string(),
567 fixture_tool_sequence: fixture.tool_sequence.to_string(),
568 selector,
569 tool_format,
570 status: "infra_error".to_string(),
571 passed: false,
572 skipped: false,
573 skipped_reason: None,
574 output_dir: run_dir.display().to_string(),
575 transcript_events_path: run_dir
576 .join("transcript_events.jsonl")
577 .display()
578 .to_string(),
579 workspace_root: None,
580 elapsed_ms,
581 duration_ms: 0,
582 iterations: 0,
583 input_tokens: 0,
584 output_tokens: 0,
585 cost_usd: 0.0,
586 pricing_known: false,
587 tool_calls: 0,
588 rejected_tool_calls: 0,
589 tool_sequence: Vec::new(),
590 successful_tools: Vec::new(),
591 transcript_event_count: 0,
592 verification_success: false,
593 harn_exit_code: outcome.exit_code,
594 error: Some("benchmark harness produced no summary JSON".to_string()),
595 stderr_excerpt: excerpt(&outcome.stderr),
596 local_cleanup,
597 };
598 };
599
600 report_from_summary(
601 RunSummaryContext {
602 run_id,
603 fixture,
604 selector,
605 tool_format,
606 run_dir,
607 elapsed_ms,
608 exit_code: outcome.exit_code,
609 stderr: outcome.stderr,
610 local_cleanup,
611 },
612 summary,
613 )
614}
615
616fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
617 let passed = summary
618 .get("passed")
619 .and_then(JsonValue::as_bool)
620 .unwrap_or(false)
621 && ctx.exit_code == 0;
622 let input_tokens = summary
623 .pointer("/llm/input_tokens")
624 .and_then(JsonValue::as_i64)
625 .unwrap_or(0);
626 let output_tokens = summary
627 .pointer("/llm/output_tokens")
628 .and_then(JsonValue::as_i64)
629 .unwrap_or(0);
630 let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
631 let cost_usd = pricing
632 .map(|(input, output)| {
633 (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
634 })
635 .unwrap_or(0.0);
636 let status = if passed {
637 "passed".to_string()
638 } else if ctx.exit_code == 0 {
639 "failed".to_string()
640 } else {
641 summary
642 .get("status")
643 .and_then(JsonValue::as_str)
644 .unwrap_or("failed")
645 .to_string()
646 };
647 RunReport {
648 run_id: ctx.run_id,
649 fixture_id: ctx.fixture.id.to_string(),
650 fixture_name: ctx.fixture.name.to_string(),
651 fixture_tool_sequence: ctx.fixture.tool_sequence.to_string(),
652 selector: ctx.selector,
653 tool_format: ctx.tool_format,
654 status,
655 passed,
656 skipped: false,
657 skipped_reason: None,
658 output_dir: ctx.run_dir.display().to_string(),
659 transcript_events_path: ctx
660 .run_dir
661 .join("transcript_events.jsonl")
662 .display()
663 .to_string(),
664 workspace_root: summary
665 .get("workspace_root")
666 .and_then(JsonValue::as_str)
667 .map(str::to_string),
668 elapsed_ms: ctx.elapsed_ms,
669 duration_ms: summary
670 .get("duration_ms")
671 .and_then(JsonValue::as_u64)
672 .unwrap_or(ctx.elapsed_ms),
673 iterations: summary
674 .pointer("/llm/iterations")
675 .and_then(JsonValue::as_i64)
676 .unwrap_or(0),
677 input_tokens,
678 output_tokens,
679 cost_usd,
680 pricing_known: pricing.is_some(),
681 tool_calls: summary
682 .pointer("/tools/calls")
683 .and_then(JsonValue::as_array)
684 .map(Vec::len)
685 .unwrap_or(0),
686 rejected_tool_calls: summary
687 .pointer("/tools/rejected")
688 .and_then(JsonValue::as_array)
689 .map(Vec::len)
690 .unwrap_or(0),
691 tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
692 .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
693 .unwrap_or_default(),
694 successful_tools: string_array(summary.pointer("/tools/successful")),
695 transcript_event_count: summary
696 .get("transcript_event_count")
697 .and_then(JsonValue::as_u64)
698 .unwrap_or(0) as usize,
699 verification_success: summary
700 .pointer("/verification/success")
701 .and_then(JsonValue::as_bool)
702 .unwrap_or(false),
703 harn_exit_code: ctx.exit_code,
704 error: (!passed).then(|| {
705 summary
706 .get("status")
707 .and_then(JsonValue::as_str)
708 .unwrap_or("benchmark failed")
709 .to_string()
710 }),
711 stderr_excerpt: excerpt(&ctx.stderr),
712 local_cleanup: ctx.local_cleanup,
713 }
714}
715
716impl LocalRunGuard {
717 async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
718 if !selector_is_local(selector) {
719 return None;
720 }
721 let snapshot = snapshot_provider(&selector.provider, Path::new("."))
722 .await
723 .ok();
724 Some(Self {
725 selector: selector.clone(),
726 stop_after,
727 snapshot,
728 })
729 }
730
731 async fn cleanup(self) -> Option<LocalCleanupReport> {
732 let snapshot = self.snapshot?;
733 if self.selector.provider != "ollama" {
734 return Some(LocalCleanupReport {
735 provider: self.selector.provider,
736 model: self.selector.model,
737 initially_loaded: false,
738 action: "not_applicable".to_string(),
739 detail: Some(
740 "non-Ollama local providers are only stopped when Harn launched a managed server"
741 .to_string(),
742 ),
743 });
744 }
745 let initially_loaded = snapshot
746 .loaded_models
747 .iter()
748 .any(|loaded| loaded.name == self.selector.model);
749 if !self.stop_after {
750 return Some(LocalCleanupReport {
751 provider: self.selector.provider,
752 model: self.selector.model,
753 initially_loaded,
754 action: "left_running".to_string(),
755 detail: Some("--keep-local-after-run".to_string()),
756 });
757 }
758 if initially_loaded {
759 return Some(LocalCleanupReport {
760 provider: self.selector.provider,
761 model: self.selector.model,
762 initially_loaded,
763 action: "left_preexisting".to_string(),
764 detail: None,
765 });
766 }
767 match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
768 Ok(()) => Some(LocalCleanupReport {
769 provider: self.selector.provider,
770 model: self.selector.model,
771 initially_loaded,
772 action: "unloaded".to_string(),
773 detail: None,
774 }),
775 Err(error) => Some(LocalCleanupReport {
776 provider: self.selector.provider,
777 model: self.selector.model,
778 initially_loaded,
779 action: "unload_failed".to_string(),
780 detail: Some(error),
781 }),
782 }
783 }
784}
785
786fn script_argv(
787 args: &EvalCodingAgentArgs,
788 fixture: FixtureDefinition,
789 selector: &ModelSelector,
790 tool_format: &str,
791 run_dir: &Path,
792) -> Vec<String> {
793 let mut argv = vec![
794 "--fixture".to_string(),
795 fixture.id.to_string(),
796 "--output-dir".to_string(),
797 run_dir.display().to_string(),
798 "--provider".to_string(),
799 selector.provider.clone(),
800 "--model".to_string(),
801 selector.model.clone(),
802 "--tool-format".to_string(),
803 tool_format.to_string(),
804 "--max-iterations".to_string(),
805 args.max_iterations.to_string(),
806 "--python".to_string(),
807 args.python.clone(),
808 ];
809 if selector.provider == "mock" {
810 argv.push("--seed-mock".to_string());
811 }
812 if let Some(json) = resolve_step_judge_json(args, selector) {
813 argv.push("--step-judge-json".to_string());
814 argv.push(json);
815 }
816 if let Some(reason) = args
817 .override_reason
818 .as_deref()
819 .map(str::trim)
820 .filter(|reason| !reason.is_empty())
821 {
822 argv.push("--override-reason".to_string());
823 argv.push(reason.to_string());
824 }
825 if let Some(json) = resolve_structural_validator_json(args) {
826 argv.push("--structural-validator-json".to_string());
827 argv.push(json);
828 }
829 argv
830}
831
832fn tool_format_override_warning_line(stderr: &str) -> Option<&str> {
833 stderr
834 .lines()
835 .map(str::trim)
836 .find(|line| line.starts_with(TOOL_FORMAT_OVERRIDE_WARNING_PREFIX))
837}
838
839fn error_report(
840 run_id: String,
841 fixture: FixtureDefinition,
842 selector: ModelSelector,
843 tool_format: String,
844 run_dir: PathBuf,
845 error: String,
846) -> RunReport {
847 RunReport {
848 run_id,
849 fixture_id: fixture.id.to_string(),
850 fixture_name: fixture.name.to_string(),
851 fixture_tool_sequence: fixture.tool_sequence.to_string(),
852 selector,
853 tool_format,
854 status: "infra_error".to_string(),
855 passed: false,
856 skipped: false,
857 skipped_reason: None,
858 output_dir: run_dir.display().to_string(),
859 transcript_events_path: run_dir
860 .join("transcript_events.jsonl")
861 .display()
862 .to_string(),
863 workspace_root: None,
864 elapsed_ms: 0,
865 duration_ms: 0,
866 iterations: 0,
867 input_tokens: 0,
868 output_tokens: 0,
869 cost_usd: 0.0,
870 pricing_known: false,
871 tool_calls: 0,
872 rejected_tool_calls: 0,
873 tool_sequence: Vec::new(),
874 successful_tools: Vec::new(),
875 transcript_event_count: 0,
876 verification_success: false,
877 harn_exit_code: 1,
878 error: Some(error),
879 stderr_excerpt: None,
880 local_cleanup: None,
881 }
882}
883
884fn skipped_report(
885 run_id: String,
886 fixture: FixtureDefinition,
887 selector: ModelSelector,
888 tool_format: String,
889 run_dir: PathBuf,
890 reason: String,
891) -> RunReport {
892 RunReport {
893 run_id,
894 fixture_id: fixture.id.to_string(),
895 fixture_name: fixture.name.to_string(),
896 fixture_tool_sequence: fixture.tool_sequence.to_string(),
897 selector,
898 tool_format,
899 status: "skipped".to_string(),
900 passed: false,
901 skipped: true,
902 skipped_reason: Some(reason),
903 output_dir: run_dir.display().to_string(),
904 transcript_events_path: run_dir
905 .join("transcript_events.jsonl")
906 .display()
907 .to_string(),
908 workspace_root: None,
909 elapsed_ms: 0,
910 duration_ms: 0,
911 iterations: 0,
912 input_tokens: 0,
913 output_tokens: 0,
914 cost_usd: 0.0,
915 pricing_known: false,
916 tool_calls: 0,
917 rejected_tool_calls: 0,
918 tool_sequence: Vec::new(),
919 successful_tools: Vec::new(),
920 transcript_event_count: 0,
921 verification_success: false,
922 harn_exit_code: 0,
923 error: None,
924 stderr_excerpt: None,
925 local_cleanup: None,
926 }
927}
928
929fn provider_available(selector: &ModelSelector) -> bool {
930 if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
931 return true;
932 }
933 harn_vm::llm_config::provider_key_available(&selector.provider)
934}
935
936fn resolve_fixtures(raw_fixtures: &[String]) -> Result<Vec<FixtureDefinition>, String> {
937 let mut seen = BTreeSet::new();
938 let mut out = Vec::new();
939 for raw in raw_fixtures {
940 let fixture = raw.trim().to_ascii_lowercase();
941 if fixture.is_empty() {
942 continue;
943 }
944 if fixture == "all" {
945 return Ok(FIXTURE_DEFINITIONS.to_vec());
946 }
947 let Some(definition) = fixture_definition(&fixture) else {
948 return Err(format!(
949 "unsupported --fixture `{fixture}`; expected one of: all, {}",
950 FIXTURE_DEFINITIONS
951 .iter()
952 .map(|definition| definition.id)
953 .collect::<Vec<_>>()
954 .join(", ")
955 ));
956 };
957 if seen.insert(definition.id) {
958 out.push(definition);
959 }
960 }
961 if out.is_empty() {
962 return Err("at least one coding-agent fixture must be selected".to_string());
963 }
964 Ok(out)
965}
966
967fn fixture_definition(id: &str) -> Option<FixtureDefinition> {
968 FIXTURE_DEFINITIONS
969 .iter()
970 .copied()
971 .find(|definition| definition.id == id)
972}
973
974async fn resolve_models(args: &EvalCodingAgentArgs) -> Result<Vec<ModelSelector>, String> {
975 let mut seen = BTreeSet::new();
976 let mut out = Vec::new();
977 for raw in normalize_model_selector_args(&args.models) {
978 let trimmed = raw.trim();
979 if trimmed.is_empty() {
980 continue;
981 }
982 let selector = resolve_selector(trimmed);
983 if seen.insert(selector_label(&selector)) {
984 out.push(selector);
985 }
986 }
987 if args.include_local {
988 for selector in discover_local_models(args).await {
989 if seen.insert(selector_label(&selector)) {
990 out.push(selector);
991 }
992 }
993 }
994 Ok(out)
995}
996
997fn normalize_model_selector_args(raw_models: &[String]) -> Vec<String> {
998 let mut out = Vec::new();
999 let mut index = 0;
1000 while index < raw_models.len() {
1001 let current = raw_models[index].trim();
1002 if current.starts_with("provider=") && index + 1 < raw_models.len() {
1003 let next = raw_models[index + 1].trim();
1004 if next.starts_with("model=") {
1005 out.push(format!("{current},{next}"));
1006 index += 2;
1007 continue;
1008 }
1009 }
1010 out.push(current.to_string());
1011 index += 1;
1012 }
1013 out
1014}
1015
1016async fn discover_local_models(args: &EvalCodingAgentArgs) -> Vec<ModelSelector> {
1017 let providers = if args.local_providers.is_empty() {
1018 local_provider_ids(None)
1019 } else {
1020 args.local_providers.clone()
1021 };
1022 let mut selectors = Vec::new();
1023 let mut seen = BTreeSet::new();
1024 for provider in providers {
1025 if selectors.len() >= args.max_local_models {
1026 break;
1027 }
1028 let Ok(snapshot) = snapshot_provider(&provider, Path::new(".")).await else {
1029 continue;
1030 };
1031 if !snapshot.reachable {
1032 continue;
1033 }
1034 let mut models = snapshot
1035 .loaded_models
1036 .iter()
1037 .map(|model| model.name.clone())
1038 .collect::<Vec<_>>();
1039 models.extend(snapshot.served_models);
1040 for model in models {
1041 if selectors.len() >= args.max_local_models {
1042 break;
1043 }
1044 let selector = ModelSelector {
1045 selector: format!("{provider}:{model}"),
1046 provider: provider.clone(),
1047 model,
1048 };
1049 if seen.insert(selector_label(&selector)) {
1050 selectors.push(selector);
1051 }
1052 }
1053 }
1054 selectors
1055}
1056
1057fn normalize_tool_formats(raw_formats: &[String]) -> Result<Vec<String>, String> {
1058 let mut seen = BTreeSet::new();
1059 let mut out = Vec::new();
1060 for raw in raw_formats {
1061 let format = raw.trim().to_ascii_lowercase();
1062 if format.is_empty() {
1063 continue;
1064 }
1065 if format != "native" && format != "text" {
1066 return Err(format!(
1067 "unsupported --tool-format `{format}`; expected `native` or `text`"
1068 ));
1069 }
1070 if seen.insert(format.clone()) {
1071 out.push(format);
1072 }
1073 }
1074 Ok(out)
1075}
1076
1077fn build_matrix(
1078 fixtures: &[FixtureDefinition],
1079 models: &[ModelSelector],
1080 tool_formats: &[String],
1081 max_runs: Option<usize>,
1082) -> Vec<(FixtureDefinition, ModelSelector, String)> {
1083 if max_runs == Some(0) {
1084 return Vec::new();
1085 }
1086 let mut matrix = Vec::new();
1087 for fixture in fixtures {
1088 for selector in models {
1089 for tool_format in tool_formats {
1090 matrix.push((*fixture, selector.clone(), tool_format.clone()));
1091 if max_runs.is_some_and(|limit| matrix.len() >= limit) {
1092 return matrix;
1093 }
1094 }
1095 }
1096 }
1097 matrix
1098}
1099
1100#[allow(clippy::too_many_arguments)]
1101fn build_summary(
1102 output_dir: &Path,
1103 fixtures: Vec<FixtureDefinition>,
1104 models: Vec<ModelSelector>,
1105 tool_formats: Vec<String>,
1106 env_keys_loaded: Vec<LoadedEnvKey>,
1107 runs: Vec<RunReport>,
1108 step_judge_preset: Option<String>,
1109 run_label: String,
1110 baseline_comparison: Option<BaselineComparison>,
1111) -> EvalSummary {
1112 let passed_runs = runs.iter().filter(|run| run.passed).count();
1113 let skipped_runs = runs.iter().filter(|run| run.skipped).count();
1114 let failed_runs = runs
1115 .iter()
1116 .filter(|run| !run.passed && !run.skipped)
1117 .count();
1118 let total_cost_usd = runs.iter().map(|run| run.cost_usd).sum();
1119 let rollups = build_rollups(&runs);
1120 let comparisons = compare_formats(&runs);
1121 let parity_by_pair = build_parity_by_pair(&comparisons);
1122 let diverged_comparisons = comparisons
1123 .iter()
1124 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1125 .count();
1126 let followups = suggest_followups(&runs, &comparisons);
1127 EvalSummary {
1128 schema_version: 3,
1129 fixture_ids: fixtures
1130 .iter()
1131 .map(|fixture| fixture.id.to_string())
1132 .collect(),
1133 fixtures: fixtures
1134 .iter()
1135 .map(|fixture| FixtureReport {
1136 id: fixture.id.to_string(),
1137 name: fixture.name.to_string(),
1138 tool_sequence: fixture.tool_sequence.to_string(),
1139 description: fixture.description.to_string(),
1140 })
1141 .collect(),
1142 output_dir: output_dir.display().to_string(),
1143 models,
1144 tool_formats,
1145 env_keys_loaded,
1146 total_runs: runs.len(),
1147 passed_runs,
1148 failed_runs,
1149 skipped_runs,
1150 diverged_comparisons,
1151 total_cost_usd,
1152 rollups,
1153 runs,
1154 comparisons,
1155 parity_by_pair,
1156 followups,
1157 step_judge_preset,
1158 run_label,
1159 baseline_comparison,
1160 }
1161}
1162
1163fn load_baseline_comparison(path: &Path, runs: &[RunReport]) -> Result<BaselineComparison, String> {
1164 let resolved = if path.is_dir() {
1165 path.join("summary.json")
1166 } else {
1167 path.to_path_buf()
1168 };
1169 let raw = fs::read_to_string(&resolved)
1170 .map_err(|e| format!("failed to read {}: {e}", resolved.display()))?;
1171 let baseline: serde_json::Value = serde_json::from_str(&raw)
1172 .map_err(|e| format!("failed to parse {} as JSON: {e}", resolved.display()))?;
1173 let baseline_runs = baseline
1174 .get("runs")
1175 .and_then(|v| v.as_array())
1176 .ok_or_else(|| format!("{} has no `runs` array", resolved.display()))?;
1177 let mut baseline_status: BTreeMap<String, &str> = BTreeMap::new();
1181 for run in baseline_runs {
1182 let fixture_id = match run.get("fixture_id").and_then(|v| v.as_str()) {
1183 Some(id) => id.to_string(),
1184 None => continue,
1185 };
1186 let passed = run.get("passed").and_then(|v| v.as_bool()).unwrap_or(false);
1187 let skipped = run
1188 .get("skipped")
1189 .and_then(|v| v.as_bool())
1190 .unwrap_or(false);
1191 let status = if skipped {
1192 "skipped"
1193 } else if passed {
1194 "passed"
1195 } else {
1196 "failed"
1197 };
1198 baseline_status
1199 .entry(fixture_id)
1200 .and_modify(|existing| {
1201 if *existing != "passed" && status == "passed" {
1202 *existing = status;
1203 }
1204 })
1205 .or_insert(status);
1206 }
1207 let mut cell_status: BTreeMap<String, &str> = BTreeMap::new();
1208 for run in runs {
1209 let status = if run.skipped {
1210 "skipped"
1211 } else if run.passed {
1212 "passed"
1213 } else {
1214 "failed"
1215 };
1216 cell_status
1217 .entry(run.fixture_id.clone())
1218 .and_modify(|existing| {
1219 if *existing != "passed" && status == "passed" {
1220 *existing = status;
1221 }
1222 })
1223 .or_insert(status);
1224 }
1225 let mut regressions = Vec::new();
1226 let mut recoveries = Vec::new();
1227 let mut unchanged_passes = Vec::new();
1228 let mut unchanged_failures = Vec::new();
1229 let mut missing_in_baseline = Vec::new();
1230 let mut missing_in_cell = Vec::new();
1231 for (fixture, cell) in &cell_status {
1232 match baseline_status.get(fixture) {
1233 None => missing_in_baseline.push(fixture.clone()),
1234 Some(base) => match (*base, *cell) {
1235 ("passed", "passed") => unchanged_passes.push(fixture.clone()),
1236 ("passed", _) => regressions.push(FixtureStatusDelta {
1237 fixture_id: fixture.clone(),
1238 baseline_status: (*base).to_string(),
1239 cell_status: (*cell).to_string(),
1240 }),
1241 (_, "passed") => recoveries.push(FixtureStatusDelta {
1242 fixture_id: fixture.clone(),
1243 baseline_status: (*base).to_string(),
1244 cell_status: (*cell).to_string(),
1245 }),
1246 _ => unchanged_failures.push(fixture.clone()),
1247 },
1248 }
1249 }
1250 for fixture in baseline_status.keys() {
1251 if !cell_status.contains_key(fixture) {
1252 missing_in_cell.push(fixture.clone());
1253 }
1254 }
1255 let baseline_label = baseline
1256 .get("run_label")
1257 .and_then(|v| v.as_str())
1258 .filter(|s| !s.is_empty())
1259 .or_else(|| baseline.get("output_dir").and_then(|v| v.as_str()))
1260 .unwrap_or("")
1261 .to_string();
1262 let regressions_count = regressions.len();
1263 let recoveries_count = recoveries.len();
1264 let total_compared =
1265 regressions_count + recoveries_count + unchanged_passes.len() + unchanged_failures.len();
1266 let net_lift_pp = if total_compared == 0 {
1267 0.0
1268 } else {
1269 let raw =
1270 (recoveries_count as f64 - regressions_count as f64) / total_compared as f64 * 100.0;
1271 (raw * 10.0).round() / 10.0
1272 };
1273 Ok(BaselineComparison {
1274 baseline_label,
1275 baseline_path: resolved.display().to_string(),
1276 regressions,
1277 recoveries,
1278 unchanged_passes,
1279 unchanged_failures,
1280 missing_in_baseline,
1281 missing_in_cell,
1282 regressions_count,
1283 recoveries_count,
1284 net_lift_pp,
1285 })
1286}
1287
1288fn build_rollups(runs: &[RunReport]) -> EvalRollups {
1289 EvalRollups {
1290 by_fixture: rollup_by(runs, |run| run.fixture_id.clone()),
1291 by_provider: rollup_by(runs, |run| run.selector.provider.clone()),
1292 by_model: rollup_by(runs, |run| run.selector.model.clone()),
1293 by_tool_format: rollup_by(runs, |run| run.tool_format.clone()),
1294 by_tool_sequence: rollup_by(runs, |run| run.fixture_tool_sequence.clone()),
1295 }
1296}
1297
1298fn rollup_by<F>(runs: &[RunReport], key_for: F) -> Vec<RollupReport>
1299where
1300 F: Fn(&RunReport) -> String,
1301{
1302 let mut grouped: BTreeMap<String, RollupReport> = BTreeMap::new();
1303 for run in runs {
1304 let key = key_for(run);
1305 let entry = grouped.entry(key.clone()).or_insert_with(|| RollupReport {
1306 key,
1307 total_runs: 0,
1308 passed_runs: 0,
1309 failed_runs: 0,
1310 skipped_runs: 0,
1311 total_cost_usd: 0.0,
1312 });
1313 entry.total_runs += 1;
1314 if run.passed {
1315 entry.passed_runs += 1;
1316 } else if run.skipped {
1317 entry.skipped_runs += 1;
1318 } else {
1319 entry.failed_runs += 1;
1320 }
1321 entry.total_cost_usd += run.cost_usd;
1322 }
1323 grouped.into_values().collect()
1324}
1325
1326fn compare_formats(runs: &[RunReport]) -> Vec<FormatComparison> {
1327 let mut grouped: BTreeMap<String, Vec<&RunReport>> = BTreeMap::new();
1328 for run in runs {
1329 grouped
1330 .entry(format!(
1331 "{}\0{}",
1332 run.fixture_id,
1333 selector_label(&run.selector)
1334 ))
1335 .or_default()
1336 .push(run);
1337 }
1338 let mut out = Vec::new();
1339 for group in grouped.values() {
1340 let Some(first) = group.first() else {
1341 continue;
1342 };
1343 let native = group
1344 .iter()
1345 .find(|run| run.tool_format == "native")
1346 .copied();
1347 let text = group.iter().find(|run| run.tool_format == "text").copied();
1348 if native.is_none() && text.is_none() {
1349 continue;
1350 }
1351 let pair = native.zip(text);
1352 let mut divergence_reasons = Vec::new();
1353 if let Some((native, text)) = pair {
1354 if native.status != text.status {
1355 divergence_reasons.push(format!(
1356 "status differs: native={} text={}",
1357 native.status, text.status
1358 ));
1359 }
1360 if native.passed != text.passed {
1361 divergence_reasons.push(format!(
1362 "pass result differs: native={} text={}",
1363 native.passed, text.passed
1364 ));
1365 }
1366 if native.verification_success != text.verification_success {
1367 divergence_reasons.push(format!(
1368 "verifier result differs: native={} text={}",
1369 native.verification_success, text.verification_success
1370 ));
1371 }
1372 if native.tool_sequence != text.tool_sequence {
1373 divergence_reasons.push(format!(
1374 "tool sequence differs: native=[{}] text=[{}]",
1375 native.tool_sequence.join(", "),
1376 text.tool_sequence.join(", ")
1377 ));
1378 }
1379 if native.rejected_tool_calls != text.rejected_tool_calls {
1380 divergence_reasons.push(format!(
1381 "rejected tool-call recovery differs: native={} text={}",
1382 native.rejected_tool_calls, text.rejected_tool_calls
1383 ));
1384 }
1385 }
1386 let evidence_paths = [native, text]
1387 .into_iter()
1388 .flatten()
1389 .map(|run| run.transcript_events_path.clone())
1390 .collect::<Vec<_>>();
1391 out.push(FormatComparison {
1392 fixture_id: first.fixture_id.clone(),
1393 selector: first.selector.clone(),
1394 native_run_id: native.map(|run| run.run_id.clone()),
1395 text_run_id: text.map(|run| run.run_id.clone()),
1396 native_evidence_path: native.map(|run| run.transcript_events_path.clone()),
1397 text_evidence_path: text.map(|run| run.transcript_events_path.clone()),
1398 native_status: native.map(|run| run.status.clone()),
1399 text_status: text.map(|run| run.status.clone()),
1400 native_passed: native.map(|run| run.passed),
1401 text_passed: text.map(|run| run.passed),
1402 native_tool_call_count: native.map(|run| run.tool_calls),
1403 text_tool_call_count: text.map(|run| run.tool_calls),
1404 native_rejected_tool_call_count: native.map(|run| run.rejected_tool_calls),
1405 text_rejected_tool_call_count: text.map(|run| run.rejected_tool_calls),
1406 verifier_match: pair
1407 .map(|(native, text)| native.verification_success == text.verification_success),
1408 tool_sequence_match: pair
1409 .map(|(native, text)| native.tool_sequence == text.tool_sequence),
1410 rejected_tool_call_delta_text_minus_native: pair.map(|(native, text)| {
1411 text.rejected_tool_calls as i64 - native.rejected_tool_calls as i64
1412 }),
1413 token_delta_text_minus_native: pair.map(|(native, text)| {
1414 (text.input_tokens + text.output_tokens)
1415 - (native.input_tokens + native.output_tokens)
1416 }),
1417 iteration_delta_text_minus_native: pair
1418 .map(|(native, text)| text.iterations - native.iterations),
1419 equivalent: pair.map(|(native, text)| {
1420 native.status == text.status
1421 && native.passed == text.passed
1422 && native.skipped == text.skipped
1423 && native.verification_success == text.verification_success
1424 && native.tool_sequence == text.tool_sequence
1425 && native.rejected_tool_calls == text.rejected_tool_calls
1426 }),
1427 divergence_reasons,
1428 evidence_paths,
1429 });
1430 }
1431 out
1432}
1433
1434fn build_parity_by_pair(comparisons: &[FormatComparison]) -> Vec<ToolModeParityPairSummary> {
1435 let fixture_inputs = comparisons
1436 .iter()
1437 .filter_map(parity_fixture_input)
1438 .collect::<Vec<_>>();
1439 let fixture_reports = tool_mode_parity::build_fixture_reports(&fixture_inputs);
1440 tool_mode_parity::build_pair_summaries(&fixture_reports)
1441}
1442
1443fn parity_fixture_input(comparison: &FormatComparison) -> Option<ToolModeParityFixtureInput> {
1444 let native_verdict = comparison.native_status.clone()?;
1445 let text_verdict = comparison.text_status.clone()?;
1446 if native_verdict == "skipped" || text_verdict == "skipped" {
1447 return None;
1448 }
1449 Some(ToolModeParityFixtureInput {
1450 provider: comparison.selector.provider.clone(),
1451 model: comparison.selector.model.clone(),
1452 fixture_id: comparison.fixture_id.clone(),
1453 native_verdict,
1454 text_verdict,
1455 native_passed: comparison.native_passed?,
1456 text_passed: comparison.text_passed?,
1457 agreement: comparison.equivalent?,
1458 verifier_agreement: comparison.verifier_match?,
1459 native_tool_call_count: comparison.native_tool_call_count?,
1460 text_tool_call_count: comparison.text_tool_call_count?,
1461 native_rejected_tool_call_count: comparison.native_rejected_tool_call_count?,
1462 text_rejected_tool_call_count: comparison.text_rejected_tool_call_count?,
1463 native_evidence_path: comparison.native_evidence_path.clone()?,
1464 text_evidence_path: comparison.text_evidence_path.clone()?,
1465 })
1466}
1467
1468fn suggest_followups(
1469 runs: &[RunReport],
1470 comparisons: &[FormatComparison],
1471) -> Vec<FollowupSuggestion> {
1472 let mut out = Vec::new();
1473 let failed = runs
1474 .iter()
1475 .filter(|run| !run.passed && !run.skipped)
1476 .map(|run| run.run_id.clone())
1477 .collect::<Vec<_>>();
1478 if !failed.is_empty() {
1479 out.push(FollowupSuggestion {
1480 title: "Normalize coding-agent fixture failures across provider presets".to_string(),
1481 body: "One or more fixture/provider/tool-format runs failed. Inspect the run directories and decide whether the gap belongs in provider adapters, preset prompting, transcript handling, or host-tool ergonomics.".to_string(),
1482 labels: vec!["eval".to_string(), "providers".to_string()],
1483 run_ids: failed,
1484 });
1485 }
1486
1487 let rejected = runs
1488 .iter()
1489 .filter(|run| run.rejected_tool_calls > 0)
1490 .map(|run| run.run_id.clone())
1491 .collect::<Vec<_>>();
1492 if !rejected.is_empty() {
1493 out.push(FollowupSuggestion {
1494 title: "Abstract rejected tool-call recovery in agent transcripts".to_string(),
1495 body: "Some runs recovered after rejected tool calls. Add runtime support or preset guidance so harness authors can distinguish recoverable provider/tool-shape noise from user-relevant transcript events.".to_string(),
1496 labels: vec!["agents".to_string(), "transcripts".to_string()],
1497 run_ids: rejected,
1498 });
1499 }
1500
1501 let mismatched = comparisons
1502 .iter()
1503 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1504 .map(|comparison| {
1505 format!(
1506 "{}:{} ({})",
1507 comparison.fixture_id,
1508 selector_label(&comparison.selector),
1509 comparison.divergence_reasons.join("; ")
1510 )
1511 })
1512 .collect::<Vec<_>>();
1513 if !mismatched.is_empty() {
1514 let run_ids = comparisons
1515 .iter()
1516 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1517 .flat_map(|comparison| {
1518 [
1519 comparison.native_run_id.clone(),
1520 comparison.text_run_id.clone(),
1521 ]
1522 })
1523 .flatten()
1524 .collect::<Vec<_>>();
1525 out.push(FollowupSuggestion {
1526 title: "Make native/text tool modes behaviorally interchangeable for preset harnesses"
1527 .to_string(),
1528 body: format!(
1529 "Native and text tool modes diverged for: {}. The preset/runtime boundary should hide provider tool-channel differences where possible.",
1530 mismatched.join(", ")
1531 ),
1532 labels: vec!["agents".to_string(), "tools".to_string()],
1533 run_ids,
1534 });
1535 }
1536
1537 let unknown_pricing = runs
1538 .iter()
1539 .filter(|run| {
1540 !run.skipped
1541 && !run.pricing_known
1542 && !matches!(run.selector.provider.as_str(), "mock" | "fake")
1543 && !selector_is_local(&run.selector)
1544 })
1545 .map(|run| run.run_id.clone())
1546 .collect::<Vec<_>>();
1547 if !unknown_pricing.is_empty() {
1548 out.push(FollowupSuggestion {
1549 title: "Fill provider pricing metadata for benchmarked models".to_string(),
1550 body: "At least one live provider/model produced usage metrics but had no pricing entry, which weakens cost comparisons in eval reports.".to_string(),
1551 labels: vec!["providers".to_string(), "docs".to_string()],
1552 run_ids: unknown_pricing,
1553 });
1554 }
1555 out
1556}
1557
1558fn write_json_artifacts(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1559 write_json_pretty(&output_dir.join("summary.json"), summary)?;
1560 write_jsonl(&output_dir.join("per_run.jsonl"), &summary.runs)?;
1561 let summary_value = serde_json::to_value(summary).map_err(|error| error.to_string())?;
1562 let readiness = local_readiness::report_from_summary_json(
1563 &summary_value,
1564 output_dir.display().to_string(),
1565 )?;
1566 write_json_pretty(&output_dir.join("local_readiness.json"), &readiness)?;
1567 let generated_at = RealClock::new()
1568 .now_utc()
1569 .format(&time::format_description::well_known::Rfc3339)
1570 .map_err(|error| format!("failed to format parity overlay timestamp: {error}"))?;
1571 let parity_dir = output_dir.join(TOOL_MODE_PARITY_DIRECTORY);
1572 let parity_reports = tool_mode_parity::build_fixture_reports(
1573 &summary
1574 .comparisons
1575 .iter()
1576 .filter_map(parity_fixture_input)
1577 .collect::<Vec<_>>(),
1578 );
1579 for report in &parity_reports {
1580 let path = parity_dir
1581 .join(sanitize_id(&format!(
1582 "{}__{}:{}",
1583 report.fixture_id, report.provider, report.model
1584 )))
1585 .join("parity.json");
1586 tool_mode_parity::write_fixture_report(&path, report)?;
1587 }
1588 let overlay = tool_mode_parity::build_overlay(
1589 &summary.parity_by_pair,
1590 &generated_at,
1591 TOOL_MODE_PARITY_FIXTURE_SUITE,
1592 output_dir,
1593 );
1594 tool_mode_parity::write_overlay(
1595 &output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME),
1596 &overlay,
1597 )?;
1598 Ok(())
1599}
1600
1601fn announce_output_paths(output_dir: &Path) {
1602 eprintln!(
1603 "wrote {}, {}, {}, {}, {}, {}, and {}",
1604 output_dir.join("summary.json").display(),
1605 output_dir.join("per_run.jsonl").display(),
1606 output_dir.join("local_readiness.json").display(),
1607 output_dir.join(TOOL_MODE_PARITY_DIRECTORY).display(),
1608 output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME).display(),
1609 output_dir.join("summary.md").display(),
1610 output_dir.join("followups.md").display()
1611 );
1612}
1613
1614fn write_markdown_artifacts_legacy(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1617 fs::write(output_dir.join("summary.md"), render_markdown(summary))
1618 .map_err(|error| format!("failed to write summary.md: {error}"))?;
1619 fs::write(output_dir.join("followups.md"), render_followups(summary))
1620 .map_err(|error| format!("failed to write followups.md: {error}"))?;
1621 Ok(())
1622}
1623
1624fn print_summary_legacy(summary: &EvalSummary) {
1625 println!(
1626 "coding-agent eval: {}/{} passed, {} skipped, total_cost_usd={:.6}",
1627 summary.passed_runs, summary.total_runs, summary.skipped_runs, summary.total_cost_usd
1628 );
1629}
1630
1631fn print_json_legacy(summary: &EvalSummary) {
1632 match serde_json::to_string_pretty(summary) {
1633 Ok(payload) => println!("{payload}"),
1634 Err(error) => eprintln!("warning: failed to render summary JSON: {error}"),
1635 }
1636}
1637
1638async fn write_markdown_artifacts_dispatch(
1641 output_dir: &Path,
1642 summary: &EvalSummary,
1643) -> Result<(), i32> {
1644 let markdown = render_via_dispatch(summary, "markdown").await?;
1645 if let Err(error) = fs::write(output_dir.join("summary.md"), markdown) {
1646 eprintln!("error: failed to write summary.md: {error}");
1647 return Err(1);
1648 }
1649 let followups = render_via_dispatch(summary, "followups").await?;
1650 if let Err(error) = fs::write(output_dir.join("followups.md"), followups) {
1651 eprintln!("error: failed to write followups.md: {error}");
1652 return Err(1);
1653 }
1654 Ok(())
1655}
1656
1657async fn print_summary_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1658 let payload = render_via_dispatch(summary, "summary").await?;
1659 print!("{payload}");
1660 if !payload.ends_with('\n') {
1663 println!();
1664 }
1665 Ok(())
1666}
1667
1668async fn print_json_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1669 let payload = render_via_dispatch(summary, "json").await?;
1670 print!("{payload}");
1671 if !payload.ends_with('\n') {
1672 println!();
1673 }
1674 Ok(())
1675}
1676
1677async fn render_via_dispatch(summary: &EvalSummary, mode: &str) -> Result<String, i32> {
1687 let summary_json = match serde_json::to_string(summary) {
1688 Ok(json) => json,
1689 Err(error) => {
1690 eprintln!("error: failed to serialise EvalSummary for dispatch: {error}");
1691 return Err(1);
1692 }
1693 };
1694 let _guard = DISPATCH_RENDER_LOCK.lock().await;
1695 let _summary = ScopedEnvVar::set(CODING_AGENT_SUMMARY_ENV, &summary_json);
1696 let _mode = ScopedEnvVar::set(CODING_AGENT_MODE_ENV, mode);
1697
1698 let outcome = dispatch::run_embedded_script("eval/coding_agent", Vec::new(), false).await;
1699 if !outcome.stderr.is_empty() {
1700 let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
1701 }
1702 if outcome.exit_code != 0 {
1703 return Err(outcome.exit_code);
1704 }
1705 Ok(outcome.stdout)
1706}
1707
1708fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<(), String> {
1709 let body = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
1710 fs::write(path, format!("{body}\n")).map_err(|error| error.to_string())
1711}
1712
1713fn write_jsonl<T: Serialize>(path: &Path, items: &[T]) -> Result<(), String> {
1714 let mut body = String::new();
1715 for item in items {
1716 let line = serde_json::to_string(item).map_err(|error| error.to_string())?;
1717 body.push_str(&line);
1718 body.push('\n');
1719 }
1720 fs::write(path, body).map_err(|error| error.to_string())
1721}
1722
1723fn render_markdown(summary: &EvalSummary) -> String {
1724 let mut out = String::new();
1725 out.push_str("# Coding Agent Harness Quality Suite\n\n");
1726 out.push_str(&format!(
1727 "- fixtures: `{}`\n- passed: {}/{}\n- skipped: {}\n- total_cost_usd: {:.6}\n\n",
1728 summary.fixture_ids.join("`, `"),
1729 summary.passed_runs,
1730 summary.total_runs,
1731 summary.skipped_runs,
1732 summary.total_cost_usd
1733 ));
1734 render_rollup_table(&mut out, "By Fixture", &summary.rollups.by_fixture);
1735 render_rollup_table(&mut out, "By Provider", &summary.rollups.by_provider);
1736 render_rollup_table(&mut out, "By Model", &summary.rollups.by_model);
1737 render_rollup_table(&mut out, "By Tool Format", &summary.rollups.by_tool_format);
1738 render_rollup_table(
1739 &mut out,
1740 "By Tool Sequence",
1741 &summary.rollups.by_tool_sequence,
1742 );
1743
1744 out.push_str("\n## Runs\n\n");
1745 out.push_str("| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n");
1746 out.push_str("|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n");
1747 for run in &summary.runs {
1748 let tool_sequence = if run.tool_sequence.is_empty() {
1749 "-".to_string()
1750 } else {
1751 run.tool_sequence.join(", ").replace('|', "\\|")
1752 };
1753 out.push_str(&format!(
1754 "| `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {:.6} | {} | `{}` |\n",
1755 run.fixture_id,
1756 run.run_id,
1757 run.selector.provider,
1758 run.selector.model.replace('|', "\\|"),
1759 run.tool_format,
1760 run.fixture_tool_sequence,
1761 tool_sequence,
1762 run.status,
1763 run.iterations,
1764 run.input_tokens + run.output_tokens,
1765 run.cost_usd,
1766 markdown_link(
1767 &run.transcript_event_count.to_string(),
1768 &run.transcript_events_path
1769 ),
1770 run.output_dir
1771 ));
1772 }
1773 if let Some(comparison) = &summary.baseline_comparison {
1774 out.push_str("\n## Baseline Comparison\n\n");
1775 out.push_str(&format!(
1776 "Compared against `{}`{}.\n\n",
1777 comparison.baseline_path,
1778 if comparison.baseline_label.is_empty() {
1779 String::new()
1780 } else {
1781 format!(" (label: `{}`)", comparison.baseline_label)
1782 },
1783 ));
1784 out.push_str(&format!(
1785 "- regressions: **{}** (baseline passed, this cell failed)\n- recoveries: **{}** (baseline failed, this cell passed)\n- net lift: **{:+.1}pp**\n\n",
1786 comparison.regressions_count,
1787 comparison.recoveries_count,
1788 comparison.net_lift_pp,
1789 ));
1790 if !comparison.regressions.is_empty() {
1791 out.push_str("### Regressions\n\n");
1792 for delta in &comparison.regressions {
1793 out.push_str(&format!(
1794 "- `{}`: `{}` → `{}`\n",
1795 delta.fixture_id, delta.baseline_status, delta.cell_status,
1796 ));
1797 }
1798 out.push('\n');
1799 }
1800 if !comparison.recoveries.is_empty() {
1801 out.push_str("### Recoveries\n\n");
1802 for delta in &comparison.recoveries {
1803 out.push_str(&format!(
1804 "- `{}`: `{}` → `{}`\n",
1805 delta.fixture_id, delta.baseline_status, delta.cell_status,
1806 ));
1807 }
1808 out.push('\n');
1809 }
1810 }
1811 if !summary.comparisons.is_empty() {
1812 out.push_str("\n## Native/Text Comparison\n\n");
1813 out.push_str("| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n");
1814 out.push_str("|---|---|---|---|---|---|---|---:|---:|---:|---|\n");
1815 for comparison in &summary.comparisons {
1816 out.push_str(&format!(
1817 "| `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
1818 comparison.fixture_id,
1819 selector_label(&comparison.selector),
1820 comparison
1821 .native_status
1822 .clone()
1823 .unwrap_or_else(|| "-".to_string()),
1824 comparison
1825 .text_status
1826 .clone()
1827 .unwrap_or_else(|| "-".to_string()),
1828 optional_bool_mark(comparison.equivalent),
1829 optional_bool_mark(comparison.verifier_match),
1830 optional_bool_mark(comparison.tool_sequence_match),
1831 comparison
1832 .rejected_tool_call_delta_text_minus_native
1833 .map(|v| v.to_string())
1834 .unwrap_or_else(|| "-".to_string()),
1835 comparison
1836 .token_delta_text_minus_native
1837 .map(|v| v.to_string())
1838 .unwrap_or_else(|| "-".to_string()),
1839 comparison
1840 .iteration_delta_text_minus_native
1841 .map(|v| v.to_string())
1842 .unwrap_or_else(|| "-".to_string()),
1843 comparison_evidence_links(comparison)
1844 ));
1845 }
1846 }
1847 if !summary.parity_by_pair.is_empty() {
1848 out.push_str("\n## Parity report — native vs text\n\n");
1849 out.push_str("| selector | sample | native pass | text pass | agreement | verifier divergence | native_only | text_only | both_pass | both_fail |\n");
1850 out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|\n");
1851 for pair in &summary.parity_by_pair {
1852 out.push_str(&format!(
1853 "| `{}` | {} | {:.1}% | {:.1}% | {:.1}% | {:.1}% | {} | {} | {} | {} |\n",
1854 selector_label(&ModelSelector {
1855 selector: format!("{}:{}", pair.provider, pair.model),
1856 provider: pair.provider.clone(),
1857 model: pair.model.clone(),
1858 }),
1859 pair.sample_size,
1860 pair.native.pass_rate * 100.0,
1861 pair.text.pass_rate * 100.0,
1862 pair.agreement_rate * 100.0,
1863 pair.verifier_divergence_rate * 100.0,
1864 pair.divergence_counts.native_only_pass,
1865 pair.divergence_counts.text_only_pass,
1866 pair.divergence_counts.both_pass,
1867 pair.divergence_counts.both_fail,
1868 ));
1869 }
1870 }
1871 let diverged = summary
1872 .comparisons
1873 .iter()
1874 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1875 .collect::<Vec<_>>();
1876 if !diverged.is_empty() {
1877 out.push_str("\n## Native/Text Divergence Evidence\n\n");
1878 for comparison in diverged {
1879 out.push_str(&format!(
1880 "- `{}` `{}`: {}\n",
1881 comparison.fixture_id,
1882 selector_label(&comparison.selector),
1883 comparison.divergence_reasons.join("; ")
1884 ));
1885 if !comparison.evidence_paths.is_empty() {
1886 out.push_str(&format!(
1887 " Evidence: {}\n",
1888 comparison_evidence_links(comparison)
1889 ));
1890 }
1891 }
1892 }
1893 out
1894}
1895
1896fn render_rollup_table(out: &mut String, title: &str, rollups: &[RollupReport]) {
1897 out.push_str(&format!("## {title}\n\n"));
1898 out.push_str("| key | passed | failed | skipped | total | cost |\n");
1899 out.push_str("|---|---:|---:|---:|---:|---:|\n");
1900 for rollup in rollups {
1901 out.push_str(&format!(
1902 "| `{}` | {} | {} | {} | {} | {:.6} |\n",
1903 rollup.key.replace('|', "\\|"),
1904 rollup.passed_runs,
1905 rollup.failed_runs,
1906 rollup.skipped_runs,
1907 rollup.total_runs,
1908 rollup.total_cost_usd
1909 ));
1910 }
1911 out.push('\n');
1912}
1913
1914fn render_followups(summary: &EvalSummary) -> String {
1915 let mut out = String::new();
1916 out.push_str("# Follow-up Issue Candidates\n\n");
1917 if summary.followups.is_empty() {
1918 out.push_str("No follow-up issue candidates were generated from this run.\n");
1919 return out;
1920 }
1921 for followup in &summary.followups {
1922 out.push_str(&format!("## {}\n\n{}\n\n", followup.title, followup.body));
1923 if !followup.run_ids.is_empty() {
1924 out.push_str(&format!("- run_ids: `{}`\n", followup.run_ids.join("`, `")));
1925 }
1926 if !followup.labels.is_empty() {
1927 out.push_str(&format!("- labels: `{}`\n", followup.labels.join("`, `")));
1928 }
1929 out.push('\n');
1930 }
1931 out
1932}
1933
1934fn read_run_summary(run_dir: &Path) -> Option<JsonValue> {
1935 let raw = fs::read_to_string(run_dir.join("summary.json")).ok()?;
1936 serde_json::from_str(&raw).ok()
1937}
1938
1939fn parse_last_json_line(stdout: &str) -> Option<JsonValue> {
1940 stdout
1941 .lines()
1942 .rev()
1943 .map(str::trim)
1944 .filter(|line| !line.is_empty())
1945 .find_map(|line| serde_json::from_str::<JsonValue>(line).ok())
1946}
1947
1948fn string_array(value: Option<&JsonValue>) -> Vec<String> {
1949 value
1950 .and_then(JsonValue::as_array)
1951 .map(|values| {
1952 values
1953 .iter()
1954 .filter_map(JsonValue::as_str)
1955 .map(str::to_string)
1956 .collect()
1957 })
1958 .unwrap_or_default()
1959}
1960
1961fn non_empty_string_array(value: Option<&JsonValue>) -> Option<Vec<String>> {
1962 let values = string_array(value);
1963 (!values.is_empty()).then_some(values)
1964}
1965
1966fn tool_call_sequence(value: Option<&JsonValue>) -> Option<Vec<String>> {
1967 let calls = value.and_then(JsonValue::as_array)?;
1968 let mut sequence = Vec::new();
1969 for call in calls {
1970 if let Some(name) = call
1971 .get("name")
1972 .or_else(|| call.get("tool_name"))
1973 .and_then(JsonValue::as_str)
1974 {
1975 sequence.push(name.to_string());
1976 }
1977 }
1978 (!sequence.is_empty()).then_some(sequence)
1979}
1980
1981fn optional_bool_mark(value: Option<bool>) -> &'static str {
1982 match value {
1983 Some(true) => "yes",
1984 Some(false) => "no",
1985 None => "-",
1986 }
1987}
1988
1989fn comparison_evidence_links(comparison: &FormatComparison) -> String {
1990 let mut links = Vec::new();
1991 if let Some(native) = comparison.native_evidence_path.as_deref() {
1992 links.push(markdown_link("native", native));
1993 }
1994 if let Some(text) = comparison.text_evidence_path.as_deref() {
1995 links.push(markdown_link("text", text));
1996 }
1997 if links.is_empty() {
1998 "-".to_string()
1999 } else {
2000 links.join("<br>")
2001 }
2002}
2003
2004fn markdown_link(label: &str, target: &str) -> String {
2005 format!(
2006 "[{}]({})",
2007 label.replace('|', "\\|"),
2008 target
2009 .replace(' ', "%20")
2010 .replace('(', "%28")
2011 .replace(')', "%29")
2012 )
2013}
2014
2015fn reset_dir(path: &Path) -> Result<(), String> {
2016 if path.exists() {
2017 fs::remove_dir_all(path).map_err(|error| error.to_string())?;
2018 }
2019 fs::create_dir_all(path).map_err(|error| error.to_string())
2020}
2021
2022fn run_id_for(fixture: FixtureDefinition, selector: &ModelSelector, tool_format: &str) -> String {
2023 sanitize_id(&format!(
2024 "{}__{}__{}",
2025 fixture.id,
2026 selector_label(selector),
2027 tool_format
2028 ))
2029}
2030
2031fn sanitize_id(raw: &str) -> String {
2032 let mut out = String::new();
2033 for ch in raw.chars() {
2034 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
2035 out.push(ch);
2036 } else {
2037 out.push('_');
2038 }
2039 }
2040 out.trim_matches('_').to_string()
2041}
2042
2043fn default_output_dir() -> PathBuf {
2044 PathBuf::from(".harn-runs")
2045 .join("coding-agent-bench")
2046 .join("latest")
2047}
2048
2049fn excerpt(text: &str) -> Option<String> {
2050 let trimmed = text.trim();
2051 if trimmed.is_empty() {
2052 return None;
2053 }
2054 let max = 4000;
2055 if trimmed.len() <= max {
2056 return Some(trimmed.to_string());
2057 }
2058 let mut truncated = String::new();
2059 for ch in trimmed.chars().take(max) {
2060 truncated.push(ch);
2061 }
2062 truncated.push_str("...");
2063 Some(truncated)
2064}
2065
2066fn load_env_files(paths: &[PathBuf]) -> Result<(EnvOverlay, Vec<LoadedEnvKey>), String> {
2067 let mut previous = Vec::new();
2068 let mut loaded = Vec::new();
2069 let mut touched = BTreeSet::new();
2070 for path in paths {
2071 let path = expand_home(path);
2072 let raw = fs::read_to_string(&path)
2073 .map_err(|error| format!("failed to read env file {}: {error}", path.display()))?;
2074 for (line_no, line) in raw.lines().enumerate() {
2075 let Some((key, value)) = parse_env_line(line).map_err(|error| {
2076 format!("{}:{}: {error}", path.display(), line_no.saturating_add(1))
2077 })?
2078 else {
2079 continue;
2080 };
2081 if touched.insert(key.clone()) {
2082 previous.push((OsString::from(&key), std::env::var_os(&key)));
2083 }
2084 std::env::set_var(&key, value);
2085 loaded.push(LoadedEnvKey {
2086 key,
2087 source: path.display().to_string(),
2088 });
2089 }
2090 }
2091 Ok((EnvOverlay { previous }, loaded))
2092}
2093
2094fn parse_env_line(line: &str) -> Result<Option<(String, String)>, String> {
2095 let trimmed = line.trim();
2096 if trimmed.is_empty() || trimmed.starts_with('#') {
2097 return Ok(None);
2098 }
2099 let trimmed = trimmed.strip_prefix("export ").unwrap_or(trimmed).trim();
2100 let Some((key, value)) = trimmed.split_once('=') else {
2101 return Err("expected KEY=VALUE".to_string());
2102 };
2103 let key = key.trim();
2104 if key.is_empty() {
2105 return Err("empty key".to_string());
2106 }
2107 if !key
2108 .chars()
2109 .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2110 {
2111 return Err(format!("invalid key `{key}`"));
2112 }
2113 Ok(Some((key.to_string(), unquote_env_value(value.trim()))))
2114}
2115
2116fn unquote_env_value(value: &str) -> String {
2117 if value.len() >= 2 {
2118 let bytes = value.as_bytes();
2119 if (bytes[0] == b'"' && bytes[value.len() - 1] == b'"')
2120 || (bytes[0] == b'\'' && bytes[value.len() - 1] == b'\'')
2121 {
2122 return value[1..value.len() - 1].to_string();
2123 }
2124 }
2125 value.to_string()
2126}
2127
2128fn expand_home(path: &Path) -> PathBuf {
2129 let raw = path.to_string_lossy();
2130 if raw == "~" {
2131 return std::env::var_os("HOME")
2132 .map(PathBuf::from)
2133 .unwrap_or_else(|| path.to_path_buf());
2134 }
2135 if let Some(rest) = raw.strip_prefix("~/") {
2136 if let Some(home) = std::env::var_os("HOME") {
2137 return PathBuf::from(home).join(rest);
2138 }
2139 }
2140 path.to_path_buf()
2141}
2142
2143#[cfg(test)]
2144#[path = "eval_coding_agent_tests.rs"]
2145mod tests;