1use std::collections::{BTreeMap, BTreeSet, HashSet};
37use std::ffi::OsString;
38use std::fs;
39use std::io::Write as _;
40use std::path::{Path, PathBuf};
41
42use harn_vm::clock::{Clock, RealClock};
43use serde::Serialize;
44use serde_json::Value as JsonValue;
45
46use crate::cli::EvalCodingAgentArgs;
47use crate::commands::eval_model_selector::{
48 resolve_selector, selector_is_local, selector_label, ModelSelector,
49};
50use crate::commands::local::runtime::{
51 local_provider_ids, ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
52};
53use crate::commands::local_readiness;
54use crate::commands::run::{execute_run, CliLlmMockMode, RunProfileOptions};
55use crate::dispatch;
56use crate::env_guard::ScopedEnvVar;
57
58const CODING_AGENT_SUMMARY_ENV: &str = "HARN_EVAL_CODING_AGENT_SUMMARY_JSON";
63
64const CODING_AGENT_MODE_ENV: &str = "HARN_EVAL_CODING_AGENT_MODE";
70
71static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
83
84const CODING_AGENT_SUITE_HARN: &str = include_str!("../../assets/evals/coding_agent_suite.harn");
85
86#[derive(Debug, Clone, Copy)]
87struct FixtureDefinition {
88 id: &'static str,
89 name: &'static str,
90 tool_sequence: &'static str,
91 description: &'static str,
92}
93
94static FIXTURE_DEFINITIONS: &[FixtureDefinition] = &[
95 FixtureDefinition {
96 id: "python-add",
97 name: "Python add repair",
98 tool_sequence: "multi-tool",
99 description: "One-file Python bug fix verified by unittest output.",
100 },
101 FixtureDefinition {
102 id: "cli-help-flag",
103 name: "CLI help flag",
104 tool_sequence: "multi-tool",
105 description: "Add a tiny CLI flag, update help-facing docs, and verify behavior.",
106 },
107 FixtureDefinition {
108 id: "test-output-first",
109 name: "Test-output-first repair",
110 tool_sequence: "multi-tool",
111 description: "Run a failing test first, then edit the implementation and re-run it.",
112 },
113 FixtureDefinition {
114 id: "docs-symbol-rename",
115 name: "Docs symbol rename",
116 tool_sequence: "multi-tool",
117 description:
118 "Update docs and an example after a symbol rename without touching implementation.",
119 },
120 FixtureDefinition {
121 id: "read-only-audit",
122 name: "Read-only audit",
123 tool_sequence: "one-tool",
124 description: "Inspect a file and report that no edits are needed.",
125 },
126 FixtureDefinition {
127 id: "no-tool-diagnosis",
128 name: "No-tool diagnosis",
129 tool_sequence: "no-tool",
130 description: "Answer from prompt-only context without any tools.",
131 },
132];
133
134#[derive(Debug, Clone, Serialize)]
135struct LoadedEnvKey {
136 key: String,
137 source: String,
138}
139
140#[derive(Debug)]
141struct EnvOverlay {
142 previous: Vec<(OsString, Option<OsString>)>,
143}
144
145impl Drop for EnvOverlay {
146 fn drop(&mut self) {
147 for (key, previous) in self.previous.iter().rev() {
148 if let Some(value) = previous {
149 std::env::set_var(key, value);
150 } else {
151 std::env::remove_var(key);
152 }
153 }
154 }
155}
156
157#[derive(Debug, Clone, Serialize)]
158struct RunReport {
159 run_id: String,
160 fixture_id: String,
161 fixture_name: String,
162 fixture_tool_sequence: String,
163 selector: ModelSelector,
164 tool_format: String,
165 status: String,
166 passed: bool,
167 skipped: bool,
168 #[serde(skip_serializing_if = "Option::is_none")]
169 skipped_reason: Option<String>,
170 output_dir: String,
171 transcript_events_path: String,
172 workspace_root: Option<String>,
173 elapsed_ms: u64,
174 duration_ms: u64,
175 iterations: i64,
176 input_tokens: i64,
177 output_tokens: i64,
178 cost_usd: f64,
179 pricing_known: bool,
180 tool_calls: usize,
181 rejected_tool_calls: usize,
182 tool_sequence: Vec<String>,
183 successful_tools: Vec<String>,
184 transcript_event_count: usize,
185 verification_success: bool,
186 harn_exit_code: i32,
187 #[serde(skip_serializing_if = "Option::is_none")]
188 error: Option<String>,
189 #[serde(skip_serializing_if = "Option::is_none")]
190 stderr_excerpt: Option<String>,
191 local_cleanup: Option<LocalCleanupReport>,
192}
193
194#[derive(Debug, Clone, Serialize)]
195struct LocalCleanupReport {
196 provider: String,
197 model: String,
198 initially_loaded: bool,
199 action: String,
200 #[serde(skip_serializing_if = "Option::is_none")]
201 detail: Option<String>,
202}
203
204#[derive(Debug, Clone, Serialize)]
205struct FormatComparison {
206 fixture_id: String,
207 selector: ModelSelector,
208 native_run_id: Option<String>,
209 text_run_id: Option<String>,
210 native_evidence_path: Option<String>,
211 text_evidence_path: Option<String>,
212 native_status: Option<String>,
213 text_status: Option<String>,
214 native_passed: Option<bool>,
215 text_passed: Option<bool>,
216 verifier_match: Option<bool>,
217 tool_sequence_match: Option<bool>,
218 rejected_tool_call_delta_text_minus_native: Option<i64>,
219 token_delta_text_minus_native: Option<i64>,
220 iteration_delta_text_minus_native: Option<i64>,
221 equivalent: Option<bool>,
222 divergence_reasons: Vec<String>,
223 evidence_paths: Vec<String>,
224}
225
226#[derive(Debug, Clone, Serialize)]
227struct FollowupSuggestion {
228 title: String,
229 body: String,
230 labels: Vec<String>,
231 run_ids: Vec<String>,
232}
233
234#[derive(Debug, Clone, Serialize)]
235struct FixtureReport {
236 id: String,
237 name: String,
238 tool_sequence: String,
239 description: String,
240}
241
242#[derive(Debug, Clone, Serialize)]
243struct RollupReport {
244 key: String,
245 total_runs: usize,
246 passed_runs: usize,
247 failed_runs: usize,
248 skipped_runs: usize,
249 total_cost_usd: f64,
250}
251
252#[derive(Debug, Clone, Serialize)]
253struct EvalRollups {
254 by_fixture: Vec<RollupReport>,
255 by_provider: Vec<RollupReport>,
256 by_model: Vec<RollupReport>,
257 by_tool_format: Vec<RollupReport>,
258 by_tool_sequence: Vec<RollupReport>,
259}
260
261#[derive(Debug, Clone, Serialize)]
262struct EvalSummary {
263 schema_version: u32,
264 fixture_ids: Vec<String>,
265 fixtures: Vec<FixtureReport>,
266 output_dir: String,
267 models: Vec<ModelSelector>,
268 tool_formats: Vec<String>,
269 env_keys_loaded: Vec<LoadedEnvKey>,
270 total_runs: usize,
271 passed_runs: usize,
272 failed_runs: usize,
273 skipped_runs: usize,
274 diverged_comparisons: usize,
275 total_cost_usd: f64,
276 rollups: EvalRollups,
277 runs: Vec<RunReport>,
278 comparisons: Vec<FormatComparison>,
279 followups: Vec<FollowupSuggestion>,
280 #[serde(skip_serializing_if = "Option::is_none")]
284 step_judge_preset: Option<String>,
285 #[serde(skip_serializing_if = "String::is_empty")]
288 run_label: String,
289 #[serde(skip_serializing_if = "Option::is_none")]
295 baseline_comparison: Option<BaselineComparison>,
296}
297
298#[derive(Debug, Clone, Serialize, Default)]
299struct BaselineComparison {
300 baseline_label: String,
302 baseline_path: String,
304 regressions: Vec<FixtureStatusDelta>,
305 recoveries: Vec<FixtureStatusDelta>,
306 unchanged_passes: Vec<String>,
308 unchanged_failures: Vec<String>,
310 missing_in_baseline: Vec<String>,
313 missing_in_cell: Vec<String>,
314 regressions_count: usize,
315 recoveries_count: usize,
316 net_lift_pp: f64,
320}
321
322#[derive(Debug, Clone, Serialize)]
323struct FixtureStatusDelta {
324 fixture_id: String,
325 baseline_status: String,
326 cell_status: String,
327}
328
329struct LocalRunGuard {
330 selector: ModelSelector,
331 stop_after: bool,
332 snapshot: Option<LocalProviderSnapshot>,
333}
334
335struct RunSummaryContext {
336 run_id: String,
337 fixture: FixtureDefinition,
338 selector: ModelSelector,
339 tool_format: String,
340 run_dir: PathBuf,
341 elapsed_ms: u64,
342 exit_code: i32,
343 stderr: String,
344 local_cleanup: Option<LocalCleanupReport>,
345}
346
347pub async fn run(args: EvalCodingAgentArgs) -> i32 {
348 let output_dir = args.output.clone().unwrap_or_else(default_output_dir);
349 if let Err(error) = fs::create_dir_all(&output_dir) {
350 eprintln!("error: failed to create {}: {error}", output_dir.display());
351 return 1;
352 }
353
354 let (_env_guard, env_keys_loaded) = match load_env_files(&args.env_files) {
355 Ok(loaded) => loaded,
356 Err(error) => {
357 eprintln!("error: {error}");
358 return 1;
359 }
360 };
361
362 let fixtures = match resolve_fixtures(&args.fixtures) {
363 Ok(fixtures) => fixtures,
364 Err(error) => {
365 eprintln!("error: {error}");
366 return 2;
367 }
368 };
369 let models = match resolve_models(&args).await {
370 Ok(models) => models,
371 Err(error) => {
372 eprintln!("error: {error}");
373 return 1;
374 }
375 };
376 let tool_formats = match normalize_tool_formats(&args.tool_formats) {
377 Ok(formats) => formats,
378 Err(error) => {
379 eprintln!("error: {error}");
380 return 2;
381 }
382 };
383 let matrix = build_matrix(&fixtures, &models, &tool_formats, args.max_runs);
384 if matrix.is_empty() {
385 eprintln!("error: no coding-agent benchmark runs selected");
386 return 2;
387 }
388
389 let mut reports = Vec::new();
390 let mut had_error = false;
391 for (fixture, selector, tool_format) in matrix {
392 let report = run_matrix_entry(&args, &output_dir, fixture, selector, tool_format).await;
393 if !report.passed && !report.skipped {
394 had_error = true;
395 }
396 if report.skipped && args.fail_on_unauthorized {
397 had_error = true;
398 }
399 eprintln!(
400 "{} {} {}: {}",
401 report.fixture_id,
402 selector_label(&report.selector),
403 report.tool_format,
404 report.status
405 );
406 reports.push(report);
407 }
408
409 let baseline_comparison = match &args.baseline_comparison_against {
410 Some(path) => match load_baseline_comparison(path, &reports) {
411 Ok(comparison) => Some(comparison),
412 Err(error) => {
413 eprintln!("error: --baseline-comparison-against: {error}");
414 return 1;
415 }
416 },
417 None => None,
418 };
419 let summary = build_summary(
420 &output_dir,
421 fixtures,
422 models,
423 tool_formats,
424 env_keys_loaded,
425 reports,
426 args.step_judge
427 .clone()
428 .filter(|s| !s.is_empty() && s != "none"),
429 args.run_label.clone(),
430 baseline_comparison,
431 );
432 if let Err(error) = write_json_artifacts(&output_dir, &summary) {
438 eprintln!("error: failed to write benchmark outputs: {error}");
439 return 1;
440 }
441
442 let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
446
447 if use_legacy {
448 if let Err(error) = write_markdown_artifacts_legacy(&output_dir, &summary) {
449 eprintln!("error: {error}");
450 return 1;
451 }
452 announce_output_paths(&output_dir);
453 if args.json {
454 print_json_legacy(&summary);
455 } else {
456 print_summary_legacy(&summary);
457 }
458 return if had_error { 1 } else { 0 };
459 }
460
461 if let Err(code) = write_markdown_artifacts_dispatch(&output_dir, &summary).await {
462 return code;
463 }
464 announce_output_paths(&output_dir);
465 if args.json {
466 if let Err(code) = print_json_dispatch(&summary).await {
467 return code;
468 }
469 } else if let Err(code) = print_summary_dispatch(&summary).await {
470 return code;
471 }
472
473 if had_error {
474 1
475 } else {
476 0
477 }
478}
479
480async fn run_matrix_entry(
481 args: &EvalCodingAgentArgs,
482 output_dir: &Path,
483 fixture: FixtureDefinition,
484 selector: ModelSelector,
485 tool_format: String,
486) -> RunReport {
487 let run_id = run_id_for(fixture, &selector, &tool_format);
488 let run_dir = output_dir.join(&run_id);
489 if let Err(error) = reset_dir(&run_dir) {
490 return error_report(
491 run_id,
492 fixture,
493 selector,
494 tool_format,
495 run_dir,
496 format!("failed to prepare run directory: {error}"),
497 );
498 }
499
500 if !provider_available(&selector) {
501 let reason = format!(
502 "provider `{}` has no configured credentials",
503 selector.provider
504 );
505 return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
506 }
507
508 let script_path = run_dir.join("coding_agent_suite.harn");
509 if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
510 return error_report(
511 run_id,
512 fixture,
513 selector,
514 tool_format,
515 run_dir,
516 format!("failed to write benchmark harness: {error}"),
517 );
518 }
519
520 let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
521 let argv = script_argv(args, fixture, &selector, &tool_format, &run_dir);
522 let clock = RealClock::new();
523 let started_ms = clock.monotonic_ms();
524 let outcome = execute_run(
525 &script_path.to_string_lossy(),
526 false,
527 HashSet::new(),
528 argv,
529 Vec::new(),
530 CliLlmMockMode::Off,
531 None,
532 RunProfileOptions::default(),
533 )
534 .await;
535 let elapsed_ms = clock
536 .monotonic_ms()
537 .saturating_sub(started_ms)
538 .try_into()
539 .unwrap_or(0);
540 let local_cleanup = if let Some(guard) = local_guard {
541 guard.cleanup().await
542 } else {
543 None
544 };
545
546 let summary_value =
547 read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
548 let Some(summary) = summary_value else {
549 return RunReport {
550 run_id,
551 fixture_id: fixture.id.to_string(),
552 fixture_name: fixture.name.to_string(),
553 fixture_tool_sequence: fixture.tool_sequence.to_string(),
554 selector,
555 tool_format,
556 status: "infra_error".to_string(),
557 passed: false,
558 skipped: false,
559 skipped_reason: None,
560 output_dir: run_dir.display().to_string(),
561 transcript_events_path: run_dir
562 .join("transcript_events.jsonl")
563 .display()
564 .to_string(),
565 workspace_root: None,
566 elapsed_ms,
567 duration_ms: 0,
568 iterations: 0,
569 input_tokens: 0,
570 output_tokens: 0,
571 cost_usd: 0.0,
572 pricing_known: false,
573 tool_calls: 0,
574 rejected_tool_calls: 0,
575 tool_sequence: Vec::new(),
576 successful_tools: Vec::new(),
577 transcript_event_count: 0,
578 verification_success: false,
579 harn_exit_code: outcome.exit_code,
580 error: Some("benchmark harness produced no summary JSON".to_string()),
581 stderr_excerpt: excerpt(&outcome.stderr),
582 local_cleanup,
583 };
584 };
585
586 report_from_summary(
587 RunSummaryContext {
588 run_id,
589 fixture,
590 selector,
591 tool_format,
592 run_dir,
593 elapsed_ms,
594 exit_code: outcome.exit_code,
595 stderr: outcome.stderr,
596 local_cleanup,
597 },
598 summary,
599 )
600}
601
602fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
603 let passed = summary
604 .get("passed")
605 .and_then(JsonValue::as_bool)
606 .unwrap_or(false)
607 && ctx.exit_code == 0;
608 let input_tokens = summary
609 .pointer("/llm/input_tokens")
610 .and_then(JsonValue::as_i64)
611 .unwrap_or(0);
612 let output_tokens = summary
613 .pointer("/llm/output_tokens")
614 .and_then(JsonValue::as_i64)
615 .unwrap_or(0);
616 let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
617 let cost_usd = pricing
618 .map(|(input, output)| {
619 (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
620 })
621 .unwrap_or(0.0);
622 let status = if passed {
623 "passed".to_string()
624 } else if ctx.exit_code == 0 {
625 "failed".to_string()
626 } else {
627 summary
628 .get("status")
629 .and_then(JsonValue::as_str)
630 .unwrap_or("failed")
631 .to_string()
632 };
633 RunReport {
634 run_id: ctx.run_id,
635 fixture_id: ctx.fixture.id.to_string(),
636 fixture_name: ctx.fixture.name.to_string(),
637 fixture_tool_sequence: ctx.fixture.tool_sequence.to_string(),
638 selector: ctx.selector,
639 tool_format: ctx.tool_format,
640 status,
641 passed,
642 skipped: false,
643 skipped_reason: None,
644 output_dir: ctx.run_dir.display().to_string(),
645 transcript_events_path: ctx
646 .run_dir
647 .join("transcript_events.jsonl")
648 .display()
649 .to_string(),
650 workspace_root: summary
651 .get("workspace_root")
652 .and_then(JsonValue::as_str)
653 .map(str::to_string),
654 elapsed_ms: ctx.elapsed_ms,
655 duration_ms: summary
656 .get("duration_ms")
657 .and_then(JsonValue::as_u64)
658 .unwrap_or(ctx.elapsed_ms),
659 iterations: summary
660 .pointer("/llm/iterations")
661 .and_then(JsonValue::as_i64)
662 .unwrap_or(0),
663 input_tokens,
664 output_tokens,
665 cost_usd,
666 pricing_known: pricing.is_some(),
667 tool_calls: summary
668 .pointer("/tools/calls")
669 .and_then(JsonValue::as_array)
670 .map(Vec::len)
671 .unwrap_or(0),
672 rejected_tool_calls: summary
673 .pointer("/tools/rejected")
674 .and_then(JsonValue::as_array)
675 .map(Vec::len)
676 .unwrap_or(0),
677 tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
678 .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
679 .unwrap_or_default(),
680 successful_tools: string_array(summary.pointer("/tools/successful")),
681 transcript_event_count: summary
682 .get("transcript_event_count")
683 .and_then(JsonValue::as_u64)
684 .unwrap_or(0) as usize,
685 verification_success: summary
686 .pointer("/verification/success")
687 .and_then(JsonValue::as_bool)
688 .unwrap_or(false),
689 harn_exit_code: ctx.exit_code,
690 error: (!passed).then(|| {
691 summary
692 .get("status")
693 .and_then(JsonValue::as_str)
694 .unwrap_or("benchmark failed")
695 .to_string()
696 }),
697 stderr_excerpt: excerpt(&ctx.stderr),
698 local_cleanup: ctx.local_cleanup,
699 }
700}
701
702impl LocalRunGuard {
703 async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
704 if !selector_is_local(selector) {
705 return None;
706 }
707 let snapshot = snapshot_provider(&selector.provider, Path::new("."))
708 .await
709 .ok();
710 Some(Self {
711 selector: selector.clone(),
712 stop_after,
713 snapshot,
714 })
715 }
716
717 async fn cleanup(self) -> Option<LocalCleanupReport> {
718 let snapshot = self.snapshot?;
719 if self.selector.provider != "ollama" {
720 return Some(LocalCleanupReport {
721 provider: self.selector.provider,
722 model: self.selector.model,
723 initially_loaded: false,
724 action: "not_applicable".to_string(),
725 detail: Some(
726 "non-Ollama local providers are only stopped when Harn launched a managed server"
727 .to_string(),
728 ),
729 });
730 }
731 let initially_loaded = snapshot
732 .loaded_models
733 .iter()
734 .any(|loaded| loaded.name == self.selector.model);
735 if !self.stop_after {
736 return Some(LocalCleanupReport {
737 provider: self.selector.provider,
738 model: self.selector.model,
739 initially_loaded,
740 action: "left_running".to_string(),
741 detail: Some("--keep-local-after-run".to_string()),
742 });
743 }
744 if initially_loaded {
745 return Some(LocalCleanupReport {
746 provider: self.selector.provider,
747 model: self.selector.model,
748 initially_loaded,
749 action: "left_preexisting".to_string(),
750 detail: None,
751 });
752 }
753 match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
754 Ok(()) => Some(LocalCleanupReport {
755 provider: self.selector.provider,
756 model: self.selector.model,
757 initially_loaded,
758 action: "unloaded".to_string(),
759 detail: None,
760 }),
761 Err(error) => Some(LocalCleanupReport {
762 provider: self.selector.provider,
763 model: self.selector.model,
764 initially_loaded,
765 action: "unload_failed".to_string(),
766 detail: Some(error),
767 }),
768 }
769 }
770}
771
772fn script_argv(
773 args: &EvalCodingAgentArgs,
774 fixture: FixtureDefinition,
775 selector: &ModelSelector,
776 tool_format: &str,
777 run_dir: &Path,
778) -> Vec<String> {
779 let mut argv = vec![
780 "--fixture".to_string(),
781 fixture.id.to_string(),
782 "--output-dir".to_string(),
783 run_dir.display().to_string(),
784 "--provider".to_string(),
785 selector.provider.clone(),
786 "--model".to_string(),
787 selector.model.clone(),
788 "--tool-format".to_string(),
789 tool_format.to_string(),
790 "--max-iterations".to_string(),
791 args.max_iterations.to_string(),
792 "--python".to_string(),
793 args.python.clone(),
794 ];
795 if selector.provider == "mock" {
796 argv.push("--seed-mock".to_string());
797 }
798 if let Some(json) = resolve_step_judge_json(args, selector) {
799 argv.push("--step-judge-json".to_string());
800 argv.push(json);
801 }
802 argv
803}
804
805fn resolve_step_judge_json(args: &EvalCodingAgentArgs, selector: &ModelSelector) -> Option<String> {
817 let raw = args.step_judge.as_deref()?.trim();
818 if raw.is_empty() || raw.eq_ignore_ascii_case("none") {
819 return None;
820 }
821 let mut obj = serde_json::Map::new();
822 if let Some(rest) = raw.strip_prefix("custom:") {
823 match serde_json::from_str::<JsonValue>(rest) {
824 Ok(JsonValue::Object(map)) => obj.extend(map),
825 _ => {
826 obj.insert(
829 "model".to_string(),
830 JsonValue::String("__invalid_custom_step_judge__".to_string()),
831 );
832 }
833 }
834 } else {
835 match raw {
836 "symmetric-cheap" | "symmetric-strong" => {
837 obj.insert(
838 "model".to_string(),
839 JsonValue::String(selector.model.clone()),
840 );
841 obj.insert(
842 "provider".to_string(),
843 JsonValue::String(selector.provider.clone()),
844 );
845 }
846 "asymmetric" => {
847 obj.insert(
848 "model".to_string(),
849 JsonValue::String("anthropic/claude-sonnet-4-6".to_string()),
850 );
851 obj.insert(
852 "provider".to_string(),
853 JsonValue::String("openrouter".to_string()),
854 );
855 }
856 _other => {
857 obj.insert(
858 "model".to_string(),
859 JsonValue::String("__unknown_step_judge_preset__".to_string()),
860 );
861 }
862 }
863 }
864 if let Some(on_veto) = args.step_judge_on_veto.as_deref() {
865 obj.insert(
866 "on_veto".to_string(),
867 JsonValue::String(on_veto.to_string()),
868 );
869 }
870 if args.step_judge_adversarial {
871 obj.insert(
872 "rubric".to_string(),
873 JsonValue::String("adversarial".to_string()),
874 );
875 }
876 Some(JsonValue::Object(obj).to_string())
877}
878
879fn error_report(
880 run_id: String,
881 fixture: FixtureDefinition,
882 selector: ModelSelector,
883 tool_format: String,
884 run_dir: PathBuf,
885 error: String,
886) -> RunReport {
887 RunReport {
888 run_id,
889 fixture_id: fixture.id.to_string(),
890 fixture_name: fixture.name.to_string(),
891 fixture_tool_sequence: fixture.tool_sequence.to_string(),
892 selector,
893 tool_format,
894 status: "infra_error".to_string(),
895 passed: false,
896 skipped: false,
897 skipped_reason: None,
898 output_dir: run_dir.display().to_string(),
899 transcript_events_path: run_dir
900 .join("transcript_events.jsonl")
901 .display()
902 .to_string(),
903 workspace_root: None,
904 elapsed_ms: 0,
905 duration_ms: 0,
906 iterations: 0,
907 input_tokens: 0,
908 output_tokens: 0,
909 cost_usd: 0.0,
910 pricing_known: false,
911 tool_calls: 0,
912 rejected_tool_calls: 0,
913 tool_sequence: Vec::new(),
914 successful_tools: Vec::new(),
915 transcript_event_count: 0,
916 verification_success: false,
917 harn_exit_code: 1,
918 error: Some(error),
919 stderr_excerpt: None,
920 local_cleanup: None,
921 }
922}
923
924fn skipped_report(
925 run_id: String,
926 fixture: FixtureDefinition,
927 selector: ModelSelector,
928 tool_format: String,
929 run_dir: PathBuf,
930 reason: String,
931) -> RunReport {
932 RunReport {
933 run_id,
934 fixture_id: fixture.id.to_string(),
935 fixture_name: fixture.name.to_string(),
936 fixture_tool_sequence: fixture.tool_sequence.to_string(),
937 selector,
938 tool_format,
939 status: "skipped".to_string(),
940 passed: false,
941 skipped: true,
942 skipped_reason: Some(reason),
943 output_dir: run_dir.display().to_string(),
944 transcript_events_path: run_dir
945 .join("transcript_events.jsonl")
946 .display()
947 .to_string(),
948 workspace_root: None,
949 elapsed_ms: 0,
950 duration_ms: 0,
951 iterations: 0,
952 input_tokens: 0,
953 output_tokens: 0,
954 cost_usd: 0.0,
955 pricing_known: false,
956 tool_calls: 0,
957 rejected_tool_calls: 0,
958 tool_sequence: Vec::new(),
959 successful_tools: Vec::new(),
960 transcript_event_count: 0,
961 verification_success: false,
962 harn_exit_code: 0,
963 error: None,
964 stderr_excerpt: None,
965 local_cleanup: None,
966 }
967}
968
969fn provider_available(selector: &ModelSelector) -> bool {
970 if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
971 return true;
972 }
973 harn_vm::llm_config::provider_key_available(&selector.provider)
974}
975
976fn resolve_fixtures(raw_fixtures: &[String]) -> Result<Vec<FixtureDefinition>, String> {
977 let mut seen = BTreeSet::new();
978 let mut out = Vec::new();
979 for raw in raw_fixtures {
980 let fixture = raw.trim().to_ascii_lowercase();
981 if fixture.is_empty() {
982 continue;
983 }
984 if fixture == "all" {
985 return Ok(FIXTURE_DEFINITIONS.to_vec());
986 }
987 let Some(definition) = fixture_definition(&fixture) else {
988 return Err(format!(
989 "unsupported --fixture `{fixture}`; expected one of: all, {}",
990 FIXTURE_DEFINITIONS
991 .iter()
992 .map(|definition| definition.id)
993 .collect::<Vec<_>>()
994 .join(", ")
995 ));
996 };
997 if seen.insert(definition.id) {
998 out.push(definition);
999 }
1000 }
1001 if out.is_empty() {
1002 return Err("at least one coding-agent fixture must be selected".to_string());
1003 }
1004 Ok(out)
1005}
1006
1007fn fixture_definition(id: &str) -> Option<FixtureDefinition> {
1008 FIXTURE_DEFINITIONS
1009 .iter()
1010 .copied()
1011 .find(|definition| definition.id == id)
1012}
1013
1014async fn resolve_models(args: &EvalCodingAgentArgs) -> Result<Vec<ModelSelector>, String> {
1015 let mut seen = BTreeSet::new();
1016 let mut out = Vec::new();
1017 for raw in normalize_model_selector_args(&args.models) {
1018 let trimmed = raw.trim();
1019 if trimmed.is_empty() {
1020 continue;
1021 }
1022 let selector = resolve_selector(trimmed);
1023 if seen.insert(selector_label(&selector)) {
1024 out.push(selector);
1025 }
1026 }
1027 if args.include_local {
1028 for selector in discover_local_models(args).await {
1029 if seen.insert(selector_label(&selector)) {
1030 out.push(selector);
1031 }
1032 }
1033 }
1034 Ok(out)
1035}
1036
1037fn normalize_model_selector_args(raw_models: &[String]) -> Vec<String> {
1038 let mut out = Vec::new();
1039 let mut index = 0;
1040 while index < raw_models.len() {
1041 let current = raw_models[index].trim();
1042 if current.starts_with("provider=") && index + 1 < raw_models.len() {
1043 let next = raw_models[index + 1].trim();
1044 if next.starts_with("model=") {
1045 out.push(format!("{current},{next}"));
1046 index += 2;
1047 continue;
1048 }
1049 }
1050 out.push(current.to_string());
1051 index += 1;
1052 }
1053 out
1054}
1055
1056async fn discover_local_models(args: &EvalCodingAgentArgs) -> Vec<ModelSelector> {
1057 let providers = if args.local_providers.is_empty() {
1058 local_provider_ids(None)
1059 } else {
1060 args.local_providers.clone()
1061 };
1062 let mut selectors = Vec::new();
1063 let mut seen = BTreeSet::new();
1064 for provider in providers {
1065 if selectors.len() >= args.max_local_models {
1066 break;
1067 }
1068 let Ok(snapshot) = snapshot_provider(&provider, Path::new(".")).await else {
1069 continue;
1070 };
1071 if !snapshot.reachable {
1072 continue;
1073 }
1074 let mut models = snapshot
1075 .loaded_models
1076 .iter()
1077 .map(|model| model.name.clone())
1078 .collect::<Vec<_>>();
1079 models.extend(snapshot.served_models);
1080 for model in models {
1081 if selectors.len() >= args.max_local_models {
1082 break;
1083 }
1084 let selector = ModelSelector {
1085 selector: format!("{provider}:{model}"),
1086 provider: provider.clone(),
1087 model,
1088 };
1089 if seen.insert(selector_label(&selector)) {
1090 selectors.push(selector);
1091 }
1092 }
1093 }
1094 selectors
1095}
1096
1097fn normalize_tool_formats(raw_formats: &[String]) -> Result<Vec<String>, String> {
1098 let mut seen = BTreeSet::new();
1099 let mut out = Vec::new();
1100 for raw in raw_formats {
1101 let format = raw.trim().to_ascii_lowercase();
1102 if format.is_empty() {
1103 continue;
1104 }
1105 if format != "native" && format != "text" {
1106 return Err(format!(
1107 "unsupported --tool-format `{format}`; expected `native` or `text`"
1108 ));
1109 }
1110 if seen.insert(format.clone()) {
1111 out.push(format);
1112 }
1113 }
1114 Ok(out)
1115}
1116
1117fn build_matrix(
1118 fixtures: &[FixtureDefinition],
1119 models: &[ModelSelector],
1120 tool_formats: &[String],
1121 max_runs: Option<usize>,
1122) -> Vec<(FixtureDefinition, ModelSelector, String)> {
1123 if max_runs == Some(0) {
1124 return Vec::new();
1125 }
1126 let mut matrix = Vec::new();
1127 for fixture in fixtures {
1128 for selector in models {
1129 for tool_format in tool_formats {
1130 matrix.push((*fixture, selector.clone(), tool_format.clone()));
1131 if max_runs.is_some_and(|limit| matrix.len() >= limit) {
1132 return matrix;
1133 }
1134 }
1135 }
1136 }
1137 matrix
1138}
1139
1140#[allow(clippy::too_many_arguments)]
1141fn build_summary(
1142 output_dir: &Path,
1143 fixtures: Vec<FixtureDefinition>,
1144 models: Vec<ModelSelector>,
1145 tool_formats: Vec<String>,
1146 env_keys_loaded: Vec<LoadedEnvKey>,
1147 runs: Vec<RunReport>,
1148 step_judge_preset: Option<String>,
1149 run_label: String,
1150 baseline_comparison: Option<BaselineComparison>,
1151) -> EvalSummary {
1152 let passed_runs = runs.iter().filter(|run| run.passed).count();
1153 let skipped_runs = runs.iter().filter(|run| run.skipped).count();
1154 let failed_runs = runs
1155 .iter()
1156 .filter(|run| !run.passed && !run.skipped)
1157 .count();
1158 let total_cost_usd = runs.iter().map(|run| run.cost_usd).sum();
1159 let rollups = build_rollups(&runs);
1160 let comparisons = compare_formats(&runs);
1161 let diverged_comparisons = comparisons
1162 .iter()
1163 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1164 .count();
1165 let followups = suggest_followups(&runs, &comparisons);
1166 EvalSummary {
1167 schema_version: 2,
1168 fixture_ids: fixtures
1169 .iter()
1170 .map(|fixture| fixture.id.to_string())
1171 .collect(),
1172 fixtures: fixtures
1173 .iter()
1174 .map(|fixture| FixtureReport {
1175 id: fixture.id.to_string(),
1176 name: fixture.name.to_string(),
1177 tool_sequence: fixture.tool_sequence.to_string(),
1178 description: fixture.description.to_string(),
1179 })
1180 .collect(),
1181 output_dir: output_dir.display().to_string(),
1182 models,
1183 tool_formats,
1184 env_keys_loaded,
1185 total_runs: runs.len(),
1186 passed_runs,
1187 failed_runs,
1188 skipped_runs,
1189 diverged_comparisons,
1190 total_cost_usd,
1191 rollups,
1192 runs,
1193 comparisons,
1194 followups,
1195 step_judge_preset,
1196 run_label,
1197 baseline_comparison,
1198 }
1199}
1200
1201fn load_baseline_comparison(path: &Path, runs: &[RunReport]) -> Result<BaselineComparison, String> {
1202 let resolved = if path.is_dir() {
1203 path.join("summary.json")
1204 } else {
1205 path.to_path_buf()
1206 };
1207 let raw = fs::read_to_string(&resolved)
1208 .map_err(|e| format!("failed to read {}: {e}", resolved.display()))?;
1209 let baseline: serde_json::Value = serde_json::from_str(&raw)
1210 .map_err(|e| format!("failed to parse {} as JSON: {e}", resolved.display()))?;
1211 let baseline_runs = baseline
1212 .get("runs")
1213 .and_then(|v| v.as_array())
1214 .ok_or_else(|| format!("{} has no `runs` array", resolved.display()))?;
1215 let mut baseline_status: BTreeMap<String, &str> = BTreeMap::new();
1219 for run in baseline_runs {
1220 let fixture_id = match run.get("fixture_id").and_then(|v| v.as_str()) {
1221 Some(id) => id.to_string(),
1222 None => continue,
1223 };
1224 let passed = run.get("passed").and_then(|v| v.as_bool()).unwrap_or(false);
1225 let skipped = run
1226 .get("skipped")
1227 .and_then(|v| v.as_bool())
1228 .unwrap_or(false);
1229 let status = if skipped {
1230 "skipped"
1231 } else if passed {
1232 "passed"
1233 } else {
1234 "failed"
1235 };
1236 baseline_status
1237 .entry(fixture_id)
1238 .and_modify(|existing| {
1239 if *existing != "passed" && status == "passed" {
1240 *existing = status;
1241 }
1242 })
1243 .or_insert(status);
1244 }
1245 let mut cell_status: BTreeMap<String, &str> = BTreeMap::new();
1246 for run in runs {
1247 let status = if run.skipped {
1248 "skipped"
1249 } else if run.passed {
1250 "passed"
1251 } else {
1252 "failed"
1253 };
1254 cell_status
1255 .entry(run.fixture_id.clone())
1256 .and_modify(|existing| {
1257 if *existing != "passed" && status == "passed" {
1258 *existing = status;
1259 }
1260 })
1261 .or_insert(status);
1262 }
1263 let mut regressions = Vec::new();
1264 let mut recoveries = Vec::new();
1265 let mut unchanged_passes = Vec::new();
1266 let mut unchanged_failures = Vec::new();
1267 let mut missing_in_baseline = Vec::new();
1268 let mut missing_in_cell = Vec::new();
1269 for (fixture, cell) in &cell_status {
1270 match baseline_status.get(fixture) {
1271 None => missing_in_baseline.push(fixture.clone()),
1272 Some(base) => match (*base, *cell) {
1273 ("passed", "passed") => unchanged_passes.push(fixture.clone()),
1274 ("passed", _) => regressions.push(FixtureStatusDelta {
1275 fixture_id: fixture.clone(),
1276 baseline_status: (*base).to_string(),
1277 cell_status: (*cell).to_string(),
1278 }),
1279 (_, "passed") => recoveries.push(FixtureStatusDelta {
1280 fixture_id: fixture.clone(),
1281 baseline_status: (*base).to_string(),
1282 cell_status: (*cell).to_string(),
1283 }),
1284 _ => unchanged_failures.push(fixture.clone()),
1285 },
1286 }
1287 }
1288 for fixture in baseline_status.keys() {
1289 if !cell_status.contains_key(fixture) {
1290 missing_in_cell.push(fixture.clone());
1291 }
1292 }
1293 let baseline_label = baseline
1294 .get("run_label")
1295 .and_then(|v| v.as_str())
1296 .filter(|s| !s.is_empty())
1297 .or_else(|| baseline.get("output_dir").and_then(|v| v.as_str()))
1298 .unwrap_or("")
1299 .to_string();
1300 let regressions_count = regressions.len();
1301 let recoveries_count = recoveries.len();
1302 let total_compared =
1303 regressions_count + recoveries_count + unchanged_passes.len() + unchanged_failures.len();
1304 let net_lift_pp = if total_compared == 0 {
1305 0.0
1306 } else {
1307 let raw =
1308 (recoveries_count as f64 - regressions_count as f64) / total_compared as f64 * 100.0;
1309 (raw * 10.0).round() / 10.0
1310 };
1311 Ok(BaselineComparison {
1312 baseline_label,
1313 baseline_path: resolved.display().to_string(),
1314 regressions,
1315 recoveries,
1316 unchanged_passes,
1317 unchanged_failures,
1318 missing_in_baseline,
1319 missing_in_cell,
1320 regressions_count,
1321 recoveries_count,
1322 net_lift_pp,
1323 })
1324}
1325
1326fn build_rollups(runs: &[RunReport]) -> EvalRollups {
1327 EvalRollups {
1328 by_fixture: rollup_by(runs, |run| run.fixture_id.clone()),
1329 by_provider: rollup_by(runs, |run| run.selector.provider.clone()),
1330 by_model: rollup_by(runs, |run| run.selector.model.clone()),
1331 by_tool_format: rollup_by(runs, |run| run.tool_format.clone()),
1332 by_tool_sequence: rollup_by(runs, |run| run.fixture_tool_sequence.clone()),
1333 }
1334}
1335
1336fn rollup_by<F>(runs: &[RunReport], key_for: F) -> Vec<RollupReport>
1337where
1338 F: Fn(&RunReport) -> String,
1339{
1340 let mut grouped: BTreeMap<String, RollupReport> = BTreeMap::new();
1341 for run in runs {
1342 let key = key_for(run);
1343 let entry = grouped.entry(key.clone()).or_insert_with(|| RollupReport {
1344 key,
1345 total_runs: 0,
1346 passed_runs: 0,
1347 failed_runs: 0,
1348 skipped_runs: 0,
1349 total_cost_usd: 0.0,
1350 });
1351 entry.total_runs += 1;
1352 if run.passed {
1353 entry.passed_runs += 1;
1354 } else if run.skipped {
1355 entry.skipped_runs += 1;
1356 } else {
1357 entry.failed_runs += 1;
1358 }
1359 entry.total_cost_usd += run.cost_usd;
1360 }
1361 grouped.into_values().collect()
1362}
1363
1364fn compare_formats(runs: &[RunReport]) -> Vec<FormatComparison> {
1365 let mut grouped: BTreeMap<String, Vec<&RunReport>> = BTreeMap::new();
1366 for run in runs {
1367 grouped
1368 .entry(format!(
1369 "{}\0{}",
1370 run.fixture_id,
1371 selector_label(&run.selector)
1372 ))
1373 .or_default()
1374 .push(run);
1375 }
1376 let mut out = Vec::new();
1377 for group in grouped.values() {
1378 let Some(first) = group.first() else {
1379 continue;
1380 };
1381 let native = group
1382 .iter()
1383 .find(|run| run.tool_format == "native")
1384 .copied();
1385 let text = group.iter().find(|run| run.tool_format == "text").copied();
1386 if native.is_none() && text.is_none() {
1387 continue;
1388 }
1389 let pair = native.zip(text);
1390 let mut divergence_reasons = Vec::new();
1391 if let Some((native, text)) = pair {
1392 if native.status != text.status {
1393 divergence_reasons.push(format!(
1394 "status differs: native={} text={}",
1395 native.status, text.status
1396 ));
1397 }
1398 if native.passed != text.passed {
1399 divergence_reasons.push(format!(
1400 "pass result differs: native={} text={}",
1401 native.passed, text.passed
1402 ));
1403 }
1404 if native.verification_success != text.verification_success {
1405 divergence_reasons.push(format!(
1406 "verifier result differs: native={} text={}",
1407 native.verification_success, text.verification_success
1408 ));
1409 }
1410 if native.tool_sequence != text.tool_sequence {
1411 divergence_reasons.push(format!(
1412 "tool sequence differs: native=[{}] text=[{}]",
1413 native.tool_sequence.join(", "),
1414 text.tool_sequence.join(", ")
1415 ));
1416 }
1417 if native.rejected_tool_calls != text.rejected_tool_calls {
1418 divergence_reasons.push(format!(
1419 "rejected tool-call recovery differs: native={} text={}",
1420 native.rejected_tool_calls, text.rejected_tool_calls
1421 ));
1422 }
1423 }
1424 let evidence_paths = [native, text]
1425 .into_iter()
1426 .flatten()
1427 .map(|run| run.transcript_events_path.clone())
1428 .collect::<Vec<_>>();
1429 out.push(FormatComparison {
1430 fixture_id: first.fixture_id.clone(),
1431 selector: first.selector.clone(),
1432 native_run_id: native.map(|run| run.run_id.clone()),
1433 text_run_id: text.map(|run| run.run_id.clone()),
1434 native_evidence_path: native.map(|run| run.transcript_events_path.clone()),
1435 text_evidence_path: text.map(|run| run.transcript_events_path.clone()),
1436 native_status: native.map(|run| run.status.clone()),
1437 text_status: text.map(|run| run.status.clone()),
1438 native_passed: native.map(|run| run.passed),
1439 text_passed: text.map(|run| run.passed),
1440 verifier_match: pair
1441 .map(|(native, text)| native.verification_success == text.verification_success),
1442 tool_sequence_match: pair
1443 .map(|(native, text)| native.tool_sequence == text.tool_sequence),
1444 rejected_tool_call_delta_text_minus_native: pair.map(|(native, text)| {
1445 text.rejected_tool_calls as i64 - native.rejected_tool_calls as i64
1446 }),
1447 token_delta_text_minus_native: pair.map(|(native, text)| {
1448 (text.input_tokens + text.output_tokens)
1449 - (native.input_tokens + native.output_tokens)
1450 }),
1451 iteration_delta_text_minus_native: pair
1452 .map(|(native, text)| text.iterations - native.iterations),
1453 equivalent: pair.map(|(native, text)| {
1454 native.status == text.status
1455 && native.passed == text.passed
1456 && native.skipped == text.skipped
1457 && native.verification_success == text.verification_success
1458 && native.tool_sequence == text.tool_sequence
1459 && native.rejected_tool_calls == text.rejected_tool_calls
1460 }),
1461 divergence_reasons,
1462 evidence_paths,
1463 });
1464 }
1465 out
1466}
1467
1468fn suggest_followups(
1469 runs: &[RunReport],
1470 comparisons: &[FormatComparison],
1471) -> Vec<FollowupSuggestion> {
1472 let mut out = Vec::new();
1473 let failed = runs
1474 .iter()
1475 .filter(|run| !run.passed && !run.skipped)
1476 .map(|run| run.run_id.clone())
1477 .collect::<Vec<_>>();
1478 if !failed.is_empty() {
1479 out.push(FollowupSuggestion {
1480 title: "Normalize coding-agent fixture failures across provider presets".to_string(),
1481 body: "One or more fixture/provider/tool-format runs failed. Inspect the run directories and decide whether the gap belongs in provider adapters, preset prompting, transcript handling, or host-tool ergonomics.".to_string(),
1482 labels: vec!["eval".to_string(), "providers".to_string()],
1483 run_ids: failed,
1484 });
1485 }
1486
1487 let rejected = runs
1488 .iter()
1489 .filter(|run| run.rejected_tool_calls > 0)
1490 .map(|run| run.run_id.clone())
1491 .collect::<Vec<_>>();
1492 if !rejected.is_empty() {
1493 out.push(FollowupSuggestion {
1494 title: "Abstract rejected tool-call recovery in agent transcripts".to_string(),
1495 body: "Some runs recovered after rejected tool calls. Add runtime support or preset guidance so harness authors can distinguish recoverable provider/tool-shape noise from user-relevant transcript events.".to_string(),
1496 labels: vec!["agents".to_string(), "transcripts".to_string()],
1497 run_ids: rejected,
1498 });
1499 }
1500
1501 let mismatched = comparisons
1502 .iter()
1503 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1504 .map(|comparison| {
1505 format!(
1506 "{}:{} ({})",
1507 comparison.fixture_id,
1508 selector_label(&comparison.selector),
1509 comparison.divergence_reasons.join("; ")
1510 )
1511 })
1512 .collect::<Vec<_>>();
1513 if !mismatched.is_empty() {
1514 let run_ids = comparisons
1515 .iter()
1516 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1517 .flat_map(|comparison| {
1518 [
1519 comparison.native_run_id.clone(),
1520 comparison.text_run_id.clone(),
1521 ]
1522 })
1523 .flatten()
1524 .collect::<Vec<_>>();
1525 out.push(FollowupSuggestion {
1526 title: "Make native/text tool modes behaviorally interchangeable for preset harnesses"
1527 .to_string(),
1528 body: format!(
1529 "Native and text tool modes diverged for: {}. The preset/runtime boundary should hide provider tool-channel differences where possible.",
1530 mismatched.join(", ")
1531 ),
1532 labels: vec!["agents".to_string(), "tools".to_string()],
1533 run_ids,
1534 });
1535 }
1536
1537 let unknown_pricing = runs
1538 .iter()
1539 .filter(|run| {
1540 !run.skipped
1541 && !run.pricing_known
1542 && !matches!(run.selector.provider.as_str(), "mock" | "fake")
1543 && !selector_is_local(&run.selector)
1544 })
1545 .map(|run| run.run_id.clone())
1546 .collect::<Vec<_>>();
1547 if !unknown_pricing.is_empty() {
1548 out.push(FollowupSuggestion {
1549 title: "Fill provider pricing metadata for benchmarked models".to_string(),
1550 body: "At least one live provider/model produced usage metrics but had no pricing entry, which weakens cost comparisons in eval reports.".to_string(),
1551 labels: vec!["providers".to_string(), "docs".to_string()],
1552 run_ids: unknown_pricing,
1553 });
1554 }
1555 out
1556}
1557
1558fn write_json_artifacts(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1559 write_json_pretty(&output_dir.join("summary.json"), summary)?;
1560 write_jsonl(&output_dir.join("per_run.jsonl"), &summary.runs)?;
1561 let summary_value = serde_json::to_value(summary).map_err(|error| error.to_string())?;
1562 let readiness = local_readiness::report_from_summary_json(
1563 &summary_value,
1564 output_dir.display().to_string(),
1565 )?;
1566 write_json_pretty(&output_dir.join("local_readiness.json"), &readiness)?;
1567 Ok(())
1568}
1569
1570fn announce_output_paths(output_dir: &Path) {
1571 eprintln!(
1572 "wrote {}, {}, {}, {}, and {}",
1573 output_dir.join("summary.json").display(),
1574 output_dir.join("per_run.jsonl").display(),
1575 output_dir.join("local_readiness.json").display(),
1576 output_dir.join("summary.md").display(),
1577 output_dir.join("followups.md").display()
1578 );
1579}
1580
1581fn write_markdown_artifacts_legacy(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1584 fs::write(output_dir.join("summary.md"), render_markdown(summary))
1585 .map_err(|error| format!("failed to write summary.md: {error}"))?;
1586 fs::write(output_dir.join("followups.md"), render_followups(summary))
1587 .map_err(|error| format!("failed to write followups.md: {error}"))?;
1588 Ok(())
1589}
1590
1591fn print_summary_legacy(summary: &EvalSummary) {
1592 println!(
1593 "coding-agent eval: {}/{} passed, {} skipped, total_cost_usd={:.6}",
1594 summary.passed_runs, summary.total_runs, summary.skipped_runs, summary.total_cost_usd
1595 );
1596}
1597
1598fn print_json_legacy(summary: &EvalSummary) {
1599 match serde_json::to_string_pretty(summary) {
1600 Ok(payload) => println!("{payload}"),
1601 Err(error) => eprintln!("warning: failed to render summary JSON: {error}"),
1602 }
1603}
1604
1605async fn write_markdown_artifacts_dispatch(
1608 output_dir: &Path,
1609 summary: &EvalSummary,
1610) -> Result<(), i32> {
1611 let markdown = render_via_dispatch(summary, "markdown").await?;
1612 if let Err(error) = fs::write(output_dir.join("summary.md"), markdown) {
1613 eprintln!("error: failed to write summary.md: {error}");
1614 return Err(1);
1615 }
1616 let followups = render_via_dispatch(summary, "followups").await?;
1617 if let Err(error) = fs::write(output_dir.join("followups.md"), followups) {
1618 eprintln!("error: failed to write followups.md: {error}");
1619 return Err(1);
1620 }
1621 Ok(())
1622}
1623
1624async fn print_summary_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1625 let payload = render_via_dispatch(summary, "summary").await?;
1626 print!("{payload}");
1627 if !payload.ends_with('\n') {
1630 println!();
1631 }
1632 Ok(())
1633}
1634
1635async fn print_json_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1636 let payload = render_via_dispatch(summary, "json").await?;
1637 print!("{payload}");
1638 if !payload.ends_with('\n') {
1639 println!();
1640 }
1641 Ok(())
1642}
1643
1644async fn render_via_dispatch(summary: &EvalSummary, mode: &str) -> Result<String, i32> {
1654 let summary_json = match serde_json::to_string(summary) {
1655 Ok(json) => json,
1656 Err(error) => {
1657 eprintln!("error: failed to serialise EvalSummary for dispatch: {error}");
1658 return Err(1);
1659 }
1660 };
1661 let _guard = DISPATCH_RENDER_LOCK.lock().await;
1662 let _summary = ScopedEnvVar::set(CODING_AGENT_SUMMARY_ENV, &summary_json);
1663 let _mode = ScopedEnvVar::set(CODING_AGENT_MODE_ENV, mode);
1664
1665 let outcome = dispatch::run_embedded_script("eval/coding_agent", Vec::new(), false).await;
1666 if !outcome.stderr.is_empty() {
1667 let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
1668 }
1669 if outcome.exit_code != 0 {
1670 return Err(outcome.exit_code);
1671 }
1672 Ok(outcome.stdout)
1673}
1674
1675fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<(), String> {
1676 let body = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
1677 fs::write(path, format!("{body}\n")).map_err(|error| error.to_string())
1678}
1679
1680fn write_jsonl<T: Serialize>(path: &Path, items: &[T]) -> Result<(), String> {
1681 let mut body = String::new();
1682 for item in items {
1683 let line = serde_json::to_string(item).map_err(|error| error.to_string())?;
1684 body.push_str(&line);
1685 body.push('\n');
1686 }
1687 fs::write(path, body).map_err(|error| error.to_string())
1688}
1689
1690fn render_markdown(summary: &EvalSummary) -> String {
1691 let mut out = String::new();
1692 out.push_str("# Coding Agent Harness Quality Suite\n\n");
1693 out.push_str(&format!(
1694 "- fixtures: `{}`\n- passed: {}/{}\n- skipped: {}\n- total_cost_usd: {:.6}\n\n",
1695 summary.fixture_ids.join("`, `"),
1696 summary.passed_runs,
1697 summary.total_runs,
1698 summary.skipped_runs,
1699 summary.total_cost_usd
1700 ));
1701 render_rollup_table(&mut out, "By Fixture", &summary.rollups.by_fixture);
1702 render_rollup_table(&mut out, "By Provider", &summary.rollups.by_provider);
1703 render_rollup_table(&mut out, "By Model", &summary.rollups.by_model);
1704 render_rollup_table(&mut out, "By Tool Format", &summary.rollups.by_tool_format);
1705 render_rollup_table(
1706 &mut out,
1707 "By Tool Sequence",
1708 &summary.rollups.by_tool_sequence,
1709 );
1710
1711 out.push_str("\n## Runs\n\n");
1712 out.push_str("| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n");
1713 out.push_str("|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n");
1714 for run in &summary.runs {
1715 let tool_sequence = if run.tool_sequence.is_empty() {
1716 "-".to_string()
1717 } else {
1718 run.tool_sequence.join(", ").replace('|', "\\|")
1719 };
1720 out.push_str(&format!(
1721 "| `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {:.6} | {} | `{}` |\n",
1722 run.fixture_id,
1723 run.run_id,
1724 run.selector.provider,
1725 run.selector.model.replace('|', "\\|"),
1726 run.tool_format,
1727 run.fixture_tool_sequence,
1728 tool_sequence,
1729 run.status,
1730 run.iterations,
1731 run.input_tokens + run.output_tokens,
1732 run.cost_usd,
1733 markdown_link(
1734 &run.transcript_event_count.to_string(),
1735 &run.transcript_events_path
1736 ),
1737 run.output_dir
1738 ));
1739 }
1740 if let Some(comparison) = &summary.baseline_comparison {
1741 out.push_str("\n## Baseline Comparison\n\n");
1742 out.push_str(&format!(
1743 "Compared against `{}`{}.\n\n",
1744 comparison.baseline_path,
1745 if comparison.baseline_label.is_empty() {
1746 String::new()
1747 } else {
1748 format!(" (label: `{}`)", comparison.baseline_label)
1749 },
1750 ));
1751 out.push_str(&format!(
1752 "- regressions: **{}** (baseline passed, this cell failed)\n- recoveries: **{}** (baseline failed, this cell passed)\n- net lift: **{:+.1}pp**\n\n",
1753 comparison.regressions_count,
1754 comparison.recoveries_count,
1755 comparison.net_lift_pp,
1756 ));
1757 if !comparison.regressions.is_empty() {
1758 out.push_str("### Regressions\n\n");
1759 for delta in &comparison.regressions {
1760 out.push_str(&format!(
1761 "- `{}`: `{}` → `{}`\n",
1762 delta.fixture_id, delta.baseline_status, delta.cell_status,
1763 ));
1764 }
1765 out.push('\n');
1766 }
1767 if !comparison.recoveries.is_empty() {
1768 out.push_str("### Recoveries\n\n");
1769 for delta in &comparison.recoveries {
1770 out.push_str(&format!(
1771 "- `{}`: `{}` → `{}`\n",
1772 delta.fixture_id, delta.baseline_status, delta.cell_status,
1773 ));
1774 }
1775 out.push('\n');
1776 }
1777 }
1778 if !summary.comparisons.is_empty() {
1779 out.push_str("\n## Native/Text Comparison\n\n");
1780 out.push_str("| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n");
1781 out.push_str("|---|---|---|---|---|---|---|---:|---:|---:|---|\n");
1782 for comparison in &summary.comparisons {
1783 out.push_str(&format!(
1784 "| `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
1785 comparison.fixture_id,
1786 selector_label(&comparison.selector),
1787 comparison
1788 .native_status
1789 .clone()
1790 .unwrap_or_else(|| "-".to_string()),
1791 comparison
1792 .text_status
1793 .clone()
1794 .unwrap_or_else(|| "-".to_string()),
1795 optional_bool_mark(comparison.equivalent),
1796 optional_bool_mark(comparison.verifier_match),
1797 optional_bool_mark(comparison.tool_sequence_match),
1798 comparison
1799 .rejected_tool_call_delta_text_minus_native
1800 .map(|v| v.to_string())
1801 .unwrap_or_else(|| "-".to_string()),
1802 comparison
1803 .token_delta_text_minus_native
1804 .map(|v| v.to_string())
1805 .unwrap_or_else(|| "-".to_string()),
1806 comparison
1807 .iteration_delta_text_minus_native
1808 .map(|v| v.to_string())
1809 .unwrap_or_else(|| "-".to_string()),
1810 comparison_evidence_links(comparison)
1811 ));
1812 }
1813 }
1814 let diverged = summary
1815 .comparisons
1816 .iter()
1817 .filter(|comparison| !comparison.divergence_reasons.is_empty())
1818 .collect::<Vec<_>>();
1819 if !diverged.is_empty() {
1820 out.push_str("\n## Native/Text Divergence Evidence\n\n");
1821 for comparison in diverged {
1822 out.push_str(&format!(
1823 "- `{}` `{}`: {}\n",
1824 comparison.fixture_id,
1825 selector_label(&comparison.selector),
1826 comparison.divergence_reasons.join("; ")
1827 ));
1828 if !comparison.evidence_paths.is_empty() {
1829 out.push_str(&format!(
1830 " Evidence: {}\n",
1831 comparison_evidence_links(comparison)
1832 ));
1833 }
1834 }
1835 }
1836 out
1837}
1838
1839fn render_rollup_table(out: &mut String, title: &str, rollups: &[RollupReport]) {
1840 out.push_str(&format!("## {title}\n\n"));
1841 out.push_str("| key | passed | failed | skipped | total | cost |\n");
1842 out.push_str("|---|---:|---:|---:|---:|---:|\n");
1843 for rollup in rollups {
1844 out.push_str(&format!(
1845 "| `{}` | {} | {} | {} | {} | {:.6} |\n",
1846 rollup.key.replace('|', "\\|"),
1847 rollup.passed_runs,
1848 rollup.failed_runs,
1849 rollup.skipped_runs,
1850 rollup.total_runs,
1851 rollup.total_cost_usd
1852 ));
1853 }
1854 out.push('\n');
1855}
1856
1857fn render_followups(summary: &EvalSummary) -> String {
1858 let mut out = String::new();
1859 out.push_str("# Follow-up Issue Candidates\n\n");
1860 if summary.followups.is_empty() {
1861 out.push_str("No follow-up issue candidates were generated from this run.\n");
1862 return out;
1863 }
1864 for followup in &summary.followups {
1865 out.push_str(&format!("## {}\n\n{}\n\n", followup.title, followup.body));
1866 if !followup.run_ids.is_empty() {
1867 out.push_str(&format!("- run_ids: `{}`\n", followup.run_ids.join("`, `")));
1868 }
1869 if !followup.labels.is_empty() {
1870 out.push_str(&format!("- labels: `{}`\n", followup.labels.join("`, `")));
1871 }
1872 out.push('\n');
1873 }
1874 out
1875}
1876
1877fn read_run_summary(run_dir: &Path) -> Option<JsonValue> {
1878 let raw = fs::read_to_string(run_dir.join("summary.json")).ok()?;
1879 serde_json::from_str(&raw).ok()
1880}
1881
1882fn parse_last_json_line(stdout: &str) -> Option<JsonValue> {
1883 stdout
1884 .lines()
1885 .rev()
1886 .map(str::trim)
1887 .filter(|line| !line.is_empty())
1888 .find_map(|line| serde_json::from_str::<JsonValue>(line).ok())
1889}
1890
1891fn string_array(value: Option<&JsonValue>) -> Vec<String> {
1892 value
1893 .and_then(JsonValue::as_array)
1894 .map(|values| {
1895 values
1896 .iter()
1897 .filter_map(JsonValue::as_str)
1898 .map(str::to_string)
1899 .collect()
1900 })
1901 .unwrap_or_default()
1902}
1903
1904fn non_empty_string_array(value: Option<&JsonValue>) -> Option<Vec<String>> {
1905 let values = string_array(value);
1906 (!values.is_empty()).then_some(values)
1907}
1908
1909fn tool_call_sequence(value: Option<&JsonValue>) -> Option<Vec<String>> {
1910 let calls = value.and_then(JsonValue::as_array)?;
1911 let mut sequence = Vec::new();
1912 for call in calls {
1913 if let Some(name) = call
1914 .get("name")
1915 .or_else(|| call.get("tool_name"))
1916 .and_then(JsonValue::as_str)
1917 {
1918 sequence.push(name.to_string());
1919 }
1920 }
1921 (!sequence.is_empty()).then_some(sequence)
1922}
1923
1924fn optional_bool_mark(value: Option<bool>) -> &'static str {
1925 match value {
1926 Some(true) => "yes",
1927 Some(false) => "no",
1928 None => "-",
1929 }
1930}
1931
1932fn comparison_evidence_links(comparison: &FormatComparison) -> String {
1933 let mut links = Vec::new();
1934 if let Some(native) = comparison.native_evidence_path.as_deref() {
1935 links.push(markdown_link("native", native));
1936 }
1937 if let Some(text) = comparison.text_evidence_path.as_deref() {
1938 links.push(markdown_link("text", text));
1939 }
1940 if links.is_empty() {
1941 "-".to_string()
1942 } else {
1943 links.join("<br>")
1944 }
1945}
1946
1947fn markdown_link(label: &str, target: &str) -> String {
1948 format!(
1949 "[{}]({})",
1950 label.replace('|', "\\|"),
1951 target
1952 .replace(' ', "%20")
1953 .replace('(', "%28")
1954 .replace(')', "%29")
1955 )
1956}
1957
1958fn reset_dir(path: &Path) -> Result<(), String> {
1959 if path.exists() {
1960 fs::remove_dir_all(path).map_err(|error| error.to_string())?;
1961 }
1962 fs::create_dir_all(path).map_err(|error| error.to_string())
1963}
1964
1965fn run_id_for(fixture: FixtureDefinition, selector: &ModelSelector, tool_format: &str) -> String {
1966 sanitize_id(&format!(
1967 "{}__{}__{}",
1968 fixture.id,
1969 selector_label(selector),
1970 tool_format
1971 ))
1972}
1973
1974fn sanitize_id(raw: &str) -> String {
1975 let mut out = String::new();
1976 for ch in raw.chars() {
1977 if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
1978 out.push(ch);
1979 } else {
1980 out.push('_');
1981 }
1982 }
1983 out.trim_matches('_').to_string()
1984}
1985
1986fn default_output_dir() -> PathBuf {
1987 PathBuf::from(".harn-runs")
1988 .join("coding-agent-bench")
1989 .join("latest")
1990}
1991
1992fn excerpt(text: &str) -> Option<String> {
1993 let trimmed = text.trim();
1994 if trimmed.is_empty() {
1995 return None;
1996 }
1997 let max = 4000;
1998 if trimmed.len() <= max {
1999 return Some(trimmed.to_string());
2000 }
2001 let mut truncated = String::new();
2002 for ch in trimmed.chars().take(max) {
2003 truncated.push(ch);
2004 }
2005 truncated.push_str("...");
2006 Some(truncated)
2007}
2008
2009fn load_env_files(paths: &[PathBuf]) -> Result<(EnvOverlay, Vec<LoadedEnvKey>), String> {
2010 let mut previous = Vec::new();
2011 let mut loaded = Vec::new();
2012 let mut touched = BTreeSet::new();
2013 for path in paths {
2014 let path = expand_home(path);
2015 let raw = fs::read_to_string(&path)
2016 .map_err(|error| format!("failed to read env file {}: {error}", path.display()))?;
2017 for (line_no, line) in raw.lines().enumerate() {
2018 let Some((key, value)) = parse_env_line(line).map_err(|error| {
2019 format!("{}:{}: {error}", path.display(), line_no.saturating_add(1))
2020 })?
2021 else {
2022 continue;
2023 };
2024 if touched.insert(key.clone()) {
2025 previous.push((OsString::from(&key), std::env::var_os(&key)));
2026 }
2027 std::env::set_var(&key, value);
2028 loaded.push(LoadedEnvKey {
2029 key,
2030 source: path.display().to_string(),
2031 });
2032 }
2033 }
2034 Ok((EnvOverlay { previous }, loaded))
2035}
2036
2037fn parse_env_line(line: &str) -> Result<Option<(String, String)>, String> {
2038 let trimmed = line.trim();
2039 if trimmed.is_empty() || trimmed.starts_with('#') {
2040 return Ok(None);
2041 }
2042 let trimmed = trimmed.strip_prefix("export ").unwrap_or(trimmed).trim();
2043 let Some((key, value)) = trimmed.split_once('=') else {
2044 return Err("expected KEY=VALUE".to_string());
2045 };
2046 let key = key.trim();
2047 if key.is_empty() {
2048 return Err("empty key".to_string());
2049 }
2050 if !key
2051 .chars()
2052 .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2053 {
2054 return Err(format!("invalid key `{key}`"));
2055 }
2056 Ok(Some((key.to_string(), unquote_env_value(value.trim()))))
2057}
2058
2059fn unquote_env_value(value: &str) -> String {
2060 if value.len() >= 2 {
2061 let bytes = value.as_bytes();
2062 if (bytes[0] == b'"' && bytes[value.len() - 1] == b'"')
2063 || (bytes[0] == b'\'' && bytes[value.len() - 1] == b'\'')
2064 {
2065 return value[1..value.len() - 1].to_string();
2066 }
2067 }
2068 value.to_string()
2069}
2070
2071fn expand_home(path: &Path) -> PathBuf {
2072 let raw = path.to_string_lossy();
2073 if raw == "~" {
2074 return std::env::var_os("HOME")
2075 .map(PathBuf::from)
2076 .unwrap_or_else(|| path.to_path_buf());
2077 }
2078 if let Some(rest) = raw.strip_prefix("~/") {
2079 if let Some(home) = std::env::var_os("HOME") {
2080 return PathBuf::from(home).join(rest);
2081 }
2082 }
2083 path.to_path_buf()
2084}
2085
2086#[cfg(test)]
2087mod tests {
2088 use super::*;
2089
2090 #[test]
2091 fn dotenv_parser_strips_export_and_quotes_without_leaking_values() {
2092 let parsed = parse_env_line("export TOGETHER_API_KEY=\"secret\"")
2093 .unwrap()
2094 .unwrap();
2095 assert_eq!(parsed.0, "TOGETHER_API_KEY");
2096 assert_eq!(parsed.1, "secret");
2097 assert!(parse_env_line("# comment").unwrap().is_none());
2098 }
2099
2100 #[test]
2101 fn model_selector_args_rejoin_provider_model_kv_after_clap_delimiter_split() {
2102 let normalized = normalize_model_selector_args(&[
2103 "mock:mock".to_string(),
2104 "provider=openrouter".to_string(),
2105 "model=qwen/qwen3-coder-flash".to_string(),
2106 "provider=together".to_string(),
2107 "model=Qwen/Qwen3-Coder-Next-FP8".to_string(),
2108 ]);
2109 assert_eq!(
2110 normalized,
2111 vec![
2112 "mock:mock",
2113 "provider=openrouter,model=qwen/qwen3-coder-flash",
2114 "provider=together,model=Qwen/Qwen3-Coder-Next-FP8",
2115 ]
2116 );
2117 }
2118
2119 #[test]
2120 fn markdown_escapes_model_table_pipes() {
2121 let selector = ModelSelector {
2122 selector: "provider:a|b".to_string(),
2123 provider: "provider".to_string(),
2124 model: "a|b".to_string(),
2125 };
2126 let summary = EvalSummary {
2127 schema_version: 2,
2128 fixture_ids: vec!["python-add".to_string()],
2129 fixtures: vec![FixtureReport {
2130 id: "python-add".to_string(),
2131 name: "Python add repair".to_string(),
2132 tool_sequence: "multi-tool".to_string(),
2133 description: "One-file Python bug fix verified by unittest output.".to_string(),
2134 }],
2135 output_dir: "out".to_string(),
2136 models: vec![selector.clone()],
2137 tool_formats: vec!["native".to_string()],
2138 env_keys_loaded: Vec::new(),
2139 total_runs: 1,
2140 passed_runs: 1,
2141 failed_runs: 0,
2142 skipped_runs: 0,
2143 diverged_comparisons: 0,
2144 total_cost_usd: 0.0,
2145 rollups: EvalRollups {
2146 by_fixture: vec![RollupReport {
2147 key: "python-add".to_string(),
2148 total_runs: 1,
2149 passed_runs: 1,
2150 failed_runs: 0,
2151 skipped_runs: 0,
2152 total_cost_usd: 0.0,
2153 }],
2154 by_provider: Vec::new(),
2155 by_model: Vec::new(),
2156 by_tool_format: Vec::new(),
2157 by_tool_sequence: Vec::new(),
2158 },
2159 runs: vec![RunReport {
2160 run_id: "r".to_string(),
2161 fixture_id: "python-add".to_string(),
2162 fixture_name: "Python add repair".to_string(),
2163 fixture_tool_sequence: "multi-tool".to_string(),
2164 selector,
2165 tool_format: "native".to_string(),
2166 status: "passed".to_string(),
2167 passed: true,
2168 skipped: false,
2169 skipped_reason: None,
2170 output_dir: "out/r".to_string(),
2171 transcript_events_path: "out/r/transcript_events.jsonl".to_string(),
2172 workspace_root: None,
2173 elapsed_ms: 1,
2174 duration_ms: 1,
2175 iterations: 1,
2176 input_tokens: 1,
2177 output_tokens: 1,
2178 cost_usd: 0.0,
2179 pricing_known: false,
2180 tool_calls: 0,
2181 rejected_tool_calls: 0,
2182 tool_sequence: Vec::new(),
2183 successful_tools: Vec::new(),
2184 transcript_event_count: 0,
2185 verification_success: true,
2186 harn_exit_code: 0,
2187 error: None,
2188 stderr_excerpt: None,
2189 local_cleanup: None,
2190 }],
2191 comparisons: Vec::new(),
2192 followups: Vec::new(),
2193 step_judge_preset: None,
2194 run_label: String::new(),
2195 baseline_comparison: None,
2196 };
2197 let md = render_markdown(&summary);
2198 assert!(md.contains("a\\|b"));
2199 }
2200
2201 #[test]
2202 fn baseline_comparison_reports_regressions_and_recoveries() {
2203 let tmp = tempfile::tempdir().expect("tempdir");
2205 let baseline_path = tmp.path().join("baseline_summary.json");
2206 let baseline = serde_json::json!({
2207 "schema_version": 2,
2208 "runs": [
2209 {"fixture_id": "python-add", "passed": true, "skipped": false},
2210 {"fixture_id": "cli-help-flag", "passed": true, "skipped": false},
2211 {"fixture_id": "test-output-first", "passed": false, "skipped": false},
2212 ],
2213 });
2214 std::fs::write(&baseline_path, serde_json::to_string(&baseline).unwrap())
2215 .expect("write baseline");
2216
2217 let selector = ModelSelector {
2219 selector: "mock:mock".to_string(),
2220 provider: "mock".to_string(),
2221 model: "mock".to_string(),
2222 };
2223 let runs = vec![
2224 RunReport {
2225 run_id: "r1".to_string(),
2226 fixture_id: "python-add".to_string(),
2227 fixture_name: "Python add".to_string(),
2228 fixture_tool_sequence: "multi-tool".to_string(),
2229 selector: selector.clone(),
2230 tool_format: "native".to_string(),
2231 status: "passed".to_string(),
2232 passed: true,
2233 skipped: false,
2234 skipped_reason: None,
2235 output_dir: "out/r1".to_string(),
2236 transcript_events_path: "out/r1/t.jsonl".to_string(),
2237 workspace_root: None,
2238 elapsed_ms: 0,
2239 duration_ms: 0,
2240 iterations: 0,
2241 input_tokens: 0,
2242 output_tokens: 0,
2243 cost_usd: 0.0,
2244 pricing_known: false,
2245 tool_calls: 0,
2246 rejected_tool_calls: 0,
2247 tool_sequence: Vec::new(),
2248 successful_tools: Vec::new(),
2249 transcript_event_count: 0,
2250 verification_success: true,
2251 harn_exit_code: 0,
2252 error: None,
2253 stderr_excerpt: None,
2254 local_cleanup: None,
2255 },
2256 RunReport {
2257 run_id: "r2".to_string(),
2258 fixture_id: "cli-help-flag".to_string(),
2259 fixture_name: "CLI help flag".to_string(),
2260 fixture_tool_sequence: "multi-tool".to_string(),
2261 selector: selector.clone(),
2262 tool_format: "native".to_string(),
2263 status: "failed".to_string(),
2264 passed: false,
2265 skipped: false,
2266 skipped_reason: None,
2267 output_dir: "out/r2".to_string(),
2268 transcript_events_path: "out/r2/t.jsonl".to_string(),
2269 workspace_root: None,
2270 elapsed_ms: 0,
2271 duration_ms: 0,
2272 iterations: 0,
2273 input_tokens: 0,
2274 output_tokens: 0,
2275 cost_usd: 0.0,
2276 pricing_known: false,
2277 tool_calls: 0,
2278 rejected_tool_calls: 0,
2279 tool_sequence: Vec::new(),
2280 successful_tools: Vec::new(),
2281 transcript_event_count: 0,
2282 verification_success: false,
2283 harn_exit_code: 1,
2284 error: None,
2285 stderr_excerpt: None,
2286 local_cleanup: None,
2287 },
2288 RunReport {
2289 run_id: "r3".to_string(),
2290 fixture_id: "test-output-first".to_string(),
2291 fixture_name: "Test output first".to_string(),
2292 fixture_tool_sequence: "multi-tool".to_string(),
2293 selector,
2294 tool_format: "native".to_string(),
2295 status: "passed".to_string(),
2296 passed: true,
2297 skipped: false,
2298 skipped_reason: None,
2299 output_dir: "out/r3".to_string(),
2300 transcript_events_path: "out/r3/t.jsonl".to_string(),
2301 workspace_root: None,
2302 elapsed_ms: 0,
2303 duration_ms: 0,
2304 iterations: 0,
2305 input_tokens: 0,
2306 output_tokens: 0,
2307 cost_usd: 0.0,
2308 pricing_known: false,
2309 tool_calls: 0,
2310 rejected_tool_calls: 0,
2311 tool_sequence: Vec::new(),
2312 successful_tools: Vec::new(),
2313 transcript_event_count: 0,
2314 verification_success: true,
2315 harn_exit_code: 0,
2316 error: None,
2317 stderr_excerpt: None,
2318 local_cleanup: None,
2319 },
2320 ];
2321 let comparison = load_baseline_comparison(&baseline_path, &runs).expect("compare");
2322 assert_eq!(comparison.regressions_count, 1);
2323 assert_eq!(comparison.regressions[0].fixture_id, "cli-help-flag");
2324 assert_eq!(comparison.recoveries_count, 1);
2325 assert_eq!(comparison.recoveries[0].fixture_id, "test-output-first");
2326 assert_eq!(comparison.unchanged_passes, vec!["python-add".to_string()]);
2327 assert_eq!(
2328 comparison.net_lift_pp, 0.0,
2329 "+1 recovery and -1 regression should net to 0pp lift across 3 compared fixtures"
2330 );
2331 }
2332
2333 #[test]
2334 fn fixture_selection_supports_all_and_specific_ids() {
2335 let all = resolve_fixtures(&["all".to_string()]).expect("all fixtures resolve");
2336 assert_eq!(all.len(), FIXTURE_DEFINITIONS.len());
2337
2338 let selected = resolve_fixtures(&[
2339 "python-add".to_string(),
2340 "python-add".to_string(),
2341 "read-only-audit".to_string(),
2342 ])
2343 .expect("specific fixtures resolve");
2344 assert_eq!(
2345 selected
2346 .iter()
2347 .map(|fixture| fixture.id)
2348 .collect::<Vec<_>>(),
2349 vec!["python-add", "read-only-audit"],
2350 );
2351
2352 let error = resolve_fixtures(&["missing".to_string()]).expect_err("unknown fixture fails");
2353 assert!(error.contains("unsupported --fixture `missing`"));
2354 }
2355
2356 #[test]
2357 fn matrix_max_runs_bounds_fixture_model_tool_product() {
2358 let fixtures = resolve_fixtures(&["all".to_string()]).expect("fixtures");
2359 let selector = ModelSelector {
2360 selector: "mock:mock".to_string(),
2361 provider: "mock".to_string(),
2362 model: "mock".to_string(),
2363 };
2364 let selectors = vec![selector];
2365 let tool_formats = vec!["native".to_string(), "text".to_string()];
2366 let matrix = build_matrix(&fixtures, &selectors, &tool_formats, Some(3));
2367 assert_eq!(matrix.len(), 3);
2368 assert_eq!(
2369 matrix
2370 .iter()
2371 .map(|(fixture, _selector, tool_format)| (fixture.id, tool_format.as_str()))
2372 .collect::<Vec<_>>(),
2373 vec![
2374 ("python-add", "native"),
2375 ("python-add", "text"),
2376 ("cli-help-flag", "native"),
2377 ],
2378 );
2379
2380 let empty = build_matrix(&fixtures, &selectors, &tool_formats, Some(0));
2381 assert!(empty.is_empty());
2382 }
2383}