Skip to main content

harn_cli/commands/
eval_coding_agent.rs

1//! `harn eval coding-agent` - empirical preset/provider benchmark for a
2//! small coding-agent fixture suite.
3//!
4//! ## Dispatch boundary
5//!
6//! The **matrix execution pipeline** (fixture resolution, model
7//! discovery, per-cell `execute_run` invocation, Ollama snapshot/
8//! cleanup, scoring, rollups, native/text comparisons, follow-up
9//! generation, baseline diff) stays in Rust. Every cell drives the
10//! embedded `coding_agent_suite.harn` driver through `execute_run`,
11//! which itself reaches into VM internals (`commands::run`,
12//! `harn_vm::llm`, `commands::local::runtime`) that are not exposed as
13//! script capabilities.
14//!
15//! The **rendering layer** (the `summary.md` body, the `followups.md`
16//! body, the one-line human stdout summary, the `--json` pretty form)
17//! is delegated to
18//! `crates/harn-stdlib/src/stdlib/cli/eval/coding_agent.harn`. The
19//! Rust shim pre-serialises the assembled `EvalSummary` to JSON,
20//! forwards it via [`CODING_AGENT_SUMMARY_ENV`], dispatches four
21//! times (markdown for `summary.md`, followups for `followups.md`,
22//! then either the summary line or the `--json` pretty form for
23//! stdout), and writes the captured payloads to disk / real stdout.
24//!
25//! The on-disk JSON artifacts (`summary.json`, `per_run.jsonl`,
26//! `local_readiness.json`) stay on the serde-driven Rust path because
27//! Harn's `json_stringify_pretty` sorts dict keys alphabetically and
28//! the on-disk format is consumed by the experiment driver in
29//! `experiments/step-judge/run.sh`, the local-readiness regression
30//! check, and hosted ingestion — all of which depend on the serde
31//! struct-field byte order.
32//!
33//! `HARN_CLI_IMPL=rust` keeps the direct-render path available for
34//! parity snapshot coverage.
35
36use std::collections::{BTreeMap, BTreeSet, HashSet};
37use std::ffi::OsString;
38use std::fs;
39use std::io::Write as _;
40use std::path::{Path, PathBuf};
41
42use harn_vm::clock::{Clock, RealClock};
43use serde::Serialize;
44use serde_json::Value as JsonValue;
45
46use crate::cli::EvalCodingAgentArgs;
47use crate::commands::eval_coding_agent_preset::{
48    resolve_step_judge_json, resolve_structural_validator_json,
49};
50use crate::commands::eval_model_selector::{
51    resolve_selector, selector_is_local, selector_label, ModelSelector,
52};
53use crate::commands::local::runtime::{
54    local_provider_ids, ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
55};
56use crate::commands::local_readiness;
57use crate::commands::run::{
58    execute_run_with_sandbox_options, CliLlmMockMode, RunProfileOptions, RunSandboxOptions,
59};
60use crate::commands::tool_mode_parity::{
61    self, ToolModeParityFixtureInput, ToolModeParityPairSummary, TOOL_MODE_PARITY_DIRECTORY,
62    TOOL_MODE_PARITY_FIXTURE_SUITE, TOOL_MODE_PARITY_OVERLAY_FILENAME,
63};
64use crate::dispatch;
65use crate::env_guard::ScopedEnvVar;
66
67/// Env var the embedded `cli/eval/coding_agent` script reads to pick
68/// up the pre-serialised [`EvalSummary`]. The Rust shim does all the
69/// matrix execution and scoring and hands the script the assembled
70/// summary so it only has to format it.
71const CODING_AGENT_SUMMARY_ENV: &str = "HARN_EVAL_CODING_AGENT_SUMMARY_JSON";
72
73/// Env var the script reads to pick the rendering mode — one of
74/// `"markdown"` (summary.md body), `"followups"` (followups.md body),
75/// `"summary"` (one-line stdout summary), or `"json"` (--json pretty
76/// form). Defaulted to `"summary"` if unset so the script stays robust
77/// against future Rust-side bugs.
78const CODING_AGENT_MODE_ENV: &str = "HARN_EVAL_CODING_AGENT_MODE";
79
80/// Serialises the dispatch-render path so concurrent in-process
81/// callers (the existing `eval_coding_agent_cli` integration test plus
82/// any future fanout caller) don't race on the global env vars the
83/// Rust shim sets to hand the report off to the .harn script. The CLI
84/// binary itself is single-call, so this mutex is uncontended in
85/// production; in tests it serialises the dispatch window only —
86/// matrix execution still parallelises freely.
87///
88/// Matches the other eval render shims so the cross-script env-var handoff
89/// stays consistent across the eval cluster.
90static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
91
92const CODING_AGENT_SUITE_HARN: &str = include_str!("../../assets/evals/coding_agent_suite.harn");
93const TOOL_FORMAT_OVERRIDE_WARNING_PREFIX: &str = "warning: tool_format override:";
94
95#[derive(Debug, Clone, Copy)]
96struct FixtureDefinition {
97    id: &'static str,
98    name: &'static str,
99    tool_sequence: &'static str,
100    description: &'static str,
101}
102
103static FIXTURE_DEFINITIONS: &[FixtureDefinition] = &[
104    FixtureDefinition {
105        id: "python-add",
106        name: "Python add repair",
107        tool_sequence: "multi-tool",
108        description: "One-file Python bug fix verified by unittest output.",
109    },
110    FixtureDefinition {
111        id: "cli-help-flag",
112        name: "CLI help flag",
113        tool_sequence: "multi-tool",
114        description: "Add a tiny CLI flag, update help-facing docs, and verify behavior.",
115    },
116    FixtureDefinition {
117        id: "test-output-first",
118        name: "Test-output-first repair",
119        tool_sequence: "multi-tool",
120        description: "Run a failing test first, then edit the implementation and re-run it.",
121    },
122    FixtureDefinition {
123        id: "docs-symbol-rename",
124        name: "Docs symbol rename",
125        tool_sequence: "multi-tool",
126        description:
127            "Update docs and an example after a symbol rename without touching implementation.",
128    },
129    FixtureDefinition {
130        id: "read-only-audit",
131        name: "Read-only audit",
132        tool_sequence: "one-tool",
133        description: "Inspect a file and report that no edits are needed.",
134    },
135    FixtureDefinition {
136        id: "no-tool-diagnosis",
137        name: "No-tool diagnosis",
138        tool_sequence: "no-tool",
139        description: "Answer from prompt-only context without any tools.",
140    },
141];
142
143#[derive(Debug, Clone, Serialize)]
144struct LoadedEnvKey {
145    key: String,
146    source: String,
147}
148
149#[derive(Debug)]
150struct EnvOverlay {
151    previous: Vec<(OsString, Option<OsString>)>,
152}
153
154impl Drop for EnvOverlay {
155    fn drop(&mut self) {
156        for (key, previous) in self.previous.iter().rev() {
157            if let Some(value) = previous {
158                std::env::set_var(key, value);
159            } else {
160                std::env::remove_var(key);
161            }
162        }
163    }
164}
165
166#[derive(Debug, Clone, Serialize)]
167struct RunReport {
168    run_id: String,
169    fixture_id: String,
170    fixture_name: String,
171    fixture_tool_sequence: String,
172    selector: ModelSelector,
173    tool_format: String,
174    status: String,
175    passed: bool,
176    skipped: bool,
177    #[serde(skip_serializing_if = "Option::is_none")]
178    skipped_reason: Option<String>,
179    output_dir: String,
180    transcript_events_path: String,
181    workspace_root: Option<String>,
182    elapsed_ms: u64,
183    duration_ms: u64,
184    iterations: i64,
185    input_tokens: i64,
186    output_tokens: i64,
187    cost_usd: f64,
188    pricing_known: bool,
189    tool_calls: usize,
190    rejected_tool_calls: usize,
191    tool_sequence: Vec<String>,
192    successful_tools: Vec<String>,
193    transcript_event_count: usize,
194    verification_success: bool,
195    harn_exit_code: i32,
196    #[serde(skip_serializing_if = "Option::is_none")]
197    error: Option<String>,
198    #[serde(skip_serializing_if = "Option::is_none")]
199    stderr_excerpt: Option<String>,
200    local_cleanup: Option<LocalCleanupReport>,
201}
202
203#[derive(Debug, Clone, Serialize)]
204struct LocalCleanupReport {
205    provider: String,
206    model: String,
207    initially_loaded: bool,
208    action: String,
209    #[serde(skip_serializing_if = "Option::is_none")]
210    detail: Option<String>,
211}
212
213#[derive(Debug, Clone, Serialize)]
214struct FormatComparison {
215    fixture_id: String,
216    selector: ModelSelector,
217    native_run_id: Option<String>,
218    text_run_id: Option<String>,
219    native_evidence_path: Option<String>,
220    text_evidence_path: Option<String>,
221    native_status: Option<String>,
222    text_status: Option<String>,
223    native_passed: Option<bool>,
224    text_passed: Option<bool>,
225    native_tool_call_count: Option<usize>,
226    text_tool_call_count: Option<usize>,
227    native_rejected_tool_call_count: Option<usize>,
228    text_rejected_tool_call_count: Option<usize>,
229    verifier_match: Option<bool>,
230    tool_sequence_match: Option<bool>,
231    rejected_tool_call_delta_text_minus_native: Option<i64>,
232    token_delta_text_minus_native: Option<i64>,
233    iteration_delta_text_minus_native: Option<i64>,
234    equivalent: Option<bool>,
235    divergence_reasons: Vec<String>,
236    evidence_paths: Vec<String>,
237}
238
239#[derive(Debug, Clone, Serialize)]
240struct FollowupSuggestion {
241    title: String,
242    body: String,
243    labels: Vec<String>,
244    run_ids: Vec<String>,
245}
246
247#[derive(Debug, Clone, Serialize)]
248struct FixtureReport {
249    id: String,
250    name: String,
251    tool_sequence: String,
252    description: String,
253}
254
255#[derive(Debug, Clone, Serialize)]
256struct RollupReport {
257    key: String,
258    total_runs: usize,
259    passed_runs: usize,
260    failed_runs: usize,
261    skipped_runs: usize,
262    total_cost_usd: f64,
263}
264
265#[derive(Debug, Clone, Serialize)]
266struct EvalRollups {
267    by_fixture: Vec<RollupReport>,
268    by_provider: Vec<RollupReport>,
269    by_model: Vec<RollupReport>,
270    by_tool_format: Vec<RollupReport>,
271    by_tool_sequence: Vec<RollupReport>,
272}
273
274#[derive(Debug, Clone, Serialize)]
275struct EvalSummary {
276    schema_version: u32,
277    fixture_ids: Vec<String>,
278    fixtures: Vec<FixtureReport>,
279    output_dir: String,
280    models: Vec<ModelSelector>,
281    tool_formats: Vec<String>,
282    env_keys_loaded: Vec<LoadedEnvKey>,
283    total_runs: usize,
284    passed_runs: usize,
285    failed_runs: usize,
286    skipped_runs: usize,
287    diverged_comparisons: usize,
288    total_cost_usd: f64,
289    rollups: EvalRollups,
290    runs: Vec<RunReport>,
291    comparisons: Vec<FormatComparison>,
292    parity_by_pair: Vec<ToolModeParityPairSummary>,
293    followups: Vec<FollowupSuggestion>,
294    /// Step-judge preset applied to all runs in this invocation, if any.
295    /// Used by the experiment driver (experiments/step-judge/run.sh) to
296    /// group repeat invocations into cells.
297    #[serde(skip_serializing_if = "Option::is_none")]
298    step_judge_preset: Option<String>,
299    /// Free-form label for grouping repeat invocations (e.g.
300    /// "replicate-1", "probe-rubric-adversarial"). Empty when unset.
301    #[serde(skip_serializing_if = "String::is_empty")]
302    run_label: String,
303    /// Optional per-fixture diff against a prior run's `summary.json`,
304    /// listing regressions (baseline passed, this cell failed) and
305    /// recoveries (baseline failed, this cell passed) plus aggregate
306    /// counts and a net lift in percentage points. Populated when the
307    /// caller passes `--baseline-comparison-against <path>` (harn#2318).
308    #[serde(skip_serializing_if = "Option::is_none")]
309    baseline_comparison: Option<BaselineComparison>,
310}
311
312#[derive(Debug, Clone, Serialize, Default)]
313struct BaselineComparison {
314    /// `output_dir` or `run_label` of the baseline summary, for context.
315    baseline_label: String,
316    /// Resolved path to the baseline `summary.json` that was diffed against.
317    baseline_path: String,
318    regressions: Vec<FixtureStatusDelta>,
319    recoveries: Vec<FixtureStatusDelta>,
320    /// Fixtures that passed in both runs.
321    unchanged_passes: Vec<String>,
322    /// Fixtures that failed in both runs.
323    unchanged_failures: Vec<String>,
324    /// Fixtures present in only one of the two runs (skipped from the
325    /// diff but listed for visibility).
326    missing_in_baseline: Vec<String>,
327    missing_in_cell: Vec<String>,
328    regressions_count: usize,
329    recoveries_count: usize,
330    /// `(recoveries_count - regressions_count) / total_fixtures_compared * 100`,
331    /// rounded to one decimal place. Negative when the cell regresses more
332    /// than it recovers.
333    net_lift_pp: f64,
334}
335
336#[derive(Debug, Clone, Serialize)]
337struct FixtureStatusDelta {
338    fixture_id: String,
339    baseline_status: String,
340    cell_status: String,
341}
342
343struct LocalRunGuard {
344    selector: ModelSelector,
345    stop_after: bool,
346    snapshot: Option<LocalProviderSnapshot>,
347}
348
349struct RunSummaryContext {
350    run_id: String,
351    fixture: FixtureDefinition,
352    selector: ModelSelector,
353    tool_format: String,
354    run_dir: PathBuf,
355    elapsed_ms: u64,
356    exit_code: i32,
357    stderr: String,
358    local_cleanup: Option<LocalCleanupReport>,
359}
360
361pub async fn run(args: EvalCodingAgentArgs) -> i32 {
362    let output_dir = args.output.clone().unwrap_or_else(default_output_dir);
363    if let Err(error) = fs::create_dir_all(&output_dir) {
364        eprintln!("error: failed to create {}: {error}", output_dir.display());
365        return 1;
366    }
367
368    let (_env_guard, env_keys_loaded) = match load_env_files(&args.env_files) {
369        Ok(loaded) => loaded,
370        Err(error) => {
371            eprintln!("error: {error}");
372            return 1;
373        }
374    };
375
376    let fixtures = match resolve_fixtures(&args.fixtures) {
377        Ok(fixtures) => fixtures,
378        Err(error) => {
379            eprintln!("error: {error}");
380            return 2;
381        }
382    };
383    let models = match resolve_models(&args).await {
384        Ok(models) => models,
385        Err(error) => {
386            eprintln!("error: {error}");
387            return 1;
388        }
389    };
390    let tool_formats = match normalize_tool_formats(&args.tool_formats) {
391        Ok(formats) => formats,
392        Err(error) => {
393            eprintln!("error: {error}");
394            return 2;
395        }
396    };
397    let matrix = build_matrix(&fixtures, &models, &tool_formats, args.max_runs);
398    if matrix.is_empty() {
399        eprintln!("error: no coding-agent benchmark runs selected");
400        return 2;
401    }
402
403    let mut reports = Vec::new();
404    let mut had_error = false;
405    for (fixture, selector, tool_format) in matrix {
406        let report = run_matrix_entry(&args, &output_dir, fixture, selector, tool_format).await;
407        if !report.passed && !report.skipped {
408            had_error = true;
409        }
410        if report.skipped && args.fail_on_unauthorized {
411            had_error = true;
412        }
413        eprintln!(
414            "{} {} {}: {}",
415            report.fixture_id,
416            selector_label(&report.selector),
417            report.tool_format,
418            report.status
419        );
420        reports.push(report);
421    }
422
423    let baseline_comparison = match &args.baseline_comparison_against {
424        Some(path) => match load_baseline_comparison(path, &reports) {
425            Ok(comparison) => Some(comparison),
426            Err(error) => {
427                eprintln!("error: --baseline-comparison-against: {error}");
428                return 1;
429            }
430        },
431        None => None,
432    };
433    let summary = build_summary(
434        &output_dir,
435        fixtures,
436        models,
437        tool_formats,
438        env_keys_loaded,
439        reports,
440        args.step_judge
441            .clone()
442            .filter(|s| !s.is_empty() && s != "none"),
443        args.run_label.clone(),
444        baseline_comparison,
445    );
446    // The JSON artifacts (summary.json, per_run.jsonl,
447    // local_readiness.json) always stay on the serde-driven Rust path —
448    // see module docstring for the byte-format rationale. They write
449    // before any rendering so a render failure doesn't leave a partially
450    // written report directory.
451    if let Err(error) = write_json_artifacts(&output_dir, &summary) {
452        eprintln!("error: failed to write benchmark outputs: {error}");
453        return 1;
454    }
455
456    // `HARN_CLI_IMPL=rust` keeps the legacy direct-render path so the
457    // parity-snapshot harness (#2299) can compare both impls until C1
458    // (#2314) deletes this escape hatch.
459    let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
460
461    if use_legacy {
462        if let Err(error) = write_markdown_artifacts_legacy(&output_dir, &summary) {
463            eprintln!("error: {error}");
464            return 1;
465        }
466        announce_output_paths(&output_dir);
467        if args.json {
468            print_json_legacy(&summary);
469        } else {
470            print_summary_legacy(&summary);
471        }
472        return if had_error { 1 } else { 0 };
473    }
474
475    if let Err(code) = write_markdown_artifacts_dispatch(&output_dir, &summary).await {
476        return code;
477    }
478    announce_output_paths(&output_dir);
479    if args.json {
480        if let Err(code) = print_json_dispatch(&summary).await {
481            return code;
482        }
483    } else if let Err(code) = print_summary_dispatch(&summary).await {
484        return code;
485    }
486
487    if had_error {
488        1
489    } else {
490        0
491    }
492}
493
494async fn run_matrix_entry(
495    args: &EvalCodingAgentArgs,
496    output_dir: &Path,
497    fixture: FixtureDefinition,
498    selector: ModelSelector,
499    tool_format: String,
500) -> RunReport {
501    let run_id = run_id_for(fixture, &selector, &tool_format);
502    let run_dir = output_dir.join(&run_id);
503    if let Err(error) = reset_dir(&run_dir) {
504        return error_report(
505            run_id,
506            fixture,
507            selector,
508            tool_format,
509            run_dir,
510            format!("failed to prepare run directory: {error}"),
511        );
512    }
513
514    if !provider_available(&selector) {
515        let reason = format!(
516            "provider `{}` has no configured credentials",
517            selector.provider
518        );
519        return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
520    }
521
522    let script_path = run_dir.join("coding_agent_suite.harn");
523    if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
524        return error_report(
525            run_id,
526            fixture,
527            selector,
528            tool_format,
529            run_dir,
530            format!("failed to write benchmark harness: {error}"),
531        );
532    }
533
534    let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
535    let argv = script_argv(args, fixture, &selector, &tool_format, &run_dir);
536    let clock = RealClock::new();
537    let started_ms = clock.monotonic_ms();
538    let outcome = execute_run_with_sandbox_options(
539        &script_path.to_string_lossy(),
540        false,
541        HashSet::new(),
542        argv,
543        Vec::new(),
544        CliLlmMockMode::Off,
545        None,
546        RunProfileOptions::default(),
547        RunSandboxOptions::default().with_workspace_root(run_dir.clone()),
548    )
549    .await;
550    if let Some(line) = tool_format_override_warning_line(&outcome.stderr) {
551        eprintln!("{line}");
552    }
553    let elapsed_ms = clock
554        .monotonic_ms()
555        .saturating_sub(started_ms)
556        .try_into()
557        .unwrap_or(0);
558    let local_cleanup = if let Some(guard) = local_guard {
559        guard.cleanup().await
560    } else {
561        None
562    };
563
564    let summary_value =
565        read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
566    let Some(summary) = summary_value else {
567        return RunReport {
568            run_id,
569            fixture_id: fixture.id.to_string(),
570            fixture_name: fixture.name.to_string(),
571            fixture_tool_sequence: fixture.tool_sequence.to_string(),
572            selector,
573            tool_format,
574            status: "infra_error".to_string(),
575            passed: false,
576            skipped: false,
577            skipped_reason: None,
578            output_dir: run_dir.display().to_string(),
579            transcript_events_path: run_dir
580                .join("transcript_events.jsonl")
581                .display()
582                .to_string(),
583            workspace_root: None,
584            elapsed_ms,
585            duration_ms: 0,
586            iterations: 0,
587            input_tokens: 0,
588            output_tokens: 0,
589            cost_usd: 0.0,
590            pricing_known: false,
591            tool_calls: 0,
592            rejected_tool_calls: 0,
593            tool_sequence: Vec::new(),
594            successful_tools: Vec::new(),
595            transcript_event_count: 0,
596            verification_success: false,
597            harn_exit_code: outcome.exit_code,
598            error: Some("benchmark harness produced no summary JSON".to_string()),
599            stderr_excerpt: excerpt(&outcome.stderr),
600            local_cleanup,
601        };
602    };
603
604    report_from_summary(
605        RunSummaryContext {
606            run_id,
607            fixture,
608            selector,
609            tool_format,
610            run_dir,
611            elapsed_ms,
612            exit_code: outcome.exit_code,
613            stderr: outcome.stderr,
614            local_cleanup,
615        },
616        summary,
617    )
618}
619
620fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
621    let passed = summary
622        .get("passed")
623        .and_then(JsonValue::as_bool)
624        .unwrap_or(false)
625        && ctx.exit_code == 0;
626    let input_tokens = summary
627        .pointer("/llm/input_tokens")
628        .and_then(JsonValue::as_i64)
629        .unwrap_or(0);
630    let output_tokens = summary
631        .pointer("/llm/output_tokens")
632        .and_then(JsonValue::as_i64)
633        .unwrap_or(0);
634    let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
635    let cost_usd = pricing
636        .map(|(input, output)| {
637            (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
638        })
639        .unwrap_or(0.0);
640    let status = if passed {
641        "passed".to_string()
642    } else if ctx.exit_code == 0 {
643        "failed".to_string()
644    } else {
645        summary
646            .get("status")
647            .and_then(JsonValue::as_str)
648            .unwrap_or("failed")
649            .to_string()
650    };
651    RunReport {
652        run_id: ctx.run_id,
653        fixture_id: ctx.fixture.id.to_string(),
654        fixture_name: ctx.fixture.name.to_string(),
655        fixture_tool_sequence: ctx.fixture.tool_sequence.to_string(),
656        selector: ctx.selector,
657        tool_format: ctx.tool_format,
658        status,
659        passed,
660        skipped: false,
661        skipped_reason: None,
662        output_dir: ctx.run_dir.display().to_string(),
663        transcript_events_path: ctx
664            .run_dir
665            .join("transcript_events.jsonl")
666            .display()
667            .to_string(),
668        workspace_root: summary
669            .get("workspace_root")
670            .and_then(JsonValue::as_str)
671            .map(str::to_string),
672        elapsed_ms: ctx.elapsed_ms,
673        duration_ms: summary
674            .get("duration_ms")
675            .and_then(JsonValue::as_u64)
676            .unwrap_or(ctx.elapsed_ms),
677        iterations: summary
678            .pointer("/llm/iterations")
679            .and_then(JsonValue::as_i64)
680            .unwrap_or(0),
681        input_tokens,
682        output_tokens,
683        cost_usd,
684        pricing_known: pricing.is_some(),
685        tool_calls: summary
686            .pointer("/tools/calls")
687            .and_then(JsonValue::as_array)
688            .map(Vec::len)
689            .unwrap_or(0),
690        rejected_tool_calls: summary
691            .pointer("/tools/rejected")
692            .and_then(JsonValue::as_array)
693            .map(Vec::len)
694            .unwrap_or(0),
695        tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
696            .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
697            .unwrap_or_default(),
698        successful_tools: string_array(summary.pointer("/tools/successful")),
699        transcript_event_count: summary
700            .get("transcript_event_count")
701            .and_then(JsonValue::as_u64)
702            .unwrap_or(0) as usize,
703        verification_success: summary
704            .pointer("/verification/success")
705            .and_then(JsonValue::as_bool)
706            .unwrap_or(false),
707        harn_exit_code: ctx.exit_code,
708        error: (!passed).then(|| {
709            summary
710                .get("status")
711                .and_then(JsonValue::as_str)
712                .unwrap_or("benchmark failed")
713                .to_string()
714        }),
715        stderr_excerpt: excerpt(&ctx.stderr),
716        local_cleanup: ctx.local_cleanup,
717    }
718}
719
720impl LocalRunGuard {
721    async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
722        if !selector_is_local(selector) {
723            return None;
724        }
725        let snapshot = snapshot_provider(&selector.provider, Path::new("."))
726            .await
727            .ok();
728        Some(Self {
729            selector: selector.clone(),
730            stop_after,
731            snapshot,
732        })
733    }
734
735    async fn cleanup(self) -> Option<LocalCleanupReport> {
736        let snapshot = self.snapshot?;
737        if self.selector.provider != "ollama" {
738            return Some(LocalCleanupReport {
739                provider: self.selector.provider,
740                model: self.selector.model,
741                initially_loaded: false,
742                action: "not_applicable".to_string(),
743                detail: Some(
744                    "non-Ollama local providers are only stopped when Harn launched a managed server"
745                        .to_string(),
746                ),
747            });
748        }
749        let initially_loaded = snapshot
750            .loaded_models
751            .iter()
752            .any(|loaded| loaded.name == self.selector.model);
753        if !self.stop_after {
754            return Some(LocalCleanupReport {
755                provider: self.selector.provider,
756                model: self.selector.model,
757                initially_loaded,
758                action: "left_running".to_string(),
759                detail: Some("--keep-local-after-run".to_string()),
760            });
761        }
762        if initially_loaded {
763            return Some(LocalCleanupReport {
764                provider: self.selector.provider,
765                model: self.selector.model,
766                initially_loaded,
767                action: "left_preexisting".to_string(),
768                detail: None,
769            });
770        }
771        match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
772            Ok(()) => Some(LocalCleanupReport {
773                provider: self.selector.provider,
774                model: self.selector.model,
775                initially_loaded,
776                action: "unloaded".to_string(),
777                detail: None,
778            }),
779            Err(error) => Some(LocalCleanupReport {
780                provider: self.selector.provider,
781                model: self.selector.model,
782                initially_loaded,
783                action: "unload_failed".to_string(),
784                detail: Some(error),
785            }),
786        }
787    }
788}
789
790fn script_argv(
791    args: &EvalCodingAgentArgs,
792    fixture: FixtureDefinition,
793    selector: &ModelSelector,
794    tool_format: &str,
795    run_dir: &Path,
796) -> Vec<String> {
797    let mut argv = vec![
798        "--fixture".to_string(),
799        fixture.id.to_string(),
800        "--output-dir".to_string(),
801        run_dir.display().to_string(),
802        "--provider".to_string(),
803        selector.provider.clone(),
804        "--model".to_string(),
805        selector.model.clone(),
806        "--tool-format".to_string(),
807        tool_format.to_string(),
808        "--max-iterations".to_string(),
809        args.max_iterations.to_string(),
810        "--python".to_string(),
811        args.python.clone(),
812    ];
813    if selector.provider == "mock" {
814        argv.push("--seed-mock".to_string());
815    }
816    if let Some(json) = resolve_step_judge_json(args, selector) {
817        argv.push("--step-judge-json".to_string());
818        argv.push(json);
819    }
820    if let Some(reason) = args
821        .override_reason
822        .as_deref()
823        .map(str::trim)
824        .filter(|reason| !reason.is_empty())
825    {
826        argv.push("--override-reason".to_string());
827        argv.push(reason.to_string());
828    }
829    if let Some(json) = resolve_structural_validator_json(args) {
830        argv.push("--structural-validator-json".to_string());
831        argv.push(json);
832    }
833    argv
834}
835
836fn tool_format_override_warning_line(stderr: &str) -> Option<&str> {
837    stderr
838        .lines()
839        .map(str::trim)
840        .find(|line| line.starts_with(TOOL_FORMAT_OVERRIDE_WARNING_PREFIX))
841}
842
843fn error_report(
844    run_id: String,
845    fixture: FixtureDefinition,
846    selector: ModelSelector,
847    tool_format: String,
848    run_dir: PathBuf,
849    error: String,
850) -> RunReport {
851    RunReport {
852        run_id,
853        fixture_id: fixture.id.to_string(),
854        fixture_name: fixture.name.to_string(),
855        fixture_tool_sequence: fixture.tool_sequence.to_string(),
856        selector,
857        tool_format,
858        status: "infra_error".to_string(),
859        passed: false,
860        skipped: false,
861        skipped_reason: None,
862        output_dir: run_dir.display().to_string(),
863        transcript_events_path: run_dir
864            .join("transcript_events.jsonl")
865            .display()
866            .to_string(),
867        workspace_root: None,
868        elapsed_ms: 0,
869        duration_ms: 0,
870        iterations: 0,
871        input_tokens: 0,
872        output_tokens: 0,
873        cost_usd: 0.0,
874        pricing_known: false,
875        tool_calls: 0,
876        rejected_tool_calls: 0,
877        tool_sequence: Vec::new(),
878        successful_tools: Vec::new(),
879        transcript_event_count: 0,
880        verification_success: false,
881        harn_exit_code: 1,
882        error: Some(error),
883        stderr_excerpt: None,
884        local_cleanup: None,
885    }
886}
887
888fn skipped_report(
889    run_id: String,
890    fixture: FixtureDefinition,
891    selector: ModelSelector,
892    tool_format: String,
893    run_dir: PathBuf,
894    reason: String,
895) -> RunReport {
896    RunReport {
897        run_id,
898        fixture_id: fixture.id.to_string(),
899        fixture_name: fixture.name.to_string(),
900        fixture_tool_sequence: fixture.tool_sequence.to_string(),
901        selector,
902        tool_format,
903        status: "skipped".to_string(),
904        passed: false,
905        skipped: true,
906        skipped_reason: Some(reason),
907        output_dir: run_dir.display().to_string(),
908        transcript_events_path: run_dir
909            .join("transcript_events.jsonl")
910            .display()
911            .to_string(),
912        workspace_root: None,
913        elapsed_ms: 0,
914        duration_ms: 0,
915        iterations: 0,
916        input_tokens: 0,
917        output_tokens: 0,
918        cost_usd: 0.0,
919        pricing_known: false,
920        tool_calls: 0,
921        rejected_tool_calls: 0,
922        tool_sequence: Vec::new(),
923        successful_tools: Vec::new(),
924        transcript_event_count: 0,
925        verification_success: false,
926        harn_exit_code: 0,
927        error: None,
928        stderr_excerpt: None,
929        local_cleanup: None,
930    }
931}
932
933fn provider_available(selector: &ModelSelector) -> bool {
934    if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
935        return true;
936    }
937    harn_vm::llm_config::provider_key_available(&selector.provider)
938}
939
940fn resolve_fixtures(raw_fixtures: &[String]) -> Result<Vec<FixtureDefinition>, String> {
941    let mut seen = BTreeSet::new();
942    let mut out = Vec::new();
943    for raw in raw_fixtures {
944        let fixture = raw.trim().to_ascii_lowercase();
945        if fixture.is_empty() {
946            continue;
947        }
948        if fixture == "all" {
949            return Ok(FIXTURE_DEFINITIONS.to_vec());
950        }
951        let Some(definition) = fixture_definition(&fixture) else {
952            return Err(format!(
953                "unsupported --fixture `{fixture}`; expected one of: all, {}",
954                FIXTURE_DEFINITIONS
955                    .iter()
956                    .map(|definition| definition.id)
957                    .collect::<Vec<_>>()
958                    .join(", ")
959            ));
960        };
961        if seen.insert(definition.id) {
962            out.push(definition);
963        }
964    }
965    if out.is_empty() {
966        return Err("at least one coding-agent fixture must be selected".to_string());
967    }
968    Ok(out)
969}
970
971fn fixture_definition(id: &str) -> Option<FixtureDefinition> {
972    FIXTURE_DEFINITIONS
973        .iter()
974        .copied()
975        .find(|definition| definition.id == id)
976}
977
978async fn resolve_models(args: &EvalCodingAgentArgs) -> Result<Vec<ModelSelector>, String> {
979    let mut seen = BTreeSet::new();
980    let mut out = Vec::new();
981    for raw in normalize_model_selector_args(&args.models) {
982        let trimmed = raw.trim();
983        if trimmed.is_empty() {
984            continue;
985        }
986        let selector = resolve_selector(trimmed);
987        if seen.insert(selector_label(&selector)) {
988            out.push(selector);
989        }
990    }
991    if args.include_local {
992        for selector in discover_local_models(args).await {
993            if seen.insert(selector_label(&selector)) {
994                out.push(selector);
995            }
996        }
997    }
998    Ok(out)
999}
1000
1001fn normalize_model_selector_args(raw_models: &[String]) -> Vec<String> {
1002    let mut out = Vec::new();
1003    let mut index = 0;
1004    while index < raw_models.len() {
1005        let current = raw_models[index].trim();
1006        if current.starts_with("provider=") && index + 1 < raw_models.len() {
1007            let next = raw_models[index + 1].trim();
1008            if next.starts_with("model=") {
1009                out.push(format!("{current},{next}"));
1010                index += 2;
1011                continue;
1012            }
1013        }
1014        out.push(current.to_string());
1015        index += 1;
1016    }
1017    out
1018}
1019
1020async fn discover_local_models(args: &EvalCodingAgentArgs) -> Vec<ModelSelector> {
1021    let providers = if args.local_providers.is_empty() {
1022        local_provider_ids(None)
1023    } else {
1024        args.local_providers.clone()
1025    };
1026    let mut selectors = Vec::new();
1027    let mut seen = BTreeSet::new();
1028    for provider in providers {
1029        if selectors.len() >= args.max_local_models {
1030            break;
1031        }
1032        let Ok(snapshot) = snapshot_provider(&provider, Path::new(".")).await else {
1033            continue;
1034        };
1035        if !snapshot.reachable {
1036            continue;
1037        }
1038        let mut models = snapshot
1039            .loaded_models
1040            .iter()
1041            .map(|model| model.name.clone())
1042            .collect::<Vec<_>>();
1043        models.extend(snapshot.served_models);
1044        for model in models {
1045            if selectors.len() >= args.max_local_models {
1046                break;
1047            }
1048            let selector = ModelSelector {
1049                selector: format!("{provider}:{model}"),
1050                provider: provider.clone(),
1051                model,
1052            };
1053            if seen.insert(selector_label(&selector)) {
1054                selectors.push(selector);
1055            }
1056        }
1057    }
1058    selectors
1059}
1060
1061fn normalize_tool_formats(raw_formats: &[String]) -> Result<Vec<String>, String> {
1062    let mut seen = BTreeSet::new();
1063    let mut out = Vec::new();
1064    for raw in raw_formats {
1065        let format = raw.trim().to_ascii_lowercase();
1066        if format.is_empty() {
1067            continue;
1068        }
1069        if format != "native" && format != "text" {
1070            return Err(format!(
1071                "unsupported --tool-format `{format}`; expected `native` or `text`"
1072            ));
1073        }
1074        if seen.insert(format.clone()) {
1075            out.push(format);
1076        }
1077    }
1078    Ok(out)
1079}
1080
1081fn build_matrix(
1082    fixtures: &[FixtureDefinition],
1083    models: &[ModelSelector],
1084    tool_formats: &[String],
1085    max_runs: Option<usize>,
1086) -> Vec<(FixtureDefinition, ModelSelector, String)> {
1087    if max_runs == Some(0) {
1088        return Vec::new();
1089    }
1090    let mut matrix = Vec::new();
1091    for fixture in fixtures {
1092        for selector in models {
1093            for tool_format in tool_formats {
1094                matrix.push((*fixture, selector.clone(), tool_format.clone()));
1095                if max_runs.is_some_and(|limit| matrix.len() >= limit) {
1096                    return matrix;
1097                }
1098            }
1099        }
1100    }
1101    matrix
1102}
1103
1104#[allow(clippy::too_many_arguments)]
1105fn build_summary(
1106    output_dir: &Path,
1107    fixtures: Vec<FixtureDefinition>,
1108    models: Vec<ModelSelector>,
1109    tool_formats: Vec<String>,
1110    env_keys_loaded: Vec<LoadedEnvKey>,
1111    runs: Vec<RunReport>,
1112    step_judge_preset: Option<String>,
1113    run_label: String,
1114    baseline_comparison: Option<BaselineComparison>,
1115) -> EvalSummary {
1116    let passed_runs = runs.iter().filter(|run| run.passed).count();
1117    let skipped_runs = runs.iter().filter(|run| run.skipped).count();
1118    let failed_runs = runs
1119        .iter()
1120        .filter(|run| !run.passed && !run.skipped)
1121        .count();
1122    let total_cost_usd = runs.iter().map(|run| run.cost_usd).sum();
1123    let rollups = build_rollups(&runs);
1124    let comparisons = compare_formats(&runs);
1125    let parity_by_pair = build_parity_by_pair(&comparisons);
1126    let diverged_comparisons = comparisons
1127        .iter()
1128        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1129        .count();
1130    let followups = suggest_followups(&runs, &comparisons);
1131    EvalSummary {
1132        schema_version: 3,
1133        fixture_ids: fixtures
1134            .iter()
1135            .map(|fixture| fixture.id.to_string())
1136            .collect(),
1137        fixtures: fixtures
1138            .iter()
1139            .map(|fixture| FixtureReport {
1140                id: fixture.id.to_string(),
1141                name: fixture.name.to_string(),
1142                tool_sequence: fixture.tool_sequence.to_string(),
1143                description: fixture.description.to_string(),
1144            })
1145            .collect(),
1146        output_dir: output_dir.display().to_string(),
1147        models,
1148        tool_formats,
1149        env_keys_loaded,
1150        total_runs: runs.len(),
1151        passed_runs,
1152        failed_runs,
1153        skipped_runs,
1154        diverged_comparisons,
1155        total_cost_usd,
1156        rollups,
1157        runs,
1158        comparisons,
1159        parity_by_pair,
1160        followups,
1161        step_judge_preset,
1162        run_label,
1163        baseline_comparison,
1164    }
1165}
1166
1167fn load_baseline_comparison(path: &Path, runs: &[RunReport]) -> Result<BaselineComparison, String> {
1168    let resolved = if path.is_dir() {
1169        path.join("summary.json")
1170    } else {
1171        path.to_path_buf()
1172    };
1173    let raw = fs::read_to_string(&resolved)
1174        .map_err(|e| format!("failed to read {}: {e}", resolved.display()))?;
1175    let baseline: serde_json::Value = serde_json::from_str(&raw)
1176        .map_err(|e| format!("failed to parse {} as JSON: {e}", resolved.display()))?;
1177    let baseline_runs = baseline
1178        .get("runs")
1179        .and_then(|v| v.as_array())
1180        .ok_or_else(|| format!("{} has no `runs` array", resolved.display()))?;
1181    // Index baseline status by fixture_id. When the baseline has multiple
1182    // runs per fixture (e.g. native + text), prefer the first passing run
1183    // so a fixture passes the comparison if ANY baseline variant did.
1184    let mut baseline_status: BTreeMap<String, &str> = BTreeMap::new();
1185    for run in baseline_runs {
1186        let fixture_id = match run.get("fixture_id").and_then(|v| v.as_str()) {
1187            Some(id) => id.to_string(),
1188            None => continue,
1189        };
1190        let passed = run.get("passed").and_then(|v| v.as_bool()).unwrap_or(false);
1191        let skipped = run
1192            .get("skipped")
1193            .and_then(|v| v.as_bool())
1194            .unwrap_or(false);
1195        let status = if skipped {
1196            "skipped"
1197        } else if passed {
1198            "passed"
1199        } else {
1200            "failed"
1201        };
1202        baseline_status
1203            .entry(fixture_id)
1204            .and_modify(|existing| {
1205                if *existing != "passed" && status == "passed" {
1206                    *existing = status;
1207                }
1208            })
1209            .or_insert(status);
1210    }
1211    let mut cell_status: BTreeMap<String, &str> = BTreeMap::new();
1212    for run in runs {
1213        let status = if run.skipped {
1214            "skipped"
1215        } else if run.passed {
1216            "passed"
1217        } else {
1218            "failed"
1219        };
1220        cell_status
1221            .entry(run.fixture_id.clone())
1222            .and_modify(|existing| {
1223                if *existing != "passed" && status == "passed" {
1224                    *existing = status;
1225                }
1226            })
1227            .or_insert(status);
1228    }
1229    let mut regressions = Vec::new();
1230    let mut recoveries = Vec::new();
1231    let mut unchanged_passes = Vec::new();
1232    let mut unchanged_failures = Vec::new();
1233    let mut missing_in_baseline = Vec::new();
1234    let mut missing_in_cell = Vec::new();
1235    for (fixture, cell) in &cell_status {
1236        match baseline_status.get(fixture) {
1237            None => missing_in_baseline.push(fixture.clone()),
1238            Some(base) => match (*base, *cell) {
1239                ("passed", "passed") => unchanged_passes.push(fixture.clone()),
1240                ("passed", _) => regressions.push(FixtureStatusDelta {
1241                    fixture_id: fixture.clone(),
1242                    baseline_status: (*base).to_string(),
1243                    cell_status: (*cell).to_string(),
1244                }),
1245                (_, "passed") => recoveries.push(FixtureStatusDelta {
1246                    fixture_id: fixture.clone(),
1247                    baseline_status: (*base).to_string(),
1248                    cell_status: (*cell).to_string(),
1249                }),
1250                _ => unchanged_failures.push(fixture.clone()),
1251            },
1252        }
1253    }
1254    for fixture in baseline_status.keys() {
1255        if !cell_status.contains_key(fixture) {
1256            missing_in_cell.push(fixture.clone());
1257        }
1258    }
1259    let baseline_label = baseline
1260        .get("run_label")
1261        .and_then(|v| v.as_str())
1262        .filter(|s| !s.is_empty())
1263        .or_else(|| baseline.get("output_dir").and_then(|v| v.as_str()))
1264        .unwrap_or("")
1265        .to_string();
1266    let regressions_count = regressions.len();
1267    let recoveries_count = recoveries.len();
1268    let total_compared =
1269        regressions_count + recoveries_count + unchanged_passes.len() + unchanged_failures.len();
1270    let net_lift_pp = if total_compared == 0 {
1271        0.0
1272    } else {
1273        let raw =
1274            (recoveries_count as f64 - regressions_count as f64) / total_compared as f64 * 100.0;
1275        (raw * 10.0).round() / 10.0
1276    };
1277    Ok(BaselineComparison {
1278        baseline_label,
1279        baseline_path: resolved.display().to_string(),
1280        regressions,
1281        recoveries,
1282        unchanged_passes,
1283        unchanged_failures,
1284        missing_in_baseline,
1285        missing_in_cell,
1286        regressions_count,
1287        recoveries_count,
1288        net_lift_pp,
1289    })
1290}
1291
1292fn build_rollups(runs: &[RunReport]) -> EvalRollups {
1293    EvalRollups {
1294        by_fixture: rollup_by(runs, |run| run.fixture_id.clone()),
1295        by_provider: rollup_by(runs, |run| run.selector.provider.clone()),
1296        by_model: rollup_by(runs, |run| run.selector.model.clone()),
1297        by_tool_format: rollup_by(runs, |run| run.tool_format.clone()),
1298        by_tool_sequence: rollup_by(runs, |run| run.fixture_tool_sequence.clone()),
1299    }
1300}
1301
1302fn rollup_by<F>(runs: &[RunReport], key_for: F) -> Vec<RollupReport>
1303where
1304    F: Fn(&RunReport) -> String,
1305{
1306    let mut grouped: BTreeMap<String, RollupReport> = BTreeMap::new();
1307    for run in runs {
1308        let key = key_for(run);
1309        let entry = grouped.entry(key.clone()).or_insert_with(|| RollupReport {
1310            key,
1311            total_runs: 0,
1312            passed_runs: 0,
1313            failed_runs: 0,
1314            skipped_runs: 0,
1315            total_cost_usd: 0.0,
1316        });
1317        entry.total_runs += 1;
1318        if run.passed {
1319            entry.passed_runs += 1;
1320        } else if run.skipped {
1321            entry.skipped_runs += 1;
1322        } else {
1323            entry.failed_runs += 1;
1324        }
1325        entry.total_cost_usd += run.cost_usd;
1326    }
1327    grouped.into_values().collect()
1328}
1329
1330fn compare_formats(runs: &[RunReport]) -> Vec<FormatComparison> {
1331    let mut grouped: BTreeMap<String, Vec<&RunReport>> = BTreeMap::new();
1332    for run in runs {
1333        grouped
1334            .entry(format!(
1335                "{}\0{}",
1336                run.fixture_id,
1337                selector_label(&run.selector)
1338            ))
1339            .or_default()
1340            .push(run);
1341    }
1342    let mut out = Vec::new();
1343    for group in grouped.values() {
1344        let Some(first) = group.first() else {
1345            continue;
1346        };
1347        let native = group
1348            .iter()
1349            .find(|run| run.tool_format == "native")
1350            .copied();
1351        let text = group.iter().find(|run| run.tool_format == "text").copied();
1352        if native.is_none() && text.is_none() {
1353            continue;
1354        }
1355        let pair = native.zip(text);
1356        let mut divergence_reasons = Vec::new();
1357        if let Some((native, text)) = pair {
1358            if native.status != text.status {
1359                divergence_reasons.push(format!(
1360                    "status differs: native={} text={}",
1361                    native.status, text.status
1362                ));
1363            }
1364            if native.passed != text.passed {
1365                divergence_reasons.push(format!(
1366                    "pass result differs: native={} text={}",
1367                    native.passed, text.passed
1368                ));
1369            }
1370            if native.verification_success != text.verification_success {
1371                divergence_reasons.push(format!(
1372                    "verifier result differs: native={} text={}",
1373                    native.verification_success, text.verification_success
1374                ));
1375            }
1376            if native.tool_sequence != text.tool_sequence {
1377                divergence_reasons.push(format!(
1378                    "tool sequence differs: native=[{}] text=[{}]",
1379                    native.tool_sequence.join(", "),
1380                    text.tool_sequence.join(", ")
1381                ));
1382            }
1383            if native.rejected_tool_calls != text.rejected_tool_calls {
1384                divergence_reasons.push(format!(
1385                    "rejected tool-call recovery differs: native={} text={}",
1386                    native.rejected_tool_calls, text.rejected_tool_calls
1387                ));
1388            }
1389        }
1390        let evidence_paths = [native, text]
1391            .into_iter()
1392            .flatten()
1393            .map(|run| run.transcript_events_path.clone())
1394            .collect::<Vec<_>>();
1395        out.push(FormatComparison {
1396            fixture_id: first.fixture_id.clone(),
1397            selector: first.selector.clone(),
1398            native_run_id: native.map(|run| run.run_id.clone()),
1399            text_run_id: text.map(|run| run.run_id.clone()),
1400            native_evidence_path: native.map(|run| run.transcript_events_path.clone()),
1401            text_evidence_path: text.map(|run| run.transcript_events_path.clone()),
1402            native_status: native.map(|run| run.status.clone()),
1403            text_status: text.map(|run| run.status.clone()),
1404            native_passed: native.map(|run| run.passed),
1405            text_passed: text.map(|run| run.passed),
1406            native_tool_call_count: native.map(|run| run.tool_calls),
1407            text_tool_call_count: text.map(|run| run.tool_calls),
1408            native_rejected_tool_call_count: native.map(|run| run.rejected_tool_calls),
1409            text_rejected_tool_call_count: text.map(|run| run.rejected_tool_calls),
1410            verifier_match: pair
1411                .map(|(native, text)| native.verification_success == text.verification_success),
1412            tool_sequence_match: pair
1413                .map(|(native, text)| native.tool_sequence == text.tool_sequence),
1414            rejected_tool_call_delta_text_minus_native: pair.map(|(native, text)| {
1415                text.rejected_tool_calls as i64 - native.rejected_tool_calls as i64
1416            }),
1417            token_delta_text_minus_native: pair.map(|(native, text)| {
1418                (text.input_tokens + text.output_tokens)
1419                    - (native.input_tokens + native.output_tokens)
1420            }),
1421            iteration_delta_text_minus_native: pair
1422                .map(|(native, text)| text.iterations - native.iterations),
1423            equivalent: pair.map(|(native, text)| {
1424                native.status == text.status
1425                    && native.passed == text.passed
1426                    && native.skipped == text.skipped
1427                    && native.verification_success == text.verification_success
1428                    && native.tool_sequence == text.tool_sequence
1429                    && native.rejected_tool_calls == text.rejected_tool_calls
1430            }),
1431            divergence_reasons,
1432            evidence_paths,
1433        });
1434    }
1435    out
1436}
1437
1438fn build_parity_by_pair(comparisons: &[FormatComparison]) -> Vec<ToolModeParityPairSummary> {
1439    let fixture_inputs = comparisons
1440        .iter()
1441        .filter_map(parity_fixture_input)
1442        .collect::<Vec<_>>();
1443    let fixture_reports = tool_mode_parity::build_fixture_reports(&fixture_inputs);
1444    tool_mode_parity::build_pair_summaries(&fixture_reports)
1445}
1446
1447fn parity_fixture_input(comparison: &FormatComparison) -> Option<ToolModeParityFixtureInput> {
1448    let native_verdict = comparison.native_status.clone()?;
1449    let text_verdict = comparison.text_status.clone()?;
1450    if native_verdict == "skipped" || text_verdict == "skipped" {
1451        return None;
1452    }
1453    Some(ToolModeParityFixtureInput {
1454        provider: comparison.selector.provider.clone(),
1455        model: comparison.selector.model.clone(),
1456        fixture_id: comparison.fixture_id.clone(),
1457        native_verdict,
1458        text_verdict,
1459        native_passed: comparison.native_passed?,
1460        text_passed: comparison.text_passed?,
1461        agreement: comparison.equivalent?,
1462        verifier_agreement: comparison.verifier_match?,
1463        native_tool_call_count: comparison.native_tool_call_count?,
1464        text_tool_call_count: comparison.text_tool_call_count?,
1465        native_rejected_tool_call_count: comparison.native_rejected_tool_call_count?,
1466        text_rejected_tool_call_count: comparison.text_rejected_tool_call_count?,
1467        native_evidence_path: comparison.native_evidence_path.clone()?,
1468        text_evidence_path: comparison.text_evidence_path.clone()?,
1469    })
1470}
1471
1472fn suggest_followups(
1473    runs: &[RunReport],
1474    comparisons: &[FormatComparison],
1475) -> Vec<FollowupSuggestion> {
1476    let mut out = Vec::new();
1477    let failed = runs
1478        .iter()
1479        .filter(|run| !run.passed && !run.skipped)
1480        .map(|run| run.run_id.clone())
1481        .collect::<Vec<_>>();
1482    if !failed.is_empty() {
1483        out.push(FollowupSuggestion {
1484            title: "Normalize coding-agent fixture failures across provider presets".to_string(),
1485            body: "One or more fixture/provider/tool-format runs failed. Inspect the run directories and decide whether the gap belongs in provider adapters, preset prompting, transcript handling, or host-tool ergonomics.".to_string(),
1486            labels: vec!["eval".to_string(), "providers".to_string()],
1487            run_ids: failed,
1488        });
1489    }
1490
1491    let rejected = runs
1492        .iter()
1493        .filter(|run| run.rejected_tool_calls > 0)
1494        .map(|run| run.run_id.clone())
1495        .collect::<Vec<_>>();
1496    if !rejected.is_empty() {
1497        out.push(FollowupSuggestion {
1498            title: "Abstract rejected tool-call recovery in agent transcripts".to_string(),
1499            body: "Some runs recovered after rejected tool calls. Add runtime support or preset guidance so harness authors can distinguish recoverable provider/tool-shape noise from user-relevant transcript events.".to_string(),
1500            labels: vec!["agents".to_string(), "transcripts".to_string()],
1501            run_ids: rejected,
1502        });
1503    }
1504
1505    let mismatched = comparisons
1506        .iter()
1507        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1508        .map(|comparison| {
1509            format!(
1510                "{}:{} ({})",
1511                comparison.fixture_id,
1512                selector_label(&comparison.selector),
1513                comparison.divergence_reasons.join("; ")
1514            )
1515        })
1516        .collect::<Vec<_>>();
1517    if !mismatched.is_empty() {
1518        let run_ids = comparisons
1519            .iter()
1520            .filter(|comparison| !comparison.divergence_reasons.is_empty())
1521            .flat_map(|comparison| {
1522                [
1523                    comparison.native_run_id.clone(),
1524                    comparison.text_run_id.clone(),
1525                ]
1526            })
1527            .flatten()
1528            .collect::<Vec<_>>();
1529        out.push(FollowupSuggestion {
1530            title: "Make native/text tool modes behaviorally interchangeable for preset harnesses"
1531                .to_string(),
1532            body: format!(
1533                "Native and text tool modes diverged for: {}. The preset/runtime boundary should hide provider tool-channel differences where possible.",
1534                mismatched.join(", ")
1535            ),
1536            labels: vec!["agents".to_string(), "tools".to_string()],
1537            run_ids,
1538        });
1539    }
1540
1541    let unknown_pricing = runs
1542        .iter()
1543        .filter(|run| {
1544            !run.skipped
1545                && !run.pricing_known
1546                && !matches!(run.selector.provider.as_str(), "mock" | "fake")
1547                && !selector_is_local(&run.selector)
1548        })
1549        .map(|run| run.run_id.clone())
1550        .collect::<Vec<_>>();
1551    if !unknown_pricing.is_empty() {
1552        out.push(FollowupSuggestion {
1553            title: "Fill provider pricing metadata for benchmarked models".to_string(),
1554            body: "At least one live provider/model produced usage metrics but had no pricing entry, which weakens cost comparisons in eval reports.".to_string(),
1555            labels: vec!["providers".to_string(), "docs".to_string()],
1556            run_ids: unknown_pricing,
1557        });
1558    }
1559    out
1560}
1561
1562fn write_json_artifacts(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1563    write_json_pretty(&output_dir.join("summary.json"), summary)?;
1564    write_jsonl(&output_dir.join("per_run.jsonl"), &summary.runs)?;
1565    let summary_value = serde_json::to_value(summary).map_err(|error| error.to_string())?;
1566    let readiness = local_readiness::report_from_summary_json(
1567        &summary_value,
1568        output_dir.display().to_string(),
1569    )?;
1570    write_json_pretty(&output_dir.join("local_readiness.json"), &readiness)?;
1571    let generated_at = RealClock::new()
1572        .now_utc()
1573        .format(&time::format_description::well_known::Rfc3339)
1574        .map_err(|error| format!("failed to format parity overlay timestamp: {error}"))?;
1575    let parity_dir = output_dir.join(TOOL_MODE_PARITY_DIRECTORY);
1576    let parity_reports = tool_mode_parity::build_fixture_reports(
1577        &summary
1578            .comparisons
1579            .iter()
1580            .filter_map(parity_fixture_input)
1581            .collect::<Vec<_>>(),
1582    );
1583    for report in &parity_reports {
1584        let path = parity_dir
1585            .join(sanitize_id(&format!(
1586                "{}__{}:{}",
1587                report.fixture_id, report.provider, report.model
1588            )))
1589            .join("parity.json");
1590        tool_mode_parity::write_fixture_report(&path, report)?;
1591    }
1592    let overlay = tool_mode_parity::build_overlay(
1593        &summary.parity_by_pair,
1594        &generated_at,
1595        TOOL_MODE_PARITY_FIXTURE_SUITE,
1596        output_dir,
1597    );
1598    tool_mode_parity::write_overlay(
1599        &output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME),
1600        &overlay,
1601    )?;
1602    Ok(())
1603}
1604
1605fn announce_output_paths(output_dir: &Path) {
1606    eprintln!(
1607        "wrote {}, {}, {}, {}, {}, {}, and {}",
1608        output_dir.join("summary.json").display(),
1609        output_dir.join("per_run.jsonl").display(),
1610        output_dir.join("local_readiness.json").display(),
1611        output_dir.join(TOOL_MODE_PARITY_DIRECTORY).display(),
1612        output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME).display(),
1613        output_dir.join("summary.md").display(),
1614        output_dir.join("followups.md").display()
1615    );
1616}
1617
1618// ─── Legacy direct-render path (gated by HARN_CLI_IMPL=rust) ────────────
1619
1620fn write_markdown_artifacts_legacy(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1621    fs::write(output_dir.join("summary.md"), render_markdown(summary))
1622        .map_err(|error| format!("failed to write summary.md: {error}"))?;
1623    fs::write(output_dir.join("followups.md"), render_followups(summary))
1624        .map_err(|error| format!("failed to write followups.md: {error}"))?;
1625    Ok(())
1626}
1627
1628fn print_summary_legacy(summary: &EvalSummary) {
1629    println!(
1630        "coding-agent eval: {}/{} passed, {} skipped, total_cost_usd={:.6}",
1631        summary.passed_runs, summary.total_runs, summary.skipped_runs, summary.total_cost_usd
1632    );
1633}
1634
1635fn print_json_legacy(summary: &EvalSummary) {
1636    match serde_json::to_string_pretty(summary) {
1637        Ok(payload) => println!("{payload}"),
1638        Err(error) => eprintln!("warning: failed to render summary JSON: {error}"),
1639    }
1640}
1641
1642// ─── Dispatch (.harn) render path ────────────────────────────────────────
1643
1644async fn write_markdown_artifacts_dispatch(
1645    output_dir: &Path,
1646    summary: &EvalSummary,
1647) -> Result<(), i32> {
1648    let markdown = render_via_dispatch(summary, "markdown").await?;
1649    if let Err(error) = fs::write(output_dir.join("summary.md"), markdown) {
1650        eprintln!("error: failed to write summary.md: {error}");
1651        return Err(1);
1652    }
1653    let followups = render_via_dispatch(summary, "followups").await?;
1654    if let Err(error) = fs::write(output_dir.join("followups.md"), followups) {
1655        eprintln!("error: failed to write followups.md: {error}");
1656        return Err(1);
1657    }
1658    Ok(())
1659}
1660
1661async fn print_summary_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1662    let payload = render_via_dispatch(summary, "summary").await?;
1663    print!("{payload}");
1664    // The script emits exactly the legacy summary line (no trailing
1665    // newline); add one to match the legacy `println!` semantics.
1666    if !payload.ends_with('\n') {
1667        println!();
1668    }
1669    Ok(())
1670}
1671
1672async fn print_json_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1673    let payload = render_via_dispatch(summary, "json").await?;
1674    print!("{payload}");
1675    if !payload.ends_with('\n') {
1676        println!();
1677    }
1678    Ok(())
1679}
1680
1681/// Dispatch to the embedded `cli/eval/coding_agent.harn` script for one
1682/// of the four rendering modes (markdown / followups / summary / json).
1683/// Returns the captured stdout on success, or a propagated exit code
1684/// on failure.
1685///
1686/// **Concurrency.** Held under [`DISPATCH_RENDER_LOCK`] so concurrent
1687/// in-process callers don't race on the global env vars the Rust shim
1688/// sets to hand the report to the script. See the lock's docstring
1689/// for the trade-off rationale.
1690async fn render_via_dispatch(summary: &EvalSummary, mode: &str) -> Result<String, i32> {
1691    let summary_json = match serde_json::to_string(summary) {
1692        Ok(json) => json,
1693        Err(error) => {
1694            eprintln!("error: failed to serialise EvalSummary for dispatch: {error}");
1695            return Err(1);
1696        }
1697    };
1698    let _guard = DISPATCH_RENDER_LOCK.lock().await;
1699    let _summary = ScopedEnvVar::set(CODING_AGENT_SUMMARY_ENV, &summary_json);
1700    let _mode = ScopedEnvVar::set(CODING_AGENT_MODE_ENV, mode);
1701
1702    let outcome = dispatch::run_embedded_script("eval/coding_agent", Vec::new(), false).await;
1703    if !outcome.stderr.is_empty() {
1704        let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
1705    }
1706    if outcome.exit_code != 0 {
1707        return Err(outcome.exit_code);
1708    }
1709    Ok(outcome.stdout)
1710}
1711
1712fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<(), String> {
1713    let body = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
1714    fs::write(path, format!("{body}\n")).map_err(|error| error.to_string())
1715}
1716
1717fn write_jsonl<T: Serialize>(path: &Path, items: &[T]) -> Result<(), String> {
1718    let mut body = String::new();
1719    for item in items {
1720        let line = serde_json::to_string(item).map_err(|error| error.to_string())?;
1721        body.push_str(&line);
1722        body.push('\n');
1723    }
1724    fs::write(path, body).map_err(|error| error.to_string())
1725}
1726
1727fn render_markdown(summary: &EvalSummary) -> String {
1728    let mut out = String::new();
1729    out.push_str("# Coding Agent Harness Quality Suite\n\n");
1730    out.push_str(&format!(
1731        "- fixtures: `{}`\n- passed: {}/{}\n- skipped: {}\n- total_cost_usd: {:.6}\n\n",
1732        summary.fixture_ids.join("`, `"),
1733        summary.passed_runs,
1734        summary.total_runs,
1735        summary.skipped_runs,
1736        summary.total_cost_usd
1737    ));
1738    render_rollup_table(&mut out, "By Fixture", &summary.rollups.by_fixture);
1739    render_rollup_table(&mut out, "By Provider", &summary.rollups.by_provider);
1740    render_rollup_table(&mut out, "By Model", &summary.rollups.by_model);
1741    render_rollup_table(&mut out, "By Tool Format", &summary.rollups.by_tool_format);
1742    render_rollup_table(
1743        &mut out,
1744        "By Tool Sequence",
1745        &summary.rollups.by_tool_sequence,
1746    );
1747
1748    out.push_str("\n## Runs\n\n");
1749    out.push_str("| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n");
1750    out.push_str("|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n");
1751    for run in &summary.runs {
1752        let tool_sequence = if run.tool_sequence.is_empty() {
1753            "-".to_string()
1754        } else {
1755            run.tool_sequence.join(", ").replace('|', "\\|")
1756        };
1757        out.push_str(&format!(
1758            "| `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {:.6} | {} | `{}` |\n",
1759            run.fixture_id,
1760            run.run_id,
1761            run.selector.provider,
1762            run.selector.model.replace('|', "\\|"),
1763            run.tool_format,
1764            run.fixture_tool_sequence,
1765            tool_sequence,
1766            run.status,
1767            run.iterations,
1768            run.input_tokens + run.output_tokens,
1769            run.cost_usd,
1770            markdown_link(
1771                &run.transcript_event_count.to_string(),
1772                &run.transcript_events_path
1773            ),
1774            run.output_dir
1775        ));
1776    }
1777    if let Some(comparison) = &summary.baseline_comparison {
1778        out.push_str("\n## Baseline Comparison\n\n");
1779        out.push_str(&format!(
1780            "Compared against `{}`{}.\n\n",
1781            comparison.baseline_path,
1782            if comparison.baseline_label.is_empty() {
1783                String::new()
1784            } else {
1785                format!(" (label: `{}`)", comparison.baseline_label)
1786            },
1787        ));
1788        out.push_str(&format!(
1789            "- regressions: **{}** (baseline passed, this cell failed)\n- recoveries: **{}** (baseline failed, this cell passed)\n- net lift: **{:+.1}pp**\n\n",
1790            comparison.regressions_count,
1791            comparison.recoveries_count,
1792            comparison.net_lift_pp,
1793        ));
1794        if !comparison.regressions.is_empty() {
1795            out.push_str("### Regressions\n\n");
1796            for delta in &comparison.regressions {
1797                out.push_str(&format!(
1798                    "- `{}`: `{}` → `{}`\n",
1799                    delta.fixture_id, delta.baseline_status, delta.cell_status,
1800                ));
1801            }
1802            out.push('\n');
1803        }
1804        if !comparison.recoveries.is_empty() {
1805            out.push_str("### Recoveries\n\n");
1806            for delta in &comparison.recoveries {
1807                out.push_str(&format!(
1808                    "- `{}`: `{}` → `{}`\n",
1809                    delta.fixture_id, delta.baseline_status, delta.cell_status,
1810                ));
1811            }
1812            out.push('\n');
1813        }
1814    }
1815    if !summary.comparisons.is_empty() {
1816        out.push_str("\n## Native/Text Comparison\n\n");
1817        out.push_str("| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n");
1818        out.push_str("|---|---|---|---|---|---|---|---:|---:|---:|---|\n");
1819        for comparison in &summary.comparisons {
1820            out.push_str(&format!(
1821                "| `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
1822                comparison.fixture_id,
1823                selector_label(&comparison.selector),
1824                comparison
1825                    .native_status
1826                    .clone()
1827                    .unwrap_or_else(|| "-".to_string()),
1828                comparison
1829                    .text_status
1830                    .clone()
1831                    .unwrap_or_else(|| "-".to_string()),
1832                optional_bool_mark(comparison.equivalent),
1833                optional_bool_mark(comparison.verifier_match),
1834                optional_bool_mark(comparison.tool_sequence_match),
1835                comparison
1836                    .rejected_tool_call_delta_text_minus_native
1837                    .map(|v| v.to_string())
1838                    .unwrap_or_else(|| "-".to_string()),
1839                comparison
1840                    .token_delta_text_minus_native
1841                    .map(|v| v.to_string())
1842                    .unwrap_or_else(|| "-".to_string()),
1843                comparison
1844                    .iteration_delta_text_minus_native
1845                    .map(|v| v.to_string())
1846                    .unwrap_or_else(|| "-".to_string()),
1847                comparison_evidence_links(comparison)
1848            ));
1849        }
1850    }
1851    if !summary.parity_by_pair.is_empty() {
1852        out.push_str("\n## Parity report — native vs text\n\n");
1853        out.push_str("| selector | sample | native pass | text pass | agreement | verifier divergence | native_only | text_only | both_pass | both_fail |\n");
1854        out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|\n");
1855        for pair in &summary.parity_by_pair {
1856            out.push_str(&format!(
1857                "| `{}` | {} | {:.1}% | {:.1}% | {:.1}% | {:.1}% | {} | {} | {} | {} |\n",
1858                selector_label(&ModelSelector {
1859                    selector: format!("{}:{}", pair.provider, pair.model),
1860                    provider: pair.provider.clone(),
1861                    model: pair.model.clone(),
1862                }),
1863                pair.sample_size,
1864                pair.native.pass_rate * 100.0,
1865                pair.text.pass_rate * 100.0,
1866                pair.agreement_rate * 100.0,
1867                pair.verifier_divergence_rate * 100.0,
1868                pair.divergence_counts.native_only_pass,
1869                pair.divergence_counts.text_only_pass,
1870                pair.divergence_counts.both_pass,
1871                pair.divergence_counts.both_fail,
1872            ));
1873        }
1874    }
1875    let diverged = summary
1876        .comparisons
1877        .iter()
1878        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1879        .collect::<Vec<_>>();
1880    if !diverged.is_empty() {
1881        out.push_str("\n## Native/Text Divergence Evidence\n\n");
1882        for comparison in diverged {
1883            out.push_str(&format!(
1884                "- `{}` `{}`: {}\n",
1885                comparison.fixture_id,
1886                selector_label(&comparison.selector),
1887                comparison.divergence_reasons.join("; ")
1888            ));
1889            if !comparison.evidence_paths.is_empty() {
1890                out.push_str(&format!(
1891                    "  Evidence: {}\n",
1892                    comparison_evidence_links(comparison)
1893                ));
1894            }
1895        }
1896    }
1897    out
1898}
1899
1900fn render_rollup_table(out: &mut String, title: &str, rollups: &[RollupReport]) {
1901    out.push_str(&format!("## {title}\n\n"));
1902    out.push_str("| key | passed | failed | skipped | total | cost |\n");
1903    out.push_str("|---|---:|---:|---:|---:|---:|\n");
1904    for rollup in rollups {
1905        out.push_str(&format!(
1906            "| `{}` | {} | {} | {} | {} | {:.6} |\n",
1907            rollup.key.replace('|', "\\|"),
1908            rollup.passed_runs,
1909            rollup.failed_runs,
1910            rollup.skipped_runs,
1911            rollup.total_runs,
1912            rollup.total_cost_usd
1913        ));
1914    }
1915    out.push('\n');
1916}
1917
1918fn render_followups(summary: &EvalSummary) -> String {
1919    let mut out = String::new();
1920    out.push_str("# Follow-up Issue Candidates\n\n");
1921    if summary.followups.is_empty() {
1922        out.push_str("No follow-up issue candidates were generated from this run.\n");
1923        return out;
1924    }
1925    for followup in &summary.followups {
1926        out.push_str(&format!("## {}\n\n{}\n\n", followup.title, followup.body));
1927        if !followup.run_ids.is_empty() {
1928            out.push_str(&format!("- run_ids: `{}`\n", followup.run_ids.join("`, `")));
1929        }
1930        if !followup.labels.is_empty() {
1931            out.push_str(&format!("- labels: `{}`\n", followup.labels.join("`, `")));
1932        }
1933        out.push('\n');
1934    }
1935    out
1936}
1937
1938fn read_run_summary(run_dir: &Path) -> Option<JsonValue> {
1939    let raw = fs::read_to_string(run_dir.join("summary.json")).ok()?;
1940    serde_json::from_str(&raw).ok()
1941}
1942
1943fn parse_last_json_line(stdout: &str) -> Option<JsonValue> {
1944    stdout
1945        .lines()
1946        .rev()
1947        .map(str::trim)
1948        .filter(|line| !line.is_empty())
1949        .find_map(|line| serde_json::from_str::<JsonValue>(line).ok())
1950}
1951
1952fn string_array(value: Option<&JsonValue>) -> Vec<String> {
1953    value
1954        .and_then(JsonValue::as_array)
1955        .map(|values| {
1956            values
1957                .iter()
1958                .filter_map(JsonValue::as_str)
1959                .map(str::to_string)
1960                .collect()
1961        })
1962        .unwrap_or_default()
1963}
1964
1965fn non_empty_string_array(value: Option<&JsonValue>) -> Option<Vec<String>> {
1966    let values = string_array(value);
1967    (!values.is_empty()).then_some(values)
1968}
1969
1970fn tool_call_sequence(value: Option<&JsonValue>) -> Option<Vec<String>> {
1971    let calls = value.and_then(JsonValue::as_array)?;
1972    let mut sequence = Vec::new();
1973    for call in calls {
1974        if let Some(name) = call
1975            .get("name")
1976            .or_else(|| call.get("tool_name"))
1977            .and_then(JsonValue::as_str)
1978        {
1979            sequence.push(name.to_string());
1980        }
1981    }
1982    (!sequence.is_empty()).then_some(sequence)
1983}
1984
1985fn optional_bool_mark(value: Option<bool>) -> &'static str {
1986    match value {
1987        Some(true) => "yes",
1988        Some(false) => "no",
1989        None => "-",
1990    }
1991}
1992
1993fn comparison_evidence_links(comparison: &FormatComparison) -> String {
1994    let mut links = Vec::new();
1995    if let Some(native) = comparison.native_evidence_path.as_deref() {
1996        links.push(markdown_link("native", native));
1997    }
1998    if let Some(text) = comparison.text_evidence_path.as_deref() {
1999        links.push(markdown_link("text", text));
2000    }
2001    if links.is_empty() {
2002        "-".to_string()
2003    } else {
2004        links.join("<br>")
2005    }
2006}
2007
2008fn markdown_link(label: &str, target: &str) -> String {
2009    format!(
2010        "[{}]({})",
2011        label.replace('|', "\\|"),
2012        target
2013            .replace(' ', "%20")
2014            .replace('(', "%28")
2015            .replace(')', "%29")
2016    )
2017}
2018
2019fn reset_dir(path: &Path) -> Result<(), String> {
2020    if path.exists() {
2021        fs::remove_dir_all(path).map_err(|error| error.to_string())?;
2022    }
2023    fs::create_dir_all(path).map_err(|error| error.to_string())
2024}
2025
2026fn run_id_for(fixture: FixtureDefinition, selector: &ModelSelector, tool_format: &str) -> String {
2027    sanitize_id(&format!(
2028        "{}__{}__{}",
2029        fixture.id,
2030        selector_label(selector),
2031        tool_format
2032    ))
2033}
2034
2035fn sanitize_id(raw: &str) -> String {
2036    let mut out = String::new();
2037    for ch in raw.chars() {
2038        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
2039            out.push(ch);
2040        } else {
2041            out.push('_');
2042        }
2043    }
2044    out.trim_matches('_').to_string()
2045}
2046
2047fn default_output_dir() -> PathBuf {
2048    PathBuf::from(".harn-runs")
2049        .join("coding-agent-bench")
2050        .join("latest")
2051}
2052
2053fn excerpt(text: &str) -> Option<String> {
2054    let trimmed = text.trim();
2055    if trimmed.is_empty() {
2056        return None;
2057    }
2058    let max = 4000;
2059    if trimmed.len() <= max {
2060        return Some(trimmed.to_string());
2061    }
2062    let mut truncated = String::new();
2063    for ch in trimmed.chars().take(max) {
2064        truncated.push(ch);
2065    }
2066    truncated.push_str("...");
2067    Some(truncated)
2068}
2069
2070fn load_env_files(paths: &[PathBuf]) -> Result<(EnvOverlay, Vec<LoadedEnvKey>), String> {
2071    let mut previous = Vec::new();
2072    let mut loaded = Vec::new();
2073    let mut touched = BTreeSet::new();
2074    for path in paths {
2075        let path = expand_home(path);
2076        let raw = fs::read_to_string(&path)
2077            .map_err(|error| format!("failed to read env file {}: {error}", path.display()))?;
2078        for (line_no, line) in raw.lines().enumerate() {
2079            let Some((key, value)) = parse_env_line(line).map_err(|error| {
2080                format!("{}:{}: {error}", path.display(), line_no.saturating_add(1))
2081            })?
2082            else {
2083                continue;
2084            };
2085            if touched.insert(key.clone()) {
2086                previous.push((OsString::from(&key), std::env::var_os(&key)));
2087            }
2088            std::env::set_var(&key, value);
2089            loaded.push(LoadedEnvKey {
2090                key,
2091                source: path.display().to_string(),
2092            });
2093        }
2094    }
2095    Ok((EnvOverlay { previous }, loaded))
2096}
2097
2098fn parse_env_line(line: &str) -> Result<Option<(String, String)>, String> {
2099    let trimmed = line.trim();
2100    if trimmed.is_empty() || trimmed.starts_with('#') {
2101        return Ok(None);
2102    }
2103    let trimmed = trimmed.strip_prefix("export ").unwrap_or(trimmed).trim();
2104    let Some((key, value)) = trimmed.split_once('=') else {
2105        return Err("expected KEY=VALUE".to_string());
2106    };
2107    let key = key.trim();
2108    if key.is_empty() {
2109        return Err("empty key".to_string());
2110    }
2111    if !key
2112        .chars()
2113        .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2114    {
2115        return Err(format!("invalid key `{key}`"));
2116    }
2117    Ok(Some((key.to_string(), unquote_env_value(value.trim()))))
2118}
2119
2120fn unquote_env_value(value: &str) -> String {
2121    if value.len() >= 2 {
2122        let bytes = value.as_bytes();
2123        if (bytes[0] == b'"' && bytes[value.len() - 1] == b'"')
2124            || (bytes[0] == b'\'' && bytes[value.len() - 1] == b'\'')
2125        {
2126            return value[1..value.len() - 1].to_string();
2127        }
2128    }
2129    value.to_string()
2130}
2131
2132fn expand_home(path: &Path) -> PathBuf {
2133    let raw = path.to_string_lossy();
2134    if raw == "~" {
2135        return std::env::var_os("HOME")
2136            .map(PathBuf::from)
2137            .unwrap_or_else(|| path.to_path_buf());
2138    }
2139    if let Some(rest) = raw.strip_prefix("~/") {
2140        if let Some(home) = std::env::var_os("HOME") {
2141            return PathBuf::from(home).join(rest);
2142        }
2143    }
2144    path.to_path_buf()
2145}
2146
2147#[cfg(test)]
2148#[path = "eval_coding_agent_tests.rs"]
2149mod tests;