Skip to main content

harn_cli/commands/
eval_coding_agent.rs

1//! `harn eval coding-agent` - empirical preset/provider benchmark for a
2//! small coding-agent fixture suite.
3//!
4//! ## Dispatch boundary
5//!
6//! The **matrix execution pipeline** (fixture resolution, model
7//! discovery, per-cell `execute_run` invocation, Ollama snapshot/
8//! cleanup, scoring, rollups, native/text comparisons, follow-up
9//! generation, baseline diff) stays in Rust. Every cell drives the
10//! embedded `coding_agent_suite.harn` driver through `execute_run`,
11//! which itself reaches into VM internals (`commands::run`,
12//! `harn_vm::llm`, `commands::local::runtime`) that are not exposed as
13//! script capabilities.
14//!
15//! The **rendering layer** (the `summary.md` body, the `followups.md`
16//! body, the one-line human stdout summary, the `--json` pretty form)
17//! is delegated to
18//! `crates/harn-stdlib/src/stdlib/cli/eval/coding_agent.harn`. The
19//! Rust shim pre-serialises the assembled `EvalSummary` to JSON,
20//! forwards it via [`CODING_AGENT_SUMMARY_ENV`], dispatches four
21//! times (markdown for `summary.md`, followups for `followups.md`,
22//! then either the summary line or the `--json` pretty form for
23//! stdout), and writes the captured payloads to disk / real stdout.
24//!
25//! The on-disk JSON artifacts (`summary.json`, `per_run.jsonl`,
26//! `local_readiness.json`) stay on the serde-driven Rust path because
27//! Harn's `json_stringify_pretty` sorts dict keys alphabetically and
28//! the on-disk format is consumed by the experiment driver in
29//! `experiments/step-judge/run.sh`, the local-readiness regression
30//! check, and hosted ingestion — all of which depend on the serde
31//! struct-field byte order.
32//!
33//! `HARN_CLI_IMPL=rust` keeps the direct-render path available for
34//! parity snapshot coverage.
35
36use std::collections::{BTreeMap, BTreeSet, HashSet};
37use std::ffi::OsString;
38use std::fs;
39use std::io::Write as _;
40use std::path::{Path, PathBuf};
41
42use harn_vm::clock::{Clock, RealClock};
43use serde::Serialize;
44use serde_json::Value as JsonValue;
45
46use crate::cli::EvalCodingAgentArgs;
47use crate::commands::eval_coding_agent_preset::{
48    resolve_step_judge_json, resolve_structural_validator_json,
49};
50use crate::commands::eval_model_selector::{
51    resolve_selector, selector_is_local, selector_label, ModelSelector,
52};
53use crate::commands::local::runtime::{
54    local_provider_ids, ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
55};
56use crate::commands::local_readiness;
57use crate::commands::run::{
58    execute_run_with_sandbox_options, CliLlmMockMode, RunProfileOptions, RunSandboxOptions,
59};
60use crate::commands::tool_mode_parity::{
61    self, ToolModeParityFixtureInput, ToolModeParityPairSummary, TOOL_MODE_PARITY_DIRECTORY,
62    TOOL_MODE_PARITY_FIXTURE_SUITE, TOOL_MODE_PARITY_OVERLAY_FILENAME,
63};
64use crate::dispatch;
65use crate::env_guard::ScopedEnvVar;
66
67/// Env var the embedded `cli/eval/coding_agent` script reads to pick
68/// up the pre-serialised [`EvalSummary`]. The Rust shim does all the
69/// matrix execution and scoring and hands the script the assembled
70/// summary so it only has to format it.
71const CODING_AGENT_SUMMARY_ENV: &str = "HARN_EVAL_CODING_AGENT_SUMMARY_JSON";
72
73/// Env var the script reads to pick the rendering mode — one of
74/// `"markdown"` (summary.md body), `"followups"` (followups.md body),
75/// `"summary"` (one-line stdout summary), or `"json"` (--json pretty
76/// form). Defaulted to `"summary"` if unset so the script stays robust
77/// against future Rust-side bugs.
78const CODING_AGENT_MODE_ENV: &str = "HARN_EVAL_CODING_AGENT_MODE";
79
80/// Serialises the dispatch-render path so concurrent in-process
81/// callers (the existing `eval_coding_agent_cli` integration test plus
82/// any future fanout caller) don't race on the global env vars the
83/// Rust shim sets to hand the report off to the .harn script. The CLI
84/// binary itself is single-call, so this mutex is uncontended in
85/// production; in tests it serialises the dispatch window only —
86/// matrix execution still parallelises freely.
87///
88/// Matches the other eval render shims so the cross-script env-var handoff
89/// stays consistent across the eval cluster.
90static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
91
92const CODING_AGENT_SUITE_HARN: &str = include_str!("../../assets/evals/coding_agent_suite.harn");
93const TOOL_FORMAT_OVERRIDE_WARNING_PREFIX: &str = "warning: tool_format override:";
94
95#[derive(Debug, Clone, Copy)]
96struct FixtureDefinition {
97    id: &'static str,
98    name: &'static str,
99    tool_sequence: &'static str,
100    description: &'static str,
101}
102
103static FIXTURE_DEFINITIONS: &[FixtureDefinition] = &[
104    FixtureDefinition {
105        id: "python-add",
106        name: "Python add repair",
107        tool_sequence: "multi-tool",
108        description: "One-file Python bug fix verified by unittest output.",
109    },
110    FixtureDefinition {
111        id: "cli-help-flag",
112        name: "CLI help flag",
113        tool_sequence: "multi-tool",
114        description: "Add a tiny CLI flag, update help-facing docs, and verify behavior.",
115    },
116    FixtureDefinition {
117        id: "test-output-first",
118        name: "Test-output-first repair",
119        tool_sequence: "multi-tool",
120        description: "Run a failing test first, then edit the implementation and re-run it.",
121    },
122    FixtureDefinition {
123        id: "docs-symbol-rename",
124        name: "Docs symbol rename",
125        tool_sequence: "multi-tool",
126        description:
127            "Update docs and an example after a symbol rename without touching implementation.",
128    },
129    FixtureDefinition {
130        id: "read-only-audit",
131        name: "Read-only audit",
132        tool_sequence: "one-tool",
133        description: "Inspect a file and report that no edits are needed.",
134    },
135    FixtureDefinition {
136        id: "no-tool-diagnosis",
137        name: "No-tool diagnosis",
138        tool_sequence: "no-tool",
139        description: "Answer from prompt-only context without any tools.",
140    },
141];
142
143#[derive(Debug, Clone, Serialize)]
144struct LoadedEnvKey {
145    key: String,
146    source: String,
147}
148
149#[derive(Debug)]
150struct EnvOverlay {
151    previous: Vec<(OsString, Option<OsString>)>,
152}
153
154impl Drop for EnvOverlay {
155    fn drop(&mut self) {
156        for (key, previous) in self.previous.iter().rev() {
157            if let Some(value) = previous {
158                std::env::set_var(key, value);
159            } else {
160                std::env::remove_var(key);
161            }
162        }
163    }
164}
165
166#[derive(Debug, Clone, Serialize)]
167struct RunReport {
168    run_id: String,
169    fixture_id: String,
170    fixture_name: String,
171    fixture_tool_sequence: String,
172    selector: ModelSelector,
173    tool_format: String,
174    status: String,
175    passed: bool,
176    skipped: bool,
177    #[serde(skip_serializing_if = "Option::is_none")]
178    skipped_reason: Option<String>,
179    output_dir: String,
180    transcript_events_path: String,
181    workspace_root: Option<String>,
182    elapsed_ms: u64,
183    duration_ms: u64,
184    iterations: i64,
185    input_tokens: i64,
186    output_tokens: i64,
187    cost_usd: f64,
188    pricing_known: bool,
189    tool_calls: usize,
190    rejected_tool_calls: usize,
191    tool_sequence: Vec<String>,
192    successful_tools: Vec<String>,
193    transcript_event_count: usize,
194    verification_success: bool,
195    harn_exit_code: i32,
196    #[serde(skip_serializing_if = "Option::is_none")]
197    error: Option<String>,
198    #[serde(skip_serializing_if = "Option::is_none")]
199    stderr_excerpt: Option<String>,
200    local_cleanup: Option<LocalCleanupReport>,
201}
202
203#[derive(Debug, Clone, Serialize)]
204struct LocalCleanupReport {
205    provider: String,
206    model: String,
207    initially_loaded: bool,
208    action: String,
209    #[serde(skip_serializing_if = "Option::is_none")]
210    detail: Option<String>,
211}
212
213#[derive(Debug, Clone, Serialize)]
214struct FormatComparison {
215    fixture_id: String,
216    selector: ModelSelector,
217    native_run_id: Option<String>,
218    text_run_id: Option<String>,
219    native_evidence_path: Option<String>,
220    text_evidence_path: Option<String>,
221    native_status: Option<String>,
222    text_status: Option<String>,
223    native_passed: Option<bool>,
224    text_passed: Option<bool>,
225    native_tool_call_count: Option<usize>,
226    text_tool_call_count: Option<usize>,
227    native_rejected_tool_call_count: Option<usize>,
228    text_rejected_tool_call_count: Option<usize>,
229    verifier_match: Option<bool>,
230    tool_sequence_match: Option<bool>,
231    rejected_tool_call_delta_text_minus_native: Option<i64>,
232    token_delta_text_minus_native: Option<i64>,
233    iteration_delta_text_minus_native: Option<i64>,
234    equivalent: Option<bool>,
235    divergence_reasons: Vec<String>,
236    evidence_paths: Vec<String>,
237}
238
239#[derive(Debug, Clone, Serialize)]
240struct FollowupSuggestion {
241    title: String,
242    body: String,
243    labels: Vec<String>,
244    run_ids: Vec<String>,
245}
246
247#[derive(Debug, Clone, Serialize)]
248struct FixtureReport {
249    id: String,
250    name: String,
251    tool_sequence: String,
252    description: String,
253}
254
255#[derive(Debug, Clone, Serialize)]
256struct RollupReport {
257    key: String,
258    total_runs: usize,
259    passed_runs: usize,
260    failed_runs: usize,
261    skipped_runs: usize,
262    total_cost_usd: f64,
263}
264
265#[derive(Debug, Clone, Serialize)]
266struct EvalRollups {
267    by_fixture: Vec<RollupReport>,
268    by_provider: Vec<RollupReport>,
269    by_model: Vec<RollupReport>,
270    by_tool_format: Vec<RollupReport>,
271    by_tool_sequence: Vec<RollupReport>,
272}
273
274#[derive(Debug, Clone, Serialize)]
275struct EvalSummary {
276    schema_version: u32,
277    fixture_ids: Vec<String>,
278    fixtures: Vec<FixtureReport>,
279    output_dir: String,
280    models: Vec<ModelSelector>,
281    tool_formats: Vec<String>,
282    env_keys_loaded: Vec<LoadedEnvKey>,
283    total_runs: usize,
284    passed_runs: usize,
285    failed_runs: usize,
286    skipped_runs: usize,
287    diverged_comparisons: usize,
288    total_cost_usd: f64,
289    rollups: EvalRollups,
290    runs: Vec<RunReport>,
291    comparisons: Vec<FormatComparison>,
292    parity_by_pair: Vec<ToolModeParityPairSummary>,
293    followups: Vec<FollowupSuggestion>,
294    /// Step-judge preset applied to all runs in this invocation, if any.
295    /// Used by the experiment driver (experiments/step-judge/run.sh) to
296    /// group repeat invocations into cells.
297    #[serde(skip_serializing_if = "Option::is_none")]
298    step_judge_preset: Option<String>,
299    /// Free-form label for grouping repeat invocations (e.g.
300    /// "replicate-1", "probe-rubric-adversarial"). Empty when unset.
301    #[serde(skip_serializing_if = "String::is_empty")]
302    run_label: String,
303    /// Optional per-fixture diff against a prior run's `summary.json`,
304    /// listing regressions (baseline passed, this cell failed) and
305    /// recoveries (baseline failed, this cell passed) plus aggregate
306    /// counts and a net lift in percentage points. Populated when the
307    /// caller passes `--baseline-comparison-against <path>` (harn#2318).
308    #[serde(skip_serializing_if = "Option::is_none")]
309    baseline_comparison: Option<BaselineComparison>,
310}
311
312#[derive(Debug, Clone, Serialize, Default)]
313struct BaselineComparison {
314    /// `output_dir` or `run_label` of the baseline summary, for context.
315    baseline_label: String,
316    /// Resolved path to the baseline `summary.json` that was diffed against.
317    baseline_path: String,
318    regressions: Vec<FixtureStatusDelta>,
319    recoveries: Vec<FixtureStatusDelta>,
320    /// Fixtures that passed in both runs.
321    unchanged_passes: Vec<String>,
322    /// Fixtures that failed in both runs.
323    unchanged_failures: Vec<String>,
324    /// Fixtures present in only one of the two runs (skipped from the
325    /// diff but listed for visibility).
326    missing_in_baseline: Vec<String>,
327    missing_in_cell: Vec<String>,
328    regressions_count: usize,
329    recoveries_count: usize,
330    /// `(recoveries_count - regressions_count) / total_fixtures_compared * 100`,
331    /// rounded to one decimal place. Negative when the cell regresses more
332    /// than it recovers.
333    net_lift_pp: f64,
334}
335
336#[derive(Debug, Clone, Serialize)]
337struct FixtureStatusDelta {
338    fixture_id: String,
339    baseline_status: String,
340    cell_status: String,
341}
342
343struct LocalRunGuard {
344    selector: ModelSelector,
345    stop_after: bool,
346    snapshot: Option<LocalProviderSnapshot>,
347}
348
349struct RunSummaryContext {
350    run_id: String,
351    fixture: FixtureDefinition,
352    selector: ModelSelector,
353    tool_format: String,
354    run_dir: PathBuf,
355    elapsed_ms: u64,
356    exit_code: i32,
357    stderr: String,
358    local_cleanup: Option<LocalCleanupReport>,
359}
360
361pub async fn run(args: EvalCodingAgentArgs) -> i32 {
362    let output_dir = args.output.clone().unwrap_or_else(default_output_dir);
363    if let Err(error) = fs::create_dir_all(&output_dir) {
364        eprintln!("error: failed to create {}: {error}", output_dir.display());
365        return 1;
366    }
367
368    let (_env_guard, env_keys_loaded) = match load_env_files(&args.env_files) {
369        Ok(loaded) => loaded,
370        Err(error) => {
371            eprintln!("error: {error}");
372            return 1;
373        }
374    };
375
376    let fixtures = match resolve_fixtures(&args.fixtures) {
377        Ok(fixtures) => fixtures,
378        Err(error) => {
379            eprintln!("error: {error}");
380            return 2;
381        }
382    };
383    let models = match resolve_models(&args).await {
384        Ok(models) => models,
385        Err(error) => {
386            eprintln!("error: {error}");
387            return 1;
388        }
389    };
390    let tool_formats = match normalize_tool_formats(&args.tool_formats) {
391        Ok(formats) => formats,
392        Err(error) => {
393            eprintln!("error: {error}");
394            return 2;
395        }
396    };
397    let matrix = build_matrix(&fixtures, &models, &tool_formats, args.max_runs);
398    if matrix.is_empty() {
399        eprintln!("error: no coding-agent benchmark runs selected");
400        return 2;
401    }
402
403    let mut reports = Vec::new();
404    let mut had_error = false;
405    for (fixture, selector, tool_format) in matrix {
406        let report = run_matrix_entry(&args, &output_dir, fixture, selector, tool_format).await;
407        if !report.passed && !report.skipped {
408            had_error = true;
409        }
410        if report.skipped && args.fail_on_unauthorized {
411            had_error = true;
412        }
413        eprintln!(
414            "{} {} {}: {}",
415            report.fixture_id,
416            selector_label(&report.selector),
417            report.tool_format,
418            report.status
419        );
420        reports.push(report);
421    }
422
423    let baseline_comparison = match &args.baseline_comparison_against {
424        Some(path) => match load_baseline_comparison(path, &reports) {
425            Ok(comparison) => Some(comparison),
426            Err(error) => {
427                eprintln!("error: --baseline-comparison-against: {error}");
428                return 1;
429            }
430        },
431        None => None,
432    };
433    let summary = build_summary(
434        &output_dir,
435        fixtures,
436        models,
437        tool_formats,
438        env_keys_loaded,
439        reports,
440        args.step_judge
441            .clone()
442            .filter(|s| !s.is_empty() && s != "none"),
443        args.run_label.clone(),
444        baseline_comparison,
445    );
446    // The JSON artifacts (summary.json, per_run.jsonl,
447    // local_readiness.json) always stay on the serde-driven Rust path —
448    // see module docstring for the byte-format rationale. They write
449    // before any rendering so a render failure doesn't leave a partially
450    // written report directory.
451    if let Err(error) = write_json_artifacts(&output_dir, &summary) {
452        eprintln!("error: failed to write benchmark outputs: {error}");
453        return 1;
454    }
455
456    // `HARN_CLI_IMPL=rust` keeps the legacy direct-render path so the
457    // parity-snapshot harness (#2299) can compare both impls until C1
458    // (#2314) deletes this escape hatch.
459    let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
460
461    if use_legacy {
462        if let Err(error) = write_markdown_artifacts_legacy(&output_dir, &summary) {
463            eprintln!("error: {error}");
464            return 1;
465        }
466        announce_output_paths(&output_dir);
467        if args.json {
468            print_json_legacy(&summary);
469        } else {
470            print_summary_legacy(&summary);
471        }
472        return i32::from(had_error);
473    }
474
475    if let Err(code) = write_markdown_artifacts_dispatch(&output_dir, &summary).await {
476        return code;
477    }
478    announce_output_paths(&output_dir);
479    if args.json {
480        if let Err(code) = print_json_dispatch(&summary).await {
481            return code;
482        }
483    } else if let Err(code) = print_summary_dispatch(&summary).await {
484        return code;
485    }
486
487    i32::from(had_error)
488}
489
490async fn run_matrix_entry(
491    args: &EvalCodingAgentArgs,
492    output_dir: &Path,
493    fixture: FixtureDefinition,
494    selector: ModelSelector,
495    tool_format: String,
496) -> RunReport {
497    let run_id = run_id_for(fixture, &selector, &tool_format);
498    let run_dir = output_dir.join(&run_id);
499    if let Err(error) = reset_dir(&run_dir) {
500        return error_report(
501            run_id,
502            fixture,
503            selector,
504            tool_format,
505            run_dir,
506            format!("failed to prepare run directory: {error}"),
507        );
508    }
509
510    if !provider_available(&selector) {
511        let reason = format!(
512            "provider `{}` has no configured credentials",
513            selector.provider
514        );
515        return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
516    }
517
518    let script_path = run_dir.join("coding_agent_suite.harn");
519    if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
520        return error_report(
521            run_id,
522            fixture,
523            selector,
524            tool_format,
525            run_dir,
526            format!("failed to write benchmark harness: {error}"),
527        );
528    }
529
530    let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
531    let argv = script_argv(args, fixture, &selector, &tool_format, &run_dir);
532    let clock = RealClock::new();
533    let started_ms = clock.monotonic_ms();
534    let outcome = execute_run_with_sandbox_options(
535        &script_path.to_string_lossy(),
536        false,
537        HashSet::new(),
538        argv,
539        Vec::new(),
540        CliLlmMockMode::Off,
541        None,
542        RunProfileOptions::default(),
543        RunSandboxOptions::default().with_workspace_root(run_dir.clone()),
544    )
545    .await;
546    if let Some(line) = tool_format_override_warning_line(&outcome.stderr) {
547        eprintln!("{line}");
548    }
549    let elapsed_ms = clock
550        .monotonic_ms()
551        .saturating_sub(started_ms)
552        .try_into()
553        .unwrap_or(0);
554    let local_cleanup = if let Some(guard) = local_guard {
555        guard.cleanup().await
556    } else {
557        None
558    };
559
560    let summary_value =
561        read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
562    let Some(summary) = summary_value else {
563        return RunReport {
564            run_id,
565            fixture_id: fixture.id.to_string(),
566            fixture_name: fixture.name.to_string(),
567            fixture_tool_sequence: fixture.tool_sequence.to_string(),
568            selector,
569            tool_format,
570            status: "infra_error".to_string(),
571            passed: false,
572            skipped: false,
573            skipped_reason: None,
574            output_dir: run_dir.display().to_string(),
575            transcript_events_path: run_dir
576                .join("transcript_events.jsonl")
577                .display()
578                .to_string(),
579            workspace_root: None,
580            elapsed_ms,
581            duration_ms: 0,
582            iterations: 0,
583            input_tokens: 0,
584            output_tokens: 0,
585            cost_usd: 0.0,
586            pricing_known: false,
587            tool_calls: 0,
588            rejected_tool_calls: 0,
589            tool_sequence: Vec::new(),
590            successful_tools: Vec::new(),
591            transcript_event_count: 0,
592            verification_success: false,
593            harn_exit_code: outcome.exit_code,
594            error: Some("benchmark harness produced no summary JSON".to_string()),
595            stderr_excerpt: excerpt(&outcome.stderr),
596            local_cleanup,
597        };
598    };
599
600    report_from_summary(
601        RunSummaryContext {
602            run_id,
603            fixture,
604            selector,
605            tool_format,
606            run_dir,
607            elapsed_ms,
608            exit_code: outcome.exit_code,
609            stderr: outcome.stderr,
610            local_cleanup,
611        },
612        summary,
613    )
614}
615
616fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
617    let passed = summary
618        .get("passed")
619        .and_then(JsonValue::as_bool)
620        .unwrap_or(false)
621        && ctx.exit_code == 0;
622    let input_tokens = summary
623        .pointer("/llm/input_tokens")
624        .and_then(JsonValue::as_i64)
625        .unwrap_or(0);
626    let output_tokens = summary
627        .pointer("/llm/output_tokens")
628        .and_then(JsonValue::as_i64)
629        .unwrap_or(0);
630    let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
631    let cost_usd = pricing
632        .map(|(input, output)| {
633            (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
634        })
635        .unwrap_or(0.0);
636    let status = if passed {
637        "passed".to_string()
638    } else if ctx.exit_code == 0 {
639        "failed".to_string()
640    } else {
641        summary
642            .get("status")
643            .and_then(JsonValue::as_str)
644            .unwrap_or("failed")
645            .to_string()
646    };
647    RunReport {
648        run_id: ctx.run_id,
649        fixture_id: ctx.fixture.id.to_string(),
650        fixture_name: ctx.fixture.name.to_string(),
651        fixture_tool_sequence: ctx.fixture.tool_sequence.to_string(),
652        selector: ctx.selector,
653        tool_format: ctx.tool_format,
654        status,
655        passed,
656        skipped: false,
657        skipped_reason: None,
658        output_dir: ctx.run_dir.display().to_string(),
659        transcript_events_path: ctx
660            .run_dir
661            .join("transcript_events.jsonl")
662            .display()
663            .to_string(),
664        workspace_root: summary
665            .get("workspace_root")
666            .and_then(JsonValue::as_str)
667            .map(str::to_string),
668        elapsed_ms: ctx.elapsed_ms,
669        duration_ms: summary
670            .get("duration_ms")
671            .and_then(JsonValue::as_u64)
672            .unwrap_or(ctx.elapsed_ms),
673        iterations: summary
674            .pointer("/llm/iterations")
675            .and_then(JsonValue::as_i64)
676            .unwrap_or(0),
677        input_tokens,
678        output_tokens,
679        cost_usd,
680        pricing_known: pricing.is_some(),
681        tool_calls: summary
682            .pointer("/tools/calls")
683            .and_then(JsonValue::as_array)
684            .map(Vec::len)
685            .unwrap_or(0),
686        rejected_tool_calls: summary
687            .pointer("/tools/rejected")
688            .and_then(JsonValue::as_array)
689            .map(Vec::len)
690            .unwrap_or(0),
691        tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
692            .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
693            .unwrap_or_default(),
694        successful_tools: string_array(summary.pointer("/tools/successful")),
695        transcript_event_count: summary
696            .get("transcript_event_count")
697            .and_then(JsonValue::as_u64)
698            .unwrap_or(0) as usize,
699        verification_success: summary
700            .pointer("/verification/success")
701            .and_then(JsonValue::as_bool)
702            .unwrap_or(false),
703        harn_exit_code: ctx.exit_code,
704        error: (!passed).then(|| {
705            summary
706                .get("status")
707                .and_then(JsonValue::as_str)
708                .unwrap_or("benchmark failed")
709                .to_string()
710        }),
711        stderr_excerpt: excerpt(&ctx.stderr),
712        local_cleanup: ctx.local_cleanup,
713    }
714}
715
716impl LocalRunGuard {
717    async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
718        if !selector_is_local(selector) {
719            return None;
720        }
721        let snapshot = snapshot_provider(&selector.provider, Path::new("."))
722            .await
723            .ok();
724        Some(Self {
725            selector: selector.clone(),
726            stop_after,
727            snapshot,
728        })
729    }
730
731    async fn cleanup(self) -> Option<LocalCleanupReport> {
732        let snapshot = self.snapshot?;
733        if self.selector.provider != "ollama" {
734            return Some(LocalCleanupReport {
735                provider: self.selector.provider,
736                model: self.selector.model,
737                initially_loaded: false,
738                action: "not_applicable".to_string(),
739                detail: Some(
740                    "non-Ollama local providers are only stopped when Harn launched a managed server"
741                        .to_string(),
742                ),
743            });
744        }
745        let initially_loaded = snapshot
746            .loaded_models
747            .iter()
748            .any(|loaded| loaded.name == self.selector.model);
749        if !self.stop_after {
750            return Some(LocalCleanupReport {
751                provider: self.selector.provider,
752                model: self.selector.model,
753                initially_loaded,
754                action: "left_running".to_string(),
755                detail: Some("--keep-local-after-run".to_string()),
756            });
757        }
758        if initially_loaded {
759            return Some(LocalCleanupReport {
760                provider: self.selector.provider,
761                model: self.selector.model,
762                initially_loaded,
763                action: "left_preexisting".to_string(),
764                detail: None,
765            });
766        }
767        match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
768            Ok(()) => Some(LocalCleanupReport {
769                provider: self.selector.provider,
770                model: self.selector.model,
771                initially_loaded,
772                action: "unloaded".to_string(),
773                detail: None,
774            }),
775            Err(error) => Some(LocalCleanupReport {
776                provider: self.selector.provider,
777                model: self.selector.model,
778                initially_loaded,
779                action: "unload_failed".to_string(),
780                detail: Some(error),
781            }),
782        }
783    }
784}
785
786fn script_argv(
787    args: &EvalCodingAgentArgs,
788    fixture: FixtureDefinition,
789    selector: &ModelSelector,
790    tool_format: &str,
791    run_dir: &Path,
792) -> Vec<String> {
793    let mut argv = vec![
794        "--fixture".to_string(),
795        fixture.id.to_string(),
796        "--output-dir".to_string(),
797        run_dir.display().to_string(),
798        "--provider".to_string(),
799        selector.provider.clone(),
800        "--model".to_string(),
801        selector.model.clone(),
802        "--tool-format".to_string(),
803        tool_format.to_string(),
804        "--max-iterations".to_string(),
805        args.max_iterations.to_string(),
806        "--python".to_string(),
807        args.python.clone(),
808    ];
809    if selector.provider == "mock" {
810        argv.push("--seed-mock".to_string());
811    }
812    if let Some(json) = resolve_step_judge_json(args, selector) {
813        argv.push("--step-judge-json".to_string());
814        argv.push(json);
815    }
816    if let Some(reason) = args
817        .override_reason
818        .as_deref()
819        .map(str::trim)
820        .filter(|reason| !reason.is_empty())
821    {
822        argv.push("--override-reason".to_string());
823        argv.push(reason.to_string());
824    }
825    if let Some(json) = resolve_structural_validator_json(args) {
826        argv.push("--structural-validator-json".to_string());
827        argv.push(json);
828    }
829    argv
830}
831
832fn tool_format_override_warning_line(stderr: &str) -> Option<&str> {
833    stderr
834        .lines()
835        .map(str::trim)
836        .find(|line| line.starts_with(TOOL_FORMAT_OVERRIDE_WARNING_PREFIX))
837}
838
839fn error_report(
840    run_id: String,
841    fixture: FixtureDefinition,
842    selector: ModelSelector,
843    tool_format: String,
844    run_dir: PathBuf,
845    error: String,
846) -> RunReport {
847    RunReport {
848        run_id,
849        fixture_id: fixture.id.to_string(),
850        fixture_name: fixture.name.to_string(),
851        fixture_tool_sequence: fixture.tool_sequence.to_string(),
852        selector,
853        tool_format,
854        status: "infra_error".to_string(),
855        passed: false,
856        skipped: false,
857        skipped_reason: None,
858        output_dir: run_dir.display().to_string(),
859        transcript_events_path: run_dir
860            .join("transcript_events.jsonl")
861            .display()
862            .to_string(),
863        workspace_root: None,
864        elapsed_ms: 0,
865        duration_ms: 0,
866        iterations: 0,
867        input_tokens: 0,
868        output_tokens: 0,
869        cost_usd: 0.0,
870        pricing_known: false,
871        tool_calls: 0,
872        rejected_tool_calls: 0,
873        tool_sequence: Vec::new(),
874        successful_tools: Vec::new(),
875        transcript_event_count: 0,
876        verification_success: false,
877        harn_exit_code: 1,
878        error: Some(error),
879        stderr_excerpt: None,
880        local_cleanup: None,
881    }
882}
883
884fn skipped_report(
885    run_id: String,
886    fixture: FixtureDefinition,
887    selector: ModelSelector,
888    tool_format: String,
889    run_dir: PathBuf,
890    reason: String,
891) -> RunReport {
892    RunReport {
893        run_id,
894        fixture_id: fixture.id.to_string(),
895        fixture_name: fixture.name.to_string(),
896        fixture_tool_sequence: fixture.tool_sequence.to_string(),
897        selector,
898        tool_format,
899        status: "skipped".to_string(),
900        passed: false,
901        skipped: true,
902        skipped_reason: Some(reason),
903        output_dir: run_dir.display().to_string(),
904        transcript_events_path: run_dir
905            .join("transcript_events.jsonl")
906            .display()
907            .to_string(),
908        workspace_root: None,
909        elapsed_ms: 0,
910        duration_ms: 0,
911        iterations: 0,
912        input_tokens: 0,
913        output_tokens: 0,
914        cost_usd: 0.0,
915        pricing_known: false,
916        tool_calls: 0,
917        rejected_tool_calls: 0,
918        tool_sequence: Vec::new(),
919        successful_tools: Vec::new(),
920        transcript_event_count: 0,
921        verification_success: false,
922        harn_exit_code: 0,
923        error: None,
924        stderr_excerpt: None,
925        local_cleanup: None,
926    }
927}
928
929fn provider_available(selector: &ModelSelector) -> bool {
930    if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
931        return true;
932    }
933    harn_vm::llm_config::provider_key_available(&selector.provider)
934}
935
936fn resolve_fixtures(raw_fixtures: &[String]) -> Result<Vec<FixtureDefinition>, String> {
937    let mut seen = BTreeSet::new();
938    let mut out = Vec::new();
939    for raw in raw_fixtures {
940        let fixture = raw.trim().to_ascii_lowercase();
941        if fixture.is_empty() {
942            continue;
943        }
944        if fixture == "all" {
945            return Ok(FIXTURE_DEFINITIONS.to_vec());
946        }
947        let Some(definition) = fixture_definition(&fixture) else {
948            return Err(format!(
949                "unsupported --fixture `{fixture}`; expected one of: all, {}",
950                FIXTURE_DEFINITIONS
951                    .iter()
952                    .map(|definition| definition.id)
953                    .collect::<Vec<_>>()
954                    .join(", ")
955            ));
956        };
957        if seen.insert(definition.id) {
958            out.push(definition);
959        }
960    }
961    if out.is_empty() {
962        return Err("at least one coding-agent fixture must be selected".to_string());
963    }
964    Ok(out)
965}
966
967fn fixture_definition(id: &str) -> Option<FixtureDefinition> {
968    FIXTURE_DEFINITIONS
969        .iter()
970        .copied()
971        .find(|definition| definition.id == id)
972}
973
974async fn resolve_models(args: &EvalCodingAgentArgs) -> Result<Vec<ModelSelector>, String> {
975    let mut seen = BTreeSet::new();
976    let mut out = Vec::new();
977    for raw in normalize_model_selector_args(&args.models) {
978        let trimmed = raw.trim();
979        if trimmed.is_empty() {
980            continue;
981        }
982        let selector = resolve_selector(trimmed);
983        if seen.insert(selector_label(&selector)) {
984            out.push(selector);
985        }
986    }
987    if args.include_local {
988        for selector in discover_local_models(args).await {
989            if seen.insert(selector_label(&selector)) {
990                out.push(selector);
991            }
992        }
993    }
994    Ok(out)
995}
996
997fn normalize_model_selector_args(raw_models: &[String]) -> Vec<String> {
998    let mut out = Vec::new();
999    let mut index = 0;
1000    while index < raw_models.len() {
1001        let current = raw_models[index].trim();
1002        if current.starts_with("provider=") && index + 1 < raw_models.len() {
1003            let next = raw_models[index + 1].trim();
1004            if next.starts_with("model=") {
1005                out.push(format!("{current},{next}"));
1006                index += 2;
1007                continue;
1008            }
1009        }
1010        out.push(current.to_string());
1011        index += 1;
1012    }
1013    out
1014}
1015
1016async fn discover_local_models(args: &EvalCodingAgentArgs) -> Vec<ModelSelector> {
1017    let providers = if args.local_providers.is_empty() {
1018        local_provider_ids(None)
1019    } else {
1020        args.local_providers.clone()
1021    };
1022    let mut selectors = Vec::new();
1023    let mut seen = BTreeSet::new();
1024    for provider in providers {
1025        if selectors.len() >= args.max_local_models {
1026            break;
1027        }
1028        let Ok(snapshot) = snapshot_provider(&provider, Path::new(".")).await else {
1029            continue;
1030        };
1031        if !snapshot.reachable {
1032            continue;
1033        }
1034        let mut models = snapshot
1035            .loaded_models
1036            .iter()
1037            .map(|model| model.name.clone())
1038            .collect::<Vec<_>>();
1039        models.extend(snapshot.served_models);
1040        for model in models {
1041            if selectors.len() >= args.max_local_models {
1042                break;
1043            }
1044            let selector = ModelSelector {
1045                selector: format!("{provider}:{model}"),
1046                provider: provider.clone(),
1047                model,
1048            };
1049            if seen.insert(selector_label(&selector)) {
1050                selectors.push(selector);
1051            }
1052        }
1053    }
1054    selectors
1055}
1056
1057fn normalize_tool_formats(raw_formats: &[String]) -> Result<Vec<String>, String> {
1058    let mut seen = BTreeSet::new();
1059    let mut out = Vec::new();
1060    for raw in raw_formats {
1061        let format = raw.trim().to_ascii_lowercase();
1062        if format.is_empty() {
1063            continue;
1064        }
1065        if format != "native" && format != "text" {
1066            return Err(format!(
1067                "unsupported --tool-format `{format}`; expected `native` or `text`"
1068            ));
1069        }
1070        if seen.insert(format.clone()) {
1071            out.push(format);
1072        }
1073    }
1074    Ok(out)
1075}
1076
1077fn build_matrix(
1078    fixtures: &[FixtureDefinition],
1079    models: &[ModelSelector],
1080    tool_formats: &[String],
1081    max_runs: Option<usize>,
1082) -> Vec<(FixtureDefinition, ModelSelector, String)> {
1083    if max_runs == Some(0) {
1084        return Vec::new();
1085    }
1086    let mut matrix = Vec::new();
1087    for fixture in fixtures {
1088        for selector in models {
1089            for tool_format in tool_formats {
1090                matrix.push((*fixture, selector.clone(), tool_format.clone()));
1091                if max_runs.is_some_and(|limit| matrix.len() >= limit) {
1092                    return matrix;
1093                }
1094            }
1095        }
1096    }
1097    matrix
1098}
1099
1100#[allow(clippy::too_many_arguments)]
1101fn build_summary(
1102    output_dir: &Path,
1103    fixtures: Vec<FixtureDefinition>,
1104    models: Vec<ModelSelector>,
1105    tool_formats: Vec<String>,
1106    env_keys_loaded: Vec<LoadedEnvKey>,
1107    runs: Vec<RunReport>,
1108    step_judge_preset: Option<String>,
1109    run_label: String,
1110    baseline_comparison: Option<BaselineComparison>,
1111) -> EvalSummary {
1112    let passed_runs = runs.iter().filter(|run| run.passed).count();
1113    let skipped_runs = runs.iter().filter(|run| run.skipped).count();
1114    let failed_runs = runs
1115        .iter()
1116        .filter(|run| !run.passed && !run.skipped)
1117        .count();
1118    let total_cost_usd = runs.iter().map(|run| run.cost_usd).sum();
1119    let rollups = build_rollups(&runs);
1120    let comparisons = compare_formats(&runs);
1121    let parity_by_pair = build_parity_by_pair(&comparisons);
1122    let diverged_comparisons = comparisons
1123        .iter()
1124        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1125        .count();
1126    let followups = suggest_followups(&runs, &comparisons);
1127    EvalSummary {
1128        schema_version: 3,
1129        fixture_ids: fixtures
1130            .iter()
1131            .map(|fixture| fixture.id.to_string())
1132            .collect(),
1133        fixtures: fixtures
1134            .iter()
1135            .map(|fixture| FixtureReport {
1136                id: fixture.id.to_string(),
1137                name: fixture.name.to_string(),
1138                tool_sequence: fixture.tool_sequence.to_string(),
1139                description: fixture.description.to_string(),
1140            })
1141            .collect(),
1142        output_dir: output_dir.display().to_string(),
1143        models,
1144        tool_formats,
1145        env_keys_loaded,
1146        total_runs: runs.len(),
1147        passed_runs,
1148        failed_runs,
1149        skipped_runs,
1150        diverged_comparisons,
1151        total_cost_usd,
1152        rollups,
1153        runs,
1154        comparisons,
1155        parity_by_pair,
1156        followups,
1157        step_judge_preset,
1158        run_label,
1159        baseline_comparison,
1160    }
1161}
1162
1163fn load_baseline_comparison(path: &Path, runs: &[RunReport]) -> Result<BaselineComparison, String> {
1164    let resolved = if path.is_dir() {
1165        path.join("summary.json")
1166    } else {
1167        path.to_path_buf()
1168    };
1169    let raw = fs::read_to_string(&resolved)
1170        .map_err(|e| format!("failed to read {}: {e}", resolved.display()))?;
1171    let baseline: serde_json::Value = serde_json::from_str(&raw)
1172        .map_err(|e| format!("failed to parse {} as JSON: {e}", resolved.display()))?;
1173    let baseline_runs = baseline
1174        .get("runs")
1175        .and_then(|v| v.as_array())
1176        .ok_or_else(|| format!("{} has no `runs` array", resolved.display()))?;
1177    // Index baseline status by fixture_id. When the baseline has multiple
1178    // runs per fixture (e.g. native + text), prefer the first passing run
1179    // so a fixture passes the comparison if ANY baseline variant did.
1180    let mut baseline_status: BTreeMap<String, &str> = BTreeMap::new();
1181    for run in baseline_runs {
1182        let fixture_id = match run.get("fixture_id").and_then(|v| v.as_str()) {
1183            Some(id) => id.to_string(),
1184            None => continue,
1185        };
1186        let passed = run.get("passed").and_then(|v| v.as_bool()).unwrap_or(false);
1187        let skipped = run
1188            .get("skipped")
1189            .and_then(|v| v.as_bool())
1190            .unwrap_or(false);
1191        let status = if skipped {
1192            "skipped"
1193        } else if passed {
1194            "passed"
1195        } else {
1196            "failed"
1197        };
1198        baseline_status
1199            .entry(fixture_id)
1200            .and_modify(|existing| {
1201                if *existing != "passed" && status == "passed" {
1202                    *existing = status;
1203                }
1204            })
1205            .or_insert(status);
1206    }
1207    let mut cell_status: BTreeMap<String, &str> = BTreeMap::new();
1208    for run in runs {
1209        let status = if run.skipped {
1210            "skipped"
1211        } else if run.passed {
1212            "passed"
1213        } else {
1214            "failed"
1215        };
1216        cell_status
1217            .entry(run.fixture_id.clone())
1218            .and_modify(|existing| {
1219                if *existing != "passed" && status == "passed" {
1220                    *existing = status;
1221                }
1222            })
1223            .or_insert(status);
1224    }
1225    let mut regressions = Vec::new();
1226    let mut recoveries = Vec::new();
1227    let mut unchanged_passes = Vec::new();
1228    let mut unchanged_failures = Vec::new();
1229    let mut missing_in_baseline = Vec::new();
1230    let mut missing_in_cell = Vec::new();
1231    for (fixture, cell) in &cell_status {
1232        match baseline_status.get(fixture) {
1233            None => missing_in_baseline.push(fixture.clone()),
1234            Some(base) => match (*base, *cell) {
1235                ("passed", "passed") => unchanged_passes.push(fixture.clone()),
1236                ("passed", _) => regressions.push(FixtureStatusDelta {
1237                    fixture_id: fixture.clone(),
1238                    baseline_status: (*base).to_string(),
1239                    cell_status: (*cell).to_string(),
1240                }),
1241                (_, "passed") => recoveries.push(FixtureStatusDelta {
1242                    fixture_id: fixture.clone(),
1243                    baseline_status: (*base).to_string(),
1244                    cell_status: (*cell).to_string(),
1245                }),
1246                _ => unchanged_failures.push(fixture.clone()),
1247            },
1248        }
1249    }
1250    for fixture in baseline_status.keys() {
1251        if !cell_status.contains_key(fixture) {
1252            missing_in_cell.push(fixture.clone());
1253        }
1254    }
1255    let baseline_label = baseline
1256        .get("run_label")
1257        .and_then(|v| v.as_str())
1258        .filter(|s| !s.is_empty())
1259        .or_else(|| baseline.get("output_dir").and_then(|v| v.as_str()))
1260        .unwrap_or("")
1261        .to_string();
1262    let regressions_count = regressions.len();
1263    let recoveries_count = recoveries.len();
1264    let total_compared =
1265        regressions_count + recoveries_count + unchanged_passes.len() + unchanged_failures.len();
1266    let net_lift_pp = if total_compared == 0 {
1267        0.0
1268    } else {
1269        let raw =
1270            (recoveries_count as f64 - regressions_count as f64) / total_compared as f64 * 100.0;
1271        (raw * 10.0).round() / 10.0
1272    };
1273    Ok(BaselineComparison {
1274        baseline_label,
1275        baseline_path: resolved.display().to_string(),
1276        regressions,
1277        recoveries,
1278        unchanged_passes,
1279        unchanged_failures,
1280        missing_in_baseline,
1281        missing_in_cell,
1282        regressions_count,
1283        recoveries_count,
1284        net_lift_pp,
1285    })
1286}
1287
1288fn build_rollups(runs: &[RunReport]) -> EvalRollups {
1289    EvalRollups {
1290        by_fixture: rollup_by(runs, |run| run.fixture_id.clone()),
1291        by_provider: rollup_by(runs, |run| run.selector.provider.clone()),
1292        by_model: rollup_by(runs, |run| run.selector.model.clone()),
1293        by_tool_format: rollup_by(runs, |run| run.tool_format.clone()),
1294        by_tool_sequence: rollup_by(runs, |run| run.fixture_tool_sequence.clone()),
1295    }
1296}
1297
1298fn rollup_by<F>(runs: &[RunReport], key_for: F) -> Vec<RollupReport>
1299where
1300    F: Fn(&RunReport) -> String,
1301{
1302    let mut grouped: BTreeMap<String, RollupReport> = BTreeMap::new();
1303    for run in runs {
1304        let key = key_for(run);
1305        let entry = grouped.entry(key.clone()).or_insert_with(|| RollupReport {
1306            key,
1307            total_runs: 0,
1308            passed_runs: 0,
1309            failed_runs: 0,
1310            skipped_runs: 0,
1311            total_cost_usd: 0.0,
1312        });
1313        entry.total_runs += 1;
1314        if run.passed {
1315            entry.passed_runs += 1;
1316        } else if run.skipped {
1317            entry.skipped_runs += 1;
1318        } else {
1319            entry.failed_runs += 1;
1320        }
1321        entry.total_cost_usd += run.cost_usd;
1322    }
1323    grouped.into_values().collect()
1324}
1325
1326fn compare_formats(runs: &[RunReport]) -> Vec<FormatComparison> {
1327    let mut grouped: BTreeMap<String, Vec<&RunReport>> = BTreeMap::new();
1328    for run in runs {
1329        grouped
1330            .entry(format!(
1331                "{}\0{}",
1332                run.fixture_id,
1333                selector_label(&run.selector)
1334            ))
1335            .or_default()
1336            .push(run);
1337    }
1338    let mut out = Vec::new();
1339    for group in grouped.values() {
1340        let Some(first) = group.first() else {
1341            continue;
1342        };
1343        let native = group
1344            .iter()
1345            .find(|run| run.tool_format == "native")
1346            .copied();
1347        let text = group.iter().find(|run| run.tool_format == "text").copied();
1348        if native.is_none() && text.is_none() {
1349            continue;
1350        }
1351        let pair = native.zip(text);
1352        let mut divergence_reasons = Vec::new();
1353        if let Some((native, text)) = pair {
1354            if native.status != text.status {
1355                divergence_reasons.push(format!(
1356                    "status differs: native={} text={}",
1357                    native.status, text.status
1358                ));
1359            }
1360            if native.passed != text.passed {
1361                divergence_reasons.push(format!(
1362                    "pass result differs: native={} text={}",
1363                    native.passed, text.passed
1364                ));
1365            }
1366            if native.verification_success != text.verification_success {
1367                divergence_reasons.push(format!(
1368                    "verifier result differs: native={} text={}",
1369                    native.verification_success, text.verification_success
1370                ));
1371            }
1372            if native.tool_sequence != text.tool_sequence {
1373                divergence_reasons.push(format!(
1374                    "tool sequence differs: native=[{}] text=[{}]",
1375                    native.tool_sequence.join(", "),
1376                    text.tool_sequence.join(", ")
1377                ));
1378            }
1379            if native.rejected_tool_calls != text.rejected_tool_calls {
1380                divergence_reasons.push(format!(
1381                    "rejected tool-call recovery differs: native={} text={}",
1382                    native.rejected_tool_calls, text.rejected_tool_calls
1383                ));
1384            }
1385        }
1386        let evidence_paths = [native, text]
1387            .into_iter()
1388            .flatten()
1389            .map(|run| run.transcript_events_path.clone())
1390            .collect::<Vec<_>>();
1391        out.push(FormatComparison {
1392            fixture_id: first.fixture_id.clone(),
1393            selector: first.selector.clone(),
1394            native_run_id: native.map(|run| run.run_id.clone()),
1395            text_run_id: text.map(|run| run.run_id.clone()),
1396            native_evidence_path: native.map(|run| run.transcript_events_path.clone()),
1397            text_evidence_path: text.map(|run| run.transcript_events_path.clone()),
1398            native_status: native.map(|run| run.status.clone()),
1399            text_status: text.map(|run| run.status.clone()),
1400            native_passed: native.map(|run| run.passed),
1401            text_passed: text.map(|run| run.passed),
1402            native_tool_call_count: native.map(|run| run.tool_calls),
1403            text_tool_call_count: text.map(|run| run.tool_calls),
1404            native_rejected_tool_call_count: native.map(|run| run.rejected_tool_calls),
1405            text_rejected_tool_call_count: text.map(|run| run.rejected_tool_calls),
1406            verifier_match: pair
1407                .map(|(native, text)| native.verification_success == text.verification_success),
1408            tool_sequence_match: pair
1409                .map(|(native, text)| native.tool_sequence == text.tool_sequence),
1410            rejected_tool_call_delta_text_minus_native: pair.map(|(native, text)| {
1411                text.rejected_tool_calls as i64 - native.rejected_tool_calls as i64
1412            }),
1413            token_delta_text_minus_native: pair.map(|(native, text)| {
1414                (text.input_tokens + text.output_tokens)
1415                    - (native.input_tokens + native.output_tokens)
1416            }),
1417            iteration_delta_text_minus_native: pair
1418                .map(|(native, text)| text.iterations - native.iterations),
1419            equivalent: pair.map(|(native, text)| {
1420                native.status == text.status
1421                    && native.passed == text.passed
1422                    && native.skipped == text.skipped
1423                    && native.verification_success == text.verification_success
1424                    && native.tool_sequence == text.tool_sequence
1425                    && native.rejected_tool_calls == text.rejected_tool_calls
1426            }),
1427            divergence_reasons,
1428            evidence_paths,
1429        });
1430    }
1431    out
1432}
1433
1434fn build_parity_by_pair(comparisons: &[FormatComparison]) -> Vec<ToolModeParityPairSummary> {
1435    let fixture_inputs = comparisons
1436        .iter()
1437        .filter_map(parity_fixture_input)
1438        .collect::<Vec<_>>();
1439    let fixture_reports = tool_mode_parity::build_fixture_reports(&fixture_inputs);
1440    tool_mode_parity::build_pair_summaries(&fixture_reports)
1441}
1442
1443fn parity_fixture_input(comparison: &FormatComparison) -> Option<ToolModeParityFixtureInput> {
1444    let native_verdict = comparison.native_status.clone()?;
1445    let text_verdict = comparison.text_status.clone()?;
1446    if native_verdict == "skipped" || text_verdict == "skipped" {
1447        return None;
1448    }
1449    Some(ToolModeParityFixtureInput {
1450        provider: comparison.selector.provider.clone(),
1451        model: comparison.selector.model.clone(),
1452        fixture_id: comparison.fixture_id.clone(),
1453        native_verdict,
1454        text_verdict,
1455        native_passed: comparison.native_passed?,
1456        text_passed: comparison.text_passed?,
1457        agreement: comparison.equivalent?,
1458        verifier_agreement: comparison.verifier_match?,
1459        native_tool_call_count: comparison.native_tool_call_count?,
1460        text_tool_call_count: comparison.text_tool_call_count?,
1461        native_rejected_tool_call_count: comparison.native_rejected_tool_call_count?,
1462        text_rejected_tool_call_count: comparison.text_rejected_tool_call_count?,
1463        native_evidence_path: comparison.native_evidence_path.clone()?,
1464        text_evidence_path: comparison.text_evidence_path.clone()?,
1465    })
1466}
1467
1468fn suggest_followups(
1469    runs: &[RunReport],
1470    comparisons: &[FormatComparison],
1471) -> Vec<FollowupSuggestion> {
1472    let mut out = Vec::new();
1473    let failed = runs
1474        .iter()
1475        .filter(|run| !run.passed && !run.skipped)
1476        .map(|run| run.run_id.clone())
1477        .collect::<Vec<_>>();
1478    if !failed.is_empty() {
1479        out.push(FollowupSuggestion {
1480            title: "Normalize coding-agent fixture failures across provider presets".to_string(),
1481            body: "One or more fixture/provider/tool-format runs failed. Inspect the run directories and decide whether the gap belongs in provider adapters, preset prompting, transcript handling, or host-tool ergonomics.".to_string(),
1482            labels: vec!["eval".to_string(), "providers".to_string()],
1483            run_ids: failed,
1484        });
1485    }
1486
1487    let rejected = runs
1488        .iter()
1489        .filter(|run| run.rejected_tool_calls > 0)
1490        .map(|run| run.run_id.clone())
1491        .collect::<Vec<_>>();
1492    if !rejected.is_empty() {
1493        out.push(FollowupSuggestion {
1494            title: "Abstract rejected tool-call recovery in agent transcripts".to_string(),
1495            body: "Some runs recovered after rejected tool calls. Add runtime support or preset guidance so harness authors can distinguish recoverable provider/tool-shape noise from user-relevant transcript events.".to_string(),
1496            labels: vec!["agents".to_string(), "transcripts".to_string()],
1497            run_ids: rejected,
1498        });
1499    }
1500
1501    let mismatched = comparisons
1502        .iter()
1503        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1504        .map(|comparison| {
1505            format!(
1506                "{}:{} ({})",
1507                comparison.fixture_id,
1508                selector_label(&comparison.selector),
1509                comparison.divergence_reasons.join("; ")
1510            )
1511        })
1512        .collect::<Vec<_>>();
1513    if !mismatched.is_empty() {
1514        let run_ids = comparisons
1515            .iter()
1516            .filter(|comparison| !comparison.divergence_reasons.is_empty())
1517            .flat_map(|comparison| {
1518                [
1519                    comparison.native_run_id.clone(),
1520                    comparison.text_run_id.clone(),
1521                ]
1522            })
1523            .flatten()
1524            .collect::<Vec<_>>();
1525        out.push(FollowupSuggestion {
1526            title: "Make native/text tool modes behaviorally interchangeable for preset harnesses"
1527                .to_string(),
1528            body: format!(
1529                "Native and text tool modes diverged for: {}. The preset/runtime boundary should hide provider tool-channel differences where possible.",
1530                mismatched.join(", ")
1531            ),
1532            labels: vec!["agents".to_string(), "tools".to_string()],
1533            run_ids,
1534        });
1535    }
1536
1537    let unknown_pricing = runs
1538        .iter()
1539        .filter(|run| {
1540            !run.skipped
1541                && !run.pricing_known
1542                && !matches!(run.selector.provider.as_str(), "mock" | "fake")
1543                && !selector_is_local(&run.selector)
1544        })
1545        .map(|run| run.run_id.clone())
1546        .collect::<Vec<_>>();
1547    if !unknown_pricing.is_empty() {
1548        out.push(FollowupSuggestion {
1549            title: "Fill provider pricing metadata for benchmarked models".to_string(),
1550            body: "At least one live provider/model produced usage metrics but had no pricing entry, which weakens cost comparisons in eval reports.".to_string(),
1551            labels: vec!["providers".to_string(), "docs".to_string()],
1552            run_ids: unknown_pricing,
1553        });
1554    }
1555    out
1556}
1557
1558fn write_json_artifacts(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1559    write_json_pretty(&output_dir.join("summary.json"), summary)?;
1560    write_jsonl(&output_dir.join("per_run.jsonl"), &summary.runs)?;
1561    let summary_value = serde_json::to_value(summary).map_err(|error| error.to_string())?;
1562    let readiness = local_readiness::report_from_summary_json(
1563        &summary_value,
1564        output_dir.display().to_string(),
1565    )?;
1566    write_json_pretty(&output_dir.join("local_readiness.json"), &readiness)?;
1567    let generated_at = RealClock::new()
1568        .now_utc()
1569        .format(&time::format_description::well_known::Rfc3339)
1570        .map_err(|error| format!("failed to format parity overlay timestamp: {error}"))?;
1571    let parity_dir = output_dir.join(TOOL_MODE_PARITY_DIRECTORY);
1572    let parity_reports = tool_mode_parity::build_fixture_reports(
1573        &summary
1574            .comparisons
1575            .iter()
1576            .filter_map(parity_fixture_input)
1577            .collect::<Vec<_>>(),
1578    );
1579    for report in &parity_reports {
1580        let path = parity_dir
1581            .join(sanitize_id(&format!(
1582                "{}__{}:{}",
1583                report.fixture_id, report.provider, report.model
1584            )))
1585            .join("parity.json");
1586        tool_mode_parity::write_fixture_report(&path, report)?;
1587    }
1588    let overlay = tool_mode_parity::build_overlay(
1589        &summary.parity_by_pair,
1590        &generated_at,
1591        TOOL_MODE_PARITY_FIXTURE_SUITE,
1592        output_dir,
1593    );
1594    tool_mode_parity::write_overlay(
1595        &output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME),
1596        &overlay,
1597    )?;
1598    Ok(())
1599}
1600
1601fn announce_output_paths(output_dir: &Path) {
1602    eprintln!(
1603        "wrote {}, {}, {}, {}, {}, {}, and {}",
1604        output_dir.join("summary.json").display(),
1605        output_dir.join("per_run.jsonl").display(),
1606        output_dir.join("local_readiness.json").display(),
1607        output_dir.join(TOOL_MODE_PARITY_DIRECTORY).display(),
1608        output_dir.join(TOOL_MODE_PARITY_OVERLAY_FILENAME).display(),
1609        output_dir.join("summary.md").display(),
1610        output_dir.join("followups.md").display()
1611    );
1612}
1613
1614// ─── Legacy direct-render path (gated by HARN_CLI_IMPL=rust) ────────────
1615
1616fn write_markdown_artifacts_legacy(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1617    fs::write(output_dir.join("summary.md"), render_markdown(summary))
1618        .map_err(|error| format!("failed to write summary.md: {error}"))?;
1619    fs::write(output_dir.join("followups.md"), render_followups(summary))
1620        .map_err(|error| format!("failed to write followups.md: {error}"))?;
1621    Ok(())
1622}
1623
1624fn print_summary_legacy(summary: &EvalSummary) {
1625    println!(
1626        "coding-agent eval: {}/{} passed, {} skipped, total_cost_usd={:.6}",
1627        summary.passed_runs, summary.total_runs, summary.skipped_runs, summary.total_cost_usd
1628    );
1629}
1630
1631fn print_json_legacy(summary: &EvalSummary) {
1632    match serde_json::to_string_pretty(summary) {
1633        Ok(payload) => println!("{payload}"),
1634        Err(error) => eprintln!("warning: failed to render summary JSON: {error}"),
1635    }
1636}
1637
1638// ─── Dispatch (.harn) render path ────────────────────────────────────────
1639
1640async fn write_markdown_artifacts_dispatch(
1641    output_dir: &Path,
1642    summary: &EvalSummary,
1643) -> Result<(), i32> {
1644    let markdown = render_via_dispatch(summary, "markdown").await?;
1645    if let Err(error) = fs::write(output_dir.join("summary.md"), markdown) {
1646        eprintln!("error: failed to write summary.md: {error}");
1647        return Err(1);
1648    }
1649    let followups = render_via_dispatch(summary, "followups").await?;
1650    if let Err(error) = fs::write(output_dir.join("followups.md"), followups) {
1651        eprintln!("error: failed to write followups.md: {error}");
1652        return Err(1);
1653    }
1654    Ok(())
1655}
1656
1657async fn print_summary_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1658    let payload = render_via_dispatch(summary, "summary").await?;
1659    print!("{payload}");
1660    // The script emits exactly the legacy summary line (no trailing
1661    // newline); add one to match the legacy `println!` semantics.
1662    if !payload.ends_with('\n') {
1663        println!();
1664    }
1665    Ok(())
1666}
1667
1668async fn print_json_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1669    let payload = render_via_dispatch(summary, "json").await?;
1670    print!("{payload}");
1671    if !payload.ends_with('\n') {
1672        println!();
1673    }
1674    Ok(())
1675}
1676
1677/// Dispatch to the embedded `cli/eval/coding_agent.harn` script for one
1678/// of the four rendering modes (markdown / followups / summary / json).
1679/// Returns the captured stdout on success, or a propagated exit code
1680/// on failure.
1681///
1682/// **Concurrency.** Held under [`DISPATCH_RENDER_LOCK`] so concurrent
1683/// in-process callers don't race on the global env vars the Rust shim
1684/// sets to hand the report to the script. See the lock's docstring
1685/// for the trade-off rationale.
1686async fn render_via_dispatch(summary: &EvalSummary, mode: &str) -> Result<String, i32> {
1687    let summary_json = match serde_json::to_string(summary) {
1688        Ok(json) => json,
1689        Err(error) => {
1690            eprintln!("error: failed to serialise EvalSummary for dispatch: {error}");
1691            return Err(1);
1692        }
1693    };
1694    let _guard = DISPATCH_RENDER_LOCK.lock().await;
1695    let _summary = ScopedEnvVar::set(CODING_AGENT_SUMMARY_ENV, &summary_json);
1696    let _mode = ScopedEnvVar::set(CODING_AGENT_MODE_ENV, mode);
1697
1698    let outcome = dispatch::run_embedded_script("eval/coding_agent", Vec::new(), false).await;
1699    if !outcome.stderr.is_empty() {
1700        let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
1701    }
1702    if outcome.exit_code != 0 {
1703        return Err(outcome.exit_code);
1704    }
1705    Ok(outcome.stdout)
1706}
1707
1708fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<(), String> {
1709    let body = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
1710    fs::write(path, format!("{body}\n")).map_err(|error| error.to_string())
1711}
1712
1713fn write_jsonl<T: Serialize>(path: &Path, items: &[T]) -> Result<(), String> {
1714    let mut body = String::new();
1715    for item in items {
1716        let line = serde_json::to_string(item).map_err(|error| error.to_string())?;
1717        body.push_str(&line);
1718        body.push('\n');
1719    }
1720    fs::write(path, body).map_err(|error| error.to_string())
1721}
1722
1723fn render_markdown(summary: &EvalSummary) -> String {
1724    let mut out = String::new();
1725    out.push_str("# Coding Agent Harness Quality Suite\n\n");
1726    out.push_str(&format!(
1727        "- fixtures: `{}`\n- passed: {}/{}\n- skipped: {}\n- total_cost_usd: {:.6}\n\n",
1728        summary.fixture_ids.join("`, `"),
1729        summary.passed_runs,
1730        summary.total_runs,
1731        summary.skipped_runs,
1732        summary.total_cost_usd
1733    ));
1734    render_rollup_table(&mut out, "By Fixture", &summary.rollups.by_fixture);
1735    render_rollup_table(&mut out, "By Provider", &summary.rollups.by_provider);
1736    render_rollup_table(&mut out, "By Model", &summary.rollups.by_model);
1737    render_rollup_table(&mut out, "By Tool Format", &summary.rollups.by_tool_format);
1738    render_rollup_table(
1739        &mut out,
1740        "By Tool Sequence",
1741        &summary.rollups.by_tool_sequence,
1742    );
1743
1744    out.push_str("\n## Runs\n\n");
1745    out.push_str("| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n");
1746    out.push_str("|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n");
1747    for run in &summary.runs {
1748        let tool_sequence = if run.tool_sequence.is_empty() {
1749            "-".to_string()
1750        } else {
1751            run.tool_sequence.join(", ").replace('|', "\\|")
1752        };
1753        out.push_str(&format!(
1754            "| `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {:.6} | {} | `{}` |\n",
1755            run.fixture_id,
1756            run.run_id,
1757            run.selector.provider,
1758            run.selector.model.replace('|', "\\|"),
1759            run.tool_format,
1760            run.fixture_tool_sequence,
1761            tool_sequence,
1762            run.status,
1763            run.iterations,
1764            run.input_tokens + run.output_tokens,
1765            run.cost_usd,
1766            markdown_link(
1767                &run.transcript_event_count.to_string(),
1768                &run.transcript_events_path
1769            ),
1770            run.output_dir
1771        ));
1772    }
1773    if let Some(comparison) = &summary.baseline_comparison {
1774        out.push_str("\n## Baseline Comparison\n\n");
1775        out.push_str(&format!(
1776            "Compared against `{}`{}.\n\n",
1777            comparison.baseline_path,
1778            if comparison.baseline_label.is_empty() {
1779                String::new()
1780            } else {
1781                format!(" (label: `{}`)", comparison.baseline_label)
1782            },
1783        ));
1784        out.push_str(&format!(
1785            "- regressions: **{}** (baseline passed, this cell failed)\n- recoveries: **{}** (baseline failed, this cell passed)\n- net lift: **{:+.1}pp**\n\n",
1786            comparison.regressions_count,
1787            comparison.recoveries_count,
1788            comparison.net_lift_pp,
1789        ));
1790        if !comparison.regressions.is_empty() {
1791            out.push_str("### Regressions\n\n");
1792            for delta in &comparison.regressions {
1793                out.push_str(&format!(
1794                    "- `{}`: `{}` → `{}`\n",
1795                    delta.fixture_id, delta.baseline_status, delta.cell_status,
1796                ));
1797            }
1798            out.push('\n');
1799        }
1800        if !comparison.recoveries.is_empty() {
1801            out.push_str("### Recoveries\n\n");
1802            for delta in &comparison.recoveries {
1803                out.push_str(&format!(
1804                    "- `{}`: `{}` → `{}`\n",
1805                    delta.fixture_id, delta.baseline_status, delta.cell_status,
1806                ));
1807            }
1808            out.push('\n');
1809        }
1810    }
1811    if !summary.comparisons.is_empty() {
1812        out.push_str("\n## Native/Text Comparison\n\n");
1813        out.push_str("| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n");
1814        out.push_str("|---|---|---|---|---|---|---|---:|---:|---:|---|\n");
1815        for comparison in &summary.comparisons {
1816            out.push_str(&format!(
1817                "| `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
1818                comparison.fixture_id,
1819                selector_label(&comparison.selector),
1820                comparison
1821                    .native_status
1822                    .clone()
1823                    .unwrap_or_else(|| "-".to_string()),
1824                comparison
1825                    .text_status
1826                    .clone()
1827                    .unwrap_or_else(|| "-".to_string()),
1828                optional_bool_mark(comparison.equivalent),
1829                optional_bool_mark(comparison.verifier_match),
1830                optional_bool_mark(comparison.tool_sequence_match),
1831                comparison
1832                    .rejected_tool_call_delta_text_minus_native
1833                    .map(|v| v.to_string())
1834                    .unwrap_or_else(|| "-".to_string()),
1835                comparison
1836                    .token_delta_text_minus_native
1837                    .map(|v| v.to_string())
1838                    .unwrap_or_else(|| "-".to_string()),
1839                comparison
1840                    .iteration_delta_text_minus_native
1841                    .map(|v| v.to_string())
1842                    .unwrap_or_else(|| "-".to_string()),
1843                comparison_evidence_links(comparison)
1844            ));
1845        }
1846    }
1847    if !summary.parity_by_pair.is_empty() {
1848        out.push_str("\n## Parity report — native vs text\n\n");
1849        out.push_str("| selector | sample | native pass | text pass | agreement | verifier divergence | native_only | text_only | both_pass | both_fail |\n");
1850        out.push_str("|---|---:|---:|---:|---:|---:|---:|---:|---:|---:|\n");
1851        for pair in &summary.parity_by_pair {
1852            out.push_str(&format!(
1853                "| `{}` | {} | {:.1}% | {:.1}% | {:.1}% | {:.1}% | {} | {} | {} | {} |\n",
1854                selector_label(&ModelSelector {
1855                    selector: format!("{}:{}", pair.provider, pair.model),
1856                    provider: pair.provider.clone(),
1857                    model: pair.model.clone(),
1858                }),
1859                pair.sample_size,
1860                pair.native.pass_rate * 100.0,
1861                pair.text.pass_rate * 100.0,
1862                pair.agreement_rate * 100.0,
1863                pair.verifier_divergence_rate * 100.0,
1864                pair.divergence_counts.native_only_pass,
1865                pair.divergence_counts.text_only_pass,
1866                pair.divergence_counts.both_pass,
1867                pair.divergence_counts.both_fail,
1868            ));
1869        }
1870    }
1871    let diverged = summary
1872        .comparisons
1873        .iter()
1874        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1875        .collect::<Vec<_>>();
1876    if !diverged.is_empty() {
1877        out.push_str("\n## Native/Text Divergence Evidence\n\n");
1878        for comparison in diverged {
1879            out.push_str(&format!(
1880                "- `{}` `{}`: {}\n",
1881                comparison.fixture_id,
1882                selector_label(&comparison.selector),
1883                comparison.divergence_reasons.join("; ")
1884            ));
1885            if !comparison.evidence_paths.is_empty() {
1886                out.push_str(&format!(
1887                    "  Evidence: {}\n",
1888                    comparison_evidence_links(comparison)
1889                ));
1890            }
1891        }
1892    }
1893    out
1894}
1895
1896fn render_rollup_table(out: &mut String, title: &str, rollups: &[RollupReport]) {
1897    out.push_str(&format!("## {title}\n\n"));
1898    out.push_str("| key | passed | failed | skipped | total | cost |\n");
1899    out.push_str("|---|---:|---:|---:|---:|---:|\n");
1900    for rollup in rollups {
1901        out.push_str(&format!(
1902            "| `{}` | {} | {} | {} | {} | {:.6} |\n",
1903            rollup.key.replace('|', "\\|"),
1904            rollup.passed_runs,
1905            rollup.failed_runs,
1906            rollup.skipped_runs,
1907            rollup.total_runs,
1908            rollup.total_cost_usd
1909        ));
1910    }
1911    out.push('\n');
1912}
1913
1914fn render_followups(summary: &EvalSummary) -> String {
1915    let mut out = String::new();
1916    out.push_str("# Follow-up Issue Candidates\n\n");
1917    if summary.followups.is_empty() {
1918        out.push_str("No follow-up issue candidates were generated from this run.\n");
1919        return out;
1920    }
1921    for followup in &summary.followups {
1922        out.push_str(&format!("## {}\n\n{}\n\n", followup.title, followup.body));
1923        if !followup.run_ids.is_empty() {
1924            out.push_str(&format!("- run_ids: `{}`\n", followup.run_ids.join("`, `")));
1925        }
1926        if !followup.labels.is_empty() {
1927            out.push_str(&format!("- labels: `{}`\n", followup.labels.join("`, `")));
1928        }
1929        out.push('\n');
1930    }
1931    out
1932}
1933
1934fn read_run_summary(run_dir: &Path) -> Option<JsonValue> {
1935    let raw = fs::read_to_string(run_dir.join("summary.json")).ok()?;
1936    serde_json::from_str(&raw).ok()
1937}
1938
1939fn parse_last_json_line(stdout: &str) -> Option<JsonValue> {
1940    stdout
1941        .lines()
1942        .rev()
1943        .map(str::trim)
1944        .filter(|line| !line.is_empty())
1945        .find_map(|line| serde_json::from_str::<JsonValue>(line).ok())
1946}
1947
1948fn string_array(value: Option<&JsonValue>) -> Vec<String> {
1949    value
1950        .and_then(JsonValue::as_array)
1951        .map(|values| {
1952            values
1953                .iter()
1954                .filter_map(JsonValue::as_str)
1955                .map(str::to_string)
1956                .collect()
1957        })
1958        .unwrap_or_default()
1959}
1960
1961fn non_empty_string_array(value: Option<&JsonValue>) -> Option<Vec<String>> {
1962    let values = string_array(value);
1963    (!values.is_empty()).then_some(values)
1964}
1965
1966fn tool_call_sequence(value: Option<&JsonValue>) -> Option<Vec<String>> {
1967    let calls = value.and_then(JsonValue::as_array)?;
1968    let mut sequence = Vec::new();
1969    for call in calls {
1970        if let Some(name) = call
1971            .get("name")
1972            .or_else(|| call.get("tool_name"))
1973            .and_then(JsonValue::as_str)
1974        {
1975            sequence.push(name.to_string());
1976        }
1977    }
1978    (!sequence.is_empty()).then_some(sequence)
1979}
1980
1981fn optional_bool_mark(value: Option<bool>) -> &'static str {
1982    match value {
1983        Some(true) => "yes",
1984        Some(false) => "no",
1985        None => "-",
1986    }
1987}
1988
1989fn comparison_evidence_links(comparison: &FormatComparison) -> String {
1990    let mut links = Vec::new();
1991    if let Some(native) = comparison.native_evidence_path.as_deref() {
1992        links.push(markdown_link("native", native));
1993    }
1994    if let Some(text) = comparison.text_evidence_path.as_deref() {
1995        links.push(markdown_link("text", text));
1996    }
1997    if links.is_empty() {
1998        "-".to_string()
1999    } else {
2000        links.join("<br>")
2001    }
2002}
2003
2004fn markdown_link(label: &str, target: &str) -> String {
2005    format!(
2006        "[{}]({})",
2007        label.replace('|', "\\|"),
2008        target
2009            .replace(' ', "%20")
2010            .replace('(', "%28")
2011            .replace(')', "%29")
2012    )
2013}
2014
2015fn reset_dir(path: &Path) -> Result<(), String> {
2016    if path.exists() {
2017        fs::remove_dir_all(path).map_err(|error| error.to_string())?;
2018    }
2019    fs::create_dir_all(path).map_err(|error| error.to_string())
2020}
2021
2022fn run_id_for(fixture: FixtureDefinition, selector: &ModelSelector, tool_format: &str) -> String {
2023    sanitize_id(&format!(
2024        "{}__{}__{}",
2025        fixture.id,
2026        selector_label(selector),
2027        tool_format
2028    ))
2029}
2030
2031fn sanitize_id(raw: &str) -> String {
2032    let mut out = String::new();
2033    for ch in raw.chars() {
2034        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
2035            out.push(ch);
2036        } else {
2037            out.push('_');
2038        }
2039    }
2040    out.trim_matches('_').to_string()
2041}
2042
2043fn default_output_dir() -> PathBuf {
2044    PathBuf::from(".harn-runs")
2045        .join("coding-agent-bench")
2046        .join("latest")
2047}
2048
2049fn excerpt(text: &str) -> Option<String> {
2050    let trimmed = text.trim();
2051    if trimmed.is_empty() {
2052        return None;
2053    }
2054    let max = 4000;
2055    if trimmed.len() <= max {
2056        return Some(trimmed.to_string());
2057    }
2058    let mut truncated = String::new();
2059    for ch in trimmed.chars().take(max) {
2060        truncated.push(ch);
2061    }
2062    truncated.push_str("...");
2063    Some(truncated)
2064}
2065
2066fn load_env_files(paths: &[PathBuf]) -> Result<(EnvOverlay, Vec<LoadedEnvKey>), String> {
2067    let mut previous = Vec::new();
2068    let mut loaded = Vec::new();
2069    let mut touched = BTreeSet::new();
2070    for path in paths {
2071        let path = expand_home(path);
2072        let raw = fs::read_to_string(&path)
2073            .map_err(|error| format!("failed to read env file {}: {error}", path.display()))?;
2074        for (line_no, line) in raw.lines().enumerate() {
2075            let Some((key, value)) = parse_env_line(line).map_err(|error| {
2076                format!("{}:{}: {error}", path.display(), line_no.saturating_add(1))
2077            })?
2078            else {
2079                continue;
2080            };
2081            if touched.insert(key.clone()) {
2082                previous.push((OsString::from(&key), std::env::var_os(&key)));
2083            }
2084            std::env::set_var(&key, value);
2085            loaded.push(LoadedEnvKey {
2086                key,
2087                source: path.display().to_string(),
2088            });
2089        }
2090    }
2091    Ok((EnvOverlay { previous }, loaded))
2092}
2093
2094fn parse_env_line(line: &str) -> Result<Option<(String, String)>, String> {
2095    let trimmed = line.trim();
2096    if trimmed.is_empty() || trimmed.starts_with('#') {
2097        return Ok(None);
2098    }
2099    let trimmed = trimmed.strip_prefix("export ").unwrap_or(trimmed).trim();
2100    let Some((key, value)) = trimmed.split_once('=') else {
2101        return Err("expected KEY=VALUE".to_string());
2102    };
2103    let key = key.trim();
2104    if key.is_empty() {
2105        return Err("empty key".to_string());
2106    }
2107    if !key
2108        .chars()
2109        .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2110    {
2111        return Err(format!("invalid key `{key}`"));
2112    }
2113    Ok(Some((key.to_string(), unquote_env_value(value.trim()))))
2114}
2115
2116fn unquote_env_value(value: &str) -> String {
2117    if value.len() >= 2 {
2118        let bytes = value.as_bytes();
2119        if (bytes[0] == b'"' && bytes[value.len() - 1] == b'"')
2120            || (bytes[0] == b'\'' && bytes[value.len() - 1] == b'\'')
2121        {
2122            return value[1..value.len() - 1].to_string();
2123        }
2124    }
2125    value.to_string()
2126}
2127
2128fn expand_home(path: &Path) -> PathBuf {
2129    let raw = path.to_string_lossy();
2130    if raw == "~" {
2131        return std::env::var_os("HOME")
2132            .map(PathBuf::from)
2133            .unwrap_or_else(|| path.to_path_buf());
2134    }
2135    if let Some(rest) = raw.strip_prefix("~/") {
2136        if let Some(home) = std::env::var_os("HOME") {
2137            return PathBuf::from(home).join(rest);
2138        }
2139    }
2140    path.to_path_buf()
2141}
2142
2143#[cfg(test)]
2144#[path = "eval_coding_agent_tests.rs"]
2145mod tests;