harn_cli/commands/
eval_coding_agent.rs

1//! `harn eval coding-agent` — empirical preset/provider benchmark for a
2//! small coding-agent fixture suite.
3//!
4//! ## .harn dispatch (W7 partial port — see harn#2307)
5//!
6//! The **matrix execution pipeline** (fixture resolution, model
7//! discovery, per-cell `execute_run` invocation, Ollama snapshot/
8//! cleanup, scoring, rollups, native/text comparisons, follow-up
9//! generation, baseline diff) stays in Rust. Every cell drives the
10//! embedded `coding_agent_suite.harn` driver through `execute_run`,
11//! which itself reaches into VM internals (`commands::run`,
12//! `harn_vm::llm`, `commands::local::runtime`) that aren't reachable
13//! from script-land today — the same constraint that shaped W5 / W6.
14//!
15//! The **rendering layer** (the `summary.md` body, the `followups.md`
16//! body, the one-line human stdout summary, the `--json` pretty form)
17//! is delegated to
18//! `crates/harn-stdlib/src/stdlib/cli/eval/coding_agent.harn`. The
19//! Rust shim pre-serialises the assembled `EvalSummary` to JSON,
20//! forwards it via [`CODING_AGENT_SUMMARY_ENV`], dispatches four
21//! times (markdown for `summary.md`, followups for `followups.md`,
22//! then either the summary line or the `--json` pretty form for
23//! stdout), and writes the captured payloads to disk / real stdout.
24//!
25//! The on-disk JSON artifacts (`summary.json`, `per_run.jsonl`,
26//! `local_readiness.json`) stay on the serde-driven Rust path because
27//! Harn's `json_stringify_pretty` sorts dict keys alphabetically and
28//! the on-disk format is consumed by the experiment driver in
29//! `experiments/step-judge/run.sh`, the local-readiness regression
30//! check, and hosted ingestion — all of which depend on the serde
31//! struct-field byte order.
32//!
33//! `HARN_CLI_IMPL=rust` keeps the legacy direct-render path for the
34//! parity-snapshot harness (#2299) until the C1 ratchet (#2314) lands.
35
36use std::collections::{BTreeMap, BTreeSet, HashSet};
37use std::ffi::OsString;
38use std::fs;
39use std::io::Write as _;
40use std::path::{Path, PathBuf};
41
42use harn_vm::clock::{Clock, RealClock};
43use serde::Serialize;
44use serde_json::Value as JsonValue;
45
46use crate::cli::EvalCodingAgentArgs;
47use crate::commands::eval_model_selector::{
48    resolve_selector, selector_is_local, selector_label, ModelSelector,
49};
50use crate::commands::local::runtime::{
51    local_provider_ids, ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
52};
53use crate::commands::local_readiness;
54use crate::commands::run::{execute_run, CliLlmMockMode, RunProfileOptions};
55use crate::dispatch;
56use crate::env_guard::ScopedEnvVar;
57
58/// Env var the embedded `cli/eval/coding_agent` script reads to pick
59/// up the pre-serialised [`EvalSummary`]. The Rust shim does all the
60/// matrix execution and scoring and hands the script the assembled
61/// summary so it only has to format it.
62const CODING_AGENT_SUMMARY_ENV: &str = "HARN_EVAL_CODING_AGENT_SUMMARY_JSON";
63
64/// Env var the script reads to pick the rendering mode — one of
65/// `"markdown"` (summary.md body), `"followups"` (followups.md body),
66/// `"summary"` (one-line stdout summary), or `"json"` (--json pretty
67/// form). Defaulted to `"summary"` if unset so the script stays robust
68/// against future Rust-side bugs.
69const CODING_AGENT_MODE_ENV: &str = "HARN_EVAL_CODING_AGENT_MODE";
70
71/// Serialises the dispatch-render path so concurrent in-process
72/// callers (the existing `eval_coding_agent_cli` integration test plus
73/// any future fanout caller) don't race on the global env vars the
74/// Rust shim sets to hand the report off to the .harn script. The CLI
75/// binary itself is single-call, so this mutex is uncontended in
76/// production; in tests it serialises the dispatch window only —
77/// matrix execution still parallelises freely.
78///
79/// Mirrors the pattern W5's `eval_prompt.rs` and W6's `eval_context.rs`
80/// / `eval_tool_calls.rs` use (see harn#2305 / #2306) so the cross-
81/// script env-var hand-off stays consistent across the eval cluster.
82static DISPATCH_RENDER_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
83
84const CODING_AGENT_SUITE_HARN: &str = include_str!("../../assets/evals/coding_agent_suite.harn");
85
86#[derive(Debug, Clone, Copy)]
87struct FixtureDefinition {
88    id: &'static str,
89    name: &'static str,
90    tool_sequence: &'static str,
91    description: &'static str,
92}
93
94static FIXTURE_DEFINITIONS: &[FixtureDefinition] = &[
95    FixtureDefinition {
96        id: "python-add",
97        name: "Python add repair",
98        tool_sequence: "multi-tool",
99        description: "One-file Python bug fix verified by unittest output.",
100    },
101    FixtureDefinition {
102        id: "cli-help-flag",
103        name: "CLI help flag",
104        tool_sequence: "multi-tool",
105        description: "Add a tiny CLI flag, update help-facing docs, and verify behavior.",
106    },
107    FixtureDefinition {
108        id: "test-output-first",
109        name: "Test-output-first repair",
110        tool_sequence: "multi-tool",
111        description: "Run a failing test first, then edit the implementation and re-run it.",
112    },
113    FixtureDefinition {
114        id: "docs-symbol-rename",
115        name: "Docs symbol rename",
116        tool_sequence: "multi-tool",
117        description:
118            "Update docs and an example after a symbol rename without touching implementation.",
119    },
120    FixtureDefinition {
121        id: "read-only-audit",
122        name: "Read-only audit",
123        tool_sequence: "one-tool",
124        description: "Inspect a file and report that no edits are needed.",
125    },
126    FixtureDefinition {
127        id: "no-tool-diagnosis",
128        name: "No-tool diagnosis",
129        tool_sequence: "no-tool",
130        description: "Answer from prompt-only context without any tools.",
131    },
132];
133
134#[derive(Debug, Clone, Serialize)]
135struct LoadedEnvKey {
136    key: String,
137    source: String,
138}
139
140#[derive(Debug)]
141struct EnvOverlay {
142    previous: Vec<(OsString, Option<OsString>)>,
143}
144
145impl Drop for EnvOverlay {
146    fn drop(&mut self) {
147        for (key, previous) in self.previous.iter().rev() {
148            if let Some(value) = previous {
149                std::env::set_var(key, value);
150            } else {
151                std::env::remove_var(key);
152            }
153        }
154    }
155}
156
157#[derive(Debug, Clone, Serialize)]
158struct RunReport {
159    run_id: String,
160    fixture_id: String,
161    fixture_name: String,
162    fixture_tool_sequence: String,
163    selector: ModelSelector,
164    tool_format: String,
165    status: String,
166    passed: bool,
167    skipped: bool,
168    #[serde(skip_serializing_if = "Option::is_none")]
169    skipped_reason: Option<String>,
170    output_dir: String,
171    transcript_events_path: String,
172    workspace_root: Option<String>,
173    elapsed_ms: u64,
174    duration_ms: u64,
175    iterations: i64,
176    input_tokens: i64,
177    output_tokens: i64,
178    cost_usd: f64,
179    pricing_known: bool,
180    tool_calls: usize,
181    rejected_tool_calls: usize,
182    tool_sequence: Vec<String>,
183    successful_tools: Vec<String>,
184    transcript_event_count: usize,
185    verification_success: bool,
186    harn_exit_code: i32,
187    #[serde(skip_serializing_if = "Option::is_none")]
188    error: Option<String>,
189    #[serde(skip_serializing_if = "Option::is_none")]
190    stderr_excerpt: Option<String>,
191    local_cleanup: Option<LocalCleanupReport>,
192}
193
194#[derive(Debug, Clone, Serialize)]
195struct LocalCleanupReport {
196    provider: String,
197    model: String,
198    initially_loaded: bool,
199    action: String,
200    #[serde(skip_serializing_if = "Option::is_none")]
201    detail: Option<String>,
202}
203
204#[derive(Debug, Clone, Serialize)]
205struct FormatComparison {
206    fixture_id: String,
207    selector: ModelSelector,
208    native_run_id: Option<String>,
209    text_run_id: Option<String>,
210    native_evidence_path: Option<String>,
211    text_evidence_path: Option<String>,
212    native_status: Option<String>,
213    text_status: Option<String>,
214    native_passed: Option<bool>,
215    text_passed: Option<bool>,
216    verifier_match: Option<bool>,
217    tool_sequence_match: Option<bool>,
218    rejected_tool_call_delta_text_minus_native: Option<i64>,
219    token_delta_text_minus_native: Option<i64>,
220    iteration_delta_text_minus_native: Option<i64>,
221    equivalent: Option<bool>,
222    divergence_reasons: Vec<String>,
223    evidence_paths: Vec<String>,
224}
225
226#[derive(Debug, Clone, Serialize)]
227struct FollowupSuggestion {
228    title: String,
229    body: String,
230    labels: Vec<String>,
231    run_ids: Vec<String>,
232}
233
234#[derive(Debug, Clone, Serialize)]
235struct FixtureReport {
236    id: String,
237    name: String,
238    tool_sequence: String,
239    description: String,
240}
241
242#[derive(Debug, Clone, Serialize)]
243struct RollupReport {
244    key: String,
245    total_runs: usize,
246    passed_runs: usize,
247    failed_runs: usize,
248    skipped_runs: usize,
249    total_cost_usd: f64,
250}
251
252#[derive(Debug, Clone, Serialize)]
253struct EvalRollups {
254    by_fixture: Vec<RollupReport>,
255    by_provider: Vec<RollupReport>,
256    by_model: Vec<RollupReport>,
257    by_tool_format: Vec<RollupReport>,
258    by_tool_sequence: Vec<RollupReport>,
259}
260
261#[derive(Debug, Clone, Serialize)]
262struct EvalSummary {
263    schema_version: u32,
264    fixture_ids: Vec<String>,
265    fixtures: Vec<FixtureReport>,
266    output_dir: String,
267    models: Vec<ModelSelector>,
268    tool_formats: Vec<String>,
269    env_keys_loaded: Vec<LoadedEnvKey>,
270    total_runs: usize,
271    passed_runs: usize,
272    failed_runs: usize,
273    skipped_runs: usize,
274    diverged_comparisons: usize,
275    total_cost_usd: f64,
276    rollups: EvalRollups,
277    runs: Vec<RunReport>,
278    comparisons: Vec<FormatComparison>,
279    followups: Vec<FollowupSuggestion>,
280    /// Step-judge preset applied to all runs in this invocation, if any.
281    /// Used by the experiment driver (experiments/step-judge/run.sh) to
282    /// group repeat invocations into cells.
283    #[serde(skip_serializing_if = "Option::is_none")]
284    step_judge_preset: Option<String>,
285    /// Free-form label for grouping repeat invocations (e.g.
286    /// "replicate-1", "probe-rubric-adversarial"). Empty when unset.
287    #[serde(skip_serializing_if = "String::is_empty")]
288    run_label: String,
289    /// Optional per-fixture diff against a prior run's `summary.json`,
290    /// listing regressions (baseline passed, this cell failed) and
291    /// recoveries (baseline failed, this cell passed) plus aggregate
292    /// counts and a net lift in percentage points. Populated when the
293    /// caller passes `--baseline-comparison-against <path>` (harn#2318).
294    #[serde(skip_serializing_if = "Option::is_none")]
295    baseline_comparison: Option<BaselineComparison>,
296}
297
298#[derive(Debug, Clone, Serialize, Default)]
299struct BaselineComparison {
300    /// `output_dir` or `run_label` of the baseline summary, for context.
301    baseline_label: String,
302    /// Resolved path to the baseline `summary.json` that was diffed against.
303    baseline_path: String,
304    regressions: Vec<FixtureStatusDelta>,
305    recoveries: Vec<FixtureStatusDelta>,
306    /// Fixtures that passed in both runs.
307    unchanged_passes: Vec<String>,
308    /// Fixtures that failed in both runs.
309    unchanged_failures: Vec<String>,
310    /// Fixtures present in only one of the two runs (skipped from the
311    /// diff but listed for visibility).
312    missing_in_baseline: Vec<String>,
313    missing_in_cell: Vec<String>,
314    regressions_count: usize,
315    recoveries_count: usize,
316    /// `(recoveries_count - regressions_count) / total_fixtures_compared * 100`,
317    /// rounded to one decimal place. Negative when the cell regresses more
318    /// than it recovers.
319    net_lift_pp: f64,
320}
321
322#[derive(Debug, Clone, Serialize)]
323struct FixtureStatusDelta {
324    fixture_id: String,
325    baseline_status: String,
326    cell_status: String,
327}
328
329struct LocalRunGuard {
330    selector: ModelSelector,
331    stop_after: bool,
332    snapshot: Option<LocalProviderSnapshot>,
333}
334
335struct RunSummaryContext {
336    run_id: String,
337    fixture: FixtureDefinition,
338    selector: ModelSelector,
339    tool_format: String,
340    run_dir: PathBuf,
341    elapsed_ms: u64,
342    exit_code: i32,
343    stderr: String,
344    local_cleanup: Option<LocalCleanupReport>,
345}
346
347pub async fn run(args: EvalCodingAgentArgs) -> i32 {
348    let output_dir = args.output.clone().unwrap_or_else(default_output_dir);
349    if let Err(error) = fs::create_dir_all(&output_dir) {
350        eprintln!("error: failed to create {}: {error}", output_dir.display());
351        return 1;
352    }
353
354    let (_env_guard, env_keys_loaded) = match load_env_files(&args.env_files) {
355        Ok(loaded) => loaded,
356        Err(error) => {
357            eprintln!("error: {error}");
358            return 1;
359        }
360    };
361
362    let fixtures = match resolve_fixtures(&args.fixtures) {
363        Ok(fixtures) => fixtures,
364        Err(error) => {
365            eprintln!("error: {error}");
366            return 2;
367        }
368    };
369    let models = match resolve_models(&args).await {
370        Ok(models) => models,
371        Err(error) => {
372            eprintln!("error: {error}");
373            return 1;
374        }
375    };
376    let tool_formats = match normalize_tool_formats(&args.tool_formats) {
377        Ok(formats) => formats,
378        Err(error) => {
379            eprintln!("error: {error}");
380            return 2;
381        }
382    };
383    let matrix = build_matrix(&fixtures, &models, &tool_formats, args.max_runs);
384    if matrix.is_empty() {
385        eprintln!("error: no coding-agent benchmark runs selected");
386        return 2;
387    }
388
389    let mut reports = Vec::new();
390    let mut had_error = false;
391    for (fixture, selector, tool_format) in matrix {
392        let report = run_matrix_entry(&args, &output_dir, fixture, selector, tool_format).await;
393        if !report.passed && !report.skipped {
394            had_error = true;
395        }
396        if report.skipped && args.fail_on_unauthorized {
397            had_error = true;
398        }
399        eprintln!(
400            "{} {} {}: {}",
401            report.fixture_id,
402            selector_label(&report.selector),
403            report.tool_format,
404            report.status
405        );
406        reports.push(report);
407    }
408
409    let baseline_comparison = match &args.baseline_comparison_against {
410        Some(path) => match load_baseline_comparison(path, &reports) {
411            Ok(comparison) => Some(comparison),
412            Err(error) => {
413                eprintln!("error: --baseline-comparison-against: {error}");
414                return 1;
415            }
416        },
417        None => None,
418    };
419    let summary = build_summary(
420        &output_dir,
421        fixtures,
422        models,
423        tool_formats,
424        env_keys_loaded,
425        reports,
426        args.step_judge
427            .clone()
428            .filter(|s| !s.is_empty() && s != "none"),
429        args.run_label.clone(),
430        baseline_comparison,
431    );
432    // The JSON artifacts (summary.json, per_run.jsonl,
433    // local_readiness.json) always stay on the serde-driven Rust path —
434    // see module docstring for the byte-format rationale. They write
435    // before any rendering so a render failure doesn't leave a partially
436    // written report directory.
437    if let Err(error) = write_json_artifacts(&output_dir, &summary) {
438        eprintln!("error: failed to write benchmark outputs: {error}");
439        return 1;
440    }
441
442    // `HARN_CLI_IMPL=rust` keeps the legacy direct-render path so the
443    // parity-snapshot harness (#2299) can compare both impls until C1
444    // (#2314) deletes this escape hatch.
445    let use_legacy = std::env::var("HARN_CLI_IMPL").as_deref() == Ok("rust");
446
447    if use_legacy {
448        if let Err(error) = write_markdown_artifacts_legacy(&output_dir, &summary) {
449            eprintln!("error: {error}");
450            return 1;
451        }
452        announce_output_paths(&output_dir);
453        if args.json {
454            print_json_legacy(&summary);
455        } else {
456            print_summary_legacy(&summary);
457        }
458        return if had_error { 1 } else { 0 };
459    }
460
461    if let Err(code) = write_markdown_artifacts_dispatch(&output_dir, &summary).await {
462        return code;
463    }
464    announce_output_paths(&output_dir);
465    if args.json {
466        if let Err(code) = print_json_dispatch(&summary).await {
467            return code;
468        }
469    } else if let Err(code) = print_summary_dispatch(&summary).await {
470        return code;
471    }
472
473    if had_error {
474        1
475    } else {
476        0
477    }
478}
479
480async fn run_matrix_entry(
481    args: &EvalCodingAgentArgs,
482    output_dir: &Path,
483    fixture: FixtureDefinition,
484    selector: ModelSelector,
485    tool_format: String,
486) -> RunReport {
487    let run_id = run_id_for(fixture, &selector, &tool_format);
488    let run_dir = output_dir.join(&run_id);
489    if let Err(error) = reset_dir(&run_dir) {
490        return error_report(
491            run_id,
492            fixture,
493            selector,
494            tool_format,
495            run_dir,
496            format!("failed to prepare run directory: {error}"),
497        );
498    }
499
500    if !provider_available(&selector) {
501        let reason = format!(
502            "provider `{}` has no configured credentials",
503            selector.provider
504        );
505        return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
506    }
507
508    let script_path = run_dir.join("coding_agent_suite.harn");
509    if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
510        return error_report(
511            run_id,
512            fixture,
513            selector,
514            tool_format,
515            run_dir,
516            format!("failed to write benchmark harness: {error}"),
517        );
518    }
519
520    let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
521    let argv = script_argv(args, fixture, &selector, &tool_format, &run_dir);
522    let clock = RealClock::new();
523    let started_ms = clock.monotonic_ms();
524    let outcome = execute_run(
525        &script_path.to_string_lossy(),
526        false,
527        HashSet::new(),
528        argv,
529        Vec::new(),
530        CliLlmMockMode::Off,
531        None,
532        RunProfileOptions::default(),
533    )
534    .await;
535    let elapsed_ms = clock
536        .monotonic_ms()
537        .saturating_sub(started_ms)
538        .try_into()
539        .unwrap_or(0);
540    let local_cleanup = if let Some(guard) = local_guard {
541        guard.cleanup().await
542    } else {
543        None
544    };
545
546    let summary_value =
547        read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
548    let Some(summary) = summary_value else {
549        return RunReport {
550            run_id,
551            fixture_id: fixture.id.to_string(),
552            fixture_name: fixture.name.to_string(),
553            fixture_tool_sequence: fixture.tool_sequence.to_string(),
554            selector,
555            tool_format,
556            status: "infra_error".to_string(),
557            passed: false,
558            skipped: false,
559            skipped_reason: None,
560            output_dir: run_dir.display().to_string(),
561            transcript_events_path: run_dir
562                .join("transcript_events.jsonl")
563                .display()
564                .to_string(),
565            workspace_root: None,
566            elapsed_ms,
567            duration_ms: 0,
568            iterations: 0,
569            input_tokens: 0,
570            output_tokens: 0,
571            cost_usd: 0.0,
572            pricing_known: false,
573            tool_calls: 0,
574            rejected_tool_calls: 0,
575            tool_sequence: Vec::new(),
576            successful_tools: Vec::new(),
577            transcript_event_count: 0,
578            verification_success: false,
579            harn_exit_code: outcome.exit_code,
580            error: Some("benchmark harness produced no summary JSON".to_string()),
581            stderr_excerpt: excerpt(&outcome.stderr),
582            local_cleanup,
583        };
584    };
585
586    report_from_summary(
587        RunSummaryContext {
588            run_id,
589            fixture,
590            selector,
591            tool_format,
592            run_dir,
593            elapsed_ms,
594            exit_code: outcome.exit_code,
595            stderr: outcome.stderr,
596            local_cleanup,
597        },
598        summary,
599    )
600}
601
602fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
603    let passed = summary
604        .get("passed")
605        .and_then(JsonValue::as_bool)
606        .unwrap_or(false)
607        && ctx.exit_code == 0;
608    let input_tokens = summary
609        .pointer("/llm/input_tokens")
610        .and_then(JsonValue::as_i64)
611        .unwrap_or(0);
612    let output_tokens = summary
613        .pointer("/llm/output_tokens")
614        .and_then(JsonValue::as_i64)
615        .unwrap_or(0);
616    let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
617    let cost_usd = pricing
618        .map(|(input, output)| {
619            (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
620        })
621        .unwrap_or(0.0);
622    let status = if passed {
623        "passed".to_string()
624    } else if ctx.exit_code == 0 {
625        "failed".to_string()
626    } else {
627        summary
628            .get("status")
629            .and_then(JsonValue::as_str)
630            .unwrap_or("failed")
631            .to_string()
632    };
633    RunReport {
634        run_id: ctx.run_id,
635        fixture_id: ctx.fixture.id.to_string(),
636        fixture_name: ctx.fixture.name.to_string(),
637        fixture_tool_sequence: ctx.fixture.tool_sequence.to_string(),
638        selector: ctx.selector,
639        tool_format: ctx.tool_format,
640        status,
641        passed,
642        skipped: false,
643        skipped_reason: None,
644        output_dir: ctx.run_dir.display().to_string(),
645        transcript_events_path: ctx
646            .run_dir
647            .join("transcript_events.jsonl")
648            .display()
649            .to_string(),
650        workspace_root: summary
651            .get("workspace_root")
652            .and_then(JsonValue::as_str)
653            .map(str::to_string),
654        elapsed_ms: ctx.elapsed_ms,
655        duration_ms: summary
656            .get("duration_ms")
657            .and_then(JsonValue::as_u64)
658            .unwrap_or(ctx.elapsed_ms),
659        iterations: summary
660            .pointer("/llm/iterations")
661            .and_then(JsonValue::as_i64)
662            .unwrap_or(0),
663        input_tokens,
664        output_tokens,
665        cost_usd,
666        pricing_known: pricing.is_some(),
667        tool_calls: summary
668            .pointer("/tools/calls")
669            .and_then(JsonValue::as_array)
670            .map(Vec::len)
671            .unwrap_or(0),
672        rejected_tool_calls: summary
673            .pointer("/tools/rejected")
674            .and_then(JsonValue::as_array)
675            .map(Vec::len)
676            .unwrap_or(0),
677        tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
678            .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
679            .unwrap_or_default(),
680        successful_tools: string_array(summary.pointer("/tools/successful")),
681        transcript_event_count: summary
682            .get("transcript_event_count")
683            .and_then(JsonValue::as_u64)
684            .unwrap_or(0) as usize,
685        verification_success: summary
686            .pointer("/verification/success")
687            .and_then(JsonValue::as_bool)
688            .unwrap_or(false),
689        harn_exit_code: ctx.exit_code,
690        error: (!passed).then(|| {
691            summary
692                .get("status")
693                .and_then(JsonValue::as_str)
694                .unwrap_or("benchmark failed")
695                .to_string()
696        }),
697        stderr_excerpt: excerpt(&ctx.stderr),
698        local_cleanup: ctx.local_cleanup,
699    }
700}
701
702impl LocalRunGuard {
703    async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
704        if !selector_is_local(selector) {
705            return None;
706        }
707        let snapshot = snapshot_provider(&selector.provider, Path::new("."))
708            .await
709            .ok();
710        Some(Self {
711            selector: selector.clone(),
712            stop_after,
713            snapshot,
714        })
715    }
716
717    async fn cleanup(self) -> Option<LocalCleanupReport> {
718        let snapshot = self.snapshot?;
719        if self.selector.provider != "ollama" {
720            return Some(LocalCleanupReport {
721                provider: self.selector.provider,
722                model: self.selector.model,
723                initially_loaded: false,
724                action: "not_applicable".to_string(),
725                detail: Some(
726                    "non-Ollama local providers are only stopped when Harn launched a managed server"
727                        .to_string(),
728                ),
729            });
730        }
731        let initially_loaded = snapshot
732            .loaded_models
733            .iter()
734            .any(|loaded| loaded.name == self.selector.model);
735        if !self.stop_after {
736            return Some(LocalCleanupReport {
737                provider: self.selector.provider,
738                model: self.selector.model,
739                initially_loaded,
740                action: "left_running".to_string(),
741                detail: Some("--keep-local-after-run".to_string()),
742            });
743        }
744        if initially_loaded {
745            return Some(LocalCleanupReport {
746                provider: self.selector.provider,
747                model: self.selector.model,
748                initially_loaded,
749                action: "left_preexisting".to_string(),
750                detail: None,
751            });
752        }
753        match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
754            Ok(()) => Some(LocalCleanupReport {
755                provider: self.selector.provider,
756                model: self.selector.model,
757                initially_loaded,
758                action: "unloaded".to_string(),
759                detail: None,
760            }),
761            Err(error) => Some(LocalCleanupReport {
762                provider: self.selector.provider,
763                model: self.selector.model,
764                initially_loaded,
765                action: "unload_failed".to_string(),
766                detail: Some(error),
767            }),
768        }
769    }
770}
771
772fn script_argv(
773    args: &EvalCodingAgentArgs,
774    fixture: FixtureDefinition,
775    selector: &ModelSelector,
776    tool_format: &str,
777    run_dir: &Path,
778) -> Vec<String> {
779    let mut argv = vec![
780        "--fixture".to_string(),
781        fixture.id.to_string(),
782        "--output-dir".to_string(),
783        run_dir.display().to_string(),
784        "--provider".to_string(),
785        selector.provider.clone(),
786        "--model".to_string(),
787        selector.model.clone(),
788        "--tool-format".to_string(),
789        tool_format.to_string(),
790        "--max-iterations".to_string(),
791        args.max_iterations.to_string(),
792        "--python".to_string(),
793        args.python.clone(),
794    ];
795    if selector.provider == "mock" {
796        argv.push("--seed-mock".to_string());
797    }
798    if let Some(json) = resolve_step_judge_json(args, selector) {
799        argv.push("--step-judge-json".to_string());
800        argv.push(json);
801    }
802    argv
803}
804
805/// Translate the `--step-judge <preset>` CLI flag into a JSON object the
806/// inner `coding_agent_suite.harn` script feeds to `agent_loop({step_judge: ...})`.
807/// Returns `None` for `None` / `"none"` / empty.
808///
809/// Preset semantics (designed for the step-judge experiment in
810/// experiments/step-judge/):
811/// - `symmetric-cheap`: judge = generator model (cheap-judges-cheap)
812/// - `asymmetric`: judge = `anthropic/claude-sonnet-4-6` via OpenRouter
813/// - `symmetric-strong`: judge = generator model (caller expected to
814///   pass --model anthropic/claude-sonnet-4-6 to make this meaningful)
815/// - `custom:<json>`: literal JSON dict passed through verbatim
816fn resolve_step_judge_json(args: &EvalCodingAgentArgs, selector: &ModelSelector) -> Option<String> {
817    let raw = args.step_judge.as_deref()?.trim();
818    if raw.is_empty() || raw.eq_ignore_ascii_case("none") {
819        return None;
820    }
821    let mut obj = serde_json::Map::new();
822    if let Some(rest) = raw.strip_prefix("custom:") {
823        match serde_json::from_str::<JsonValue>(rest) {
824            Ok(JsonValue::Object(map)) => obj.extend(map),
825            _ => {
826                // Fall through to error-style emission so the eval reports
827                // a config error rather than silently disabling the judge.
828                obj.insert(
829                    "model".to_string(),
830                    JsonValue::String("__invalid_custom_step_judge__".to_string()),
831                );
832            }
833        }
834    } else {
835        match raw {
836            "symmetric-cheap" | "symmetric-strong" => {
837                obj.insert(
838                    "model".to_string(),
839                    JsonValue::String(selector.model.clone()),
840                );
841                obj.insert(
842                    "provider".to_string(),
843                    JsonValue::String(selector.provider.clone()),
844                );
845            }
846            "asymmetric" => {
847                obj.insert(
848                    "model".to_string(),
849                    JsonValue::String("anthropic/claude-sonnet-4-6".to_string()),
850                );
851                obj.insert(
852                    "provider".to_string(),
853                    JsonValue::String("openrouter".to_string()),
854                );
855            }
856            _other => {
857                obj.insert(
858                    "model".to_string(),
859                    JsonValue::String("__unknown_step_judge_preset__".to_string()),
860                );
861            }
862        }
863    }
864    if let Some(on_veto) = args.step_judge_on_veto.as_deref() {
865        obj.insert(
866            "on_veto".to_string(),
867            JsonValue::String(on_veto.to_string()),
868        );
869    }
870    if args.step_judge_adversarial {
871        obj.insert(
872            "rubric".to_string(),
873            JsonValue::String("adversarial".to_string()),
874        );
875    }
876    Some(JsonValue::Object(obj).to_string())
877}
878
879fn error_report(
880    run_id: String,
881    fixture: FixtureDefinition,
882    selector: ModelSelector,
883    tool_format: String,
884    run_dir: PathBuf,
885    error: String,
886) -> RunReport {
887    RunReport {
888        run_id,
889        fixture_id: fixture.id.to_string(),
890        fixture_name: fixture.name.to_string(),
891        fixture_tool_sequence: fixture.tool_sequence.to_string(),
892        selector,
893        tool_format,
894        status: "infra_error".to_string(),
895        passed: false,
896        skipped: false,
897        skipped_reason: None,
898        output_dir: run_dir.display().to_string(),
899        transcript_events_path: run_dir
900            .join("transcript_events.jsonl")
901            .display()
902            .to_string(),
903        workspace_root: None,
904        elapsed_ms: 0,
905        duration_ms: 0,
906        iterations: 0,
907        input_tokens: 0,
908        output_tokens: 0,
909        cost_usd: 0.0,
910        pricing_known: false,
911        tool_calls: 0,
912        rejected_tool_calls: 0,
913        tool_sequence: Vec::new(),
914        successful_tools: Vec::new(),
915        transcript_event_count: 0,
916        verification_success: false,
917        harn_exit_code: 1,
918        error: Some(error),
919        stderr_excerpt: None,
920        local_cleanup: None,
921    }
922}
923
924fn skipped_report(
925    run_id: String,
926    fixture: FixtureDefinition,
927    selector: ModelSelector,
928    tool_format: String,
929    run_dir: PathBuf,
930    reason: String,
931) -> RunReport {
932    RunReport {
933        run_id,
934        fixture_id: fixture.id.to_string(),
935        fixture_name: fixture.name.to_string(),
936        fixture_tool_sequence: fixture.tool_sequence.to_string(),
937        selector,
938        tool_format,
939        status: "skipped".to_string(),
940        passed: false,
941        skipped: true,
942        skipped_reason: Some(reason),
943        output_dir: run_dir.display().to_string(),
944        transcript_events_path: run_dir
945            .join("transcript_events.jsonl")
946            .display()
947            .to_string(),
948        workspace_root: None,
949        elapsed_ms: 0,
950        duration_ms: 0,
951        iterations: 0,
952        input_tokens: 0,
953        output_tokens: 0,
954        cost_usd: 0.0,
955        pricing_known: false,
956        tool_calls: 0,
957        rejected_tool_calls: 0,
958        tool_sequence: Vec::new(),
959        successful_tools: Vec::new(),
960        transcript_event_count: 0,
961        verification_success: false,
962        harn_exit_code: 0,
963        error: None,
964        stderr_excerpt: None,
965        local_cleanup: None,
966    }
967}
968
969fn provider_available(selector: &ModelSelector) -> bool {
970    if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
971        return true;
972    }
973    harn_vm::llm_config::provider_key_available(&selector.provider)
974}
975
976fn resolve_fixtures(raw_fixtures: &[String]) -> Result<Vec<FixtureDefinition>, String> {
977    let mut seen = BTreeSet::new();
978    let mut out = Vec::new();
979    for raw in raw_fixtures {
980        let fixture = raw.trim().to_ascii_lowercase();
981        if fixture.is_empty() {
982            continue;
983        }
984        if fixture == "all" {
985            return Ok(FIXTURE_DEFINITIONS.to_vec());
986        }
987        let Some(definition) = fixture_definition(&fixture) else {
988            return Err(format!(
989                "unsupported --fixture `{fixture}`; expected one of: all, {}",
990                FIXTURE_DEFINITIONS
991                    .iter()
992                    .map(|definition| definition.id)
993                    .collect::<Vec<_>>()
994                    .join(", ")
995            ));
996        };
997        if seen.insert(definition.id) {
998            out.push(definition);
999        }
1000    }
1001    if out.is_empty() {
1002        return Err("at least one coding-agent fixture must be selected".to_string());
1003    }
1004    Ok(out)
1005}
1006
1007fn fixture_definition(id: &str) -> Option<FixtureDefinition> {
1008    FIXTURE_DEFINITIONS
1009        .iter()
1010        .copied()
1011        .find(|definition| definition.id == id)
1012}
1013
1014async fn resolve_models(args: &EvalCodingAgentArgs) -> Result<Vec<ModelSelector>, String> {
1015    let mut seen = BTreeSet::new();
1016    let mut out = Vec::new();
1017    for raw in normalize_model_selector_args(&args.models) {
1018        let trimmed = raw.trim();
1019        if trimmed.is_empty() {
1020            continue;
1021        }
1022        let selector = resolve_selector(trimmed);
1023        if seen.insert(selector_label(&selector)) {
1024            out.push(selector);
1025        }
1026    }
1027    if args.include_local {
1028        for selector in discover_local_models(args).await {
1029            if seen.insert(selector_label(&selector)) {
1030                out.push(selector);
1031            }
1032        }
1033    }
1034    Ok(out)
1035}
1036
1037fn normalize_model_selector_args(raw_models: &[String]) -> Vec<String> {
1038    let mut out = Vec::new();
1039    let mut index = 0;
1040    while index < raw_models.len() {
1041        let current = raw_models[index].trim();
1042        if current.starts_with("provider=") && index + 1 < raw_models.len() {
1043            let next = raw_models[index + 1].trim();
1044            if next.starts_with("model=") {
1045                out.push(format!("{current},{next}"));
1046                index += 2;
1047                continue;
1048            }
1049        }
1050        out.push(current.to_string());
1051        index += 1;
1052    }
1053    out
1054}
1055
1056async fn discover_local_models(args: &EvalCodingAgentArgs) -> Vec<ModelSelector> {
1057    let providers = if args.local_providers.is_empty() {
1058        local_provider_ids(None)
1059    } else {
1060        args.local_providers.clone()
1061    };
1062    let mut selectors = Vec::new();
1063    let mut seen = BTreeSet::new();
1064    for provider in providers {
1065        if selectors.len() >= args.max_local_models {
1066            break;
1067        }
1068        let Ok(snapshot) = snapshot_provider(&provider, Path::new(".")).await else {
1069            continue;
1070        };
1071        if !snapshot.reachable {
1072            continue;
1073        }
1074        let mut models = snapshot
1075            .loaded_models
1076            .iter()
1077            .map(|model| model.name.clone())
1078            .collect::<Vec<_>>();
1079        models.extend(snapshot.served_models);
1080        for model in models {
1081            if selectors.len() >= args.max_local_models {
1082                break;
1083            }
1084            let selector = ModelSelector {
1085                selector: format!("{provider}:{model}"),
1086                provider: provider.clone(),
1087                model,
1088            };
1089            if seen.insert(selector_label(&selector)) {
1090                selectors.push(selector);
1091            }
1092        }
1093    }
1094    selectors
1095}
1096
1097fn normalize_tool_formats(raw_formats: &[String]) -> Result<Vec<String>, String> {
1098    let mut seen = BTreeSet::new();
1099    let mut out = Vec::new();
1100    for raw in raw_formats {
1101        let format = raw.trim().to_ascii_lowercase();
1102        if format.is_empty() {
1103            continue;
1104        }
1105        if format != "native" && format != "text" {
1106            return Err(format!(
1107                "unsupported --tool-format `{format}`; expected `native` or `text`"
1108            ));
1109        }
1110        if seen.insert(format.clone()) {
1111            out.push(format);
1112        }
1113    }
1114    Ok(out)
1115}
1116
1117fn build_matrix(
1118    fixtures: &[FixtureDefinition],
1119    models: &[ModelSelector],
1120    tool_formats: &[String],
1121    max_runs: Option<usize>,
1122) -> Vec<(FixtureDefinition, ModelSelector, String)> {
1123    if max_runs == Some(0) {
1124        return Vec::new();
1125    }
1126    let mut matrix = Vec::new();
1127    for fixture in fixtures {
1128        for selector in models {
1129            for tool_format in tool_formats {
1130                matrix.push((*fixture, selector.clone(), tool_format.clone()));
1131                if max_runs.is_some_and(|limit| matrix.len() >= limit) {
1132                    return matrix;
1133                }
1134            }
1135        }
1136    }
1137    matrix
1138}
1139
1140#[allow(clippy::too_many_arguments)]
1141fn build_summary(
1142    output_dir: &Path,
1143    fixtures: Vec<FixtureDefinition>,
1144    models: Vec<ModelSelector>,
1145    tool_formats: Vec<String>,
1146    env_keys_loaded: Vec<LoadedEnvKey>,
1147    runs: Vec<RunReport>,
1148    step_judge_preset: Option<String>,
1149    run_label: String,
1150    baseline_comparison: Option<BaselineComparison>,
1151) -> EvalSummary {
1152    let passed_runs = runs.iter().filter(|run| run.passed).count();
1153    let skipped_runs = runs.iter().filter(|run| run.skipped).count();
1154    let failed_runs = runs
1155        .iter()
1156        .filter(|run| !run.passed && !run.skipped)
1157        .count();
1158    let total_cost_usd = runs.iter().map(|run| run.cost_usd).sum();
1159    let rollups = build_rollups(&runs);
1160    let comparisons = compare_formats(&runs);
1161    let diverged_comparisons = comparisons
1162        .iter()
1163        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1164        .count();
1165    let followups = suggest_followups(&runs, &comparisons);
1166    EvalSummary {
1167        schema_version: 2,
1168        fixture_ids: fixtures
1169            .iter()
1170            .map(|fixture| fixture.id.to_string())
1171            .collect(),
1172        fixtures: fixtures
1173            .iter()
1174            .map(|fixture| FixtureReport {
1175                id: fixture.id.to_string(),
1176                name: fixture.name.to_string(),
1177                tool_sequence: fixture.tool_sequence.to_string(),
1178                description: fixture.description.to_string(),
1179            })
1180            .collect(),
1181        output_dir: output_dir.display().to_string(),
1182        models,
1183        tool_formats,
1184        env_keys_loaded,
1185        total_runs: runs.len(),
1186        passed_runs,
1187        failed_runs,
1188        skipped_runs,
1189        diverged_comparisons,
1190        total_cost_usd,
1191        rollups,
1192        runs,
1193        comparisons,
1194        followups,
1195        step_judge_preset,
1196        run_label,
1197        baseline_comparison,
1198    }
1199}
1200
1201fn load_baseline_comparison(path: &Path, runs: &[RunReport]) -> Result<BaselineComparison, String> {
1202    let resolved = if path.is_dir() {
1203        path.join("summary.json")
1204    } else {
1205        path.to_path_buf()
1206    };
1207    let raw = fs::read_to_string(&resolved)
1208        .map_err(|e| format!("failed to read {}: {e}", resolved.display()))?;
1209    let baseline: serde_json::Value = serde_json::from_str(&raw)
1210        .map_err(|e| format!("failed to parse {} as JSON: {e}", resolved.display()))?;
1211    let baseline_runs = baseline
1212        .get("runs")
1213        .and_then(|v| v.as_array())
1214        .ok_or_else(|| format!("{} has no `runs` array", resolved.display()))?;
1215    // Index baseline status by fixture_id. When the baseline has multiple
1216    // runs per fixture (e.g. native + text), prefer the first passing run
1217    // so a fixture passes the comparison if ANY baseline variant did.
1218    let mut baseline_status: BTreeMap<String, &str> = BTreeMap::new();
1219    for run in baseline_runs {
1220        let fixture_id = match run.get("fixture_id").and_then(|v| v.as_str()) {
1221            Some(id) => id.to_string(),
1222            None => continue,
1223        };
1224        let passed = run.get("passed").and_then(|v| v.as_bool()).unwrap_or(false);
1225        let skipped = run
1226            .get("skipped")
1227            .and_then(|v| v.as_bool())
1228            .unwrap_or(false);
1229        let status = if skipped {
1230            "skipped"
1231        } else if passed {
1232            "passed"
1233        } else {
1234            "failed"
1235        };
1236        baseline_status
1237            .entry(fixture_id)
1238            .and_modify(|existing| {
1239                if *existing != "passed" && status == "passed" {
1240                    *existing = status;
1241                }
1242            })
1243            .or_insert(status);
1244    }
1245    let mut cell_status: BTreeMap<String, &str> = BTreeMap::new();
1246    for run in runs {
1247        let status = if run.skipped {
1248            "skipped"
1249        } else if run.passed {
1250            "passed"
1251        } else {
1252            "failed"
1253        };
1254        cell_status
1255            .entry(run.fixture_id.clone())
1256            .and_modify(|existing| {
1257                if *existing != "passed" && status == "passed" {
1258                    *existing = status;
1259                }
1260            })
1261            .or_insert(status);
1262    }
1263    let mut regressions = Vec::new();
1264    let mut recoveries = Vec::new();
1265    let mut unchanged_passes = Vec::new();
1266    let mut unchanged_failures = Vec::new();
1267    let mut missing_in_baseline = Vec::new();
1268    let mut missing_in_cell = Vec::new();
1269    for (fixture, cell) in &cell_status {
1270        match baseline_status.get(fixture) {
1271            None => missing_in_baseline.push(fixture.clone()),
1272            Some(base) => match (*base, *cell) {
1273                ("passed", "passed") => unchanged_passes.push(fixture.clone()),
1274                ("passed", _) => regressions.push(FixtureStatusDelta {
1275                    fixture_id: fixture.clone(),
1276                    baseline_status: (*base).to_string(),
1277                    cell_status: (*cell).to_string(),
1278                }),
1279                (_, "passed") => recoveries.push(FixtureStatusDelta {
1280                    fixture_id: fixture.clone(),
1281                    baseline_status: (*base).to_string(),
1282                    cell_status: (*cell).to_string(),
1283                }),
1284                _ => unchanged_failures.push(fixture.clone()),
1285            },
1286        }
1287    }
1288    for fixture in baseline_status.keys() {
1289        if !cell_status.contains_key(fixture) {
1290            missing_in_cell.push(fixture.clone());
1291        }
1292    }
1293    let baseline_label = baseline
1294        .get("run_label")
1295        .and_then(|v| v.as_str())
1296        .filter(|s| !s.is_empty())
1297        .or_else(|| baseline.get("output_dir").and_then(|v| v.as_str()))
1298        .unwrap_or("")
1299        .to_string();
1300    let regressions_count = regressions.len();
1301    let recoveries_count = recoveries.len();
1302    let total_compared =
1303        regressions_count + recoveries_count + unchanged_passes.len() + unchanged_failures.len();
1304    let net_lift_pp = if total_compared == 0 {
1305        0.0
1306    } else {
1307        let raw =
1308            (recoveries_count as f64 - regressions_count as f64) / total_compared as f64 * 100.0;
1309        (raw * 10.0).round() / 10.0
1310    };
1311    Ok(BaselineComparison {
1312        baseline_label,
1313        baseline_path: resolved.display().to_string(),
1314        regressions,
1315        recoveries,
1316        unchanged_passes,
1317        unchanged_failures,
1318        missing_in_baseline,
1319        missing_in_cell,
1320        regressions_count,
1321        recoveries_count,
1322        net_lift_pp,
1323    })
1324}
1325
1326fn build_rollups(runs: &[RunReport]) -> EvalRollups {
1327    EvalRollups {
1328        by_fixture: rollup_by(runs, |run| run.fixture_id.clone()),
1329        by_provider: rollup_by(runs, |run| run.selector.provider.clone()),
1330        by_model: rollup_by(runs, |run| run.selector.model.clone()),
1331        by_tool_format: rollup_by(runs, |run| run.tool_format.clone()),
1332        by_tool_sequence: rollup_by(runs, |run| run.fixture_tool_sequence.clone()),
1333    }
1334}
1335
1336fn rollup_by<F>(runs: &[RunReport], key_for: F) -> Vec<RollupReport>
1337where
1338    F: Fn(&RunReport) -> String,
1339{
1340    let mut grouped: BTreeMap<String, RollupReport> = BTreeMap::new();
1341    for run in runs {
1342        let key = key_for(run);
1343        let entry = grouped.entry(key.clone()).or_insert_with(|| RollupReport {
1344            key,
1345            total_runs: 0,
1346            passed_runs: 0,
1347            failed_runs: 0,
1348            skipped_runs: 0,
1349            total_cost_usd: 0.0,
1350        });
1351        entry.total_runs += 1;
1352        if run.passed {
1353            entry.passed_runs += 1;
1354        } else if run.skipped {
1355            entry.skipped_runs += 1;
1356        } else {
1357            entry.failed_runs += 1;
1358        }
1359        entry.total_cost_usd += run.cost_usd;
1360    }
1361    grouped.into_values().collect()
1362}
1363
1364fn compare_formats(runs: &[RunReport]) -> Vec<FormatComparison> {
1365    let mut grouped: BTreeMap<String, Vec<&RunReport>> = BTreeMap::new();
1366    for run in runs {
1367        grouped
1368            .entry(format!(
1369                "{}\0{}",
1370                run.fixture_id,
1371                selector_label(&run.selector)
1372            ))
1373            .or_default()
1374            .push(run);
1375    }
1376    let mut out = Vec::new();
1377    for group in grouped.values() {
1378        let Some(first) = group.first() else {
1379            continue;
1380        };
1381        let native = group
1382            .iter()
1383            .find(|run| run.tool_format == "native")
1384            .copied();
1385        let text = group.iter().find(|run| run.tool_format == "text").copied();
1386        if native.is_none() && text.is_none() {
1387            continue;
1388        }
1389        let pair = native.zip(text);
1390        let mut divergence_reasons = Vec::new();
1391        if let Some((native, text)) = pair {
1392            if native.status != text.status {
1393                divergence_reasons.push(format!(
1394                    "status differs: native={} text={}",
1395                    native.status, text.status
1396                ));
1397            }
1398            if native.passed != text.passed {
1399                divergence_reasons.push(format!(
1400                    "pass result differs: native={} text={}",
1401                    native.passed, text.passed
1402                ));
1403            }
1404            if native.verification_success != text.verification_success {
1405                divergence_reasons.push(format!(
1406                    "verifier result differs: native={} text={}",
1407                    native.verification_success, text.verification_success
1408                ));
1409            }
1410            if native.tool_sequence != text.tool_sequence {
1411                divergence_reasons.push(format!(
1412                    "tool sequence differs: native=[{}] text=[{}]",
1413                    native.tool_sequence.join(", "),
1414                    text.tool_sequence.join(", ")
1415                ));
1416            }
1417            if native.rejected_tool_calls != text.rejected_tool_calls {
1418                divergence_reasons.push(format!(
1419                    "rejected tool-call recovery differs: native={} text={}",
1420                    native.rejected_tool_calls, text.rejected_tool_calls
1421                ));
1422            }
1423        }
1424        let evidence_paths = [native, text]
1425            .into_iter()
1426            .flatten()
1427            .map(|run| run.transcript_events_path.clone())
1428            .collect::<Vec<_>>();
1429        out.push(FormatComparison {
1430            fixture_id: first.fixture_id.clone(),
1431            selector: first.selector.clone(),
1432            native_run_id: native.map(|run| run.run_id.clone()),
1433            text_run_id: text.map(|run| run.run_id.clone()),
1434            native_evidence_path: native.map(|run| run.transcript_events_path.clone()),
1435            text_evidence_path: text.map(|run| run.transcript_events_path.clone()),
1436            native_status: native.map(|run| run.status.clone()),
1437            text_status: text.map(|run| run.status.clone()),
1438            native_passed: native.map(|run| run.passed),
1439            text_passed: text.map(|run| run.passed),
1440            verifier_match: pair
1441                .map(|(native, text)| native.verification_success == text.verification_success),
1442            tool_sequence_match: pair
1443                .map(|(native, text)| native.tool_sequence == text.tool_sequence),
1444            rejected_tool_call_delta_text_minus_native: pair.map(|(native, text)| {
1445                text.rejected_tool_calls as i64 - native.rejected_tool_calls as i64
1446            }),
1447            token_delta_text_minus_native: pair.map(|(native, text)| {
1448                (text.input_tokens + text.output_tokens)
1449                    - (native.input_tokens + native.output_tokens)
1450            }),
1451            iteration_delta_text_minus_native: pair
1452                .map(|(native, text)| text.iterations - native.iterations),
1453            equivalent: pair.map(|(native, text)| {
1454                native.status == text.status
1455                    && native.passed == text.passed
1456                    && native.skipped == text.skipped
1457                    && native.verification_success == text.verification_success
1458                    && native.tool_sequence == text.tool_sequence
1459                    && native.rejected_tool_calls == text.rejected_tool_calls
1460            }),
1461            divergence_reasons,
1462            evidence_paths,
1463        });
1464    }
1465    out
1466}
1467
1468fn suggest_followups(
1469    runs: &[RunReport],
1470    comparisons: &[FormatComparison],
1471) -> Vec<FollowupSuggestion> {
1472    let mut out = Vec::new();
1473    let failed = runs
1474        .iter()
1475        .filter(|run| !run.passed && !run.skipped)
1476        .map(|run| run.run_id.clone())
1477        .collect::<Vec<_>>();
1478    if !failed.is_empty() {
1479        out.push(FollowupSuggestion {
1480            title: "Normalize coding-agent fixture failures across provider presets".to_string(),
1481            body: "One or more fixture/provider/tool-format runs failed. Inspect the run directories and decide whether the gap belongs in provider adapters, preset prompting, transcript handling, or host-tool ergonomics.".to_string(),
1482            labels: vec!["eval".to_string(), "providers".to_string()],
1483            run_ids: failed,
1484        });
1485    }
1486
1487    let rejected = runs
1488        .iter()
1489        .filter(|run| run.rejected_tool_calls > 0)
1490        .map(|run| run.run_id.clone())
1491        .collect::<Vec<_>>();
1492    if !rejected.is_empty() {
1493        out.push(FollowupSuggestion {
1494            title: "Abstract rejected tool-call recovery in agent transcripts".to_string(),
1495            body: "Some runs recovered after rejected tool calls. Add runtime support or preset guidance so harness authors can distinguish recoverable provider/tool-shape noise from user-relevant transcript events.".to_string(),
1496            labels: vec!["agents".to_string(), "transcripts".to_string()],
1497            run_ids: rejected,
1498        });
1499    }
1500
1501    let mismatched = comparisons
1502        .iter()
1503        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1504        .map(|comparison| {
1505            format!(
1506                "{}:{} ({})",
1507                comparison.fixture_id,
1508                selector_label(&comparison.selector),
1509                comparison.divergence_reasons.join("; ")
1510            )
1511        })
1512        .collect::<Vec<_>>();
1513    if !mismatched.is_empty() {
1514        let run_ids = comparisons
1515            .iter()
1516            .filter(|comparison| !comparison.divergence_reasons.is_empty())
1517            .flat_map(|comparison| {
1518                [
1519                    comparison.native_run_id.clone(),
1520                    comparison.text_run_id.clone(),
1521                ]
1522            })
1523            .flatten()
1524            .collect::<Vec<_>>();
1525        out.push(FollowupSuggestion {
1526            title: "Make native/text tool modes behaviorally interchangeable for preset harnesses"
1527                .to_string(),
1528            body: format!(
1529                "Native and text tool modes diverged for: {}. The preset/runtime boundary should hide provider tool-channel differences where possible.",
1530                mismatched.join(", ")
1531            ),
1532            labels: vec!["agents".to_string(), "tools".to_string()],
1533            run_ids,
1534        });
1535    }
1536
1537    let unknown_pricing = runs
1538        .iter()
1539        .filter(|run| {
1540            !run.skipped
1541                && !run.pricing_known
1542                && !matches!(run.selector.provider.as_str(), "mock" | "fake")
1543                && !selector_is_local(&run.selector)
1544        })
1545        .map(|run| run.run_id.clone())
1546        .collect::<Vec<_>>();
1547    if !unknown_pricing.is_empty() {
1548        out.push(FollowupSuggestion {
1549            title: "Fill provider pricing metadata for benchmarked models".to_string(),
1550            body: "At least one live provider/model produced usage metrics but had no pricing entry, which weakens cost comparisons in eval reports.".to_string(),
1551            labels: vec!["providers".to_string(), "docs".to_string()],
1552            run_ids: unknown_pricing,
1553        });
1554    }
1555    out
1556}
1557
1558fn write_json_artifacts(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1559    write_json_pretty(&output_dir.join("summary.json"), summary)?;
1560    write_jsonl(&output_dir.join("per_run.jsonl"), &summary.runs)?;
1561    let summary_value = serde_json::to_value(summary).map_err(|error| error.to_string())?;
1562    let readiness = local_readiness::report_from_summary_json(
1563        &summary_value,
1564        output_dir.display().to_string(),
1565    )?;
1566    write_json_pretty(&output_dir.join("local_readiness.json"), &readiness)?;
1567    Ok(())
1568}
1569
1570fn announce_output_paths(output_dir: &Path) {
1571    eprintln!(
1572        "wrote {}, {}, {}, {}, and {}",
1573        output_dir.join("summary.json").display(),
1574        output_dir.join("per_run.jsonl").display(),
1575        output_dir.join("local_readiness.json").display(),
1576        output_dir.join("summary.md").display(),
1577        output_dir.join("followups.md").display()
1578    );
1579}
1580
1581// ─── Legacy direct-render path (gated by HARN_CLI_IMPL=rust) ────────────
1582
1583fn write_markdown_artifacts_legacy(output_dir: &Path, summary: &EvalSummary) -> Result<(), String> {
1584    fs::write(output_dir.join("summary.md"), render_markdown(summary))
1585        .map_err(|error| format!("failed to write summary.md: {error}"))?;
1586    fs::write(output_dir.join("followups.md"), render_followups(summary))
1587        .map_err(|error| format!("failed to write followups.md: {error}"))?;
1588    Ok(())
1589}
1590
1591fn print_summary_legacy(summary: &EvalSummary) {
1592    println!(
1593        "coding-agent eval: {}/{} passed, {} skipped, total_cost_usd={:.6}",
1594        summary.passed_runs, summary.total_runs, summary.skipped_runs, summary.total_cost_usd
1595    );
1596}
1597
1598fn print_json_legacy(summary: &EvalSummary) {
1599    match serde_json::to_string_pretty(summary) {
1600        Ok(payload) => println!("{payload}"),
1601        Err(error) => eprintln!("warning: failed to render summary JSON: {error}"),
1602    }
1603}
1604
1605// ─── Dispatch (.harn) render path ────────────────────────────────────────
1606
1607async fn write_markdown_artifacts_dispatch(
1608    output_dir: &Path,
1609    summary: &EvalSummary,
1610) -> Result<(), i32> {
1611    let markdown = render_via_dispatch(summary, "markdown").await?;
1612    if let Err(error) = fs::write(output_dir.join("summary.md"), markdown) {
1613        eprintln!("error: failed to write summary.md: {error}");
1614        return Err(1);
1615    }
1616    let followups = render_via_dispatch(summary, "followups").await?;
1617    if let Err(error) = fs::write(output_dir.join("followups.md"), followups) {
1618        eprintln!("error: failed to write followups.md: {error}");
1619        return Err(1);
1620    }
1621    Ok(())
1622}
1623
1624async fn print_summary_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1625    let payload = render_via_dispatch(summary, "summary").await?;
1626    print!("{payload}");
1627    // The script emits exactly the legacy summary line (no trailing
1628    // newline); add one to match the legacy `println!` semantics.
1629    if !payload.ends_with('\n') {
1630        println!();
1631    }
1632    Ok(())
1633}
1634
1635async fn print_json_dispatch(summary: &EvalSummary) -> Result<(), i32> {
1636    let payload = render_via_dispatch(summary, "json").await?;
1637    print!("{payload}");
1638    if !payload.ends_with('\n') {
1639        println!();
1640    }
1641    Ok(())
1642}
1643
1644/// Dispatch to the embedded `cli/eval/coding_agent.harn` script for one
1645/// of the four rendering modes (markdown / followups / summary / json).
1646/// Returns the captured stdout on success, or a propagated exit code
1647/// on failure.
1648///
1649/// **Concurrency.** Held under [`DISPATCH_RENDER_LOCK`] so concurrent
1650/// in-process callers don't race on the global env vars the Rust shim
1651/// sets to hand the report to the script. See the lock's docstring
1652/// for the trade-off rationale.
1653async fn render_via_dispatch(summary: &EvalSummary, mode: &str) -> Result<String, i32> {
1654    let summary_json = match serde_json::to_string(summary) {
1655        Ok(json) => json,
1656        Err(error) => {
1657            eprintln!("error: failed to serialise EvalSummary for dispatch: {error}");
1658            return Err(1);
1659        }
1660    };
1661    let _guard = DISPATCH_RENDER_LOCK.lock().await;
1662    let _summary = ScopedEnvVar::set(CODING_AGENT_SUMMARY_ENV, &summary_json);
1663    let _mode = ScopedEnvVar::set(CODING_AGENT_MODE_ENV, mode);
1664
1665    let outcome = dispatch::run_embedded_script("eval/coding_agent", Vec::new(), false).await;
1666    if !outcome.stderr.is_empty() {
1667        let _ = std::io::stderr().write_all(outcome.stderr.as_bytes());
1668    }
1669    if outcome.exit_code != 0 {
1670        return Err(outcome.exit_code);
1671    }
1672    Ok(outcome.stdout)
1673}
1674
1675fn write_json_pretty<T: Serialize>(path: &Path, value: &T) -> Result<(), String> {
1676    let body = serde_json::to_string_pretty(value).map_err(|error| error.to_string())?;
1677    fs::write(path, format!("{body}\n")).map_err(|error| error.to_string())
1678}
1679
1680fn write_jsonl<T: Serialize>(path: &Path, items: &[T]) -> Result<(), String> {
1681    let mut body = String::new();
1682    for item in items {
1683        let line = serde_json::to_string(item).map_err(|error| error.to_string())?;
1684        body.push_str(&line);
1685        body.push('\n');
1686    }
1687    fs::write(path, body).map_err(|error| error.to_string())
1688}
1689
1690fn render_markdown(summary: &EvalSummary) -> String {
1691    let mut out = String::new();
1692    out.push_str("# Coding Agent Harness Quality Suite\n\n");
1693    out.push_str(&format!(
1694        "- fixtures: `{}`\n- passed: {}/{}\n- skipped: {}\n- total_cost_usd: {:.6}\n\n",
1695        summary.fixture_ids.join("`, `"),
1696        summary.passed_runs,
1697        summary.total_runs,
1698        summary.skipped_runs,
1699        summary.total_cost_usd
1700    ));
1701    render_rollup_table(&mut out, "By Fixture", &summary.rollups.by_fixture);
1702    render_rollup_table(&mut out, "By Provider", &summary.rollups.by_provider);
1703    render_rollup_table(&mut out, "By Model", &summary.rollups.by_model);
1704    render_rollup_table(&mut out, "By Tool Format", &summary.rollups.by_tool_format);
1705    render_rollup_table(
1706        &mut out,
1707        "By Tool Sequence",
1708        &summary.rollups.by_tool_sequence,
1709    );
1710
1711    out.push_str("\n## Runs\n\n");
1712    out.push_str("| fixture | run | provider | model | tool format | fixture sequence | tool calls | status | iterations | tokens | cost | transcript | output |\n");
1713    out.push_str("|---|---|---|---|---|---|---|---|---:|---:|---:|---|---|\n");
1714    for run in &summary.runs {
1715        let tool_sequence = if run.tool_sequence.is_empty() {
1716            "-".to_string()
1717        } else {
1718            run.tool_sequence.join(", ").replace('|', "\\|")
1719        };
1720        out.push_str(&format!(
1721            "| `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {:.6} | {} | `{}` |\n",
1722            run.fixture_id,
1723            run.run_id,
1724            run.selector.provider,
1725            run.selector.model.replace('|', "\\|"),
1726            run.tool_format,
1727            run.fixture_tool_sequence,
1728            tool_sequence,
1729            run.status,
1730            run.iterations,
1731            run.input_tokens + run.output_tokens,
1732            run.cost_usd,
1733            markdown_link(
1734                &run.transcript_event_count.to_string(),
1735                &run.transcript_events_path
1736            ),
1737            run.output_dir
1738        ));
1739    }
1740    if let Some(comparison) = &summary.baseline_comparison {
1741        out.push_str("\n## Baseline Comparison\n\n");
1742        out.push_str(&format!(
1743            "Compared against `{}`{}.\n\n",
1744            comparison.baseline_path,
1745            if comparison.baseline_label.is_empty() {
1746                String::new()
1747            } else {
1748                format!(" (label: `{}`)", comparison.baseline_label)
1749            },
1750        ));
1751        out.push_str(&format!(
1752            "- regressions: **{}** (baseline passed, this cell failed)\n- recoveries: **{}** (baseline failed, this cell passed)\n- net lift: **{:+.1}pp**\n\n",
1753            comparison.regressions_count,
1754            comparison.recoveries_count,
1755            comparison.net_lift_pp,
1756        ));
1757        if !comparison.regressions.is_empty() {
1758            out.push_str("### Regressions\n\n");
1759            for delta in &comparison.regressions {
1760                out.push_str(&format!(
1761                    "- `{}`: `{}` → `{}`\n",
1762                    delta.fixture_id, delta.baseline_status, delta.cell_status,
1763                ));
1764            }
1765            out.push('\n');
1766        }
1767        if !comparison.recoveries.is_empty() {
1768            out.push_str("### Recoveries\n\n");
1769            for delta in &comparison.recoveries {
1770                out.push_str(&format!(
1771                    "- `{}`: `{}` → `{}`\n",
1772                    delta.fixture_id, delta.baseline_status, delta.cell_status,
1773                ));
1774            }
1775            out.push('\n');
1776        }
1777    }
1778    if !summary.comparisons.is_empty() {
1779        out.push_str("\n## Native/Text Comparison\n\n");
1780        out.push_str("| fixture | selector | native | text | equivalent | verifier | tools | rejected delta | token delta | iteration delta | evidence |\n");
1781        out.push_str("|---|---|---|---|---|---|---|---:|---:|---:|---|\n");
1782        for comparison in &summary.comparisons {
1783            out.push_str(&format!(
1784                "| `{}` | `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |\n",
1785                comparison.fixture_id,
1786                selector_label(&comparison.selector),
1787                comparison
1788                    .native_status
1789                    .clone()
1790                    .unwrap_or_else(|| "-".to_string()),
1791                comparison
1792                    .text_status
1793                    .clone()
1794                    .unwrap_or_else(|| "-".to_string()),
1795                optional_bool_mark(comparison.equivalent),
1796                optional_bool_mark(comparison.verifier_match),
1797                optional_bool_mark(comparison.tool_sequence_match),
1798                comparison
1799                    .rejected_tool_call_delta_text_minus_native
1800                    .map(|v| v.to_string())
1801                    .unwrap_or_else(|| "-".to_string()),
1802                comparison
1803                    .token_delta_text_minus_native
1804                    .map(|v| v.to_string())
1805                    .unwrap_or_else(|| "-".to_string()),
1806                comparison
1807                    .iteration_delta_text_minus_native
1808                    .map(|v| v.to_string())
1809                    .unwrap_or_else(|| "-".to_string()),
1810                comparison_evidence_links(comparison)
1811            ));
1812        }
1813    }
1814    let diverged = summary
1815        .comparisons
1816        .iter()
1817        .filter(|comparison| !comparison.divergence_reasons.is_empty())
1818        .collect::<Vec<_>>();
1819    if !diverged.is_empty() {
1820        out.push_str("\n## Native/Text Divergence Evidence\n\n");
1821        for comparison in diverged {
1822            out.push_str(&format!(
1823                "- `{}` `{}`: {}\n",
1824                comparison.fixture_id,
1825                selector_label(&comparison.selector),
1826                comparison.divergence_reasons.join("; ")
1827            ));
1828            if !comparison.evidence_paths.is_empty() {
1829                out.push_str(&format!(
1830                    "  Evidence: {}\n",
1831                    comparison_evidence_links(comparison)
1832                ));
1833            }
1834        }
1835    }
1836    out
1837}
1838
1839fn render_rollup_table(out: &mut String, title: &str, rollups: &[RollupReport]) {
1840    out.push_str(&format!("## {title}\n\n"));
1841    out.push_str("| key | passed | failed | skipped | total | cost |\n");
1842    out.push_str("|---|---:|---:|---:|---:|---:|\n");
1843    for rollup in rollups {
1844        out.push_str(&format!(
1845            "| `{}` | {} | {} | {} | {} | {:.6} |\n",
1846            rollup.key.replace('|', "\\|"),
1847            rollup.passed_runs,
1848            rollup.failed_runs,
1849            rollup.skipped_runs,
1850            rollup.total_runs,
1851            rollup.total_cost_usd
1852        ));
1853    }
1854    out.push('\n');
1855}
1856
1857fn render_followups(summary: &EvalSummary) -> String {
1858    let mut out = String::new();
1859    out.push_str("# Follow-up Issue Candidates\n\n");
1860    if summary.followups.is_empty() {
1861        out.push_str("No follow-up issue candidates were generated from this run.\n");
1862        return out;
1863    }
1864    for followup in &summary.followups {
1865        out.push_str(&format!("## {}\n\n{}\n\n", followup.title, followup.body));
1866        if !followup.run_ids.is_empty() {
1867            out.push_str(&format!("- run_ids: `{}`\n", followup.run_ids.join("`, `")));
1868        }
1869        if !followup.labels.is_empty() {
1870            out.push_str(&format!("- labels: `{}`\n", followup.labels.join("`, `")));
1871        }
1872        out.push('\n');
1873    }
1874    out
1875}
1876
1877fn read_run_summary(run_dir: &Path) -> Option<JsonValue> {
1878    let raw = fs::read_to_string(run_dir.join("summary.json")).ok()?;
1879    serde_json::from_str(&raw).ok()
1880}
1881
1882fn parse_last_json_line(stdout: &str) -> Option<JsonValue> {
1883    stdout
1884        .lines()
1885        .rev()
1886        .map(str::trim)
1887        .filter(|line| !line.is_empty())
1888        .find_map(|line| serde_json::from_str::<JsonValue>(line).ok())
1889}
1890
1891fn string_array(value: Option<&JsonValue>) -> Vec<String> {
1892    value
1893        .and_then(JsonValue::as_array)
1894        .map(|values| {
1895            values
1896                .iter()
1897                .filter_map(JsonValue::as_str)
1898                .map(str::to_string)
1899                .collect()
1900        })
1901        .unwrap_or_default()
1902}
1903
1904fn non_empty_string_array(value: Option<&JsonValue>) -> Option<Vec<String>> {
1905    let values = string_array(value);
1906    (!values.is_empty()).then_some(values)
1907}
1908
1909fn tool_call_sequence(value: Option<&JsonValue>) -> Option<Vec<String>> {
1910    let calls = value.and_then(JsonValue::as_array)?;
1911    let mut sequence = Vec::new();
1912    for call in calls {
1913        if let Some(name) = call
1914            .get("name")
1915            .or_else(|| call.get("tool_name"))
1916            .and_then(JsonValue::as_str)
1917        {
1918            sequence.push(name.to_string());
1919        }
1920    }
1921    (!sequence.is_empty()).then_some(sequence)
1922}
1923
1924fn optional_bool_mark(value: Option<bool>) -> &'static str {
1925    match value {
1926        Some(true) => "yes",
1927        Some(false) => "no",
1928        None => "-",
1929    }
1930}
1931
1932fn comparison_evidence_links(comparison: &FormatComparison) -> String {
1933    let mut links = Vec::new();
1934    if let Some(native) = comparison.native_evidence_path.as_deref() {
1935        links.push(markdown_link("native", native));
1936    }
1937    if let Some(text) = comparison.text_evidence_path.as_deref() {
1938        links.push(markdown_link("text", text));
1939    }
1940    if links.is_empty() {
1941        "-".to_string()
1942    } else {
1943        links.join("<br>")
1944    }
1945}
1946
1947fn markdown_link(label: &str, target: &str) -> String {
1948    format!(
1949        "[{}]({})",
1950        label.replace('|', "\\|"),
1951        target
1952            .replace(' ', "%20")
1953            .replace('(', "%28")
1954            .replace(')', "%29")
1955    )
1956}
1957
1958fn reset_dir(path: &Path) -> Result<(), String> {
1959    if path.exists() {
1960        fs::remove_dir_all(path).map_err(|error| error.to_string())?;
1961    }
1962    fs::create_dir_all(path).map_err(|error| error.to_string())
1963}
1964
1965fn run_id_for(fixture: FixtureDefinition, selector: &ModelSelector, tool_format: &str) -> String {
1966    sanitize_id(&format!(
1967        "{}__{}__{}",
1968        fixture.id,
1969        selector_label(selector),
1970        tool_format
1971    ))
1972}
1973
1974fn sanitize_id(raw: &str) -> String {
1975    let mut out = String::new();
1976    for ch in raw.chars() {
1977        if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' {
1978            out.push(ch);
1979        } else {
1980            out.push('_');
1981        }
1982    }
1983    out.trim_matches('_').to_string()
1984}
1985
1986fn default_output_dir() -> PathBuf {
1987    PathBuf::from(".harn-runs")
1988        .join("coding-agent-bench")
1989        .join("latest")
1990}
1991
1992fn excerpt(text: &str) -> Option<String> {
1993    let trimmed = text.trim();
1994    if trimmed.is_empty() {
1995        return None;
1996    }
1997    let max = 4000;
1998    if trimmed.len() <= max {
1999        return Some(trimmed.to_string());
2000    }
2001    let mut truncated = String::new();
2002    for ch in trimmed.chars().take(max) {
2003        truncated.push(ch);
2004    }
2005    truncated.push_str("...");
2006    Some(truncated)
2007}
2008
2009fn load_env_files(paths: &[PathBuf]) -> Result<(EnvOverlay, Vec<LoadedEnvKey>), String> {
2010    let mut previous = Vec::new();
2011    let mut loaded = Vec::new();
2012    let mut touched = BTreeSet::new();
2013    for path in paths {
2014        let path = expand_home(path);
2015        let raw = fs::read_to_string(&path)
2016            .map_err(|error| format!("failed to read env file {}: {error}", path.display()))?;
2017        for (line_no, line) in raw.lines().enumerate() {
2018            let Some((key, value)) = parse_env_line(line).map_err(|error| {
2019                format!("{}:{}: {error}", path.display(), line_no.saturating_add(1))
2020            })?
2021            else {
2022                continue;
2023            };
2024            if touched.insert(key.clone()) {
2025                previous.push((OsString::from(&key), std::env::var_os(&key)));
2026            }
2027            std::env::set_var(&key, value);
2028            loaded.push(LoadedEnvKey {
2029                key,
2030                source: path.display().to_string(),
2031            });
2032        }
2033    }
2034    Ok((EnvOverlay { previous }, loaded))
2035}
2036
2037fn parse_env_line(line: &str) -> Result<Option<(String, String)>, String> {
2038    let trimmed = line.trim();
2039    if trimmed.is_empty() || trimmed.starts_with('#') {
2040        return Ok(None);
2041    }
2042    let trimmed = trimmed.strip_prefix("export ").unwrap_or(trimmed).trim();
2043    let Some((key, value)) = trimmed.split_once('=') else {
2044        return Err("expected KEY=VALUE".to_string());
2045    };
2046    let key = key.trim();
2047    if key.is_empty() {
2048        return Err("empty key".to_string());
2049    }
2050    if !key
2051        .chars()
2052        .all(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2053    {
2054        return Err(format!("invalid key `{key}`"));
2055    }
2056    Ok(Some((key.to_string(), unquote_env_value(value.trim()))))
2057}
2058
2059fn unquote_env_value(value: &str) -> String {
2060    if value.len() >= 2 {
2061        let bytes = value.as_bytes();
2062        if (bytes[0] == b'"' && bytes[value.len() - 1] == b'"')
2063            || (bytes[0] == b'\'' && bytes[value.len() - 1] == b'\'')
2064        {
2065            return value[1..value.len() - 1].to_string();
2066        }
2067    }
2068    value.to_string()
2069}
2070
2071fn expand_home(path: &Path) -> PathBuf {
2072    let raw = path.to_string_lossy();
2073    if raw == "~" {
2074        return std::env::var_os("HOME")
2075            .map(PathBuf::from)
2076            .unwrap_or_else(|| path.to_path_buf());
2077    }
2078    if let Some(rest) = raw.strip_prefix("~/") {
2079        if let Some(home) = std::env::var_os("HOME") {
2080            return PathBuf::from(home).join(rest);
2081        }
2082    }
2083    path.to_path_buf()
2084}
2085
2086#[cfg(test)]
2087mod tests {
2088    use super::*;
2089
2090    #[test]
2091    fn dotenv_parser_strips_export_and_quotes_without_leaking_values() {
2092        let parsed = parse_env_line("export TOGETHER_API_KEY=\"secret\"")
2093            .unwrap()
2094            .unwrap();
2095        assert_eq!(parsed.0, "TOGETHER_API_KEY");
2096        assert_eq!(parsed.1, "secret");
2097        assert!(parse_env_line("# comment").unwrap().is_none());
2098    }
2099
2100    #[test]
2101    fn model_selector_args_rejoin_provider_model_kv_after_clap_delimiter_split() {
2102        let normalized = normalize_model_selector_args(&[
2103            "mock:mock".to_string(),
2104            "provider=openrouter".to_string(),
2105            "model=qwen/qwen3-coder-flash".to_string(),
2106            "provider=together".to_string(),
2107            "model=Qwen/Qwen3-Coder-Next-FP8".to_string(),
2108        ]);
2109        assert_eq!(
2110            normalized,
2111            vec![
2112                "mock:mock",
2113                "provider=openrouter,model=qwen/qwen3-coder-flash",
2114                "provider=together,model=Qwen/Qwen3-Coder-Next-FP8",
2115            ]
2116        );
2117    }
2118
2119    #[test]
2120    fn markdown_escapes_model_table_pipes() {
2121        let selector = ModelSelector {
2122            selector: "provider:a|b".to_string(),
2123            provider: "provider".to_string(),
2124            model: "a|b".to_string(),
2125        };
2126        let summary = EvalSummary {
2127            schema_version: 2,
2128            fixture_ids: vec!["python-add".to_string()],
2129            fixtures: vec![FixtureReport {
2130                id: "python-add".to_string(),
2131                name: "Python add repair".to_string(),
2132                tool_sequence: "multi-tool".to_string(),
2133                description: "One-file Python bug fix verified by unittest output.".to_string(),
2134            }],
2135            output_dir: "out".to_string(),
2136            models: vec![selector.clone()],
2137            tool_formats: vec!["native".to_string()],
2138            env_keys_loaded: Vec::new(),
2139            total_runs: 1,
2140            passed_runs: 1,
2141            failed_runs: 0,
2142            skipped_runs: 0,
2143            diverged_comparisons: 0,
2144            total_cost_usd: 0.0,
2145            rollups: EvalRollups {
2146                by_fixture: vec![RollupReport {
2147                    key: "python-add".to_string(),
2148                    total_runs: 1,
2149                    passed_runs: 1,
2150                    failed_runs: 0,
2151                    skipped_runs: 0,
2152                    total_cost_usd: 0.0,
2153                }],
2154                by_provider: Vec::new(),
2155                by_model: Vec::new(),
2156                by_tool_format: Vec::new(),
2157                by_tool_sequence: Vec::new(),
2158            },
2159            runs: vec![RunReport {
2160                run_id: "r".to_string(),
2161                fixture_id: "python-add".to_string(),
2162                fixture_name: "Python add repair".to_string(),
2163                fixture_tool_sequence: "multi-tool".to_string(),
2164                selector,
2165                tool_format: "native".to_string(),
2166                status: "passed".to_string(),
2167                passed: true,
2168                skipped: false,
2169                skipped_reason: None,
2170                output_dir: "out/r".to_string(),
2171                transcript_events_path: "out/r/transcript_events.jsonl".to_string(),
2172                workspace_root: None,
2173                elapsed_ms: 1,
2174                duration_ms: 1,
2175                iterations: 1,
2176                input_tokens: 1,
2177                output_tokens: 1,
2178                cost_usd: 0.0,
2179                pricing_known: false,
2180                tool_calls: 0,
2181                rejected_tool_calls: 0,
2182                tool_sequence: Vec::new(),
2183                successful_tools: Vec::new(),
2184                transcript_event_count: 0,
2185                verification_success: true,
2186                harn_exit_code: 0,
2187                error: None,
2188                stderr_excerpt: None,
2189                local_cleanup: None,
2190            }],
2191            comparisons: Vec::new(),
2192            followups: Vec::new(),
2193            step_judge_preset: None,
2194            run_label: String::new(),
2195            baseline_comparison: None,
2196        };
2197        let md = render_markdown(&summary);
2198        assert!(md.contains("a\\|b"));
2199    }
2200
2201    #[test]
2202    fn baseline_comparison_reports_regressions_and_recoveries() {
2203        // Synthetic baseline summary.json — two fixtures, both passed.
2204        let tmp = tempfile::tempdir().expect("tempdir");
2205        let baseline_path = tmp.path().join("baseline_summary.json");
2206        let baseline = serde_json::json!({
2207            "schema_version": 2,
2208            "runs": [
2209                {"fixture_id": "python-add", "passed": true, "skipped": false},
2210                {"fixture_id": "cli-help-flag", "passed": true, "skipped": false},
2211                {"fixture_id": "test-output-first", "passed": false, "skipped": false},
2212            ],
2213        });
2214        std::fs::write(&baseline_path, serde_json::to_string(&baseline).unwrap())
2215            .expect("write baseline");
2216
2217        // Cell run: cli-help-flag REGRESSED (was passing), test-output-first RECOVERED.
2218        let selector = ModelSelector {
2219            selector: "mock:mock".to_string(),
2220            provider: "mock".to_string(),
2221            model: "mock".to_string(),
2222        };
2223        let runs = vec![
2224            RunReport {
2225                run_id: "r1".to_string(),
2226                fixture_id: "python-add".to_string(),
2227                fixture_name: "Python add".to_string(),
2228                fixture_tool_sequence: "multi-tool".to_string(),
2229                selector: selector.clone(),
2230                tool_format: "native".to_string(),
2231                status: "passed".to_string(),
2232                passed: true,
2233                skipped: false,
2234                skipped_reason: None,
2235                output_dir: "out/r1".to_string(),
2236                transcript_events_path: "out/r1/t.jsonl".to_string(),
2237                workspace_root: None,
2238                elapsed_ms: 0,
2239                duration_ms: 0,
2240                iterations: 0,
2241                input_tokens: 0,
2242                output_tokens: 0,
2243                cost_usd: 0.0,
2244                pricing_known: false,
2245                tool_calls: 0,
2246                rejected_tool_calls: 0,
2247                tool_sequence: Vec::new(),
2248                successful_tools: Vec::new(),
2249                transcript_event_count: 0,
2250                verification_success: true,
2251                harn_exit_code: 0,
2252                error: None,
2253                stderr_excerpt: None,
2254                local_cleanup: None,
2255            },
2256            RunReport {
2257                run_id: "r2".to_string(),
2258                fixture_id: "cli-help-flag".to_string(),
2259                fixture_name: "CLI help flag".to_string(),
2260                fixture_tool_sequence: "multi-tool".to_string(),
2261                selector: selector.clone(),
2262                tool_format: "native".to_string(),
2263                status: "failed".to_string(),
2264                passed: false,
2265                skipped: false,
2266                skipped_reason: None,
2267                output_dir: "out/r2".to_string(),
2268                transcript_events_path: "out/r2/t.jsonl".to_string(),
2269                workspace_root: None,
2270                elapsed_ms: 0,
2271                duration_ms: 0,
2272                iterations: 0,
2273                input_tokens: 0,
2274                output_tokens: 0,
2275                cost_usd: 0.0,
2276                pricing_known: false,
2277                tool_calls: 0,
2278                rejected_tool_calls: 0,
2279                tool_sequence: Vec::new(),
2280                successful_tools: Vec::new(),
2281                transcript_event_count: 0,
2282                verification_success: false,
2283                harn_exit_code: 1,
2284                error: None,
2285                stderr_excerpt: None,
2286                local_cleanup: None,
2287            },
2288            RunReport {
2289                run_id: "r3".to_string(),
2290                fixture_id: "test-output-first".to_string(),
2291                fixture_name: "Test output first".to_string(),
2292                fixture_tool_sequence: "multi-tool".to_string(),
2293                selector,
2294                tool_format: "native".to_string(),
2295                status: "passed".to_string(),
2296                passed: true,
2297                skipped: false,
2298                skipped_reason: None,
2299                output_dir: "out/r3".to_string(),
2300                transcript_events_path: "out/r3/t.jsonl".to_string(),
2301                workspace_root: None,
2302                elapsed_ms: 0,
2303                duration_ms: 0,
2304                iterations: 0,
2305                input_tokens: 0,
2306                output_tokens: 0,
2307                cost_usd: 0.0,
2308                pricing_known: false,
2309                tool_calls: 0,
2310                rejected_tool_calls: 0,
2311                tool_sequence: Vec::new(),
2312                successful_tools: Vec::new(),
2313                transcript_event_count: 0,
2314                verification_success: true,
2315                harn_exit_code: 0,
2316                error: None,
2317                stderr_excerpt: None,
2318                local_cleanup: None,
2319            },
2320        ];
2321        let comparison = load_baseline_comparison(&baseline_path, &runs).expect("compare");
2322        assert_eq!(comparison.regressions_count, 1);
2323        assert_eq!(comparison.regressions[0].fixture_id, "cli-help-flag");
2324        assert_eq!(comparison.recoveries_count, 1);
2325        assert_eq!(comparison.recoveries[0].fixture_id, "test-output-first");
2326        assert_eq!(comparison.unchanged_passes, vec!["python-add".to_string()]);
2327        assert_eq!(
2328            comparison.net_lift_pp, 0.0,
2329            "+1 recovery and -1 regression should net to 0pp lift across 3 compared fixtures"
2330        );
2331    }
2332
2333    #[test]
2334    fn fixture_selection_supports_all_and_specific_ids() {
2335        let all = resolve_fixtures(&["all".to_string()]).expect("all fixtures resolve");
2336        assert_eq!(all.len(), FIXTURE_DEFINITIONS.len());
2337
2338        let selected = resolve_fixtures(&[
2339            "python-add".to_string(),
2340            "python-add".to_string(),
2341            "read-only-audit".to_string(),
2342        ])
2343        .expect("specific fixtures resolve");
2344        assert_eq!(
2345            selected
2346                .iter()
2347                .map(|fixture| fixture.id)
2348                .collect::<Vec<_>>(),
2349            vec!["python-add", "read-only-audit"],
2350        );
2351
2352        let error = resolve_fixtures(&["missing".to_string()]).expect_err("unknown fixture fails");
2353        assert!(error.contains("unsupported --fixture `missing`"));
2354    }
2355
2356    #[test]
2357    fn matrix_max_runs_bounds_fixture_model_tool_product() {
2358        let fixtures = resolve_fixtures(&["all".to_string()]).expect("fixtures");
2359        let selector = ModelSelector {
2360            selector: "mock:mock".to_string(),
2361            provider: "mock".to_string(),
2362            model: "mock".to_string(),
2363        };
2364        let selectors = vec![selector];
2365        let tool_formats = vec!["native".to_string(), "text".to_string()];
2366        let matrix = build_matrix(&fixtures, &selectors, &tool_formats, Some(3));
2367        assert_eq!(matrix.len(), 3);
2368        assert_eq!(
2369            matrix
2370                .iter()
2371                .map(|(fixture, _selector, tool_format)| (fixture.id, tool_format.as_str()))
2372                .collect::<Vec<_>>(),
2373            vec![
2374                ("python-add", "native"),
2375                ("python-add", "text"),
2376                ("cli-help-flag", "native"),
2377            ],
2378        );
2379
2380        let empty = build_matrix(&fixtures, &selectors, &tool_formats, Some(0));
2381        assert!(empty.is_empty());
2382    }
2383}
harn_cli/commands/eval_coding_agent.rs

harn_cli/commands/
eval_coding_agent.rs