harn-cli 0.8.91

mod fixtures;

use std::collections::{BTreeMap, HashSet};
use std::fs;
use std::path::{Path, PathBuf};

use harn_vm::clock::{Clock, RealClock};
use harn_vm::orchestration::{
    eval_pack_case_fingerprint, evaluate_eval_pack_manifest_resumable_with_live_executor,
    EvalPackCase, EvalPackCommandObject, EvalPackCommandSpec, EvalPackLiveExecutor,
    EvalPackLiveExecutorRequest, EvalPackLiveVerifyOutcome, EvalPackManifest,
};
use serde_json::Value as JsonValue;

use crate::cli::EvalCodingAgentArgs;
use crate::commands::eval_model_selector::{selector_is_local, selector_label, ModelSelector};
use crate::commands::local::runtime::{
    ollama_unload_model, snapshot_provider, LocalProviderSnapshot,
};
use crate::commands::run::{
    execute_run_with_sandbox_options, CliLlmMockMode, RunProfileOptions, RunSandboxOptions,
};

use super::{
    excerpt, non_empty_string_array, parse_last_json_line, read_run_summary, script_argv,
    string_array, tool_call_sequence, LocalCleanupReport, RunReport, CODING_AGENT_EVAL_PACK_ID,
    CODING_AGENT_SUITE_HARN, TOOL_FORMAT_OVERRIDE_WARNING_PREFIX,
};
#[cfg(test)]
pub(super) use fixtures::coding_agent_live_verify_cases;
pub(super) use fixtures::{
    fixture_description, fixture_id, fixture_name, fixture_tool_sequence, resolve_fixtures,
};

struct LocalRunGuard {
    selector: ModelSelector,
    stop_after: bool,
    snapshot: Option<LocalProviderSnapshot>,
}

struct RunSummaryContext {
    run_id: String,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_dir: PathBuf,
    elapsed_ms: u64,
    exit_code: i32,
    stderr: String,
    local_cleanup: Option<LocalCleanupReport>,
}

struct CodingAgentLiveExecutor<'a> {
    args: &'a EvalCodingAgentArgs,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_id: String,
    run_dir: PathBuf,
    report: Option<RunReport>,
}

pub(super) async fn run_matrix_entry(
    args: &EvalCodingAgentArgs,
    output_dir: &Path,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
) -> RunReport {
    let run_id = super::run_id_for(&fixture, &selector, &tool_format);
    let run_dir = output_dir.join(&run_id);
    if let Err(error) = fs::create_dir_all(&run_dir) {
        return error_report(
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            format!("failed to create run directory: {error}"),
        );
    }

    if !provider_available(&selector) {
        let reason = format!(
            "provider `{}` has no configured credentials",
            selector.provider
        );
        return skipped_report(run_id, fixture, selector, tool_format, run_dir, reason);
    }

    let manifest =
        match coding_agent_eval_pack_manifest(&run_dir, &fixture, &selector, &tool_format) {
            Ok(manifest) => manifest,
            Err(error) => {
                return error_report(run_id, fixture, selector, tool_format, run_dir, error);
            }
        };
    let mut executor = CodingAgentLiveExecutor {
        args,
        fixture: fixture.clone(),
        selector: selector.clone(),
        tool_format: tool_format.clone(),
        run_id: run_id.clone(),
        run_dir: run_dir.clone(),
        report: None,
    };
    let previous_event_log = harn_vm::event_log::active_event_log();
    harn_vm::event_log::reset_active_event_log();
    let pack_report =
        evaluate_eval_pack_manifest_resumable_with_live_executor(&manifest, None, &mut executor);
    harn_vm::event_log::reset_active_event_log();
    if let Some(log) = previous_event_log {
        harn_vm::event_log::install_active_event_log(log);
    }
    match pack_report {
        Ok(_) => executor.report.unwrap_or_else(|| {
            report_from_existing_summary(run_id, fixture, selector, tool_format, run_dir)
        }),
        Err(error) => error_report(
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            format!("eval pack live-verify execution failed: {error}"),
        ),
    }
}

async fn execute_coding_agent_trial(
    args: &EvalCodingAgentArgs,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_id: String,
    run_dir: PathBuf,
) -> RunReport {
    if let Err(error) = prepare_run_dir_for_trial(&run_dir) {
        return error_report(run_id, fixture, selector, tool_format, run_dir, error);
    }

    let script_path = run_dir.join("coding_agent_suite.harn");
    if let Err(error) = fs::write(&script_path, CODING_AGENT_SUITE_HARN) {
        return error_report(
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            format!("failed to write benchmark harness: {error}"),
        );
    }

    let local_guard = LocalRunGuard::before(&selector, !args.keep_local_after_run).await;
    let argv = script_argv(args, &fixture, &selector, &tool_format, &run_dir);
    let clock = RealClock::new();
    let started_ms = clock.monotonic_ms();
    let outcome = execute_run_with_sandbox_options(
        &script_path.to_string_lossy(),
        false,
        HashSet::new(),
        argv,
        Vec::new(),
        CliLlmMockMode::Off,
        None,
        RunProfileOptions::default(),
        RunSandboxOptions::default().with_workspace_root(run_dir.clone()),
    )
    .await;
    if let Some(line) = tool_format_override_warning_line(&outcome.stderr) {
        eprintln!("{line}");
    }
    let elapsed_ms = clock
        .monotonic_ms()
        .saturating_sub(started_ms)
        .try_into()
        .unwrap_or(0);
    let local_cleanup = if let Some(guard) = local_guard {
        guard.cleanup().await
    } else {
        None
    };

    let summary_value =
        read_run_summary(&run_dir).or_else(|| parse_last_json_line(&outcome.stdout));
    let Some(summary) = summary_value else {
        return empty_run_report(
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            "infra_error",
            false,
            None,
            outcome.exit_code,
            Some("benchmark harness produced no summary JSON".to_string()),
            elapsed_ms,
            outcome.stderr,
            local_cleanup,
        );
    };

    report_from_summary(
        RunSummaryContext {
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            elapsed_ms,
            exit_code: outcome.exit_code,
            stderr: outcome.stderr,
            local_cleanup,
        },
        summary,
    )
}

impl EvalPackLiveExecutor for CodingAgentLiveExecutor<'_> {
    fn execute(
        &mut self,
        _request: EvalPackLiveExecutorRequest,
    ) -> Result<EvalPackLiveVerifyOutcome, harn_vm::value::VmError> {
        let report = tokio::task::block_in_place(|| {
            tokio::runtime::Handle::current().block_on(async {
                execute_coding_agent_trial(
                    self.args,
                    self.fixture.clone(),
                    self.selector.clone(),
                    self.tool_format.clone(),
                    self.run_id.clone(),
                    self.run_dir.clone(),
                )
                .await
            })
        });
        let outcome = live_verify_outcome_from_run_report(&report);
        self.report = Some(report);
        Ok(outcome)
    }
}

fn live_verify_outcome_from_run_report(report: &RunReport) -> EvalPackLiveVerifyOutcome {
    let mut failures = Vec::new();
    if !report.passed && !report.skipped {
        failures.push(
            report
                .error
                .clone()
                .unwrap_or_else(|| "coding-agent fixture failed".to_string()),
        );
    }
    EvalPackLiveVerifyOutcome {
        verification: Some(
            if report.skipped {
                "skip"
            } else if report.passed {
                "PASS"
            } else {
                "FAIL"
            }
            .to_string(),
        ),
        verification_exit_code: Some(report.harn_exit_code as i64),
        passed: Some(report.passed),
        timed_out: false,
        wall_time_seconds: report.elapsed_ms as f64 / 1000.0,
        cost_usd: report.cost_usd,
        produced_paths: ["summary.json", "result.json", "transcript_events.jsonl"]
            .into_iter()
            .filter(|path| Path::new(&report.output_dir).join(path).exists())
            .map(str::to_string)
            .collect(),
        tool_call_summary: serde_json::json!({
            "total": report.tool_calls,
            "rejected": report.rejected_tool_calls,
            "sequence": report.tool_sequence.clone(),
            "successful": report.successful_tools.clone(),
        }),
        failures,
        run_id: Some(report.run_id.clone()),
        workflow_id: Some(CODING_AGENT_EVAL_PACK_ID.to_string()),
        source_path: Some(report.transcript_events_path.clone()),
        stage_count: Some(report.transcript_event_count),
        ..EvalPackLiveVerifyOutcome::default()
    }
}

fn coding_agent_eval_pack_manifest(
    run_dir: &Path,
    fixture: &EvalPackCase,
    selector: &ModelSelector,
    tool_format: &str,
) -> Result<EvalPackManifest, String> {
    let mut case = fixture.clone();
    case.workspace = Some(".".to_string());
    case.case_fingerprint = eval_pack_case_fingerprint(&case).map_err(|error| error.to_string())?;
    let mut metadata = BTreeMap::new();
    metadata.insert(
        "harness".to_string(),
        JsonValue::String(CODING_AGENT_EVAL_PACK_ID.to_string()),
    );
    metadata.insert(
        "model".to_string(),
        JsonValue::String(format!("{}:{tool_format}", selector_label(selector))),
    );
    metadata.insert(
        "provider".to_string(),
        JsonValue::String(selector.provider.clone()),
    );
    metadata.insert(
        "provider_model".to_string(),
        JsonValue::String(selector.model.clone()),
    );
    metadata.insert(
        "tool_format".to_string(),
        JsonValue::String(tool_format.to_string()),
    );
    Ok(EvalPackManifest {
        version: 1,
        id: CODING_AGENT_EVAL_PACK_ID.to_string(),
        name: Some("Coding Agent Harness Quality Suite".to_string()),
        base_dir: Some(run_dir.display().to_string()),
        executor: Some(coding_agent_executor_spec()),
        trials: 1,
        cases: vec![case],
        metadata,
        ..EvalPackManifest::default()
    })
}

fn coding_agent_executor_spec() -> EvalPackCommandSpec {
    EvalPackCommandSpec::Object(EvalPackCommandObject {
        command: Some("harn-coding-agent-suite-in-process".to_string()),
        ..EvalPackCommandObject::default()
    })
}

fn prepare_run_dir_for_trial(run_dir: &Path) -> Result<(), String> {
    fs::create_dir_all(run_dir).map_err(|error| error.to_string())?;
    for file in [
        "coding_agent_suite.harn",
        "summary.json",
        "result.json",
        "transcript_events.jsonl",
    ] {
        let path = run_dir.join(file);
        if path.exists() {
            fs::remove_file(&path)
                .map_err(|error| format!("failed to remove {}: {error}", path.display()))?;
        }
    }
    let workspace = run_dir.join("workspace");
    if workspace.exists() {
        fs::remove_dir_all(&workspace)
            .map_err(|error| format!("failed to remove {}: {error}", workspace.display()))?;
    }
    Ok(())
}

pub(super) fn tool_format_override_warning_line(stderr: &str) -> Option<&str> {
    stderr
        .lines()
        .map(str::trim)
        .find(|line| line.starts_with(TOOL_FORMAT_OVERRIDE_WARNING_PREFIX))
}

fn report_from_existing_summary(
    run_id: String,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_dir: PathBuf,
) -> RunReport {
    let Some(summary) = read_run_summary(&run_dir) else {
        return error_report(
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            "eval pack skipped a completed ledger cell, but summary.json was missing".to_string(),
        );
    };
    let exit_code = i32::from(
        !summary
            .get("passed")
            .and_then(JsonValue::as_bool)
            .unwrap_or(false),
    );
    let elapsed_ms = summary
        .get("duration_ms")
        .and_then(JsonValue::as_u64)
        .unwrap_or(0);
    report_from_summary(
        RunSummaryContext {
            run_id,
            fixture,
            selector,
            tool_format,
            run_dir,
            elapsed_ms,
            exit_code,
            stderr: String::new(),
            local_cleanup: None,
        },
        summary,
    )
}

fn report_from_summary(ctx: RunSummaryContext, summary: JsonValue) -> RunReport {
    let passed = summary
        .get("passed")
        .and_then(JsonValue::as_bool)
        .unwrap_or(false)
        && ctx.exit_code == 0;
    let input_tokens = summary
        .pointer("/llm/input_tokens")
        .and_then(JsonValue::as_i64)
        .unwrap_or(0);
    let output_tokens = summary
        .pointer("/llm/output_tokens")
        .and_then(JsonValue::as_i64)
        .unwrap_or(0);
    let pricing = harn_vm::llm::llm_pricing_per_1k(&ctx.selector.provider, &ctx.selector.model);
    let cost_usd = pricing
        .map(|(input, output)| {
            (input_tokens.max(0) as f64 * input + output_tokens.max(0) as f64 * output) / 1000.0
        })
        .unwrap_or(0.0);
    let status = if passed {
        "passed".to_string()
    } else if ctx.exit_code == 0 {
        "failed".to_string()
    } else {
        summary
            .get("status")
            .and_then(JsonValue::as_str)
            .unwrap_or("failed")
            .to_string()
    };
    RunReport {
        run_id: ctx.run_id,
        fixture_id: fixture_id(&ctx.fixture).to_string(),
        fixture_name: fixture_name(&ctx.fixture),
        fixture_tool_sequence: fixture_tool_sequence(&ctx.fixture),
        selector: ctx.selector,
        tool_format: ctx.tool_format,
        status,
        passed,
        skipped: false,
        skipped_reason: None,
        output_dir: ctx.run_dir.display().to_string(),
        transcript_events_path: ctx
            .run_dir
            .join("transcript_events.jsonl")
            .display()
            .to_string(),
        workspace_root: summary
            .get("workspace_root")
            .and_then(JsonValue::as_str)
            .map(str::to_string),
        elapsed_ms: ctx.elapsed_ms,
        duration_ms: summary
            .get("duration_ms")
            .and_then(JsonValue::as_u64)
            .unwrap_or(ctx.elapsed_ms),
        iterations: summary
            .pointer("/llm/iterations")
            .and_then(JsonValue::as_i64)
            .unwrap_or(0),
        input_tokens,
        output_tokens,
        cost_usd,
        pricing_known: pricing.is_some(),
        tool_calls: summary
            .pointer("/tools/calls")
            .and_then(JsonValue::as_array)
            .map(Vec::len)
            .unwrap_or(0),
        rejected_tool_calls: summary
            .pointer("/tools/rejected")
            .and_then(JsonValue::as_array)
            .map(Vec::len)
            .unwrap_or(0),
        tool_sequence: tool_call_sequence(summary.pointer("/tools/calls"))
            .or_else(|| non_empty_string_array(summary.pointer("/tools/successful")))
            .unwrap_or_default(),
        successful_tools: string_array(summary.pointer("/tools/successful")),
        transcript_event_count: summary
            .get("transcript_event_count")
            .and_then(JsonValue::as_u64)
            .unwrap_or(0) as usize,
        verification_success: summary
            .pointer("/verification/success")
            .and_then(JsonValue::as_bool)
            .unwrap_or(false),
        harn_exit_code: ctx.exit_code,
        error: (!passed).then(|| {
            summary
                .get("status")
                .and_then(JsonValue::as_str)
                .unwrap_or("benchmark failed")
                .to_string()
        }),
        stderr_excerpt: excerpt(&ctx.stderr),
        local_cleanup: ctx.local_cleanup,
    }
}

impl LocalRunGuard {
    async fn before(selector: &ModelSelector, stop_after: bool) -> Option<Self> {
        if !selector_is_local(selector) {
            return None;
        }
        let snapshot = snapshot_provider(&selector.provider, Path::new("."))
            .await
            .ok();
        Some(Self {
            selector: selector.clone(),
            stop_after,
            snapshot,
        })
    }

    async fn cleanup(self) -> Option<LocalCleanupReport> {
        let snapshot = self.snapshot?;
        if self.selector.provider != "ollama" {
            return Some(LocalCleanupReport {
                provider: self.selector.provider,
                model: self.selector.model,
                initially_loaded: false,
                action: "not_applicable".to_string(),
                detail: Some(
                    "non-Ollama local providers are only stopped when Harn launched a managed server"
                        .to_string(),
                ),
            });
        }
        let initially_loaded = snapshot
            .loaded_models
            .iter()
            .any(|loaded| loaded.name == self.selector.model);
        if !self.stop_after {
            return Some(LocalCleanupReport {
                provider: self.selector.provider,
                model: self.selector.model,
                initially_loaded,
                action: "left_running".to_string(),
                detail: Some("--keep-local-after-run".to_string()),
            });
        }
        if initially_loaded {
            return Some(LocalCleanupReport {
                provider: self.selector.provider,
                model: self.selector.model,
                initially_loaded,
                action: "left_preexisting".to_string(),
                detail: None,
            });
        }
        match ollama_unload_model(&snapshot.base_url, &self.selector.model).await {
            Ok(()) => Some(LocalCleanupReport {
                provider: self.selector.provider,
                model: self.selector.model,
                initially_loaded,
                action: "unloaded".to_string(),
                detail: None,
            }),
            Err(error) => Some(LocalCleanupReport {
                provider: self.selector.provider,
                model: self.selector.model,
                initially_loaded,
                action: "unload_failed".to_string(),
                detail: Some(error),
            }),
        }
    }
}

fn error_report(
    run_id: String,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_dir: PathBuf,
    error: String,
) -> RunReport {
    empty_run_report(
        run_id,
        fixture,
        selector,
        tool_format,
        run_dir,
        "infra_error",
        false,
        None,
        1,
        Some(error),
        0,
        String::new(),
        None,
    )
}

fn skipped_report(
    run_id: String,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_dir: PathBuf,
    reason: String,
) -> RunReport {
    empty_run_report(
        run_id,
        fixture,
        selector,
        tool_format,
        run_dir,
        "skipped",
        true,
        Some(reason),
        0,
        None,
        0,
        String::new(),
        None,
    )
}

#[allow(clippy::too_many_arguments)]
fn empty_run_report(
    run_id: String,
    fixture: EvalPackCase,
    selector: ModelSelector,
    tool_format: String,
    run_dir: PathBuf,
    status: &str,
    skipped: bool,
    skipped_reason: Option<String>,
    harn_exit_code: i32,
    error: Option<String>,
    elapsed_ms: u64,
    stderr: String,
    local_cleanup: Option<LocalCleanupReport>,
) -> RunReport {
    RunReport {
        run_id,
        fixture_id: fixture_id(&fixture).to_string(),
        fixture_name: fixture_name(&fixture),
        fixture_tool_sequence: fixture_tool_sequence(&fixture),
        selector,
        tool_format,
        status: status.to_string(),
        passed: false,
        skipped,
        skipped_reason,
        output_dir: run_dir.display().to_string(),
        transcript_events_path: run_dir
            .join("transcript_events.jsonl")
            .display()
            .to_string(),
        workspace_root: None,
        elapsed_ms,
        duration_ms: 0,
        iterations: 0,
        input_tokens: 0,
        output_tokens: 0,
        cost_usd: 0.0,
        pricing_known: false,
        tool_calls: 0,
        rejected_tool_calls: 0,
        tool_sequence: Vec::new(),
        successful_tools: Vec::new(),
        transcript_event_count: 0,
        verification_success: false,
        harn_exit_code,
        error,
        stderr_excerpt: excerpt(&stderr),
        local_cleanup,
    }
}

fn provider_available(selector: &ModelSelector) -> bool {
    if matches!(selector.provider.as_str(), "mock" | "fake") || selector_is_local(selector) {
        return true;
    }
    harn_vm::llm_config::provider_key_available(&selector.provider)
}