agentcarousel 0.5.2

use agentcarousel_core::{
    judge_key_candidates, judge_provider_from_model, CaseStatus, CertificationContext,
    JudgeProvider,
};
use agentcarousel_fixtures::load_fixture;
use agentcarousel_reporters::{persist_run, print_json, print_junit, print_terminal};
use agentcarousel_runner::{run_eval, EvalConfig, GenerationMode, GeneratorProvider, RunnerConfig};
use clap::{ArgAction, Parser, ValueEnum};
use console::style;
use std::io::{stderr, IsTerminal};
use std::path::PathBuf;

use super::config::{config_hash, ResolvedConfig};
use super::exit_codes::ExitCode;
use super::fixture_utils::{
    apply_case_filter, apply_tag_filter, collect_fixture_paths, default_concurrency,
};
use super::GlobalOptions;

#[derive(Debug, Clone, ValueEnum)]
enum EvalExecutionMode {
    Mock,
    Live,
}

/// Run evaluation with mock or live generation; optionally score with an LLM judge.
#[derive(Debug, Parser)]
#[command(
    after_help = "Examples:\n  agc eval fixtures/skills/customer-support.yaml          # mock, rules evaluator\n  agc eval fixtures/ --execution-mode live               # live generation\n  agc eval fixtures/ --evaluator all --judge             # mixed rules+judge fixtures\n  agc eval fixtures/ --evaluator judge --judge           # force judge scoring on all cases\n  agc eval fixtures/ --filter-tags smoke --format json   # CI-friendly output"
)]
pub struct EvalArgs {
    /// Fixture files or dirs (default: fixtures).
    #[arg(value_name = "PATHS", default_value = "fixtures")]
    paths: Vec<PathBuf>,
    /// Config file path (default: agentcarousel.toml in the current directory).
    #[arg(long)]
    pub config: Option<PathBuf>,
    /// Override the run id stored in the history DB for this run.
    #[arg(long)]
    pub run_id: Option<String>,
    /// Number of times to run each case (use >1 for flakiness detection).
    #[arg(short = 'n', long, default_value_t = 1)]
    runs: u32,
    /// Random seed for mock generation (0 = deterministic default).
    #[arg(short = 's', long, default_value_t = 0)]
    seed: u64,
    /// `rules` | `golden` | `process` | `judge` | `all` (default `rules` uses config; `all` uses each case’s `evaluator_config` in YAML).
    ///
    /// - **`judge`** — every run case is scored with the judge (ignores per-case evaluator choice).
    /// - **`all`** — each case uses its fixture’s evaluator; **only `judge` cases call the judge API** (use with `--judge` and keys). Required for mixed rules/golden/judge fixtures.
    #[arg(short = 'e', long, default_value = "rules")]
    evaluator: String,
    /// Enable the LLM judge for judge-scored cases (requires API keys; errors list env vars if missing).
    ///
    /// Useless unless the active mode can select judge: **`--evaluator judge`** (all cases judged) or **`--evaluator all`** (only cases with `evaluator: judge` in YAML).
    #[arg(short = 'j', long)]
    judge: bool,
    /// Model to use for judge scoring (overrides config `judge.model`).
    #[arg(short = 'J', long)]
    judge_model: Option<String>,
    /// `mock` (default) or `live` — whether to call a real generator API.
    #[arg(short = 'x', long, value_enum, default_value_t = EvalExecutionMode::Mock)]
    execution_mode: EvalExecutionMode,
    /// Generator model name (overrides config `generator.model`).
    #[arg(short = 'm', long)]
    model: Option<String>,
    /// Omit `max_tokens` from generator requests (unsupported for Anthropic models).
    #[arg(short = 'M', long)]
    disable_max_tokens: bool,
    /// Maximum number of cases to run in parallel.
    #[arg(short = 'c', long)]
    concurrency: Option<usize>,
    /// Per-case timeout in seconds.
    #[arg(short = 't', long)]
    timeout: Option<u64>,
    /// Output format: `human` (default), `json`, or `junit`.
    #[arg(short = 'f', long)]
    format: Option<String>,
    /// Glob matched against full case ids (`skill/case-id`). Example: `my-skill/judge-*` to run only judge-named cases; combine with `--evaluator all --judge`.
    #[arg(short = 'F', long)]
    filter: Option<String>,
    /// Comma-separated tags; keep only cases having any listed tag. Tag judge-only rows (e.g. `judge`) and pass `--filter-tags judge` to skip rules/golden cases.
    #[arg(long = "filter-tags", value_name = "TAG", value_delimiter = ',')]
    filter_tags: Option<Vec<String>>,
    /// Certification context for audit metadata: `local`, `msp`, or `ci`.
    #[arg(short = 'C', long)]
    certification_context: Option<CliCertificationContext>,
    /// Carousel iteration number stamped into the run record for multi-iteration sweeps.
    #[arg(short = 'i', long)]
    carousel_iteration: Option<u32>,
    /// Policy version string stamped into the run record (e.g. `v1.2`).
    #[arg(short = 'p', long)]
    policy_version: Option<String>,
    /// Show a case-level progress bar on stderr (default: on for non-JSON/JUnit output when stderr is a TTY; use with `--format json` so only stderr shows progress).
    #[arg(short = 'P', long, action = ArgAction::SetTrue)]
    progress: bool,
    /// Never show the eval case progress bar.
    #[arg(short = 'N', long, action = ArgAction::SetTrue)]
    no_progress: bool,
    /// Cancel the entire run after N seconds (per-case --timeout still applies per case).
    #[arg(long)]
    timeout_run: Option<u64>,
    /// When set with --evaluator golden, write actual outputs to golden files instead of failing.
    #[arg(long)]
    update_golden: bool,
    /// Base URL for a custom agent endpoint (required when --model is 'custom').
    #[arg(long)]
    generator_endpoint: Option<String>,
}

#[derive(Debug, Clone, ValueEnum)]
enum CliCertificationContext {
    Local,
    Msp,
    Ci,
}

impl From<CliCertificationContext> for CertificationContext {
    fn from(value: CliCertificationContext) -> Self {
        match value {
            CliCertificationContext::Local => CertificationContext::Local,
            CliCertificationContext::Msp => CertificationContext::Msp,
            CliCertificationContext::Ci => CertificationContext::Ci,
        }
    }
}

pub fn run_eval_command(args: EvalArgs, config: &ResolvedConfig, globals: &GlobalOptions) -> i32 {
    if globals.verbose >= 2 {
        std::env::set_var("AGENTCAROUSEL_DEBUG_JUDGE", "1");
    }

    let fixture_paths = collect_fixture_paths(&args.paths);
    let mut fixtures = Vec::new();
    for path in fixture_paths {
        match load_fixture(&path) {
            Ok(fixture) => {
                let fixture = apply_case_filter(fixture, args.filter.as_deref());
                let fixture = apply_tag_filter(fixture, args.filter_tags.as_deref());
                fixtures.push(fixture);
            }
            Err(err) => {
                eprintln!("error: failed to load fixture {}: {err}", path.display());
                return ExitCode::ConfigError.as_i32();
            }
        }
    }

    let judge_selected = is_judge_selected(&args, &fixtures);
    if judge_selected && !args.judge {
        eprintln!("error: judge evaluator selected; rerun with --judge");
        return ExitCode::ConfigError.as_i32();
    }
    let judge_model = args
        .judge_model
        .clone()
        .unwrap_or_else(|| config.judge.model.clone());
    let judge_provider = judge_provider_from_model(&judge_model);
    if judge_selected && resolve_judge_key(judge_provider).is_none() {
        eprintln!(
            "error: set one of {} to run --judge for model '{}'\n  tip: {}",
            judge_key_candidates(judge_provider).join(", "),
            judge_model,
            key_example(judge_key_candidates(judge_provider))
        );
        return ExitCode::ConfigError.as_i32();
    }
    let judge_enabled = args.judge && judge_selected;
    let generator_model = args
        .model
        .clone()
        .unwrap_or_else(|| config.generator.model.clone());
    let generator_provider = GeneratorProvider::from_model(&generator_model);
    if args.disable_max_tokens
        && (matches!(generator_provider, GeneratorProvider::Anthropic)
            || (judge_selected && matches!(judge_provider, JudgeProvider::Anthropic)))
    {
        eprintln!("error: --disable-max-tokens is not supported with Anthropic models");
        return ExitCode::ConfigError.as_i32();
    }
    if matches!(args.execution_mode, EvalExecutionMode::Live)
        && resolve_generator_key(generator_provider).is_none()
    {
        eprintln!(
            "error: set one of {} to run live generation for model '{}'\n  tip: {}",
            generator_key_candidates(generator_provider).join(", "),
            generator_model,
            key_example(generator_key_candidates(generator_provider))
        );
        return ExitCode::ConfigError.as_i32();
    }
    let generation_mode = match args.execution_mode {
        EvalExecutionMode::Mock => GenerationMode::MockOnly,
        EvalExecutionMode::Live => GenerationMode::Live,
    };

    if globals.verbose > 0 {
        eprintln!(
            "debug: eval setup mode={:?} generator_model={} judge_model={} judge_enabled={} fixtures={}",
            generation_mode,
            generator_model,
            judge_model,
            judge_enabled,
            fixtures.len()
        );
    }

    let concurrency = if matches!(generation_mode, GenerationMode::Live)
        && args.concurrency.is_none()
        && config.runner.concurrency.is_none()
    {
        1
    } else {
        args.concurrency
            .or(config.runner.concurrency)
            .or_else(default_concurrency)
            .unwrap_or(1)
    };
    let format = args
        .format
        .clone()
        .unwrap_or_else(|| config.output.format.clone());
    let show_progress = !args.no_progress
        && !globals.quiet
        && (args.progress || ((format != "json" && format != "junit") && stderr().is_terminal()));
    if !globals.quiet && format != "json" && format != "junit" && args.judge && !judge_enabled {
        eprintln!(
            "{} --judge is set but the judge evaluator is not active (--evaluator is {:?}). \
For fixtures like customer-support that set judge per case, use --evaluator all (and keep --judge).",
            style("hint:").yellow().bold(),
            args.evaluator
        );
    }
    let runner = RunnerConfig {
        concurrency,
        timeout_secs: args.timeout.unwrap_or(config.runner.timeout_secs),
        run_timeout_secs: args.timeout_run,
        offline: if matches!(generation_mode, GenerationMode::Live) {
            false
        } else {
            config.runner.offline
        },
        mock_dir: config.runner.mock_dir.clone(),
        generation_mode,
        generator_model: Some(generator_model),
        generator_max_tokens: if args.disable_max_tokens {
            None
        } else {
            config.generator.max_tokens
        },
        generator_endpoint: args.generator_endpoint.clone(),
        fail_fast: false,
        mock_strict: std::env::var("agentcarousel_MOCK_STRICT").ok().as_deref() == Some("1"),
        command: "eval".to_string(),
        agentcarousel_version: env!("CARGO_PKG_VERSION").to_string(),
        config_hash: config_hash(config),
        run_id: args.run_id.clone(),
    };

    let eval_config = EvalConfig {
        runner,
        runs: args.runs,
        seed: args.seed,
        evaluator: if args.evaluator == "rules" {
            config.eval.default_evaluator.clone()
        } else {
            args.evaluator
        },
        judge: judge_enabled,
        judge_model: Some(judge_model),
        effectiveness_threshold: config.eval.effectiveness_threshold,
        judge_max_tokens: if args.disable_max_tokens {
            None
        } else {
            config.judge.max_tokens
        },
        certification_context: args.certification_context.map(Into::into),
        carousel_iteration: args.carousel_iteration,
        policy_version: args.policy_version,
        progress: show_progress,
        update_golden: args.update_golden,
    };

    let runtime = tokio::runtime::Builder::new_multi_thread()
        .enable_io()
        .enable_time()
        .build()
        .expect("tokio runtime");
    let run = runtime.block_on(run_eval(fixtures, eval_config));

    let _ = persist_run(&run);
    let format_str = format.as_str();
    match format_str {
        "json" => print_json(&run),
        "junit" => print_junit(&run),
        _ => {
            if globals.quiet {
                agentcarousel_reporters::print_terminal_summary(&run);
            } else {
                print_terminal(&run);
            }
        }
    }

    if !globals.quiet && format_str != "json" && format_str != "junit" {
        print_postflight_hints(&run);
    }
    if globals.quiet || format_str == "json" || format_str == "junit" {
        print_eval_saved_run_hint(
            &run,
            globals.quiet || format_str == "json" || format_str == "junit",
        );
    }

    if has_eval_failures(&run, config.eval.effectiveness_threshold) {
        ExitCode::Failed.as_i32()
    } else {
        ExitCode::Ok.as_i32()
    }
}

fn has_eval_failures(run: &agentcarousel_core::Run, threshold: f32) -> bool {
    run.cases.iter().any(|case| {
        matches!(
            case.status,
            CaseStatus::Failed | CaseStatus::TimedOut | CaseStatus::Error | CaseStatus::Flaky
        ) || case
            .eval_scores
            .as_ref()
            .map(|scores| scores.effectiveness_score < threshold)
            .unwrap_or(true)
    })
}

fn cli_invocation_name() -> String {
    std::env::current_exe()
        .ok()
        .and_then(|path| {
            path.file_stem()
                .map(|stem| stem.to_string_lossy().into_owned())
        })
        .unwrap_or_else(|| "agentcarousel".to_string())
}

fn print_eval_saved_run_hint(run: &agentcarousel_core::Run, to_stderr: bool) {
    let bin = cli_invocation_name();
    let id = run.id.0.as_str();
    let line1 = format!("run id: {id}");
    let line2 = format!("next: {bin} report show {id}");
    if to_stderr {
        eprintln!("{line1}");
        eprintln!("{line2}");
    } else {
        println!("{line1}");
        println!("{line2}");
    }
}

fn print_postflight_hints(run: &agentcarousel_core::Run) {
    let provider_errors = &run.summary.provider_errors;
    if provider_errors.status_429
        + provider_errors.status_500
        + provider_errors.status_503
        + provider_errors.status_504
        > 0
    {
        println!(
            "{} provider errors detected; consider rerunning or lowering concurrency",
            style("hint:").yellow(),
        );
    }
    if run.summary.errored > 0 || run.summary.timed_out > 0 {
        println!(
            "{} use --verbose for diagnostics or --format json to inspect outputs",
            style("hint:").yellow(),
        );
    }
}

fn key_example(keys: &[&str]) -> String {
    keys.first()
        .map(|key| format!("export {}=your_key_here", key))
        .unwrap_or_else(|| "export YOUR_API_KEY=your_key_here".to_string())
}

fn is_judge_selected(args: &EvalArgs, fixtures: &[agentcarousel_core::FixtureFile]) -> bool {
    if args.evaluator == "judge" {
        return true;
    }
    if args.evaluator != "all" {
        return false;
    }
    fixtures
        .iter()
        .flat_map(|fixture| fixture.cases.iter())
        .any(|case| {
            case.evaluator_config
                .as_ref()
                .map(|config| config.evaluator == "judge")
                .unwrap_or(false)
        })
}

fn resolve_judge_key(provider: JudgeProvider) -> Option<String> {
    judge_key_candidates(provider)
        .iter()
        .find_map(|key| std::env::var(key).ok())
}

fn generator_key_candidates(provider: GeneratorProvider) -> &'static [&'static str] {
    provider.key_candidates()
}

fn resolve_generator_key(provider: GeneratorProvider) -> Option<String> {
    provider
        .key_candidates()
        .iter()
        .find_map(|key| std::env::var(key).ok())
}