Skip to main content

kaizen/eval/
engine.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2use crate::core::config::EvalConfig;
3use crate::core::event::SessionRecord;
4use crate::eval::judge::judge_session;
5use crate::eval::rubric;
6use crate::eval::types::EvalRow;
7use crate::store::sqlite::Store;
8use anyhow::{Context, Result, bail};
9use std::path::Path;
10
11pub fn run_evals(
12    store: &Store,
13    cfg: &EvalConfig,
14    workspace: &Path,
15    since_ms: u64,
16    dry_run: bool,
17) -> Result<Vec<EvalRow>> {
18    if !cfg.enabled {
19        return Ok(vec![]);
20    }
21    let rubric =
22        rubric::by_id(&cfg.rubric).with_context(|| format!("unknown rubric: {}", cfg.rubric))?;
23    let api_key = resolve_api_key(cfg);
24    if api_key.is_empty() {
25        bail!("eval.api_key not set and ANTHROPIC_API_KEY env var is empty");
26    }
27    let client = reqwest::blocking::Client::new();
28    let candidates = store
29        .list_sessions_for_eval(since_ms, cfg.min_cost_usd)
30        .context("list sessions for eval")?;
31    let results = candidates
32        .iter()
33        .take(cfg.batch_size)
34        .filter_map(|s| eval_one(store, &client, cfg, rubric, s, dry_run))
35        .collect();
36    let _ = workspace;
37    Ok(results)
38}
39
40pub fn dry_run_candidates(
41    store: &Store,
42    cfg: &EvalConfig,
43    since_ms: u64,
44) -> Result<Vec<SessionRecord>> {
45    if !cfg.enabled {
46        return Ok(vec![]);
47    }
48    store
49        .list_sessions_for_eval(since_ms, cfg.min_cost_usd)
50        .map(|rows| rows.into_iter().take(cfg.batch_size).collect())
51}
52
53fn eval_one(
54    store: &Store,
55    client: &reqwest::blocking::Client,
56    cfg: &EvalConfig,
57    rubric: &crate::eval::rubric::Rubric,
58    session: &SessionRecord,
59    dry_run: bool,
60) -> Option<EvalRow> {
61    if dry_run {
62        eprintln!("[dry-run] would eval session {}", session.id);
63        return None;
64    }
65    let events = store.list_events_for_session(&session.id).ok()?;
66    let row = judge_session(
67        client,
68        &cfg.endpoint,
69        &resolve_api_key(cfg),
70        &cfg.model,
71        rubric,
72        session,
73        &events,
74    )
75    .ok()?;
76    store.upsert_eval(&row).ok()?;
77    Some(row)
78}
79
80fn resolve_api_key(cfg: &EvalConfig) -> String {
81    if !cfg.api_key.is_empty() {
82        return cfg.api_key.clone();
83    }
84    std::env::var("ANTHROPIC_API_KEY").unwrap_or_default()
85}