1use crate::core::event::{Event, EventKind, SessionRecord};
3use crate::eval::rubric::Rubric;
4use crate::eval::types::{EvalRow, JudgeResponse};
5use anyhow::{Context, Result, bail};
6use serde_json::json;
7
8fn summarise(session: &SessionRecord, events: &[Event]) -> String {
9 let tool_seq: Vec<_> = events
10 .iter()
11 .filter(|e| e.kind == EventKind::ToolCall)
12 .filter_map(|e| e.tool.as_deref())
13 .collect();
14 let errors = events.iter().filter(|e| e.kind == EventKind::Error).count();
15 let cost_usd: f64 = events
16 .iter()
17 .filter_map(|e| e.cost_usd_e6)
18 .map(|c| c as f64 / 1_000_000.0)
19 .sum();
20 let duration_s = session
21 .ended_at_ms
22 .map(|end| end.saturating_sub(session.started_at_ms) / 1000)
23 .unwrap_or(0);
24 format!(
25 "tools: {}\nerrors: {}\ncost_usd: {:.4}\nduration_s: {}",
26 tool_seq.join(", "),
27 errors,
28 cost_usd,
29 duration_s,
30 )
31}
32
33pub fn build_prompt(rubric: &Rubric, session: &SessionRecord, events: &[Event]) -> String {
35 let summary = summarise(session, events);
36 rubric.prompt_template.replace("{summary}", &summary)
37}
38
39pub fn judge_session(
40 client: &reqwest::blocking::Client,
41 endpoint: &str,
42 api_key: &str,
43 model: &str,
44 rubric: &Rubric,
45 session: &SessionRecord,
46 events: &[Event],
47) -> Result<EvalRow> {
48 let prompt = build_prompt(rubric, session, events);
49 let body = json!({
50 "model": model,
51 "max_tokens": 256,
52 "messages": [{"role": "user", "content": prompt}],
53 });
54 let url = format!("{}/v1/messages", endpoint.trim_end_matches('/'));
55 let resp = client
56 .post(&url)
57 .header("x-api-key", api_key)
58 .header("anthropic-version", "2023-06-01")
59 .json(&body)
60 .send()
61 .context("judge HTTP request failed")?;
62 if !resp.status().is_success() {
63 bail!("judge returned {}", resp.status());
64 }
65 parse_judge_response(resp, session, model, rubric)
66}
67
68fn parse_judge_response(
69 resp: reqwest::blocking::Response,
70 session: &SessionRecord,
71 model: &str,
72 rubric: &Rubric,
73) -> Result<EvalRow> {
74 let raw: serde_json::Value = resp.json().context("judge response parse")?;
75 let text = raw["content"][0]["text"]
76 .as_str()
77 .context("missing text in judge response")?;
78 let jr: JudgeResponse = serde_json::from_str(text).context("judge JSON parse")?;
79 let score = jr.score.clamp(0.0, 1.0);
80 Ok(EvalRow {
81 id: format!("{}:{}", session.id, rubric.id),
82 session_id: session.id.clone(),
83 judge_model: model.to_owned(),
84 rubric_id: rubric.id.to_owned(),
85 score,
86 rationale: jr.rationale,
87 flagged: score < 0.4,
88 created_at_ms: now_ms(),
89 })
90}
91
92fn now_ms() -> u64 {
93 std::time::SystemTime::now()
94 .duration_since(std::time::UNIX_EPOCH)
95 .unwrap_or_default()
96 .as_millis() as u64
97}