use std::collections::BTreeMap;
use std::sync::Arc;
use anyhow::{anyhow, Result};
use arrow::array::{
Array, BooleanArray, Float64Array, Int64Array, RecordBatch, StringArray,
TimestampMicrosecondArray,
};
use chrono::{TimeZone, Utc};
use futures::TryStreamExt;
use iceberg::Catalog;
use iceberg::arrow::schema_to_arrow_schema;
use uuid::Uuid;
use super::iceberg::{IcebergWarehouse, TABLE_AGENT_MODEL_RUNS, append_batch, ensure_table_schema};
const COL_RUN_ID: usize = 0;
const COL_TS_MICROS: usize = 1;
const COL_MODEL: usize = 2;
const COL_PROMPT_ID: usize = 3;
const COL_PROMPT: usize = 4;
const COL_OUTPUT: usize = 5;
const COL_LATENCY_MS: usize = 6;
const COL_TOKENS_IN: usize = 7;
const COL_TOKENS_OUT: usize = 8;
const COL_TOKENS_PER_S: usize = 9;
const COL_SCORE: usize = 10;
const COL_OK: usize = 11;
const COL_ERROR: usize = 12;
const COL_AGENT: usize = 13;
const COL_COST_USD: usize = 14;
const COL_MCP_TOOL_CALLS: usize = 15;
pub const AGENT_UNSET: &str = "-";
#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct AgentModelRunRow {
pub run_id: String,
pub ts_micros: i64,
#[serde(default = "default_agent")]
pub agent: String,
pub model: String,
pub prompt_id: String,
pub prompt: String,
pub output: String,
pub latency_ms: f64,
pub tokens_in: i64,
pub tokens_out: i64,
pub tokens_per_s: f64,
pub score: f64,
pub ok: bool,
pub error: Option<String>,
#[serde(default)]
pub cost_usd: f64,
#[serde(default)]
pub mcp_tool_calls: i64,
}
fn default_agent() -> String {
AGENT_UNSET.to_string()
}
impl AgentModelRunRow {
fn key(&self) -> (i64, String, String, String, String) {
(
self.ts_micros,
self.run_id.clone(),
self.agent.clone(),
self.model.clone(),
self.prompt_id.clone(),
)
}
pub fn cell(&self) -> (String, String) {
(self.agent.clone(), self.model.clone())
}
}
pub async fn append_agent_model_runs(
wh: &IcebergWarehouse,
rows: &[AgentModelRunRow],
) -> Result<()> {
if rows.is_empty() {
return Ok(());
}
let ident = wh.table_ident(TABLE_AGENT_MODEL_RUNS);
let table = wh.catalog().load_table(&ident).await?;
let table = ensure_table_schema(
wh.catalog(),
&ident,
table,
&super::iceberg_schema::agent_model_runs()?,
)
.await?;
let arrow_schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?);
let cols: Vec<Arc<dyn Array>> = vec![
Arc::new(StringArray::from(rows.iter().map(|r| r.run_id.clone()).collect::<Vec<_>>())),
Arc::new(
TimestampMicrosecondArray::from(rows.iter().map(|r| r.ts_micros).collect::<Vec<_>>())
.with_timezone("+00:00"),
),
Arc::new(StringArray::from(rows.iter().map(|r| r.model.clone()).collect::<Vec<_>>())),
Arc::new(StringArray::from(rows.iter().map(|r| r.prompt_id.clone()).collect::<Vec<_>>())),
Arc::new(StringArray::from(rows.iter().map(|r| r.prompt.clone()).collect::<Vec<_>>())),
Arc::new(StringArray::from(rows.iter().map(|r| r.output.clone()).collect::<Vec<_>>())),
Arc::new(Float64Array::from(rows.iter().map(|r| r.latency_ms).collect::<Vec<_>>())),
Arc::new(Int64Array::from(rows.iter().map(|r| r.tokens_in).collect::<Vec<_>>())),
Arc::new(Int64Array::from(rows.iter().map(|r| r.tokens_out).collect::<Vec<_>>())),
Arc::new(Float64Array::from(rows.iter().map(|r| r.tokens_per_s).collect::<Vec<_>>())),
Arc::new(Float64Array::from(rows.iter().map(|r| r.score).collect::<Vec<_>>())),
Arc::new(BooleanArray::from(rows.iter().map(|r| r.ok).collect::<Vec<_>>())),
Arc::new(StringArray::from(
rows.iter().map(|r| r.error.clone()).collect::<Vec<Option<String>>>(),
)),
Arc::new(StringArray::from(
rows.iter().map(|r| Some(r.agent.clone())).collect::<Vec<Option<String>>>(),
)),
Arc::new(Float64Array::from(
rows.iter().map(|r| Some(r.cost_usd)).collect::<Vec<Option<f64>>>(),
)),
Arc::new(Int64Array::from(
rows.iter().map(|r| Some(r.mcp_tool_calls)).collect::<Vec<Option<i64>>>(),
)),
];
let batch = RecordBatch::try_new(arrow_schema, cols)?;
append_batch(wh.catalog(), table, batch).await?;
Ok(())
}
#[derive(Debug, Clone)]
pub enum BakeoffSelector {
Run(String),
Model(String),
Agent(String),
All,
}
pub async fn query_agent_model_runs(
wh: &IcebergWarehouse,
sel: &BakeoffSelector,
) -> Result<Vec<AgentModelRunRow>> {
let table = wh.catalog().load_table(&wh.table_ident(TABLE_AGENT_MODEL_RUNS)).await?;
let scan = table.scan().build()?;
let stream = scan.to_arrow().await?;
let batches: Vec<RecordBatch> = stream.try_collect().await?;
let mut out: Vec<AgentModelRunRow> = Vec::new();
for b in &batches {
let run_id = col_str(b, COL_RUN_ID)?;
let ts = col_ts(b, COL_TS_MICROS)?;
let model = col_str(b, COL_MODEL)?;
let prompt_id = col_str(b, COL_PROMPT_ID)?;
let prompt = col_str(b, COL_PROMPT)?;
let output = col_str(b, COL_OUTPUT)?;
let latency = col_f64(b, COL_LATENCY_MS)?;
let tin = col_i64(b, COL_TOKENS_IN)?;
let tout = col_i64(b, COL_TOKENS_OUT)?;
let tps = col_f64(b, COL_TOKENS_PER_S)?;
let score = col_f64(b, COL_SCORE)?;
let ok = col_bool(b, COL_OK)?;
let error = col_str(b, COL_ERROR)?;
let agent = if b.num_columns() > COL_AGENT { Some(col_str(b, COL_AGENT)?) } else { None };
let cost = if b.num_columns() > COL_COST_USD {
Some(col_f64(b, COL_COST_USD)?)
} else {
None
};
let mcp = if b.num_columns() > COL_MCP_TOOL_CALLS {
Some(col_i64(b, COL_MCP_TOOL_CALLS)?)
} else {
None
};
for i in 0..b.num_rows() {
let row = AgentModelRunRow {
run_id: run_id.value(i).to_string(),
ts_micros: ts.value(i),
agent: match agent {
Some(a) if !a.is_null(i) => a.value(i).to_string(),
_ => AGENT_UNSET.to_string(),
},
model: model.value(i).to_string(),
prompt_id: prompt_id.value(i).to_string(),
prompt: prompt.value(i).to_string(),
output: output.value(i).to_string(),
latency_ms: latency.value(i),
tokens_in: tin.value(i),
tokens_out: tout.value(i),
tokens_per_s: tps.value(i),
score: score.value(i),
ok: ok.value(i),
error: if error.is_null(i) { None } else { Some(error.value(i).to_string()) },
cost_usd: match cost {
Some(c) if !c.is_null(i) => c.value(i),
_ => 0.0,
},
mcp_tool_calls: match mcp {
Some(m) if !m.is_null(i) => m.value(i),
_ => 0,
},
};
let keep = match sel {
BakeoffSelector::Run(id) => &row.run_id == id,
BakeoffSelector::Model(m) => &row.model == m,
BakeoffSelector::Agent(a) => &row.agent == a,
BakeoffSelector::All => true,
};
if keep {
out.push(row);
}
}
}
out.sort_by_key(|r| r.key());
Ok(out)
}
#[derive(Debug, Clone, PartialEq)]
pub struct LeaderboardEntry {
pub rank: usize,
pub agent: String,
pub model: String,
pub tokens_per_s: f64,
pub score: f64,
pub latency_ms: f64,
pub tokens_out: i64,
pub cost_usd: f64,
pub mcp_tool_calls: i64,
pub ok: bool,
pub output: String,
}
impl LeaderboardEntry {
pub fn cell_label(&self) -> String {
format!("{}/{}", self.agent, self.model)
}
}
pub fn leaderboard(rows: &[AgentModelRunRow]) -> Vec<LeaderboardEntry> {
let mut ranked: Vec<&AgentModelRunRow> = rows.iter().collect();
ranked.sort_by(|a, b| {
b.ok.cmp(&a.ok)
.then(b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal))
.then(
b.tokens_per_s
.partial_cmp(&a.tokens_per_s)
.unwrap_or(std::cmp::Ordering::Equal),
)
.then(a.latency_ms.partial_cmp(&b.latency_ms).unwrap_or(std::cmp::Ordering::Equal))
.then(a.cost_usd.partial_cmp(&b.cost_usd).unwrap_or(std::cmp::Ordering::Equal))
.then(a.agent.cmp(&b.agent))
.then(a.model.cmp(&b.model))
});
ranked
.into_iter()
.enumerate()
.map(|(i, r)| LeaderboardEntry {
rank: i + 1,
agent: r.agent.clone(),
model: r.model.clone(),
tokens_per_s: r.tokens_per_s,
score: r.score,
latency_ms: r.latency_ms,
tokens_out: r.tokens_out,
cost_usd: r.cost_usd,
mcp_tool_calls: r.mcp_tool_calls,
ok: r.ok,
output: r.output.clone(),
})
.collect()
}
#[derive(Debug, Clone, PartialEq)]
pub struct MatrixCell {
pub agent: String,
pub model: String,
pub rank: usize,
pub score: f64,
pub tokens_per_s: f64,
pub latency_ms: f64,
pub cost_usd: f64,
pub mcp_tool_calls: i64,
pub task_count: usize,
pub ok: bool,
}
#[derive(Debug, Clone, PartialEq)]
pub struct MatrixGrid {
pub agents: Vec<String>,
pub models: Vec<String>,
pub cells: Vec<MatrixCell>,
}
impl MatrixGrid {
pub fn winner(&self) -> Option<&MatrixCell> {
self.cells.iter().find(|c| c.rank == 1)
}
pub fn cell(&self, agent: &str, model: &str) -> Option<&MatrixCell> {
self.cells.iter().find(|c| c.agent == agent && c.model == model)
}
}
pub fn matrix_grid(rows: &[AgentModelRunRow]) -> MatrixGrid {
use std::collections::BTreeSet;
let mut by_cell: BTreeMap<(String, String), Vec<&AgentModelRunRow>> = BTreeMap::new();
for r in rows {
by_cell.entry(r.cell()).or_default().push(r);
}
let mut cells: Vec<MatrixCell> = by_cell
.into_iter()
.map(|((agent, model), group)| {
let n = group.len().max(1) as f64;
let mean = |f: &dyn Fn(&AgentModelRunRow) -> f64| -> f64 {
group.iter().map(|r| f(r)).sum::<f64>() / n
};
MatrixCell {
agent,
model,
rank: 0, score: mean(&|r| r.score),
tokens_per_s: mean(&|r| r.tokens_per_s),
latency_ms: mean(&|r| r.latency_ms),
cost_usd: mean(&|r| r.cost_usd),
mcp_tool_calls: group.iter().map(|r| r.mcp_tool_calls).sum(),
task_count: group.len(),
ok: group.iter().all(|r| r.ok),
}
})
.collect();
cells.sort_by(|a, b| {
b.ok.cmp(&a.ok)
.then(b.score.partial_cmp(&a.score).unwrap_or(std::cmp::Ordering::Equal))
.then(b.tokens_per_s.partial_cmp(&a.tokens_per_s).unwrap_or(std::cmp::Ordering::Equal))
.then(a.latency_ms.partial_cmp(&b.latency_ms).unwrap_or(std::cmp::Ordering::Equal))
.then(a.cost_usd.partial_cmp(&b.cost_usd).unwrap_or(std::cmp::Ordering::Equal))
.then(a.agent.cmp(&b.agent))
.then(a.model.cmp(&b.model))
});
for (i, c) in cells.iter_mut().enumerate() {
c.rank = i + 1;
}
let agents: BTreeSet<String> = cells.iter().map(|c| c.agent.clone()).collect();
let models: BTreeSet<String> = cells.iter().map(|c| c.model.clone()).collect();
MatrixGrid {
agents: agents.into_iter().collect(),
models: models.into_iter().collect(),
cells,
}
}
fn esc(s: &str) -> String {
let mut o = String::with_capacity(s.len() + 2);
for c in s.chars() {
match c {
'"' => o.push_str("\\\""),
'\\' => o.push_str("\\\\"),
'\n' => o.push_str("\\n"),
'\t' => o.push_str("\\t"),
'\r' => o.push_str("\\r"),
c => o.push(c),
}
}
o
}
pub fn rows_to_json(rows: &[AgentModelRunRow]) -> String {
let mut s = String::from("[\n");
for (i, r) in rows.iter().enumerate() {
let ts_rfc = Utc
.timestamp_micros(r.ts_micros)
.single()
.map(|d| d.to_rfc3339())
.unwrap_or_default();
let error = match &r.error {
None => "null".to_string(),
Some(e) => format!("\"{}\"", esc(e)),
};
s.push_str(&format!(
" {{\"run_id\": \"{}\", \"ts\": \"{}\", \"agent\": \"{}\", \"model\": \"{}\", \
\"prompt_id\": \"{}\", \"prompt\": \"{}\", \"output\": \"{}\", \
\"latency_ms\": {:.3}, \"tokens_in\": {}, \"tokens_out\": {}, \
\"tokens_per_s\": {:.3}, \"score\": {:.3}, \"cost_usd\": {:.4}, \
\"mcp_tool_calls\": {}, \"ok\": {}, \"error\": {}}}{}\n",
esc(&r.run_id),
esc(&ts_rfc),
esc(&r.agent),
esc(&r.model),
esc(&r.prompt_id),
esc(&r.prompt),
esc(&r.output),
r.latency_ms,
r.tokens_in,
r.tokens_out,
r.tokens_per_s,
r.score,
r.cost_usd,
r.mcp_tool_calls,
r.ok,
error,
if i + 1 < rows.len() { "," } else { "" },
));
}
s.push(']');
s
}
pub fn render_leaderboard(rows: &[AgentModelRunRow]) -> String {
if rows.is_empty() {
return "(no bake-off runs recorded)\n".to_string();
}
let board = leaderboard(rows);
let mut out = String::new();
out.push_str(
" # agent/model score tok/s latency cost mcp status\n",
);
for e in &board {
out.push_str(&format!(
" {:<2} {:<28} {:>6.3} {:>8.1} {:>8.0}ms ${:>6.4} {:>4} {}\n",
e.rank,
truncate(&e.cell_label(), 28),
e.score,
e.tokens_per_s,
e.latency_ms,
e.cost_usd,
e.mcp_tool_calls,
if e.ok { "✓" } else { "✗" },
));
}
out
}
pub fn render_matrix(rows: &[AgentModelRunRow]) -> String {
if rows.is_empty() {
return "(no bake-off runs recorded)\n".to_string();
}
let grid = matrix_grid(rows);
let mut out = String::new();
out.push_str(&format!(" {:<16}", "agent \\ model"));
for m in &grid.models {
out.push_str(&format!(" {:>14}", truncate(m, 14)));
}
out.push('\n');
for a in &grid.agents {
out.push_str(&format!(" {:<16}", truncate(a, 16)));
for m in &grid.models {
match grid.cell(a, m) {
Some(c) => {
let win = if c.rank == 1 { "*" } else { " " };
let status = if c.ok { "" } else { "✗" };
out.push_str(&format!(" {win}{:>5.3}#{:<2}{status:>3}", c.score, c.rank));
}
None => out.push_str(&format!(" {:>14}", "·")),
}
}
out.push('\n');
}
if let Some(w) = grid.winner() {
out.push_str(&format!(
" winner: {}/{} (score {:.3}, {:.1} tok/s)\n",
w.agent, w.model, w.score, w.tokens_per_s
));
}
out
}
fn truncate(s: &str, n: usize) -> String {
if s.chars().count() <= n {
s.to_string()
} else {
let head: String = s.chars().take(n.saturating_sub(1)).collect();
format!("{head}…")
}
}
pub fn short_run(run_id: &str) -> String {
if run_id.len() > 12 {
format!("{}…", &run_id[..8])
} else {
run_id.to_string()
}
}
pub fn new_run_id() -> String {
Uuid::new_v4().to_string()
}
#[derive(Debug, Clone, PartialEq)]
pub struct ModelAnswer {
pub output: String,
pub latency_ms: f64,
pub tokens_in: i64,
pub tokens_out: i64,
pub tokens_per_s: f64,
pub score: f64,
pub cost_usd: f64,
pub mcp_tool_calls: i64,
}
impl ModelAnswer {
pub fn basic(
output: impl Into<String>,
latency_ms: f64,
tokens_in: i64,
tokens_out: i64,
tokens_per_s: f64,
score: f64,
) -> Self {
ModelAnswer {
output: output.into(),
latency_ms,
tokens_in,
tokens_out,
tokens_per_s,
score,
cost_usd: 0.0,
mcp_tool_calls: 0,
}
}
}
pub trait ModelCaller {
fn call(&self, agent: &str, model: &str, prompt: &str) -> Result<ModelAnswer>;
}
#[derive(Debug, Clone, Default)]
pub struct MockCaller {
answers: BTreeMap<(String, String), ModelAnswer>,
}
impl MockCaller {
pub fn new() -> Self {
Self::default()
}
pub fn with(self, model: impl Into<String>, answer: ModelAnswer) -> Self {
self.with_cell(AGENT_UNSET, model, answer)
}
pub fn with_cell(
mut self,
agent: impl Into<String>,
model: impl Into<String>,
answer: ModelAnswer,
) -> Self {
self.answers.insert((agent.into(), model.into()), answer);
self
}
}
impl ModelCaller for MockCaller {
fn call(&self, agent: &str, model: &str, _prompt: &str) -> Result<ModelAnswer> {
self.answers
.get(&(agent.to_string(), model.to_string()))
.cloned()
.ok_or_else(|| anyhow!("mock: no canned answer for cell `{agent}/{model}`"))
}
}
pub struct OllamaCaller {
host: String,
}
impl OllamaCaller {
pub fn new(host: Option<String>) -> Self {
let host = host
.or_else(|| std::env::var("OLLAMA_HOST").ok())
.unwrap_or_else(|| "http://localhost:11434".to_string());
Self { host: host.trim_end_matches('/').to_string() }
}
}
impl ModelCaller for OllamaCaller {
fn call(&self, _agent: &str, model: &str, prompt: &str) -> Result<ModelAnswer> {
let url = format!("{}/api/generate", self.host);
let body = serde_json::json!({
"model": model,
"prompt": prompt,
"stream": false,
});
let body_str = serde_json::to_string(&body)?;
let started = std::time::Instant::now();
let resp = ureq::post(&url)
.set("Content-Type", "application/json")
.send_string(&body_str)
.map_err(|e| anyhow!("ollama POST {url} failed: {e}"))?;
let txt = resp
.into_string()
.map_err(|e| anyhow!("ollama response not readable: {e}"))?;
let v: serde_json::Value =
serde_json::from_str(&txt).map_err(|e| anyhow!("ollama response not JSON: {e}"))?;
let latency_ms = started.elapsed().as_secs_f64() * 1000.0;
let output = v.get("response").and_then(|x| x.as_str()).unwrap_or("").to_string();
let tokens_in = v.get("prompt_eval_count").and_then(|x| x.as_i64()).unwrap_or(0);
let tokens_out = v.get("eval_count").and_then(|x| x.as_i64()).unwrap_or(0);
let tokens_per_s = match v.get("eval_duration").and_then(|x| x.as_i64()) {
Some(ns) if ns > 0 => tokens_out as f64 / (ns as f64 / 1e9),
_ => 0.0,
};
Ok(ModelAnswer::basic(output, latency_ms, tokens_in, tokens_out, tokens_per_s, 0.0))
}
}
pub fn run_bakeoff(
caller: &dyn ModelCaller,
run_id: &str,
ts_micros: i64,
prompt_id: &str,
prompt: &str,
models: &[String],
) -> Vec<AgentModelRunRow> {
run_bakeoff_matrix(
caller,
run_id,
ts_micros,
prompt_id,
prompt,
&[AGENT_UNSET.to_string()],
models,
)
}
pub fn run_bakeoff_matrix(
caller: &dyn ModelCaller,
run_id: &str,
ts_micros: i64,
prompt_id: &str,
prompt: &str,
agents: &[String],
models: &[String],
) -> Vec<AgentModelRunRow> {
let mut rows = Vec::with_capacity(agents.len() * models.len());
for agent in agents {
for model in models {
let row = match caller.call(agent, model, prompt) {
Ok(ans) => {
let tps = if ans.tokens_per_s > 0.0 {
ans.tokens_per_s
} else if ans.latency_ms > 0.0 {
ans.tokens_out as f64 / (ans.latency_ms / 1000.0)
} else {
0.0
};
AgentModelRunRow {
run_id: run_id.to_string(),
ts_micros,
agent: agent.clone(),
model: model.clone(),
prompt_id: prompt_id.to_string(),
prompt: prompt.to_string(),
output: ans.output,
latency_ms: ans.latency_ms,
tokens_in: ans.tokens_in,
tokens_out: ans.tokens_out,
tokens_per_s: tps,
score: ans.score,
ok: true,
error: None,
cost_usd: ans.cost_usd,
mcp_tool_calls: ans.mcp_tool_calls,
}
}
Err(e) => AgentModelRunRow {
run_id: run_id.to_string(),
ts_micros,
agent: agent.clone(),
model: model.clone(),
prompt_id: prompt_id.to_string(),
prompt: prompt.to_string(),
output: String::new(),
latency_ms: 0.0,
tokens_in: 0,
tokens_out: 0,
tokens_per_s: 0.0,
score: 0.0,
ok: false,
error: Some(format!("{e:#}")),
cost_usd: 0.0,
mcp_tool_calls: 0,
},
};
rows.push(row);
}
}
rows
}
pub fn score_rows_with_judge(
rows: &mut [AgentModelRunRow],
judge: &dyn super::codegen_judge::AnswerJudge,
) -> Vec<JudgeReport> {
let mut reports = Vec::new();
for row in rows.iter_mut() {
if !row.ok {
continue; }
let report = match judge.judge(&row.output) {
Ok(verdict) => {
row.score = verdict.score;
JudgeReport {
agent: row.agent.clone(),
model: row.model.clone(),
score: verdict.score,
verdict: Some(verdict),
error: None,
}
}
Err(e) => JudgeReport {
agent: row.agent.clone(),
model: row.model.clone(),
score: row.score,
verdict: None,
error: Some(format!("{e:#}")),
},
};
reports.push(report);
}
reports
}
#[derive(Debug, Clone)]
pub struct JudgeReport {
pub agent: String,
pub model: String,
pub score: f64,
pub verdict: Option<super::codegen_judge::Verdict>,
pub error: Option<String>,
}
pub fn default_agents() -> Vec<String> {
vec![AGENT_UNSET.to_string()]
}
#[cfg(feature = "vector")]
pub fn default_models() -> Vec<String> {
crate::vector::embed_registry::MODELS
.iter()
.map(|m| m.id.to_string())
.collect()
}
#[cfg(not(feature = "vector"))]
pub fn default_models() -> Vec<String> {
["jina-v2-base-code", "minilm-l6-v2", "bge-base-en-v1.5"]
.iter()
.map(|s| s.to_string())
.collect()
}
pub fn demo_matrix_rows(run_id: &str, ts_micros: i64) -> Vec<AgentModelRunRow> {
let task = "Fix the flaky test in src/foo.rs and explain the race.";
let prompt_id = "fix-flaky-test";
let caller = demo_caller();
let agents: Vec<String> =
["claude", "kilo", "local-llm"].iter().map(|s| s.to_string()).collect();
let models: Vec<String> = ["opus", "sonnet", "mistral"].iter().map(|s| s.to_string()).collect();
run_bakeoff_matrix(&caller, run_id, ts_micros, prompt_id, task, &agents, &models)
}
fn demo_caller() -> MockCaller {
let mut m = MockCaller::new();
let cells: &[(&str, &str, &str, f64, i64, i64, f64, f64, f64, i64)] = &[
("claude", "opus", "Patched the race with a Mutex; root cause: TOCTOU on the cache.", 4200.0, 1800, 520, 124.0, 0.97, 0.0840, 6),
("claude", "sonnet", "Added a lock around the shared map; the test was racing on init.", 2100.0, 1800, 480, 228.0, 0.91, 0.0210, 5),
("claude", "mistral", "Wrapped the counter in an atomic.", 1400.0, 1800, 300, 214.0, 0.74, 0.0000, 3),
("kilo", "opus", "Serialized the two writers; documented the ordering.", 4600.0, 1900, 540, 117.0, 0.93, 0.0880, 4),
("kilo", "sonnet", "Guarded the lazy-init with Once.", 2400.0, 1900, 460, 191.0, 0.88, 0.0220, 4),
("kilo", "mistral", "Added a sleep to dodge the race.", 1600.0, 1900, 280, 175.0, 0.55, 0.0000, 2),
("local-llm", "sonnet", "Used a channel to sync the workers.", 900.0, 1700, 240, 266.0, 0.69, 0.0000, 0),
("local-llm", "mistral", "Made the field volatile-ish via Arc.", 700.0, 1700, 200, 285.0, 0.58, 0.0000, 0),
];
for &(agent, model, out, lat, tin, tout, tps, score, cost, mcp) in cells {
m = m.with_cell(
agent,
model,
ModelAnswer {
output: out.to_string(),
latency_ms: lat,
tokens_in: tin,
tokens_out: tout,
tokens_per_s: tps,
score,
cost_usd: cost,
mcp_tool_calls: mcp,
},
);
}
m
}
fn col_str<'a>(b: &'a RecordBatch, idx: usize) -> Result<&'a StringArray> {
b.column(idx)
.as_any()
.downcast_ref::<StringArray>()
.ok_or_else(|| anyhow!("agent_model_runs col {idx} is not StringArray"))
}
fn col_f64<'a>(b: &'a RecordBatch, idx: usize) -> Result<&'a Float64Array> {
b.column(idx)
.as_any()
.downcast_ref::<Float64Array>()
.ok_or_else(|| anyhow!("agent_model_runs col {idx} is not Float64Array"))
}
fn col_i64<'a>(b: &'a RecordBatch, idx: usize) -> Result<&'a Int64Array> {
b.column(idx)
.as_any()
.downcast_ref::<Int64Array>()
.ok_or_else(|| anyhow!("agent_model_runs col {idx} is not Int64Array"))
}
fn col_bool<'a>(b: &'a RecordBatch, idx: usize) -> Result<&'a BooleanArray> {
b.column(idx)
.as_any()
.downcast_ref::<BooleanArray>()
.ok_or_else(|| anyhow!("agent_model_runs col {idx} is not BooleanArray"))
}
fn col_ts<'a>(b: &'a RecordBatch, idx: usize) -> Result<&'a TimestampMicrosecondArray> {
b.column(idx)
.as_any()
.downcast_ref::<TimestampMicrosecondArray>()
.ok_or_else(|| anyhow!("agent_model_runs col {idx} is not TimestampMicrosecondArray"))
}
#[cfg(test)]
mod tests {
use super::*;
fn ans(out: &str, lat: f64, tin: i64, tout: i64, tps: f64, score: f64) -> ModelAnswer {
ModelAnswer::basic(out, lat, tin, tout, tps, score)
}
fn ans_priced(
out: &str,
lat: f64,
tout: i64,
tps: f64,
score: f64,
cost: f64,
mcp: i64,
) -> ModelAnswer {
ModelAnswer {
output: out.to_string(),
latency_ms: lat,
tokens_in: 10,
tokens_out: tout,
tokens_per_s: tps,
score,
cost_usd: cost,
mcp_tool_calls: mcp,
}
}
#[test]
fn run_bakeoff_builds_rows_and_records_failures() {
let caller = MockCaller::new()
.with("fast-model", ans("4", 100.0, 10, 20, 200.0, 1.0))
.with("slow-model", ans("four", 500.0, 10, 30, 0.0, 0.5));
let models = vec![
"fast-model".to_string(),
"slow-model".to_string(),
"missing-model".to_string(),
];
let rows = run_bakeoff(&caller, "runX", 1_000, "p1", "2+2?", &models);
assert_eq!(rows.len(), 3);
let fast = rows.iter().find(|r| r.model == "fast-model").unwrap();
assert_eq!(fast.output, "4");
assert!(fast.ok);
assert_eq!(fast.tokens_per_s, 200.0); assert_eq!(fast.tokens_out, 20);
let slow = rows.iter().find(|r| r.model == "slow-model").unwrap();
assert!(slow.ok);
assert!((slow.tokens_per_s - 60.0).abs() < 1e-6, "derived tok/s: {}", slow.tokens_per_s);
let missing = rows.iter().find(|r| r.model == "missing-model").unwrap();
assert!(!missing.ok);
assert!(missing.error.as_ref().unwrap().contains("missing-model"));
assert_eq!(missing.output, "");
}
#[test]
fn leaderboard_ranks_ok_then_score_then_throughput() {
let rows = run_bakeoff(
&MockCaller::new()
.with("a", ans("x", 100.0, 5, 10, 100.0, 0.9))
.with("b", ans("x", 100.0, 5, 10, 300.0, 0.9)) .with("c", ans("x", 100.0, 5, 10, 500.0, 0.5)), "runR",
1,
"p",
"q",
&["a".into(), "b".into(), "c".into(), "z".into()],
);
let board = leaderboard(&rows);
assert_eq!(board.len(), 4);
assert_eq!(board[0].model, "b");
assert_eq!(board[1].model, "a");
assert_eq!(board[2].model, "c");
assert_eq!(board[3].model, "z");
assert!(!board[3].ok);
assert_eq!(board[0].rank, 1);
assert_eq!(board[3].rank, 4);
}
#[test]
fn append_query_round_trip_groups_and_ranks() {
let dir = tempfile::tempdir().unwrap();
let wh = IcebergWarehouse::open(dir.path()).unwrap();
let caller = MockCaller::new()
.with("mistral", ans("Paris", 80.0, 12, 4, 50.0, 1.0))
.with("llama3", ans("paris", 200.0, 12, 4, 20.0, 0.8));
let run_id = "bake-1";
let rows = run_bakeoff(
&caller,
run_id,
12_345,
"capital-fr",
"Capital of France?",
&["mistral".into(), "llama3".into()],
);
wh.block_on(append_agent_model_runs(&wh, &rows)).unwrap();
let got = wh
.block_on(query_agent_model_runs(&wh, &BakeoffSelector::Run(run_id.into())))
.unwrap();
assert_eq!(got.len(), 2);
let mistral = got.iter().find(|r| r.model == "mistral").unwrap();
assert_eq!(mistral.output, "Paris");
assert_eq!(mistral.tokens_in, 12);
assert_eq!(mistral.tokens_out, 4);
assert!((mistral.tokens_per_s - 50.0).abs() < 1e-6);
assert_eq!(mistral.score, 1.0);
assert!(mistral.ok);
assert_eq!(mistral.error, None);
let board = leaderboard(&got);
assert_eq!(board[0].model, "mistral");
assert_eq!(board[0].rank, 1);
assert_eq!(board[1].model, "llama3");
let only_mistral = wh
.block_on(query_agent_model_runs(&wh, &BakeoffSelector::Model("mistral".into())))
.unwrap();
assert_eq!(only_mistral.len(), 1);
assert_eq!(only_mistral[0].model, "mistral");
let json = rows_to_json(&got);
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.as_array().unwrap().len(), 2);
let board_txt = render_leaderboard(&got);
assert!(board_txt.contains("mistral"), "leaderboard: {board_txt}");
}
#[test]
fn matrix_crosses_agents_and_models_and_records_a_failed_cell() {
let caller = MockCaller::new()
.with_cell("claude", "opus", ans_priced("A", 4000.0, 500, 125.0, 0.95, 0.08, 6))
.with_cell("claude", "mistral", ans_priced("B", 1200.0, 300, 250.0, 0.70, 0.0, 2))
.with_cell("local", "mistral", ans_priced("C", 700.0, 200, 285.0, 0.60, 0.0, 0));
let agents = vec!["claude".to_string(), "local".to_string()];
let models = vec!["opus".to_string(), "mistral".to_string()];
let rows = run_bakeoff_matrix(&caller, "m1", 10, "task-x", "do x", &agents, &models);
assert_eq!(rows.len(), 4, "full 2x2 cross-product");
assert!(rows.iter().all(|r| r.agent == "claude" || r.agent == "local"));
let claude_opus = rows.iter().find(|r| r.agent == "claude" && r.model == "opus").unwrap();
assert!(claude_opus.ok);
assert_eq!(claude_opus.score, 0.95);
assert!((claude_opus.cost_usd - 0.08).abs() < 1e-9);
assert_eq!(claude_opus.mcp_tool_calls, 6);
let local_opus = rows.iter().find(|r| r.agent == "local" && r.model == "opus").unwrap();
assert!(!local_opus.ok, "local/opus must be the failed cell");
assert!(local_opus.error.as_ref().unwrap().contains("local/opus"));
}
#[test]
fn matrix_grid_ranks_cells_and_picks_winner() {
let caller = MockCaller::new()
.with_cell("claude", "opus", ans_priced("A", 4000.0, 500, 125.0, 0.95, 0.08, 6))
.with_cell("kilo", "sonnet", ans_priced("B", 2000.0, 400, 200.0, 0.88, 0.02, 4))
.with_cell("local", "mistral", ans_priced("C", 700.0, 200, 285.0, 0.58, 0.0, 0));
let agents = vec!["claude".into(), "kilo".into(), "local".into()];
let models = vec!["opus".into(), "sonnet".into(), "mistral".into()];
let rows = run_bakeoff_matrix(&caller, "m2", 20, "task-y", "do y", &agents, &models);
assert_eq!(rows.len(), 9, "3x3 cross-product");
let grid = matrix_grid(&rows);
assert_eq!(grid.agents, vec!["claude", "kilo", "local"]);
assert_eq!(grid.models, vec!["mistral", "opus", "sonnet"]);
assert_eq!(grid.cells.len(), 9, "one cell per coordinate");
let w = grid.winner().expect("a winner");
assert_eq!((w.agent.as_str(), w.model.as_str()), ("claude", "opus"));
assert_eq!(w.rank, 1);
let co = grid.cell("claude", "opus").unwrap();
let ks = grid.cell("kilo", "sonnet").unwrap();
let lm = grid.cell("local", "mistral").unwrap();
assert_eq!(co.rank, 1);
assert_eq!(ks.rank, 2);
assert_eq!(lm.rank, 3);
assert!(co.ok && ks.ok && lm.ok);
let failed = grid.cell("local", "opus").unwrap();
assert!(!failed.ok);
assert!(failed.rank > 3, "failed cell ranks after every ok cell");
let board = leaderboard(&rows);
assert_eq!(board[0].agent, "claude");
assert_eq!(board[0].model, "opus");
assert_eq!(board[0].cell_label(), "claude/opus");
let txt = render_matrix(&rows);
assert!(txt.contains("winner: claude/opus"), "matrix:\n{txt}");
}
#[test]
fn demo_matrix_is_populated_and_round_trips() {
let dir = tempfile::tempdir().unwrap();
let wh = IcebergWarehouse::open(dir.path()).unwrap();
let rows = demo_matrix_rows("demo-run", 99);
assert_eq!(rows.len(), 9);
assert_eq!(rows.iter().filter(|r| !r.ok).count(), 1, "exactly one red cell");
wh.block_on(append_agent_model_runs(&wh, &rows)).unwrap();
let got = wh
.block_on(query_agent_model_runs(&wh, &BakeoffSelector::Run("demo-run".into())))
.unwrap();
assert_eq!(got.len(), 9);
let grid = matrix_grid(&got);
assert_eq!(grid.agents, vec!["claude", "kilo", "local-llm"]);
assert_eq!(grid.models, vec!["mistral", "opus", "sonnet"]);
let w = grid.winner().unwrap();
assert_eq!((w.agent.as_str(), w.model.as_str()), ("claude", "opus"));
let co = grid.cell("claude", "opus").unwrap();
assert!(co.cost_usd > 0.0, "hosted cell carries a cost");
assert!(co.mcp_tool_calls > 0, "hosted cell made mcp tool calls");
let lm = grid.cell("local-llm", "mistral").unwrap();
assert_eq!(lm.cost_usd, 0.0);
let only_claude = wh
.block_on(query_agent_model_runs(&wh, &BakeoffSelector::Agent("claude".into())))
.unwrap();
assert_eq!(only_claude.len(), 3, "claude × 3 models");
assert!(only_claude.iter().all(|r| r.agent == "claude"));
}
fn legacy_13_col_schema() -> iceberg::spec::Schema {
use iceberg::spec::{NestedField, PrimitiveType, Schema, Type};
Schema::builder()
.with_schema_id(0)
.with_fields(vec![
Arc::new(NestedField::required(1, "run_id", Type::Primitive(PrimitiveType::String))),
Arc::new(NestedField::required(2, "ts_micros", Type::Primitive(PrimitiveType::Timestamptz))),
Arc::new(NestedField::required(3, "model", Type::Primitive(PrimitiveType::String))),
Arc::new(NestedField::required(4, "prompt_id", Type::Primitive(PrimitiveType::String))),
Arc::new(NestedField::required(5, "prompt", Type::Primitive(PrimitiveType::String))),
Arc::new(NestedField::required(6, "output", Type::Primitive(PrimitiveType::String))),
Arc::new(NestedField::required(7, "latency_ms", Type::Primitive(PrimitiveType::Double))),
Arc::new(NestedField::required(8, "tokens_in", Type::Primitive(PrimitiveType::Long))),
Arc::new(NestedField::required(9, "tokens_out", Type::Primitive(PrimitiveType::Long))),
Arc::new(NestedField::required(10, "tokens_per_s", Type::Primitive(PrimitiveType::Double))),
Arc::new(NestedField::required(11, "score", Type::Primitive(PrimitiveType::Double))),
Arc::new(NestedField::required(12, "ok", Type::Primitive(PrimitiveType::Boolean))),
Arc::new(NestedField::optional(13, "error", Type::Primitive(PrimitiveType::String))),
])
.build()
.unwrap()
}
#[test]
fn stale_13col_table_evolves_to_16_on_append() {
use iceberg::Catalog;
use iceberg::spec::{PartitionSpec, Transform};
use iceberg::TableCreation;
let dir = tempfile::tempdir().unwrap();
let wh = IcebergWarehouse::open(dir.path()).unwrap();
let ident = wh.table_ident(TABLE_AGENT_MODEL_RUNS);
wh.block_on(async {
let cat = wh.catalog();
cat.drop_table(&ident).await.unwrap();
let schema = legacy_13_col_schema();
let spec = PartitionSpec::builder(Arc::new(schema.clone()))
.add_partition_field("run_id", "run_id", Transform::Identity)
.unwrap()
.build()
.unwrap()
.into_unbound();
let creation = TableCreation::builder()
.name(ident.name().to_string())
.schema(schema)
.partition_spec(spec)
.build();
cat.create_table(ident.namespace(), creation).await.unwrap();
let t = cat.load_table(&ident).await.unwrap();
assert_eq!(t.metadata().current_schema().as_struct().fields().len(), 13);
});
let caller = MockCaller::new()
.with_cell("claude", "opus", ans_priced("Fixed it.", 4000.0, 500, 125.0, 0.95, 0.08, 6));
let rows = run_bakeoff_matrix(
&caller,
"evolve-run",
777,
"task-z",
"do z",
&["claude".to_string()],
&["opus".to_string()],
);
assert_eq!(rows.len(), 1);
wh.block_on(append_agent_model_runs(&wh, &rows)).unwrap();
wh.block_on(async {
let t = wh.catalog().load_table(&ident).await.unwrap();
let names: Vec<&str> = t
.metadata()
.current_schema()
.as_struct()
.fields()
.iter()
.map(|f| f.name.as_str())
.collect();
assert_eq!(names.len(), 16, "schema evolved 13 → 16 columns");
assert!(names.contains(&"agent"));
assert!(names.contains(&"cost_usd"));
assert!(names.contains(&"mcp_tool_calls"));
});
let got = wh
.block_on(query_agent_model_runs(&wh, &BakeoffSelector::Run("evolve-run".into())))
.unwrap();
assert_eq!(got.len(), 1);
let r = &got[0];
assert_eq!(r.agent, "claude");
assert_eq!(r.model, "opus");
assert!((r.cost_usd - 0.08).abs() < 1e-9, "evolved cost_usd reads back");
assert_eq!(r.mcp_tool_calls, 6, "evolved mcp_tool_calls reads back");
assert!(r.ok);
let rows2 = run_bakeoff_matrix(
&caller, "evolve-run-2", 888, "task-z", "do z",
&["claude".to_string()], &["opus".to_string()],
);
wh.block_on(append_agent_model_runs(&wh, &rows2)).unwrap();
let all = wh.block_on(query_agent_model_runs(&wh, &BakeoffSelector::All)).unwrap();
assert_eq!(all.len(), 2);
}
#[test]
fn legacy_single_axis_rows_default_agent_unset() {
let caller = MockCaller::new().with("m", ans("x", 100.0, 5, 10, 100.0, 0.9));
let rows = run_bakeoff(&caller, "r", 1, "p", "q", &["m".into()]);
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].agent, AGENT_UNSET);
assert_eq!(rows[0].cost_usd, 0.0);
assert_eq!(rows[0].mcp_tool_calls, 0);
}
#[test]
fn matrix_scored_by_codegen_judge_lands_real_scores() {
use super::super::codegen_judge::{CodegenJudge, CodegenSpec};
let dir = tempfile::tempdir().unwrap();
let root = dir.path();
std::fs::write(
root.join("Cargo.toml"),
"[package]\nname=\"jcrate\"\nversion=\"0.0.0\"\nedition=\"2021\"\n\n[lib]\npath=\"src/lib.rs\"\n",
)
.unwrap();
std::fs::create_dir_all(root.join("src")).unwrap();
std::fs::write(root.join("src/lib.rs"), "pub fn add(_a:i32,_b:i32)->i32{0}\n").unwrap();
let accept = "\n#[cfg(test)]\nmod t{use super::*;#[test]fn ok(){assert!(true);}#[test]fn tgt(){assert_eq!(add(2,2),4);}}\n";
let good = format!("pub fn add(a:i32,b:i32)->i32{{a+b}}{accept}");
let bad = format!("pub fn add(_a:i32,_b:i32)->i32{{\"x\"}}{accept}");
let caller = MockCaller::new()
.with_cell("claude", "m", ans(&good, 1000.0, 10, 50, 50.0, 0.0))
.with_cell("local", "m", ans(&bad, 800.0, 10, 40, 50.0, 0.0));
let agents = vec!["claude".to_string(), "kilo".to_string(), "local".to_string()];
let models = vec!["m".to_string()];
let mut rows = run_bakeoff_matrix(&caller, "jrun", 1, "task", "do it", &agents, &models);
assert_eq!(rows.len(), 3);
assert!(rows.iter().all(|r| r.score == 0.0), "caller left scores at 0.0");
let judge = CodegenJudge::new(CodegenSpec::write_file(root.to_path_buf(), "src/lib.rs"));
let reports = score_rows_with_judge(&mut rows, &judge);
assert_eq!(reports.len(), 2, "only the two OK cells are judged");
let claude = rows.iter().find(|r| r.agent == "claude").unwrap();
assert!(claude.ok);
assert_eq!(claude.score, 1.0, "GOOD answer → real 1.0 score (was 0.0)");
let local = rows.iter().find(|r| r.agent == "local").unwrap();
assert!(local.ok, "the cell answered (ok); it just doesn't compile");
assert_eq!(local.score, 0.0, "NON-COMPILING answer → 0.0");
let kilo = rows.iter().find(|r| r.agent == "kilo").unwrap();
assert!(!kilo.ok, "kilo never answered → red");
assert_eq!(kilo.score, 0.0);
let board = leaderboard(&rows);
assert_eq!((board[0].agent.as_str(), board[0].model.as_str()), ("claude", "m"));
assert_eq!(board[0].score, 1.0);
}
}