use std::collections::{BTreeMap, BTreeSet};
use chrono::{DateTime, NaiveDate, Utc};
use rust_decimal::Decimal;
use serde::{Deserialize, Serialize};
use crate::{
decimal_to_u64, CoreError, CostLane, EngineSnapshot, PricingCatalog, ProviderStatus,
TokenTotals,
};
const BENCH_SCHEMA_VERSION: &str = "1";
const DISCLAIMER_NOTE: &str = "~ cost-only comparison at equal token volume; not a quality claim.";
const METERS: [&str; 4] = ["input", "output", "cache_read", "cache_write"];
fn bundled_benchmarks_json() -> &'static str {
include_str!("../bench/benchmarks.v1.json")
}
#[derive(Debug, Deserialize)]
struct BenchmarkTable {
schema_version: String,
#[serde(default)]
benchmarks: Vec<Benchmark>,
}
#[derive(Debug, Deserialize)]
struct Benchmark {
name: String,
role: String,
source: String,
as_of: String,
#[serde(default)]
harness: Option<String>,
cost_note: String,
#[serde(default)]
points: Vec<BenchmarkPoint>,
}
#[derive(Debug, Deserialize)]
struct BenchmarkPoint {
model_id: String,
label: String,
score_pct: Decimal,
#[serde(default)]
cost_per_task_usd: Option<Decimal>,
#[serde(default)]
note: Option<String>,
}
impl BenchmarkTable {
fn bundled() -> Result<Self, CoreError> {
Self::from_json(bundled_benchmarks_json())
}
fn from_json(value: &str) -> Result<Self, CoreError> {
let table: BenchmarkTable = serde_json::from_str(value).map_err(|err| {
CoreError::BenchValidation(format!("benchmark JSON parse error: {err}"))
})?;
table.validate()?;
Ok(table)
}
fn validate(&self) -> Result<(), CoreError> {
if self.schema_version != BENCH_SCHEMA_VERSION {
return Err(CoreError::BenchValidation(format!(
"unsupported schema_version {}; expected {}",
self.schema_version, BENCH_SCHEMA_VERSION
)));
}
if self.benchmarks.is_empty() {
return Err(CoreError::BenchValidation(
"bundled benchmark table has no benchmarks".to_string(),
));
}
for benchmark in &self.benchmarks {
if benchmark.source.trim().is_empty() {
return Err(CoreError::BenchValidation(format!(
"benchmark {} has an empty source",
benchmark.name
)));
}
if NaiveDate::parse_from_str(benchmark.as_of.trim(), "%Y-%m-%d").is_err() {
return Err(CoreError::BenchValidation(format!(
"benchmark {} has an invalid as_of {:?}; expected YYYY-MM-DD",
benchmark.name, benchmark.as_of
)));
}
if benchmark.points.is_empty() {
return Err(CoreError::BenchValidation(format!(
"benchmark {} has no points",
benchmark.name
)));
}
let mut seen = BTreeSet::new();
for point in &benchmark.points {
if !seen.insert(point.model_id.as_str()) {
return Err(CoreError::BenchValidation(format!(
"benchmark {} has duplicate model_id {}",
benchmark.name, point.model_id
)));
}
if point.score_pct < Decimal::ZERO || point.score_pct > Decimal::from(100) {
return Err(CoreError::BenchValidation(format!(
"benchmark {} model {} score_pct {} is outside 0..=100",
benchmark.name, point.model_id, point.score_pct
)));
}
if matches!(point.cost_per_task_usd, Some(cost) if cost < Decimal::ZERO) {
return Err(CoreError::BenchValidation(format!(
"benchmark {} model {} has a negative cost",
benchmark.name, point.model_id
)));
}
}
}
Ok(())
}
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct BenchFrontier {
pub name: String,
pub role: String,
pub source: String,
pub as_of: String,
pub harness: Option<String>,
pub cost_note: String,
pub points: Vec<FrontierPoint>,
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct FrontierPoint {
pub model_id: String,
pub label: String,
pub score_pct: Decimal,
pub cost_per_task_usd: Option<Decimal>,
pub standing: FrontierStanding,
pub priced_in_catalog: bool,
pub note: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum FrontierStanding {
OnFrontier,
Dominated {
by: String,
},
CostUnknown,
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct OverlayModel {
pub model_id: String,
pub raw_model: String,
pub billed_cost: Decimal,
pub tokens: TokenTotals,
pub appearances: Vec<OverlayAppearance>,
pub repricing: Vec<RepricingDelta>,
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct OverlayAppearance {
pub benchmark_name: String,
pub score_pct: Decimal,
pub standing: FrontierStanding,
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct RepricingDelta {
pub target_model_id: String,
pub target_label: String,
pub delta_usd: Decimal,
pub status: RepricingStatus,
pub on_frontier_in: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum RepricingStatus {
Computed,
TargetRateGap,
SameModel,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct BenchDisclaimer {
pub note: &'static str,
pub pricing_as_of: String,
}
#[derive(Debug, Clone, PartialEq, Serialize)]
pub struct BenchView {
pub generated_at: DateTime<Utc>,
pub frontiers: Vec<BenchFrontier>,
pub overlay: Vec<OverlayModel>,
pub no_api_usage: bool,
pub disclaimer: BenchDisclaimer,
pub providers: Vec<ProviderStatus>,
}
pub fn bench_view(snapshot: &EngineSnapshot) -> Result<BenchView, CoreError> {
let table = BenchmarkTable::bundled()?;
let pricing = PricingCatalog::bundled()?;
let frontiers = build_frontiers(&table, &pricing);
let disclaimer = BenchDisclaimer {
note: DISCLAIMER_NOTE,
pricing_as_of: pricing.as_of.clone(),
};
let mut accum: BTreeMap<String, OverlayAccum> = BTreeMap::new();
for row in &snapshot.focus_rows {
if CostLane::from_access_path(&row.x_access_path) != CostLane::Api {
continue;
}
let key = pricing
.resolve_key(&row.x_model)
.map(str::to_string)
.unwrap_or_else(|| row.x_model.clone());
let entry = accum.entry(key).or_insert_with(|| OverlayAccum {
raw_model: row.x_model.clone(),
billed_cost: Decimal::ZERO,
tokens: TokenTotals::default(),
});
entry.billed_cost += row.billed_cost;
entry
.tokens
.add(&row.x_token_type, decimal_to_u64(row.x_consumed_tokens));
}
if accum.is_empty() {
return Ok(BenchView {
generated_at: snapshot.generated_at,
frontiers,
overlay: Vec::new(),
no_api_usage: true,
disclaimer,
providers: snapshot.providers.clone(),
});
}
let targets = repricing_targets(&frontiers);
let overlay = accum
.into_iter()
.map(|(model_id, acc)| {
let appearances = frontier_appearances(&frontiers, &model_id);
let repricing =
repricing_for(&model_id, &acc.tokens, acc.billed_cost, &targets, &pricing);
OverlayModel {
model_id,
raw_model: acc.raw_model,
billed_cost: acc.billed_cost,
tokens: acc.tokens,
appearances,
repricing,
}
})
.collect();
Ok(BenchView {
generated_at: snapshot.generated_at,
frontiers,
overlay,
no_api_usage: false,
disclaimer,
providers: snapshot.providers.clone(),
})
}
struct OverlayAccum {
raw_model: String,
billed_cost: Decimal,
tokens: TokenTotals,
}
fn build_frontiers(table: &BenchmarkTable, pricing: &PricingCatalog) -> Vec<BenchFrontier> {
table
.benchmarks
.iter()
.map(|benchmark| {
let points = benchmark
.points
.iter()
.enumerate()
.map(|(idx, point)| FrontierPoint {
model_id: point.model_id.clone(),
label: point.label.clone(),
score_pct: point.score_pct,
cost_per_task_usd: point.cost_per_task_usd,
standing: standing_for(point, &benchmark.points, idx),
priced_in_catalog: pricing.model(&point.model_id).is_some(),
note: point.note.clone(),
})
.collect();
BenchFrontier {
name: benchmark.name.clone(),
role: benchmark.role.clone(),
source: benchmark.source.clone(),
as_of: benchmark.as_of.clone(),
harness: benchmark.harness.clone(),
cost_note: benchmark.cost_note.clone(),
points,
}
})
.collect()
}
fn standing_for(point: &BenchmarkPoint, points: &[BenchmarkPoint], idx: usize) -> FrontierStanding {
let Some(cost) = point.cost_per_task_usd else {
return FrontierStanding::CostUnknown;
};
for (other_idx, other) in points.iter().enumerate() {
if other_idx == idx {
continue;
}
let Some(other_cost) = other.cost_per_task_usd else {
continue;
};
let cheaper_or_equal = other_cost <= cost;
let higher_or_equal = other.score_pct >= point.score_pct;
let strictly_better = other_cost < cost || other.score_pct > point.score_pct;
if cheaper_or_equal && higher_or_equal && strictly_better {
return FrontierStanding::Dominated {
by: other.model_id.clone(),
};
}
}
FrontierStanding::OnFrontier
}
struct RepricingTarget {
model_id: String,
label: String,
on_frontier_in: Vec<String>,
}
fn repricing_targets(frontiers: &[BenchFrontier]) -> Vec<RepricingTarget> {
let mut by_model: BTreeMap<String, RepricingTarget> = BTreeMap::new();
for frontier in frontiers {
for point in &frontier.points {
if point.priced_in_catalog && point.standing == FrontierStanding::OnFrontier {
by_model
.entry(point.model_id.clone())
.or_insert_with(|| RepricingTarget {
model_id: point.model_id.clone(),
label: point.label.clone(),
on_frontier_in: Vec::new(),
})
.on_frontier_in
.push(frontier.name.clone());
}
}
}
by_model.into_values().collect()
}
fn frontier_appearances(frontiers: &[BenchFrontier], model_id: &str) -> Vec<OverlayAppearance> {
frontiers
.iter()
.flat_map(|frontier| {
frontier
.points
.iter()
.filter(move |point| point.model_id == model_id)
.map(move |point| OverlayAppearance {
benchmark_name: frontier.name.clone(),
score_pct: point.score_pct,
standing: point.standing.clone(),
})
})
.collect()
}
fn repricing_for(
model_id: &str,
tokens: &TokenTotals,
billed_cost: Decimal,
targets: &[RepricingTarget],
pricing: &PricingCatalog,
) -> Vec<RepricingDelta> {
targets
.iter()
.map(|target| {
if target.model_id == model_id {
return RepricingDelta {
target_model_id: target.model_id.clone(),
target_label: target.label.clone(),
delta_usd: Decimal::ZERO,
status: RepricingStatus::SameModel,
on_frontier_in: target.on_frontier_in.clone(),
};
}
match repriced_total(tokens, &target.model_id, pricing) {
Some(repriced) => RepricingDelta {
target_model_id: target.model_id.clone(),
target_label: target.label.clone(),
delta_usd: repriced - billed_cost,
status: RepricingStatus::Computed,
on_frontier_in: target.on_frontier_in.clone(),
},
None => RepricingDelta {
target_model_id: target.model_id.clone(),
target_label: target.label.clone(),
delta_usd: Decimal::ZERO,
status: RepricingStatus::TargetRateGap,
on_frontier_in: target.on_frontier_in.clone(),
},
}
})
.collect()
}
fn repriced_total(tokens: &TokenTotals, target: &str, pricing: &PricingCatalog) -> Option<Decimal> {
let million = Decimal::from(1_000_000_u64);
let mut total = Decimal::ZERO;
for meter in METERS {
let volume = meter_volume(tokens, meter);
if volume == 0 {
continue;
}
let price = pricing.meter_price(target, meter)?;
total += Decimal::from(volume) * price / million;
}
Some(total)
}
fn meter_volume(tokens: &TokenTotals, meter: &str) -> u64 {
match meter {
"input" => tokens.input,
"output" => tokens.output,
"cache_read" => tokens.cache_read,
"cache_write" => tokens.cache_write,
_ => 0,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::focus_records_from_usage;
use chrono::TimeZone;
use costroid_providers::{AccessPath, ProviderId, UsageEvent};
fn ts() -> DateTime<Utc> {
match Utc.with_ymd_and_hms(2026, 1, 7, 12, 0, 0) {
chrono::LocalResult::Single(value) => value,
_ => panic!("fixed test timestamp should be valid"),
}
}
fn event(model: &str, access: AccessPath, input: u64, output: u64) -> UsageEvent {
UsageEvent {
tool: ProviderId::Codex,
model: model.to_string(),
timestamp: ts(),
input_tokens: input,
output_tokens: output,
cache_read_tokens: 0,
cache_write_tokens: 0,
project: Some("/work/proj".to_string()),
access_path: access,
}
}
fn snapshot(events: &[UsageEvent]) -> EngineSnapshot {
let focus_rows = match focus_records_from_usage(events) {
Ok(rows) => rows,
Err(err) => panic!("events should price: {err}"),
};
EngineSnapshot {
generated_at: ts(),
usage_events: Vec::new(),
focus_rows,
limit_windows: Vec::new(),
providers: Vec::new(),
}
}
fn frontier<'a>(view: &'a BenchView, name: &str) -> &'a BenchFrontier {
match view.frontiers.iter().find(|f| f.name == name) {
Some(f) => f,
None => panic!("benchmark {name} should be present"),
}
}
fn point<'a>(frontier: &'a BenchFrontier, model_id: &str) -> &'a FrontierPoint {
match frontier.points.iter().find(|p| p.model_id == model_id) {
Some(p) => p,
None => panic!("point {model_id} should be present on {}", frontier.name),
}
}
fn benchmark_point(model_id: &str, score_pct: i64, cost: Option<i64>) -> BenchmarkPoint {
BenchmarkPoint {
model_id: model_id.to_string(),
label: model_id.to_string(),
score_pct: Decimal::from(score_pct),
cost_per_task_usd: cost.map(Decimal::from),
note: None,
}
}
#[test]
fn bundled_benchmarks_parse_and_validate() {
let table = match BenchmarkTable::bundled() {
Ok(table) => table,
Err(err) => panic!("bundled benchmarks should validate: {err}"),
};
assert_eq!(table.benchmarks.len(), 2);
assert_eq!(table.benchmarks[0].name, "DeepSWE");
assert_eq!(table.benchmarks[0].as_of, "2026-05-30");
assert_eq!(table.benchmarks[1].name, "CursorBench v3.1");
assert_eq!(table.benchmarks[1].as_of, "2026-05-18");
}
#[test]
fn as_of_guard_is_fail_closed() {
let body = |as_of: &str| {
format!(
r#"{{"schema_version":"1","benchmarks":[{{"name":"X","role":"primary","source":"https://x","as_of":"{as_of}","cost_note":"n","points":[{{"model_id":"gpt-5.5","label":"g","score_pct":"70.0","cost_per_task_usd":"1.0"}}]}}]}}"#
)
};
for bad in ["FILL_ME", "", "2026-13-99", "May 30 2026"] {
match BenchmarkTable::from_json(&body(bad)) {
Err(CoreError::BenchValidation(_)) => {}
other => panic!("as_of {bad:?} should be rejected, got {other:?}"),
}
}
assert!(BenchmarkTable::from_json(&body("2026-05-30")).is_ok());
}
#[test]
fn deepswe_opus47_is_dominated() {
let view = match bench_view(&snapshot(&[])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
let deepswe = frontier(&view, "DeepSWE");
assert_eq!(
point(deepswe, "claude-opus-4-7").standing,
FrontierStanding::Dominated {
by: "gpt-5.5".to_string()
}
);
assert_eq!(
point(deepswe, "gpt-5.5").standing,
FrontierStanding::OnFrontier
);
assert_eq!(
point(deepswe, "claude-sonnet-4-6").standing,
FrontierStanding::OnFrontier
);
assert!(point(deepswe, "claude-sonnet-4-6")
.cost_per_task_usd
.is_some());
}
#[test]
fn cost_unknown_point_is_score_only() {
let points = vec![
benchmark_point("gpt-5.5", 70, Some(6)),
benchmark_point("mystery", 40, None),
];
assert_eq!(
standing_for(&points[1], &points, 1),
FrontierStanding::CostUnknown
);
assert_eq!(
standing_for(&points[0], &points, 0),
FrontierStanding::OnFrontier
);
}
#[test]
fn dominance_tie_handling() {
let tied = vec![
benchmark_point("a", 50, Some(5)),
benchmark_point("b", 50, Some(5)),
];
assert_eq!(
standing_for(&tied[0], &tied, 0),
FrontierStanding::OnFrontier
);
assert_eq!(
standing_for(&tied[1], &tied, 1),
FrontierStanding::OnFrontier
);
let same_cost = vec![
benchmark_point("hi", 60, Some(5)),
benchmark_point("lo", 50, Some(5)),
];
assert_eq!(
standing_for(&same_cost[1], &same_cost, 1),
FrontierStanding::Dominated {
by: "hi".to_string()
}
);
let same_score = vec![
benchmark_point("cheap", 50, Some(3)),
benchmark_point("dear", 50, Some(8)),
];
assert_eq!(
standing_for(&same_score[1], &same_score, 1),
FrontierStanding::Dominated {
by: "cheap".to_string()
}
);
}
#[test]
fn api_rows_only_excludes_subscription() {
let view = match bench_view(&snapshot(&[
event("gpt-5.5", AccessPath::Api, 1_000_000, 0),
event("gpt-5.5", AccessPath::Subscription, 1_000_000, 0),
])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
assert!(!view.no_api_usage);
assert_eq!(view.overlay.len(), 1);
assert_eq!(view.overlay[0].tokens.input, 1_000_000);
assert_eq!(view.overlay[0].billed_cost, Decimal::new(500, 2));
}
#[test]
fn overlay_api_total_reconciles_with_now_summary() {
let snap = snapshot(&[
event("gpt-5.5", AccessPath::Api, 1_000_000, 500_000),
event("claude-opus-4-7", AccessPath::Api, 200_000, 0),
event("gpt-5.5", AccessPath::Subscription, 999_999, 0),
]);
let view = match bench_view(&snap) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
let overlay_total: Decimal = view.overlay.iter().map(|m| m.billed_cost).sum();
let now = crate::now_summary(&snap, crate::NowOptions::default());
let now_api_total: Decimal = now
.current_costs
.iter()
.filter(|c| c.lane == CostLane::Api)
.map(|c| c.totals.billed_cost)
.sum();
assert_eq!(overlay_total, now_api_total);
}
#[test]
fn no_api_usage_zero_delta_reference() {
let view = match bench_view(&snapshot(&[event(
"gpt-5.5",
AccessPath::Subscription,
1_000_000,
0,
)])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
assert!(view.no_api_usage);
assert!(view.overlay.is_empty());
assert_eq!(view.frontiers.len(), 2);
}
#[test]
fn repricing_delta_on_known_volume() {
let view = match bench_view(&snapshot(&[event(
"claude-opus-4-7",
AccessPath::Api,
1_000_000,
500_000,
)])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
let opus = &view.overlay[0];
let gpt = match opus
.repricing
.iter()
.find(|d| d.target_model_id == "gpt-5.5")
{
Some(delta) => delta,
None => panic!("gpt-5.5 should be a re-pricing target"),
};
assert_eq!(gpt.status, RepricingStatus::Computed);
assert_eq!(gpt.delta_usd + opus.billed_cost, Decimal::new(2000, 2));
let self_delta = opus
.repricing
.iter()
.find(|d| d.target_model_id == "claude-opus-4-7");
assert_eq!(
self_delta.map(|d| d.status),
Some(RepricingStatus::SameModel)
);
}
#[test]
fn composer_is_a_gap_not_a_target() {
let view = match bench_view(&snapshot(&[event(
"claude-opus-4-7",
AccessPath::Api,
10,
0,
)])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
let cursorbench = frontier(&view, "CursorBench v3.1");
let composer = point(cursorbench, "composer-2.5");
assert!(!composer.priced_in_catalog);
assert_eq!(
composer.note.as_deref(),
Some("Cursor subscription only - no API access")
);
for overlay in &view.overlay {
assert!(
overlay
.repricing
.iter()
.all(|d| d.target_model_id != "composer-2.5"),
"composer-2.5 must never be a re-pricing target"
);
}
}
#[test]
fn missing_model_is_a_gap() {
let view = match bench_view(&snapshot(&[event(
"claude-haiku-4-5",
AccessPath::Api,
10,
0,
)])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
let haiku = match view
.overlay
.iter()
.find(|m| m.model_id == "claude-haiku-4-5")
{
Some(model) => model,
None => panic!("haiku should be in the overlay"),
};
assert!(haiku.appearances.is_empty());
}
#[test]
fn repricing_skips_target_rate_gap() {
let mut cache_write_event = event("claude-opus-4-7", AccessPath::Api, 0, 0);
cache_write_event.cache_write_tokens = 1_000_000;
let view = match bench_view(&snapshot(&[cache_write_event])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
let opus = &view.overlay[0];
let gpt = match opus
.repricing
.iter()
.find(|d| d.target_model_id == "gpt-5.5")
{
Some(delta) => delta,
None => panic!("gpt-5.5 should appear as a target"),
};
assert_eq!(gpt.status, RepricingStatus::TargetRateGap);
}
#[test]
fn disclaimer_carries_hedge_and_pricing_date() {
let view = match bench_view(&snapshot(&[])) {
Ok(view) => view,
Err(err) => panic!("bench_view should build: {err}"),
};
assert!(view.disclaimer.note.starts_with('~'));
assert!(view.disclaimer.note.contains("not a quality claim"));
assert!(!view.disclaimer.pricing_as_of.is_empty());
}
}