use std::collections::HashMap;
use std::fmt;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BenchmarkCategory {
Reasoning,
Coding,
Knowledge,
Instruction,
Multilingual,
Safety,
LongContext,
Vision,
Agentic,
}
impl fmt::Display for BenchmarkCategory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Reasoning => write!(f, "Reasoning"),
Self::Coding => write!(f, "Coding"),
Self::Knowledge => write!(f, "Knowledge"),
Self::Instruction => write!(f, "Instruction"),
Self::Multilingual => write!(f, "Multilingual"),
Self::Safety => write!(f, "Safety"),
Self::LongContext => write!(f, "Long Context"),
Self::Vision => write!(f, "Vision"),
Self::Agentic => write!(f, "Agentic"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum ModelTier {
S,
A,
B,
C,
D,
}
impl fmt::Display for ModelTier {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::S => write!(f, "S-Tier"),
Self::A => write!(f, "A-Tier"),
Self::B => write!(f, "B-Tier"),
Self::C => write!(f, "C-Tier"),
Self::D => write!(f, "D-Tier"),
}
}
}
#[derive(Debug, Clone)]
pub struct Model {
pub name: String,
pub provider: String,
pub context_window: Option<u64>,
pub scores: HashMap<BenchmarkCategory, f64>,
pub input_price_per_mtok: Option<f64>,
pub output_price_per_mtok: Option<f64>,
}
impl Model {
pub fn new(name: &str, provider: &str) -> Self {
Self {
name: name.to_string(),
provider: provider.to_string(),
context_window: None,
scores: HashMap::new(),
input_price_per_mtok: None,
output_price_per_mtok: None,
}
}
pub fn with_context_window(mut self, tokens: u64) -> Self {
self.context_window = Some(tokens);
self
}
pub fn with_score(mut self, category: BenchmarkCategory, score: f64) -> Self {
self.scores.insert(category, score.clamp(0.0, 100.0));
self
}
pub fn with_pricing(mut self, input_per_mtok: f64, output_per_mtok: f64) -> Self {
self.input_price_per_mtok = Some(input_per_mtok);
self.output_price_per_mtok = Some(output_per_mtok);
self
}
pub fn average_score(&self) -> Option<f64> {
if self.scores.is_empty() {
return None;
}
let total: f64 = self.scores.values().sum();
Some(total / self.scores.len() as f64)
}
pub fn tier(&self) -> Option<ModelTier> {
self.average_score().map(|avg| {
if avg >= 90.0 {
ModelTier::S
} else if avg >= 80.0 {
ModelTier::A
} else if avg >= 70.0 {
ModelTier::B
} else if avg >= 60.0 {
ModelTier::C
} else {
ModelTier::D
}
})
}
pub fn score(&self, category: BenchmarkCategory) -> Option<f64> {
self.scores.get(&category).copied()
}
pub fn has_pricing(&self) -> bool {
self.input_price_per_mtok.is_some() && self.output_price_per_mtok.is_some()
}
pub fn value_score(&self) -> Option<f64> {
let avg = self.average_score()?;
let input_price = self.input_price_per_mtok?;
let output_price = self.output_price_per_mtok?;
let blended = input_price + output_price;
if blended <= 0.0 {
return None;
}
Some(avg / blended)
}
}
impl fmt::Display for Model {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{} ({})", self.name, self.provider)?;
if let Some(tier) = self.tier() {
write!(f, " [{tier}]")?;
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct ComparisonResult {
pub model_a: Model,
pub model_b: Model,
pub deltas: HashMap<BenchmarkCategory, f64>,
pub a_wins: Vec<BenchmarkCategory>,
pub b_wins: Vec<BenchmarkCategory>,
pub ties: Vec<BenchmarkCategory>,
}
impl ComparisonResult {
pub fn winner(&self) -> &Model {
let avg_a = self.model_a.average_score().unwrap_or(0.0);
let avg_b = self.model_b.average_score().unwrap_or(0.0);
if avg_b > avg_a {
&self.model_b
} else {
&self.model_a
}
}
pub fn categories_compared(&self) -> usize {
self.deltas.len()
}
pub fn delta(&self, category: BenchmarkCategory) -> Option<f64> {
self.deltas.get(&category).copied()
}
}
impl fmt::Display for ComparisonResult {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "{} vs {}", self.model_a.name, self.model_b.name)?;
writeln!(f, " {} wins: {} categories", self.model_a.name, self.a_wins.len())?;
writeln!(f, " {} wins: {} categories", self.model_b.name, self.b_wins.len())?;
writeln!(f, " Ties: {}", self.ties.len())?;
write!(f, " Overall winner: {}", self.winner().name)
}
}
pub fn compare_models(a: &Model, b: &Model) -> ComparisonResult {
let mut deltas = HashMap::new();
let mut a_wins = Vec::new();
let mut b_wins = Vec::new();
let mut ties = Vec::new();
for (category, &score_a) in &a.scores {
if let Some(&score_b) = b.scores.get(category) {
let delta = score_a - score_b;
deltas.insert(*category, delta);
if (delta).abs() < f64::EPSILON {
ties.push(*category);
} else if delta > 0.0 {
a_wins.push(*category);
} else {
b_wins.push(*category);
}
}
}
ComparisonResult {
model_a: a.clone(),
model_b: b.clone(),
deltas,
a_wins,
b_wins,
ties,
}
}
pub fn estimate_cost(model: &Model, input_tokens: u64, output_tokens: u64) -> Option<f64> {
let input_price = model.input_price_per_mtok?;
let output_price = model.output_price_per_mtok?;
let input_cost = (input_tokens as f64 / 1_000_000.0) * input_price;
let output_cost = (output_tokens as f64 / 1_000_000.0) * output_price;
Some(input_cost + output_cost)
}
pub fn rank_by_category(models: &[Model], category: BenchmarkCategory) -> Vec<(&Model, f64)> {
let mut ranked: Vec<(&Model, f64)> = models
.iter()
.filter_map(|m| m.score(category).map(|s| (m, s)))
.collect();
ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
ranked
}
pub fn filter_by_tier(models: &[Model], tier: ModelTier) -> Vec<&Model> {
models.iter().filter(|m| m.tier() == Some(tier)).collect()
}
pub fn best_value(models: &[Model]) -> Option<&Model> {
models
.iter()
.filter_map(|m| m.value_score().map(|v| (m, v)))
.max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
.map(|(m, _)| m)
}
#[cfg(test)]
mod tests {
use super::*;
fn sample_model(name: &str, reasoning: f64, coding: f64) -> Model {
Model::new(name, "TestProvider")
.with_score(BenchmarkCategory::Reasoning, reasoning)
.with_score(BenchmarkCategory::Coding, coding)
.with_pricing(3.0, 15.0)
.with_context_window(128_000)
}
#[test]
fn test_average_score() {
let model = sample_model("test", 90.0, 80.0);
assert!((model.average_score().unwrap() - 85.0).abs() < f64::EPSILON);
}
#[test]
fn test_tier_classification() {
assert_eq!(sample_model("s", 95.0, 92.0).tier(), Some(ModelTier::S));
assert_eq!(sample_model("a", 85.0, 82.0).tier(), Some(ModelTier::A));
assert_eq!(sample_model("b", 75.0, 72.0).tier(), Some(ModelTier::B));
assert_eq!(sample_model("c", 65.0, 62.0).tier(), Some(ModelTier::C));
assert_eq!(sample_model("d", 55.0, 52.0).tier(), Some(ModelTier::D));
}
#[test]
fn test_empty_scores() {
let model = Model::new("empty", "None");
assert!(model.average_score().is_none());
assert!(model.tier().is_none());
}
#[test]
fn test_compare_models() {
let a = sample_model("alpha", 90.0, 80.0);
let b = sample_model("beta", 85.0, 95.0);
let result = compare_models(&a, &b);
assert_eq!(result.winner().name, "beta");
assert_eq!(result.a_wins.len(), 1);
assert_eq!(result.b_wins.len(), 1);
}
#[test]
fn test_estimate_cost() {
let model = Model::new("test", "Provider").with_pricing(2.50, 10.00);
let cost = estimate_cost(&model, 1_000_000, 500_000).unwrap();
assert!((cost - 7.5).abs() < 0.01);
}
#[test]
fn test_estimate_cost_no_pricing() {
let model = Model::new("test", "Provider");
assert!(estimate_cost(&model, 1000, 500).is_none());
}
#[test]
fn test_rank_by_category() {
let models = vec![
sample_model("low", 70.0, 65.0),
sample_model("high", 95.0, 98.0),
sample_model("mid", 82.0, 80.0),
];
let ranked = rank_by_category(&models, BenchmarkCategory::Coding);
assert_eq!(ranked[0].0.name, "high");
assert_eq!(ranked[2].0.name, "low");
}
#[test]
fn test_filter_by_tier() {
let models = vec![
sample_model("s1", 95.0, 92.0),
sample_model("a1", 85.0, 82.0),
sample_model("s2", 91.0, 93.0),
];
let s_tier = filter_by_tier(&models, ModelTier::S);
assert_eq!(s_tier.len(), 2);
}
#[test]
fn test_best_value() {
let cheap = Model::new("cheap", "Budget")
.with_score(BenchmarkCategory::Reasoning, 80.0)
.with_pricing(0.10, 0.30);
let expensive = Model::new("expensive", "Premium")
.with_score(BenchmarkCategory::Reasoning, 95.0)
.with_pricing(15.0, 60.0);
let models = vec![cheap, expensive];
let best = best_value(&models).unwrap();
assert_eq!(best.name, "cheap");
}
#[test]
fn test_value_score() {
let model = Model::new("test", "P")
.with_score(BenchmarkCategory::Reasoning, 80.0)
.with_pricing(5.0, 15.0);
let vs = model.value_score().unwrap();
assert!((vs - 4.0).abs() < 0.01); }
#[test]
fn test_score_clamping() {
let model = Model::new("test", "P")
.with_score(BenchmarkCategory::Reasoning, 150.0)
.with_score(BenchmarkCategory::Coding, -10.0);
assert_eq!(model.score(BenchmarkCategory::Reasoning), Some(100.0));
assert_eq!(model.score(BenchmarkCategory::Coding), Some(0.0));
}
#[test]
fn test_model_display() {
let model = sample_model("gpt-4o", 92.0, 91.0);
let display = format!("{model}");
assert!(display.contains("gpt-4o"));
assert!(display.contains("S-Tier"));
}
}