pub mod pareto;
pub mod py2rs;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::Duration;
pub trait EvalTask: Send + Sync {
fn id(&self) -> &str;
fn description(&self) -> &str;
fn examples(&self) -> &[Example];
fn max_turns(&self) -> u32 {
5
}
fn turn_timeout(&self) -> Duration {
Duration::from_secs(60)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Example {
pub id: String,
pub input: String,
pub expected: String,
pub difficulty: Difficulty,
pub tags: Vec<String>,
}
impl Example {
#[must_use]
pub fn new(
id: impl Into<String>,
input: impl Into<String>,
expected: impl Into<String>,
) -> Self {
Self {
id: id.into(),
input: input.into(),
expected: expected.into(),
difficulty: Difficulty::Medium,
tags: Vec::new(),
}
}
#[must_use]
pub fn with_difficulty(mut self, difficulty: Difficulty) -> Self {
self.difficulty = difficulty;
self
}
#[must_use]
pub fn with_tags(mut self, tags: Vec<String>) -> Self {
self.tags = tags;
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum Difficulty {
Trivial,
Easy,
Medium,
Hard,
Expert,
}
impl Difficulty {
#[must_use]
pub const fn level(&self) -> u8 {
match self {
Self::Trivial => 1,
Self::Easy => 2,
Self::Medium => 3,
Self::Hard => 4,
Self::Expert => 5,
}
}
#[must_use]
pub const fn name(&self) -> &'static str {
match self {
Self::Trivial => "Trivial",
Self::Easy => "Easy",
Self::Medium => "Medium",
Self::Hard => "Hard",
Self::Expert => "Expert",
}
}
#[must_use]
pub const fn all() -> [Self; 5] {
[
Self::Trivial,
Self::Easy,
Self::Medium,
Self::Hard,
Self::Expert,
]
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EvalResult {
pub model_id: String,
pub model_size_bytes: u64,
pub model_params: Option<u64>,
pub task_id: String,
pub example_results: Vec<ExampleResult>,
pub success_by_turn: Vec<f64>,
pub avg_turns_to_success: f64,
pub overall_success_rate: f64,
pub total_tokens: u64,
pub total_latency: Duration,
}
impl EvalResult {
#[must_use]
pub fn new(
model_id: impl Into<String>,
task_id: impl Into<String>,
model_size_bytes: u64,
) -> Self {
Self {
model_id: model_id.into(),
model_size_bytes,
model_params: None,
task_id: task_id.into(),
example_results: Vec::new(),
success_by_turn: Vec::new(),
avg_turns_to_success: 0.0,
overall_success_rate: 0.0,
total_tokens: 0,
total_latency: Duration::ZERO,
}
}
pub fn add_example(&mut self, result: ExampleResult) {
self.total_tokens += result.tokens_per_turn.iter().sum::<u64>();
self.total_latency += result.latency_per_turn.iter().sum::<Duration>();
self.example_results.push(result);
}
pub fn finalize(&mut self, max_turns: u32) {
let total = self.example_results.len();
if total == 0 {
return;
}
self.success_by_turn = Vec::with_capacity(max_turns as usize);
for turn in 1..=max_turns {
let solved = self
.example_results
.iter()
.filter(|r| matches!(r.status, ExampleStatus::Solved { turn: t } if t <= turn))
.count();
self.success_by_turn.push(solved as f64 / total as f64);
}
let solved_examples: Vec<_> = self
.example_results
.iter()
.filter_map(|r| match r.status {
ExampleStatus::Solved { turn } => Some(turn),
_ => None,
})
.collect();
if !solved_examples.is_empty() {
self.avg_turns_to_success =
f64::from(solved_examples.iter().sum::<u32>()) / solved_examples.len() as f64;
}
self.overall_success_rate = solved_examples.len() as f64 / total as f64;
}
#[must_use]
pub fn success_at_turn(&self, turn: u32) -> f64 {
self.success_by_turn
.get((turn - 1) as usize)
.copied()
.unwrap_or(0.0)
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExampleResult {
pub example_id: String,
pub difficulty: Difficulty,
pub solved_at_turn: Option<u32>,
pub tokens_per_turn: Vec<u64>,
pub latency_per_turn: Vec<Duration>,
pub status: ExampleStatus,
}
impl ExampleResult {
#[must_use]
pub fn solved(
example_id: impl Into<String>,
difficulty: Difficulty,
turn: u32,
tokens: Vec<u64>,
latencies: Vec<Duration>,
) -> Self {
Self {
example_id: example_id.into(),
difficulty,
solved_at_turn: Some(turn),
tokens_per_turn: tokens,
latency_per_turn: latencies,
status: ExampleStatus::Solved { turn },
}
}
#[must_use]
pub fn failed(
example_id: impl Into<String>,
difficulty: Difficulty,
attempts: u32,
last_error: impl Into<String>,
tokens: Vec<u64>,
latencies: Vec<Duration>,
) -> Self {
Self {
example_id: example_id.into(),
difficulty,
solved_at_turn: None,
tokens_per_turn: tokens,
latency_per_turn: latencies,
status: ExampleStatus::Failed {
attempts,
last_error: last_error.into(),
},
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum ExampleStatus {
Solved {
turn: u32,
},
Failed {
attempts: u32,
last_error: String,
},
Timeout {
turn: u32,
},
Skipped {
reason: String,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelComparison {
pub task_id: String,
pub results: Vec<EvalResult>,
pub pareto_frontier: Vec<ParetoPoint>,
pub recommendations: Vec<Recommendation>,
}
include!("comparison.rs");
include!("test_task.rs");