mod metrics;
mod runner;
mod task;
pub use metrics::{AgentMetrics, BenchmarkMetrics, MetricsCollector};
pub use runner::{BenchmarkResult, BenchmarkRunner, RunnerConfig};
pub use task::{BenchmarkTask, TaskDifficulty, TaskType};
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSuite {
pub id: String,
pub name: String,
pub description: Option<String>,
pub version: String,
pub tasks: Vec<BenchmarkTask>,
pub metadata: HashMap<String, serde_json::Value>,
pub created_at: DateTime<Utc>,
}
impl BenchmarkSuite {
pub fn new(id: impl Into<String>, name: impl Into<String>) -> Self {
Self {
id: id.into(),
name: name.into(),
description: None,
version: "1.0.0".to_string(),
tasks: Vec::new(),
metadata: HashMap::new(),
created_at: Utc::now(),
}
}
pub fn with_task(mut self, task: BenchmarkTask) -> Self {
self.tasks.push(task);
self
}
pub fn with_description(mut self, desc: impl Into<String>) -> Self {
self.description = Some(desc.into());
self
}
pub fn with_version(mut self, version: impl Into<String>) -> Self {
self.version = version.into();
self
}
pub fn task_count(&self) -> usize {
self.tasks.len()
}
pub fn tasks_by_difficulty(&self, difficulty: TaskDifficulty) -> Vec<&BenchmarkTask> {
self.tasks
.iter()
.filter(|t| t.difficulty == difficulty)
.collect()
}
pub fn tasks_by_type(&self, task_type: TaskType) -> Vec<&BenchmarkTask> {
self.tasks
.iter()
.filter(|t| t.task_type == task_type)
.collect()
}
pub fn total_points(&self) -> u32 {
self.tasks.iter().map(|t| t.points).sum()
}
}
pub struct BenchmarkRegistry {
suites: Arc<RwLock<HashMap<String, BenchmarkSuite>>>,
runs: Arc<RwLock<Vec<BenchmarkRun>>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkRun {
pub id: String,
pub suite_id: String,
pub agent_id: String,
pub started_at: DateTime<Utc>,
pub completed_at: Option<DateTime<Utc>>,
pub results: Vec<BenchmarkResult>,
pub score: f64,
pub pass_rate: f64,
pub duration_ms: u64,
}
impl BenchmarkRun {
pub fn summary(&self) -> BenchmarkSummary {
let total = self.results.len();
let passed = self.results.iter().filter(|r| r.passed).count();
let failed = total - passed;
let total_duration: u64 = self.results.iter().map(|r| r.duration_ms).sum();
let avg_duration = if total > 0 {
total_duration / total as u64
} else {
0
};
let earned_points: u32 = self
.results
.iter()
.filter(|r| r.passed)
.map(|r| r.points_earned)
.sum();
let max_points: u32 = self.results.iter().map(|r| r.max_points).sum();
BenchmarkSummary {
total_tasks: total,
passed,
failed,
pass_rate: self.pass_rate,
score: self.score,
earned_points,
max_points,
avg_duration_ms: avg_duration,
total_duration_ms: total_duration,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub total_tasks: usize,
pub passed: usize,
pub failed: usize,
pub pass_rate: f64,
pub score: f64,
pub earned_points: u32,
pub max_points: u32,
pub avg_duration_ms: u64,
pub total_duration_ms: u64,
}
impl BenchmarkRegistry {
pub fn new() -> Self {
Self {
suites: Arc::new(RwLock::new(HashMap::new())),
runs: Arc::new(RwLock::new(Vec::new())),
}
}
pub async fn register(&self, suite: BenchmarkSuite) {
let mut suites = self.suites.write().await;
suites.insert(suite.id.clone(), suite);
}
pub async fn get_suite(&self, id: &str) -> Option<BenchmarkSuite> {
let suites = self.suites.read().await;
suites.get(id).cloned()
}
pub async fn list_suites(&self) -> Vec<String> {
let suites = self.suites.read().await;
suites.keys().cloned().collect()
}
pub async fn record_run(&self, run: BenchmarkRun) {
let mut runs = self.runs.write().await;
runs.push(run);
}
pub async fn get_runs_for_suite(&self, suite_id: &str) -> Vec<BenchmarkRun> {
let runs = self.runs.read().await;
runs.iter()
.filter(|r| r.suite_id == suite_id)
.cloned()
.collect()
}
pub async fn get_runs_for_agent(&self, agent_id: &str) -> Vec<BenchmarkRun> {
let runs = self.runs.read().await;
runs.iter()
.filter(|r| r.agent_id == agent_id)
.cloned()
.collect()
}
pub async fn get_best_run(&self, suite_id: &str, agent_id: &str) -> Option<BenchmarkRun> {
let runs = self.runs.read().await;
runs.iter()
.filter(|r| r.suite_id == suite_id && r.agent_id == agent_id)
.max_by(|a, b| {
a.score
.partial_cmp(&b.score)
.unwrap_or(std::cmp::Ordering::Equal)
})
.cloned()
}
pub async fn suite_count(&self) -> usize {
let suites = self.suites.read().await;
suites.len()
}
pub async fn run_count(&self) -> usize {
let runs = self.runs.read().await;
runs.len()
}
}
impl Default for BenchmarkRegistry {
fn default() -> Self {
Self::new()
}
}
pub struct PredefinedSuites;
impl PredefinedSuites {
pub fn basic_coding() -> BenchmarkSuite {
BenchmarkSuite::new("basic-coding", "Basic Coding Tasks")
.with_description("Simple programming tasks for initial evaluation")
.with_task(
BenchmarkTask::new("fizzbuzz", "Implement FizzBuzz")
.with_type(TaskType::Implementation)
.with_difficulty(TaskDifficulty::Easy)
.with_points(10)
.with_expected_output(
"1, 2, Fizz, 4, Buzz, Fizz, 7, 8, Fizz, Buzz, 11, Fizz, 13, 14, FizzBuzz",
),
)
.with_task(
BenchmarkTask::new("palindrome", "Check if string is palindrome")
.with_type(TaskType::Implementation)
.with_difficulty(TaskDifficulty::Easy)
.with_points(10),
)
.with_task(
BenchmarkTask::new("fibonacci", "Generate Fibonacci sequence")
.with_type(TaskType::Implementation)
.with_difficulty(TaskDifficulty::Easy)
.with_points(10),
)
}
pub fn bug_fixes() -> BenchmarkSuite {
BenchmarkSuite::new("bug-fixes", "Bug Fix Tasks")
.with_description("Tasks requiring identifying and fixing bugs")
.with_task(
BenchmarkTask::new("off-by-one", "Fix off-by-one error in loop")
.with_type(TaskType::BugFix)
.with_difficulty(TaskDifficulty::Easy)
.with_points(15),
)
.with_task(
BenchmarkTask::new("null-check", "Add missing null checks")
.with_type(TaskType::BugFix)
.with_difficulty(TaskDifficulty::Medium)
.with_points(20),
)
.with_task(
BenchmarkTask::new("race-condition", "Fix race condition")
.with_type(TaskType::BugFix)
.with_difficulty(TaskDifficulty::Hard)
.with_points(30),
)
}
pub fn refactoring() -> BenchmarkSuite {
BenchmarkSuite::new("refactoring", "Refactoring Tasks")
.with_description("Code improvement and refactoring tasks")
.with_task(
BenchmarkTask::new("extract-function", "Extract repeated code to function")
.with_type(TaskType::Refactoring)
.with_difficulty(TaskDifficulty::Medium)
.with_points(20),
)
.with_task(
BenchmarkTask::new("apply-pattern", "Apply design pattern")
.with_type(TaskType::Refactoring)
.with_difficulty(TaskDifficulty::Hard)
.with_points(30),
)
}
pub fn all() -> Vec<BenchmarkSuite> {
vec![Self::basic_coding(), Self::bug_fixes(), Self::refactoring()]
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_benchmark_suite_creation() {
let suite = BenchmarkSuite::new("test-suite", "Test Suite");
assert_eq!(suite.id, "test-suite");
assert_eq!(suite.name, "Test Suite");
assert_eq!(suite.task_count(), 0);
}
#[test]
fn test_suite_with_tasks() {
let suite = BenchmarkSuite::new("tasks", "Tasks Suite")
.with_task(BenchmarkTask::new("task-1", "Task 1"))
.with_task(BenchmarkTask::new("task-2", "Task 2"));
assert_eq!(suite.task_count(), 2);
}
#[test]
fn test_suite_total_points() {
let suite = BenchmarkSuite::new("points", "Points Test")
.with_task(BenchmarkTask::new("t1", "Task 1").with_points(10))
.with_task(BenchmarkTask::new("t2", "Task 2").with_points(20));
assert_eq!(suite.total_points(), 30);
}
#[test]
fn test_tasks_by_difficulty() {
let suite = BenchmarkSuite::new("diff", "Difficulty Test")
.with_task(BenchmarkTask::new("e1", "Easy 1").with_difficulty(TaskDifficulty::Easy))
.with_task(BenchmarkTask::new("h1", "Hard 1").with_difficulty(TaskDifficulty::Hard))
.with_task(BenchmarkTask::new("e2", "Easy 2").with_difficulty(TaskDifficulty::Easy));
let easy = suite.tasks_by_difficulty(TaskDifficulty::Easy);
assert_eq!(easy.len(), 2);
let hard = suite.tasks_by_difficulty(TaskDifficulty::Hard);
assert_eq!(hard.len(), 1);
}
#[test]
fn test_tasks_by_type() {
let suite = BenchmarkSuite::new("type", "Type Test")
.with_task(BenchmarkTask::new("impl", "Impl").with_type(TaskType::Implementation))
.with_task(BenchmarkTask::new("fix", "Fix").with_type(TaskType::BugFix));
let impl_tasks = suite.tasks_by_type(TaskType::Implementation);
assert_eq!(impl_tasks.len(), 1);
}
#[tokio::test]
async fn test_registry_creation() {
let registry = BenchmarkRegistry::new();
assert_eq!(registry.suite_count().await, 0);
}
#[tokio::test]
async fn test_registry_register_suite() {
let registry = BenchmarkRegistry::new();
let suite = BenchmarkSuite::new("test", "Test");
registry.register(suite).await;
assert_eq!(registry.suite_count().await, 1);
}
#[tokio::test]
async fn test_registry_get_suite() {
let registry = BenchmarkRegistry::new();
let suite = BenchmarkSuite::new("my-suite", "My Suite");
registry.register(suite).await;
let retrieved = registry.get_suite("my-suite").await;
assert!(retrieved.is_some());
assert_eq!(retrieved.unwrap().name, "My Suite");
}
#[tokio::test]
async fn test_registry_record_run() {
let registry = BenchmarkRegistry::new();
let run = BenchmarkRun {
id: "run-1".to_string(),
suite_id: "suite-1".to_string(),
agent_id: "agent-1".to_string(),
started_at: Utc::now(),
completed_at: Some(Utc::now()),
results: vec![],
score: 85.0,
pass_rate: 0.9,
duration_ms: 5000,
};
registry.record_run(run).await;
assert_eq!(registry.run_count().await, 1);
}
#[tokio::test]
async fn test_registry_get_runs_for_agent() {
let registry = BenchmarkRegistry::new();
let run1 = BenchmarkRun {
id: "run-1".to_string(),
suite_id: "suite-1".to_string(),
agent_id: "agent-1".to_string(),
started_at: Utc::now(),
completed_at: Some(Utc::now()),
results: vec![],
score: 80.0,
pass_rate: 0.8,
duration_ms: 4000,
};
let run2 = BenchmarkRun {
id: "run-2".to_string(),
suite_id: "suite-2".to_string(),
agent_id: "agent-1".to_string(),
started_at: Utc::now(),
completed_at: Some(Utc::now()),
results: vec![],
score: 90.0,
pass_rate: 0.95,
duration_ms: 3000,
};
registry.record_run(run1).await;
registry.record_run(run2).await;
let runs = registry.get_runs_for_agent("agent-1").await;
assert_eq!(runs.len(), 2);
}
#[test]
fn test_benchmark_run_summary() {
let run = BenchmarkRun {
id: "run".to_string(),
suite_id: "suite".to_string(),
agent_id: "agent".to_string(),
started_at: Utc::now(),
completed_at: Some(Utc::now()),
results: vec![
BenchmarkResult {
task_id: "t1".to_string(),
passed: true,
points_earned: 10,
max_points: 10,
duration_ms: 1000,
output: None,
error: None,
},
BenchmarkResult {
task_id: "t2".to_string(),
passed: false,
points_earned: 0,
max_points: 20,
duration_ms: 2000,
output: None,
error: Some("Failed".to_string()),
},
],
score: 33.3,
pass_rate: 0.5,
duration_ms: 3000,
};
let summary = run.summary();
assert_eq!(summary.total_tasks, 2);
assert_eq!(summary.passed, 1);
assert_eq!(summary.failed, 1);
assert_eq!(summary.earned_points, 10);
assert_eq!(summary.max_points, 30);
}
#[test]
fn test_predefined_suites() {
let suites = PredefinedSuites::all();
assert_eq!(suites.len(), 3);
let basic = &suites[0];
assert_eq!(basic.id, "basic-coding");
assert!(basic.task_count() > 0);
}
#[test]
fn test_predefined_basic_coding() {
let suite = PredefinedSuites::basic_coding();
assert_eq!(suite.id, "basic-coding");
assert!(suite.task_count() >= 3);
}
#[test]
fn test_predefined_bug_fixes() {
let suite = PredefinedSuites::bug_fixes();
assert_eq!(suite.id, "bug-fixes");
assert!(!suite.tasks_by_type(TaskType::BugFix).is_empty());
}
#[test]
fn test_predefined_refactoring() {
let suite = PredefinedSuites::refactoring();
assert_eq!(suite.id, "refactoring");
assert!(!suite.tasks_by_type(TaskType::Refactoring).is_empty());
}
}