use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkConfig {
pub prd_dir: String,
pub models: Vec<String>,
pub tier: Option<u8>,
pub parallel: bool,
pub max_iterations: usize,
pub story_timeout_secs: u64,
pub output: String,
pub cost_ceiling_usd: Option<f64>,
pub submit_api_url: Option<String>,
pub submit_api_key: Option<String>,
}
impl Default for BenchmarkConfig {
fn default() -> Self {
Self {
prd_dir: "benchmarks".to_string(),
models: Vec::new(),
tier: None,
parallel: false,
max_iterations: 10,
story_timeout_secs: 300,
output: "benchmark_results.json".to_string(),
cost_ceiling_usd: Some(50.0),
submit_api_url: None,
submit_api_key: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSuiteResult {
pub run_date: String,
pub agent: String,
pub agent_version: String,
pub model_results: Vec<ModelBenchmarkResult>,
pub summary: BenchmarkSummary,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelBenchmarkResult {
pub model: String,
pub prd_results: Vec<PrdBenchmarkResult>,
pub aggregate: AggregateMetrics,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PrdBenchmarkResult {
pub prd_id: String,
pub prd_tier: u8,
pub prd_feature: String,
pub stories_total: usize,
pub stories_passed: usize,
pub pass_rate: f64,
pub duration_seconds: f64,
pub tokens_used: u64,
pub cost_usd: f64,
pub quality_checks: Vec<QualityCheckResult>,
pub per_story: Vec<StoryBenchmarkResult>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StoryBenchmarkResult {
pub story_id: String,
pub title: String,
pub passed: bool,
pub iterations: usize,
pub duration_seconds: f64,
pub tokens_used: u64,
pub files_changed: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityCheckResult {
pub name: String,
pub passed: bool,
pub output: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AggregateMetrics {
pub prds_attempted: usize,
pub prds_fully_passed: usize,
pub overall_pass_rate: f64,
pub total_stories: usize,
pub total_stories_passed: usize,
pub avg_seconds_per_story: f64,
pub avg_tokens_per_story: f64,
pub total_cost_usd: f64,
pub avg_cost_per_story: f64,
pub total_duration_seconds: f64,
pub stories_per_hour: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSummary {
pub best_pass_rate_model: String,
pub fastest_model: String,
pub cheapest_model: String,
pub best_overall_model: String,
pub rankings: Vec<ModelRanking>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelRanking {
pub model: String,
pub pass_rate_score: f64,
pub speed_score: f64,
pub cost_score: f64,
pub overall_score: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BenchmarkSubmission {
pub model: String,
pub agent: String,
pub result: String,
}
pub fn detect_tier(filename: &str) -> u8 {
if filename.starts_with("t1-") {
1
} else if filename.starts_with("t2-") {
2
} else if filename.starts_with("t3-") {
3
} else {
2 }
}