codetether_agent/benchmark/
types.rs1use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct BenchmarkConfig {
8 pub prd_dir: String,
10
11 pub models: Vec<String>,
13
14 pub tier: Option<u8>,
16
17 pub parallel: bool,
19
20 pub max_iterations: usize,
22
23 pub story_timeout_secs: u64,
25
26 pub output: String,
28
29 pub cost_ceiling_usd: Option<f64>,
31
32 pub submit_api_url: Option<String>,
34
35 pub submit_api_key: Option<String>,
37}
38
39impl Default for BenchmarkConfig {
40 fn default() -> Self {
41 Self {
42 prd_dir: "benchmarks".to_string(),
43 models: Vec::new(),
44 tier: None,
45 parallel: false,
46 max_iterations: 10,
47 story_timeout_secs: 300,
48 output: "benchmark_results.json".to_string(),
49 cost_ceiling_usd: Some(50.0),
50 submit_api_url: None,
51 submit_api_key: None,
52 }
53 }
54}
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct BenchmarkSuiteResult {
59 pub run_date: String,
61
62 pub agent: String,
64
65 pub agent_version: String,
67
68 pub model_results: Vec<ModelBenchmarkResult>,
70
71 pub summary: BenchmarkSummary,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct ModelBenchmarkResult {
78 pub model: String,
80
81 pub prd_results: Vec<PrdBenchmarkResult>,
83
84 pub aggregate: AggregateMetrics,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct PrdBenchmarkResult {
91 pub prd_id: String,
93
94 pub prd_tier: u8,
96
97 pub prd_feature: String,
99
100 pub stories_total: usize,
102
103 pub stories_passed: usize,
105
106 pub pass_rate: f64,
108
109 pub duration_seconds: f64,
111
112 pub tokens_used: u64,
114
115 pub cost_usd: f64,
117
118 pub quality_checks: Vec<QualityCheckResult>,
120
121 pub per_story: Vec<StoryBenchmarkResult>,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct StoryBenchmarkResult {
128 pub story_id: String,
130
131 pub title: String,
133
134 pub passed: bool,
136
137 pub iterations: usize,
139
140 pub duration_seconds: f64,
142
143 pub tokens_used: u64,
145
146 pub files_changed: Vec<String>,
148}
149
150#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct QualityCheckResult {
153 pub name: String,
155
156 pub passed: bool,
158
159 pub output: Option<String>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct AggregateMetrics {
166 pub prds_attempted: usize,
168
169 pub prds_fully_passed: usize,
171
172 pub overall_pass_rate: f64,
174
175 pub total_stories: usize,
177
178 pub total_stories_passed: usize,
180
181 pub avg_seconds_per_story: f64,
183
184 pub avg_tokens_per_story: f64,
186
187 pub total_cost_usd: f64,
189
190 pub avg_cost_per_story: f64,
192
193 pub total_duration_seconds: f64,
195
196 pub stories_per_hour: f64,
198}
199
200#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct BenchmarkSummary {
203 pub best_pass_rate_model: String,
205
206 pub fastest_model: String,
208
209 pub cheapest_model: String,
211
212 pub best_overall_model: String,
214
215 pub rankings: Vec<ModelRanking>,
217}
218
219#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct ModelRanking {
222 pub model: String,
224
225 pub pass_rate_score: f64,
227
228 pub speed_score: f64,
230
231 pub cost_score: f64,
233
234 pub overall_score: f64,
236}
237
238#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct BenchmarkSubmission {
241 pub model: String,
242 pub agent: String,
243 pub result: String,
244}
245
246pub fn detect_tier(filename: &str) -> u8 {
248 if filename.starts_with("t1-") {
249 1
250 } else if filename.starts_with("t2-") {
251 2
252 } else if filename.starts_with("t3-") {
253 3
254 } else {
255 2 }
257}