codetether_agent/benchmark/
types.rs

1//! Benchmark types - result structures and configuration
2
3use serde::{Deserialize, Serialize};
4
5/// Configuration for a benchmark run
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct BenchmarkConfig {
8    /// Directory containing benchmark PRD files
9    pub prd_dir: String,
10
11    /// Models to benchmark (format: "provider:model")
12    pub models: Vec<String>,
13
14    /// Only run PRDs matching this tier (1, 2, or 3). None = all tiers.
15    pub tier: Option<u8>,
16
17    /// Run model×PRD combos in parallel
18    pub parallel: bool,
19
20    /// Maximum iterations per story
21    pub max_iterations: usize,
22
23    /// Timeout per story in seconds
24    pub story_timeout_secs: u64,
25
26    /// Output file path
27    pub output: String,
28
29    /// Cost ceiling per benchmark run in USD (prevents runaway spending)
30    pub cost_ceiling_usd: Option<f64>,
31
32    /// Optional API URL to submit results to
33    pub submit_api_url: Option<String>,
34
35    /// Optional API key for result submission
36    pub submit_api_key: Option<String>,
37}
38
39impl Default for BenchmarkConfig {
40    fn default() -> Self {
41        Self {
42            prd_dir: "benchmarks".to_string(),
43            models: Vec::new(),
44            tier: None,
45            parallel: false,
46            max_iterations: 10,
47            story_timeout_secs: 300,
48            output: "benchmark_results.json".to_string(),
49            cost_ceiling_usd: Some(50.0),
50            submit_api_url: None,
51            submit_api_key: None,
52        }
53    }
54}
55
56/// Complete results from a benchmark suite run
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct BenchmarkSuiteResult {
59    /// When the benchmark was run
60    pub run_date: String,
61
62    /// Agent being benchmarked
63    pub agent: String,
64
65    /// Agent version
66    pub agent_version: String,
67
68    /// Per-model results
69    pub model_results: Vec<ModelBenchmarkResult>,
70
71    /// Summary across all models
72    pub summary: BenchmarkSummary,
73}
74
75/// Results for a single model across all PRDs
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct ModelBenchmarkResult {
78    /// Model identifier (e.g., "anthropic:claude-sonnet-4-20250514")
79    pub model: String,
80
81    /// Per-PRD results for this model
82    pub prd_results: Vec<PrdBenchmarkResult>,
83
84    /// Aggregate metrics for this model
85    pub aggregate: AggregateMetrics,
86}
87
88/// Results for a single PRD run
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct PrdBenchmarkResult {
91    /// PRD identifier (derived from filename)
92    pub prd_id: String,
93
94    /// Tier classification
95    pub prd_tier: u8,
96
97    /// PRD title/feature name
98    pub prd_feature: String,
99
100    /// Total stories in the PRD
101    pub stories_total: usize,
102
103    /// Stories that passed all quality gates
104    pub stories_passed: usize,
105
106    /// Pass rate (0.0 to 1.0)
107    pub pass_rate: f64,
108
109    /// Total duration in seconds
110    pub duration_seconds: f64,
111
112    /// Total LLM tokens consumed
113    pub tokens_used: u64,
114
115    /// Estimated cost in USD
116    pub cost_usd: f64,
117
118    /// Quality check results
119    pub quality_checks: Vec<QualityCheckResult>,
120
121    /// Per-story results
122    pub per_story: Vec<StoryBenchmarkResult>,
123}
124
125/// Result for a single story within a PRD benchmark
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct StoryBenchmarkResult {
128    /// Story ID
129    pub story_id: String,
130
131    /// Story title
132    pub title: String,
133
134    /// Whether the story passed
135    pub passed: bool,
136
137    /// Number of iterations needed
138    pub iterations: usize,
139
140    /// Duration in seconds
141    pub duration_seconds: f64,
142
143    /// Tokens used for this story
144    pub tokens_used: u64,
145
146    /// Files changed
147    pub files_changed: Vec<String>,
148}
149
150/// Quality check result
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct QualityCheckResult {
153    /// Check name (typecheck, test, lint, build)
154    pub name: String,
155
156    /// Whether it passed
157    pub passed: bool,
158
159    /// Output/error message if failed
160    pub output: Option<String>,
161}
162
163/// Aggregate metrics across multiple PRDs
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct AggregateMetrics {
166    /// Total PRDs attempted
167    pub prds_attempted: usize,
168
169    /// PRDs with 100% pass rate
170    pub prds_fully_passed: usize,
171
172    /// Overall story pass rate
173    pub overall_pass_rate: f64,
174
175    /// Total stories across all PRDs
176    pub total_stories: usize,
177
178    /// Total stories passed
179    pub total_stories_passed: usize,
180
181    /// Average time per story (seconds)
182    pub avg_seconds_per_story: f64,
183
184    /// Average tokens per story
185    pub avg_tokens_per_story: f64,
186
187    /// Total cost
188    pub total_cost_usd: f64,
189
190    /// Average cost per story
191    pub avg_cost_per_story: f64,
192
193    /// Total duration
194    pub total_duration_seconds: f64,
195
196    /// Stories per hour throughput
197    pub stories_per_hour: f64,
198}
199
200/// Summary across all models
201#[derive(Debug, Clone, Serialize, Deserialize)]
202pub struct BenchmarkSummary {
203    /// Best model by pass rate
204    pub best_pass_rate_model: String,
205
206    /// Best model by speed
207    pub fastest_model: String,
208
209    /// Best model by cost efficiency
210    pub cheapest_model: String,
211
212    /// Best overall (weighted score)
213    pub best_overall_model: String,
214
215    /// Model rankings
216    pub rankings: Vec<ModelRanking>,
217}
218
219/// Ranking for a single model
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct ModelRanking {
222    /// Model identifier
223    pub model: String,
224
225    /// Pass rate score (0-100)
226    pub pass_rate_score: f64,
227
228    /// Speed score (0-100, higher = faster)
229    pub speed_score: f64,
230
231    /// Cost score (0-100, higher = cheaper)
232    pub cost_score: f64,
233
234    /// Overall weighted score
235    pub overall_score: f64,
236}
237
238/// Submission payload for the benchmark API
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct BenchmarkSubmission {
241    pub model: String,
242    pub agent: String,
243    pub result: String,
244}
245
246/// Detect tier from PRD filename (e.g., "t1-rest-api.json" -> 1)
247pub fn detect_tier(filename: &str) -> u8 {
248    if filename.starts_with("t1-") {
249        1
250    } else if filename.starts_with("t2-") {
251        2
252    } else if filename.starts_with("t3-") {
253        3
254    } else {
255        2 // default to medium
256    }
257}
codetether_agent/benchmark/types.rs

codetether_agent/benchmark/
types.rs