1use crate::mission::MissionRunner;
2use crate::model_config::{ModelConfig, Preset};
3use anyhow::Result;
7use std::time::Instant;
8
9const BENCHMARK_MISSIONS: &[(&str, u32)] = &[
10 ("Build a Python CLI that converts CSV files to JSON with column filtering and pretty-print options. Single file `tasks/csv_to_json.py`. Include argparse, read/write, error handling for missing files. Validate: python3 tasks/csv_to_json.py --help", 4),
11 ("Build a Python sensor data fusion system. Single file `tasks/sensor_fusion.py` with: a 1D KalmanFilter class (predict/update), a TrackManager class that maintains multiple tracks, correlates detections (nearest-neighbor within gate distance), creates new tracks for uncorrelated detections, prunes stale tracks. Test with 2 simulated targets over 20 time steps.", 7),
12 ("Build a Python URL shortener with FastAPI. Files: `tasks/main.py`, `tasks/models.py`, `tasks/database.py`. SQLite backend, base62 encoding, redirect endpoint, stats endpoint showing click count. Include proper error handling for duplicate URLs and not-found cases.", 5),
13 ("Build a Python config parser library. Single file `tasks/config_parser.py`. Support TOML, JSON, and YAML formats. Merge multiple config files with priority ordering. Environment variable interpolation (${VAR} syntax). Type coercion for boolean, int, float values. Validate: python3 -c \"from tasks.config_parser import ConfigParser; c=ConfigParser(); print('PASS')\"", 6),
14 ("Build a C++ coordinate transform library. Single file `tasks/coord_transforms.cpp` with: WGS84 constants, geodetic to ECEF conversion, ECEF to ENU given reference point, haversine great-circle distance. All angles in radians. Test: London to Paris must be 340-345 km. Compile: c++ -std=c++17 -Wall -o /tmp/bc_coords tasks/coord_transforms.cpp && /tmp/bc_coords", 8),
15];
16
17struct BenchmarkResult {
18 mission: String,
19 model: String,
20 time_seconds: f64,
21 score: Option<f64>,
22 error: Option<String>,
23}
24
25pub async fn run_benchmark(phase: &str, tasks: usize) -> Result<()> {
26 println!("BattleCommand Forge Benchmark");
27 println!("=============================\n");
28
29 let missions = &BENCHMARK_MISSIONS[..tasks.min(BENCHMARK_MISSIONS.len())];
30
31 match phase {
32 "full" => run_full_benchmark(missions).await?,
33 "quick" => run_quick_benchmark(missions).await?,
34 _ => {
35 return Err(anyhow::anyhow!(
36 "Unknown phase '{}'. Use: full, quick",
37 phase
38 ))
39 }
40 }
41
42 Ok(())
43}
44
45async fn run_full_benchmark(missions: &[(&str, u32)]) -> Result<()> {
46 println!("Phase: Full Pipeline ({} missions)\n", missions.len());
47
48 let mut results: Vec<BenchmarkResult> = Vec::new();
49
50 for (i, (mission, _complexity)) in missions.iter().enumerate() {
51 let start = Instant::now();
52 println!("[{}/{}] {}", i + 1, missions.len(), truncate(mission, 60));
53
54 let config = ModelConfig::resolve(Preset::Premium, ".", None, None, None, None);
55 let mut runner = MissionRunner::new(config);
56 runner.auto_mode = true;
57
58 let result = match runner.run(mission).await {
59 Ok(()) => {
60 let score = runner.best_score();
61 BenchmarkResult {
62 mission: truncate(mission, 50),
63 model: "premium".to_string(),
64 time_seconds: start.elapsed().as_secs_f64(),
65 score: Some(score),
66 error: None,
67 }
68 }
69 Err(e) => BenchmarkResult {
70 mission: truncate(mission, 50),
71 model: "premium".to_string(),
72 time_seconds: start.elapsed().as_secs_f64(),
73 score: None,
74 error: Some(format!("{}", e)),
75 },
76 };
77
78 if let Some(ref err) = result.error {
79 println!(" [FAIL] {:.1}s | {}", result.time_seconds, err);
80 } else {
81 println!(
82 " [OK] {:.1}s | score: {:.1}/10",
83 result.time_seconds,
84 result.score.unwrap_or(0.0)
85 );
86 }
87
88 results.push(result);
89 }
90
91 print_summary("Full Pipeline", &results);
92 save_results("full_benchmark.md", "Full Pipeline", &results).await?;
93 Ok(())
94}
95
96async fn run_quick_benchmark(missions: &[(&str, u32)]) -> Result<()> {
97 println!("Phase: Quick (fast preset, {} missions)\n", missions.len());
98
99 let mut results: Vec<BenchmarkResult> = Vec::new();
100
101 for (i, (mission, _complexity)) in missions.iter().enumerate() {
102 let start = Instant::now();
103 println!("[{}/{}] {}", i + 1, missions.len(), truncate(mission, 60));
104
105 let config = ModelConfig::resolve(Preset::Fast, ".", None, None, None, None);
106 let mut runner = MissionRunner::new(config);
107 runner.auto_mode = true;
108
109 let result = match runner.run(mission).await {
110 Ok(()) => BenchmarkResult {
111 mission: truncate(mission, 50),
112 model: "fast".to_string(),
113 time_seconds: start.elapsed().as_secs_f64(),
114 score: Some(runner.best_score()),
115 error: None,
116 },
117 Err(e) => BenchmarkResult {
118 mission: truncate(mission, 50),
119 model: "fast".to_string(),
120 time_seconds: start.elapsed().as_secs_f64(),
121 score: None,
122 error: Some(format!("{}", e)),
123 },
124 };
125
126 if let Some(ref err) = result.error {
127 println!(" [FAIL] {:.1}s | {}", result.time_seconds, err);
128 } else {
129 println!(
130 " [OK] {:.1}s | score: {:.1}/10",
131 result.time_seconds,
132 result.score.unwrap_or(0.0)
133 );
134 }
135
136 results.push(result);
137 }
138
139 print_summary("Quick", &results);
140 save_results("quick_benchmark.md", "Quick", &results).await?;
141 Ok(())
142}
143
144fn truncate(s: &str, max: usize) -> String {
145 if s.len() <= max {
146 s.to_string()
147 } else {
148 format!("{}...", &s[..max.saturating_sub(3)])
149 }
150}
151
152fn print_summary(phase: &str, results: &[BenchmarkResult]) {
153 println!("\n{} Benchmark Summary", phase);
154 println!("{}", "=".repeat(90));
155 println!(
156 "{:<55} {:>8} {:>8} {:>6}",
157 "Mission", "Time", "Score", "Status"
158 );
159 println!("{}", "-".repeat(90));
160
161 for r in results {
162 let status = if r.error.is_some() { "FAIL" } else { "OK" };
163 let score = r
164 .score
165 .map(|s| format!("{:.1}", s))
166 .unwrap_or_else(|| "-".into());
167 println!(
168 "{:<55} {:>7.1}s {:>8} {:>6}",
169 r.mission, r.time_seconds, score, status
170 );
171 }
172 println!("{}", "=".repeat(90));
173
174 let total = results.len();
175 let passed = results.iter().filter(|r| r.error.is_none()).count();
176 let avg_score: f64 = results.iter().filter_map(|r| r.score).sum::<f64>() / passed.max(1) as f64;
177 let avg_time: f64 = results.iter().map(|r| r.time_seconds).sum::<f64>() / total as f64;
178 println!(
179 "\n Pass: {}/{} | Avg score: {:.1} | Avg time: {:.0}s",
180 passed, total, avg_score, avg_time
181 );
182}
183
184async fn save_results(filename: &str, phase: &str, results: &[BenchmarkResult]) -> Result<()> {
185 let dir = ".battlecommand/benchmarks";
186 tokio::fs::create_dir_all(dir).await?;
187
188 let mut md = format!(
189 "# {} Benchmark Results\n\nGenerated: {}\n\n",
190 phase,
191 chrono::Utc::now().format("%Y-%m-%d %H:%M UTC")
192 );
193 md.push_str("| Mission | Model | Time | Score | Status |\n|---------|-------|------|-------|--------|\n");
194 for r in results {
195 let status = match &r.error {
196 Some(e) => format!("ERR: {}", truncate(e, 30)),
197 None => "OK".into(),
198 };
199 let score = r
200 .score
201 .map(|s| format!("{:.1}", s))
202 .unwrap_or_else(|| "-".into());
203 md.push_str(&format!(
204 "| {} | {} | {:.1}s | {} | {} |\n",
205 r.mission, r.model, r.time_seconds, score, status
206 ));
207 }
208
209 let path = format!("{}/{}", dir, filename);
210 tokio::fs::write(&path, &md).await?;
211 println!("\nResults saved to {}", path);
212 Ok(())
213}