battlecommand_forge/
benchmark.rs

1use crate::mission::MissionRunner;
2use crate::model_config::{ModelConfig, Preset};
3/// Multi-model benchmark framework.
4/// Runs N missions across different model configurations to compare
5/// quality, speed, and cost.
6use anyhow::Result;
7use std::time::Instant;
8
9const BENCHMARK_MISSIONS: &[(&str, u32)] = &[
10    ("Build a Python CLI that converts CSV files to JSON with column filtering and pretty-print options. Single file `tasks/csv_to_json.py`. Include argparse, read/write, error handling for missing files. Validate: python3 tasks/csv_to_json.py --help", 4),
11    ("Build a Python sensor data fusion system. Single file `tasks/sensor_fusion.py` with: a 1D KalmanFilter class (predict/update), a TrackManager class that maintains multiple tracks, correlates detections (nearest-neighbor within gate distance), creates new tracks for uncorrelated detections, prunes stale tracks. Test with 2 simulated targets over 20 time steps.", 7),
12    ("Build a Python URL shortener with FastAPI. Files: `tasks/main.py`, `tasks/models.py`, `tasks/database.py`. SQLite backend, base62 encoding, redirect endpoint, stats endpoint showing click count. Include proper error handling for duplicate URLs and not-found cases.", 5),
13    ("Build a Python config parser library. Single file `tasks/config_parser.py`. Support TOML, JSON, and YAML formats. Merge multiple config files with priority ordering. Environment variable interpolation (${VAR} syntax). Type coercion for boolean, int, float values. Validate: python3 -c \"from tasks.config_parser import ConfigParser; c=ConfigParser(); print('PASS')\"", 6),
14    ("Build a C++ coordinate transform library. Single file `tasks/coord_transforms.cpp` with: WGS84 constants, geodetic to ECEF conversion, ECEF to ENU given reference point, haversine great-circle distance. All angles in radians. Test: London to Paris must be 340-345 km. Compile: c++ -std=c++17 -Wall -o /tmp/bc_coords tasks/coord_transforms.cpp && /tmp/bc_coords", 8),
15];
16
17struct BenchmarkResult {
18    mission: String,
19    model: String,
20    time_seconds: f64,
21    score: Option<f64>,
22    error: Option<String>,
23}
24
25pub async fn run_benchmark(phase: &str, tasks: usize) -> Result<()> {
26    println!("BattleCommand Forge Benchmark");
27    println!("=============================\n");
28
29    let missions = &BENCHMARK_MISSIONS[..tasks.min(BENCHMARK_MISSIONS.len())];
30
31    match phase {
32        "full" => run_full_benchmark(missions).await?,
33        "quick" => run_quick_benchmark(missions).await?,
34        _ => {
35            return Err(anyhow::anyhow!(
36                "Unknown phase '{}'. Use: full, quick",
37                phase
38            ))
39        }
40    }
41
42    Ok(())
43}
44
45async fn run_full_benchmark(missions: &[(&str, u32)]) -> Result<()> {
46    println!("Phase: Full Pipeline ({} missions)\n", missions.len());
47
48    let mut results: Vec<BenchmarkResult> = Vec::new();
49
50    for (i, (mission, _complexity)) in missions.iter().enumerate() {
51        let start = Instant::now();
52        println!("[{}/{}] {}", i + 1, missions.len(), truncate(mission, 60));
53
54        let config = ModelConfig::resolve(Preset::Premium, ".", None, None, None, None);
55        let mut runner = MissionRunner::new(config);
56        runner.auto_mode = true;
57
58        let result = match runner.run(mission).await {
59            Ok(()) => {
60                let score = runner.best_score();
61                BenchmarkResult {
62                    mission: truncate(mission, 50),
63                    model: "premium".to_string(),
64                    time_seconds: start.elapsed().as_secs_f64(),
65                    score: Some(score),
66                    error: None,
67                }
68            }
69            Err(e) => BenchmarkResult {
70                mission: truncate(mission, 50),
71                model: "premium".to_string(),
72                time_seconds: start.elapsed().as_secs_f64(),
73                score: None,
74                error: Some(format!("{}", e)),
75            },
76        };
77
78        if let Some(ref err) = result.error {
79            println!("  [FAIL] {:.1}s | {}", result.time_seconds, err);
80        } else {
81            println!(
82                "  [OK]   {:.1}s | score: {:.1}/10",
83                result.time_seconds,
84                result.score.unwrap_or(0.0)
85            );
86        }
87
88        results.push(result);
89    }
90
91    print_summary("Full Pipeline", &results);
92    save_results("full_benchmark.md", "Full Pipeline", &results).await?;
93    Ok(())
94}
95
96async fn run_quick_benchmark(missions: &[(&str, u32)]) -> Result<()> {
97    println!("Phase: Quick (fast preset, {} missions)\n", missions.len());
98
99    let mut results: Vec<BenchmarkResult> = Vec::new();
100
101    for (i, (mission, _complexity)) in missions.iter().enumerate() {
102        let start = Instant::now();
103        println!("[{}/{}] {}", i + 1, missions.len(), truncate(mission, 60));
104
105        let config = ModelConfig::resolve(Preset::Fast, ".", None, None, None, None);
106        let mut runner = MissionRunner::new(config);
107        runner.auto_mode = true;
108
109        let result = match runner.run(mission).await {
110            Ok(()) => BenchmarkResult {
111                mission: truncate(mission, 50),
112                model: "fast".to_string(),
113                time_seconds: start.elapsed().as_secs_f64(),
114                score: Some(runner.best_score()),
115                error: None,
116            },
117            Err(e) => BenchmarkResult {
118                mission: truncate(mission, 50),
119                model: "fast".to_string(),
120                time_seconds: start.elapsed().as_secs_f64(),
121                score: None,
122                error: Some(format!("{}", e)),
123            },
124        };
125
126        if let Some(ref err) = result.error {
127            println!("  [FAIL] {:.1}s | {}", result.time_seconds, err);
128        } else {
129            println!(
130                "  [OK]   {:.1}s | score: {:.1}/10",
131                result.time_seconds,
132                result.score.unwrap_or(0.0)
133            );
134        }
135
136        results.push(result);
137    }
138
139    print_summary("Quick", &results);
140    save_results("quick_benchmark.md", "Quick", &results).await?;
141    Ok(())
142}
143
144fn truncate(s: &str, max: usize) -> String {
145    if s.len() <= max {
146        s.to_string()
147    } else {
148        format!("{}...", &s[..max.saturating_sub(3)])
149    }
150}
151
152fn print_summary(phase: &str, results: &[BenchmarkResult]) {
153    println!("\n{} Benchmark Summary", phase);
154    println!("{}", "=".repeat(90));
155    println!(
156        "{:<55} {:>8} {:>8} {:>6}",
157        "Mission", "Time", "Score", "Status"
158    );
159    println!("{}", "-".repeat(90));
160
161    for r in results {
162        let status = if r.error.is_some() { "FAIL" } else { "OK" };
163        let score = r
164            .score
165            .map(|s| format!("{:.1}", s))
166            .unwrap_or_else(|| "-".into());
167        println!(
168            "{:<55} {:>7.1}s {:>8} {:>6}",
169            r.mission, r.time_seconds, score, status
170        );
171    }
172    println!("{}", "=".repeat(90));
173
174    let total = results.len();
175    let passed = results.iter().filter(|r| r.error.is_none()).count();
176    let avg_score: f64 = results.iter().filter_map(|r| r.score).sum::<f64>() / passed.max(1) as f64;
177    let avg_time: f64 = results.iter().map(|r| r.time_seconds).sum::<f64>() / total as f64;
178    println!(
179        "\n  Pass: {}/{} | Avg score: {:.1} | Avg time: {:.0}s",
180        passed, total, avg_score, avg_time
181    );
182}
183
184async fn save_results(filename: &str, phase: &str, results: &[BenchmarkResult]) -> Result<()> {
185    let dir = ".battlecommand/benchmarks";
186    tokio::fs::create_dir_all(dir).await?;
187
188    let mut md = format!(
189        "# {} Benchmark Results\n\nGenerated: {}\n\n",
190        phase,
191        chrono::Utc::now().format("%Y-%m-%d %H:%M UTC")
192    );
193    md.push_str("| Mission | Model | Time | Score | Status |\n|---------|-------|------|-------|--------|\n");
194    for r in results {
195        let status = match &r.error {
196            Some(e) => format!("ERR: {}", truncate(e, 30)),
197            None => "OK".into(),
198        };
199        let score = r
200            .score
201            .map(|s| format!("{:.1}", s))
202            .unwrap_or_else(|| "-".into());
203        md.push_str(&format!(
204            "| {} | {} | {:.1}s | {} | {} |\n",
205            r.mission, r.model, r.time_seconds, score, status
206        ));
207    }
208
209    let path = format!("{}/{}", dir, filename);
210    tokio::fs::write(&path, &md).await?;
211    println!("\nResults saved to {}", path);
212    Ok(())
213}
battlecommand_forge/benchmark.rs

battlecommand_forge/
benchmark.rs