Skip to main content

battlecommand_forge/
stress.rs

1//! Stress test suite — 21 graded tasks (C4-C9) ported from battleclaw-v2.
2//!
3//! Tests the system's code generation quality across increasing complexity.
4
5use crate::llm::LlmClient;
6use anyhow::Result;
7use std::time::Instant;
8
9pub struct StressTask {
10    pub name: &'static str,
11    pub complexity: u32,
12    pub language: &'static str,
13    pub prompt: &'static str,
14}
15
16pub struct StressResult {
17    pub name: String,
18    pub complexity: u32,
19    pub passed: bool,
20    pub lines: usize,
21    pub duration_secs: f64,
22}
23
24/// Run the stress test suite.
25pub async fn run_stress(llm: &LlmClient, max_tasks: usize) -> Result<Vec<StressResult>> {
26    let tasks = get_tasks();
27    let tasks: Vec<&StressTask> = tasks.iter().take(max_tasks).collect();
28
29    println!("BattleCommand Forge — Stress Test");
30    println!("==================================");
31    println!("Tasks: {}  Complexity: C4-C9\n", tasks.len());
32
33    let mut results = Vec::new();
34    let total_start = Instant::now();
35
36    for (i, task) in tasks.iter().enumerate() {
37        print!(
38            "  [{:>2}/{}] C{} {:<30} ",
39            i + 1,
40            tasks.len(),
41            task.complexity,
42            task.name
43        );
44
45        let start = Instant::now();
46        let system = format!(
47            "You are a senior engineer. Write ONLY the code requested. \
48             No explanations. The code must compile/run and print 'PASS' if all tests pass. \
49             Language: {}",
50            task.language
51        );
52
53        let response = llm
54            .generate(
55                &format!("STRESS-C{}", task.complexity),
56                &system,
57                task.prompt,
58            )
59            .await;
60        let duration = start.elapsed().as_secs_f64();
61
62        match response {
63            Ok(code) => {
64                let lines = code.lines().count();
65                // For stress tests, we check if the LLM produced reasonable output
66                let passed = lines > 5 && !code.contains("TODO") && !code.contains("FIXME");
67                results.push(StressResult {
68                    name: task.name.to_string(),
69                    complexity: task.complexity,
70                    passed,
71                    lines,
72                    duration_secs: duration,
73                });
74                println!(
75                    "{}  {:.1}s  {} lines",
76                    if passed { "PASS" } else { "FAIL" },
77                    duration,
78                    lines
79                );
80            }
81            Err(e) => {
82                results.push(StressResult {
83                    name: task.name.to_string(),
84                    complexity: task.complexity,
85                    passed: false,
86                    lines: 0,
87                    duration_secs: duration,
88                });
89                println!(
90                    "ERR   {:.1}s  {}",
91                    duration,
92                    e.to_string().chars().take(50).collect::<String>()
93                );
94            }
95        }
96    }
97
98    let wall_time = total_start.elapsed().as_secs_f64();
99    let passed = results.iter().filter(|r| r.passed).count();
100    let total = results.len();
101    let pass_rate = if total > 0 {
102        (passed as f64 / total as f64) * 100.0
103    } else {
104        0.0
105    };
106
107    println!("\n=== Stress Test Results ===");
108    println!("Pass Rate: {}/{} ({:.0}%)", passed, total, pass_rate);
109    println!("Wall Time: {:.1}s", wall_time);
110
111    if pass_rate >= 98.0 {
112        println!("RESULT: EXCELLENT (>= 98%)");
113    } else if pass_rate >= 90.0 {
114        println!("RESULT: GOOD (>= 90%)");
115    } else {
116        println!("RESULT: NEEDS IMPROVEMENT (< 90%)");
117    }
118
119    Ok(results)
120}
121
122fn get_tasks() -> Vec<StressTask> {
123    vec![
124        // C4 — Protocol structures and coordinate math
125        StressTask { name: "Coord Distance", complexity: 4, language: "python",
126            prompt: "Write a Python function `haversine(lat1, lon1, lat2, lon2) -> float` computing great-circle distance in km. Earth radius 6371. Test: London(51.5074,-0.1278) to Paris(48.8566,2.3522) should be 340-345 km. Print PASS if correct." },
127        StressTask { name: "Bearing Calc", complexity: 4, language: "python",
128            prompt: "Write `initial_bearing(lat1, lon1, lat2, lon2) -> float` returning initial bearing in degrees 0-360. Test: (0,0) to (0,1) ~= 90 degrees. Print PASS." },
129        StressTask { name: "Military Timestamp", complexity: 4, language: "python",
130            prompt: "Write functions to convert between Unix epoch and DTG format (DDHHMMZmmmYY). Test round-trip. Print PASS." },
131
132        // C5 — State machines, ring buffers
133        StressTask { name: "CRC-32", complexity: 5, language: "python",
134            prompt: "Implement CRC-32 with polynomial 0xEDB88320. CRC of '123456789' must equal 0xCBF43926. Print PASS." },
135        StressTask { name: "Ring Buffer", complexity: 5, language: "python",
136            prompt: "Implement a fixed-size circular buffer class with push, pop, peek, full, empty. Test FIFO order and wrap-around. Print PASS." },
137        StressTask { name: "Heading Normalize", complexity: 5, language: "python",
138            prompt: "Write normalize(deg)->float wrapping to [0,360), shortest_turn(from,to)->float signed, in_arc(heading,center,width)->bool. Test all. Print PASS." },
139        StressTask { name: "Priority Queue", complexity: 5, language: "python",
140            prompt: "Max-heap priority queue (no heapq). push(priority, data), pop() returns highest priority. Test order 5,4,3,1,1 for inputs 3,1,4,1,5. Print PASS." },
141
142        // C6 — Complex protocols, matrix math
143        StressTask { name: "Matrix 3x3", complexity: 6, language: "python",
144            prompt: "3x3 matrix class: multiply, transpose, determinant. det of [[1,2,3],[0,1,4],[5,6,0]] = 1.0. Print PASS." },
145        StressTask { name: "ENU Coordinates", complexity: 6, language: "python",
146            prompt: "WGS84 geodetic_to_ecef and ecef_to_enu functions. (0,0,0)->ECEF ~= (6378137,0,0). Point 1km north has ENU north ~1000m. Print PASS." },
147        StressTask { name: "Hamming Code", complexity: 6, language: "python",
148            prompt: "Hamming(7,4) encode/decode with single-error correction. Encode 0b1011, flip one bit, decode back correctly. Print PASS." },
149        StressTask { name: "Link-16 Word", complexity: 6, language: "python",
150            prompt: "Pack/unpack 32-bit Link-16 J-word: label(5b), sublabel(3b), data(24b). Round-trip (31,7,0xFFFFFF). Print PASS." },
151
152        // C7 — Scheduling, IFF, classification
153        StressTask { name: "Threat Assessment", complexity: 7, language: "python",
154            prompt: "ThreatAssessor scoring targets by closing_rate, distance, altitude, rcs, iff. Foe close fast low = CRITICAL, friend far slow high = LOW. Print PASS." },
155        StressTask { name: "Track Correlator", complexity: 7, language: "python",
156            prompt: "TrackCorrelator: nearest-neighbor association within gate distance. 2 sensors, 2 targets. Verify 2 tracks created with fused positions. Print PASS." },
157
158        // C8 — Kalman, ballistics
159        StressTask { name: "Kalman 1D", complexity: 8, language: "python",
160            prompt: "KalmanFilter1D with predict(dt) and update(z). Constant velocity model. Init at 0, feed 10 measurements of object at 100. Estimate within 10. Print PASS." },
161        StressTask { name: "Ballistic Trajectory", complexity: 8, language: "python",
162            prompt: "simulate_trajectory(v0, angle_deg, drag_coeff=0, dt=0.01) -> dict with max_height, range, flight_time. v0=100, 45deg, no drag: range~1019m. With drag: shorter. Print PASS." },
163        StressTask { name: "Engagement Zone", complexity: 8, language: "python",
164            prompt: "EngagementZone(min_range,max_range,min_alt,max_alt,max_speed). in_zone(), pk_estimate(). Zone(5,50,100,15000,800). (25,5000,200)=in, (100,5000,200)=out. Print PASS." },
165
166        // C9 — Full systems
167        StressTask { name: "Extended Kalman", complexity: 9, language: "python",
168            prompt: "EKF for 2D tracking: state=[x,y,vx,vy]. predict + update_position + update_bearing_range. Track target at (100,100) moving (10,0). Within 20m after 5 steps. Print PASS." },
169        StressTask { name: "A* Pathfinding", complexity: 9, language: "python",
170            prompt: "A* on 2D grid. Grid class with set_blocked, find_path. 10x10 grid with wall (one gap). Path from (0,0) to (9,9) exists and avoids blocked. Fully blocked -> empty. Print PASS." },
171        StressTask { name: "Track Manager", complexity: 9, language: "python",
172            prompt: "TrackManager: predict_all, correlate_and_update, prune. States: TENTATIVE->CONFIRMED->DELETED. 3 targets, 10 steps. >=2 confirmed tracks. Print PASS." },
173        StressTask { name: "Radar Processor", complexity: 9, language: "python",
174            prompt: "matched_filter (cross-correlation) and cfar_detect (Cell-Averaging CFAR). Signal with peak at index 20, template [1,1,1]. CFAR detects 2 targets above noise. Print PASS." },
175    ]
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn test_task_count() {
184        let tasks = get_tasks();
185        assert!(tasks.len() >= 20);
186    }
187
188    #[test]
189    fn test_complexity_range() {
190        let tasks = get_tasks();
191        assert!(tasks.iter().all(|t| t.complexity >= 4 && t.complexity <= 9));
192    }
193}