1use std::fmt;
8use std::path::{Path, PathBuf};
9
10use anyhow::{Context, Result};
11use serde::Serialize;
12
13const REPORTS_DIR: &str = ".batty/reports/stress";
14const COMPACT_DURATION_SECS: u64 = 10 * 60;
15
16#[derive(Debug, Clone)]
17pub struct StressTestOptions {
18 pub compact: bool,
19 pub duration_hours: u64,
20 pub seed: u64,
21 pub json_out: Option<PathBuf>,
22 pub markdown_out: Option<PathBuf>,
23}
24
25#[derive(Debug, Clone)]
26pub struct StressRunArtifacts {
27 pub summary: StressSummary,
28 pub json_report_path: PathBuf,
29 pub markdown_report_path: PathBuf,
30}
31
32#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq, Hash)]
33#[serde(rename_all = "snake_case")]
34pub enum FaultKind {
35 AgentCrash,
36 ContextExhaustion,
37 MergeConflict,
38 BoardStarvation,
39 WorktreeCorruption,
40 ShimEof,
41}
42
43impl FaultKind {
44 const ALL: [Self; 6] = [
45 Self::AgentCrash,
46 Self::ContextExhaustion,
47 Self::MergeConflict,
48 Self::BoardStarvation,
49 Self::WorktreeCorruption,
50 Self::ShimEof,
51 ];
52
53 fn label(self) -> &'static str {
54 match self {
55 Self::AgentCrash => "agent_crash",
56 Self::ContextExhaustion => "context_exhaustion",
57 Self::MergeConflict => "merge_conflict",
58 Self::BoardStarvation => "board_starvation",
59 Self::WorktreeCorruption => "worktree_corruption",
60 Self::ShimEof => "shim_eof",
61 }
62 }
63
64 fn description(self) -> &'static str {
65 match self {
66 Self::AgentCrash => "Shim-backed agent process exits unexpectedly during active work.",
67 Self::ContextExhaustion => {
68 "Agent exceeds context budget and must be restarted with handoff state."
69 }
70 Self::MergeConflict => "Engineer worktree is left in unresolved merge-conflict state.",
71 Self::BoardStarvation => {
72 "Idle engineers outnumber dispatchable tasks and planning must replenish work."
73 }
74 Self::WorktreeCorruption => {
75 "Engineer worktree becomes unusable and must be rebuilt or reset to base."
76 }
77 Self::ShimEof => "Shim command channel closes and daemon must detect the dead runtime.",
78 }
79 }
80
81 fn roadmap_anchor(self) -> &'static str {
82 match self {
83 Self::AgentCrash => "Agent process dies inside shim",
84 Self::ContextExhaustion => "Codex agents exhaust context on meta-conversations",
85 Self::MergeConflict => "Merge conflict permanent stall",
86 Self::BoardStarvation => "Board empties when agents don't create tasks",
87 Self::WorktreeCorruption => "Worktree stuck on old branch",
88 Self::ShimEof => "Agent process dies inside shim",
89 }
90 }
91
92 fn sla_secs(self) -> u64 {
93 match self {
94 Self::AgentCrash => 60,
95 Self::ContextExhaustion => 90,
96 Self::MergeConflict => 90,
97 Self::BoardStarvation => 120,
98 Self::WorktreeCorruption => 120,
99 Self::ShimEof => 60,
100 }
101 }
102
103 fn ordinal(self) -> u64 {
104 match self {
105 Self::AgentCrash => 0,
106 Self::ContextExhaustion => 1,
107 Self::MergeConflict => 2,
108 Self::BoardStarvation => 3,
109 Self::WorktreeCorruption => 4,
110 Self::ShimEof => 5,
111 }
112 }
113}
114
115impl fmt::Display for FaultKind {
116 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
117 f.write_str(self.label())
118 }
119}
120
121#[derive(Debug, Clone, Serialize)]
122pub struct StressSummary {
123 pub compact: bool,
124 pub seed: u64,
125 pub virtual_duration_secs: u64,
126 pub total_faults: usize,
127 pub passed_faults: usize,
128 pub failed_faults: usize,
129 pub max_recovery_secs: u64,
130 pub avg_recovery_secs: f64,
131}
132
133#[derive(Debug, Clone, Serialize)]
134pub struct StressReport {
135 pub generated_at: String,
136 pub compact: bool,
137 pub seed: u64,
138 pub virtual_duration_secs: u64,
139 pub summary: StressSummary,
140 pub faults: Vec<FaultRecord>,
141}
142
143#[derive(Debug, Clone, Serialize)]
144pub struct FaultRecord {
145 pub sequence: usize,
146 pub kind: FaultKind,
147 pub description: String,
148 pub roadmap_anchor: String,
149 pub injected_at_secs: u64,
150 pub detected_at_secs: u64,
151 pub recovered_at_secs: u64,
152 pub recovery_time_secs: u64,
153 pub sla_secs: u64,
154 pub passed_sla: bool,
155 pub notes: String,
156}
157
158#[derive(Debug, Clone)]
159struct ScheduledFault {
160 sequence: usize,
161 kind: FaultKind,
162 injected_at_secs: u64,
163}
164
165#[derive(Debug, Clone)]
166struct InjectedFault {
167 detected_after_secs: u64,
168 recovered_after_secs: u64,
169 notes: String,
170}
171
172trait FaultInjector {
173 fn inject(&self, fault: &ScheduledFault) -> InjectedFault;
174}
175
176struct SyntheticFaultInjector {
177 seed: u64,
178}
179
180impl FaultInjector for SyntheticFaultInjector {
181 fn inject(&self, fault: &ScheduledFault) -> InjectedFault {
182 let sequence_mix = (fault.sequence as u64 + 1).wrapping_mul(0x9E37_79B9_7F4A_7C15);
183 let kind_mix = fault.kind.ordinal().wrapping_mul(0xA24B_AED4_963E_E407);
184 let mut rng = Lcg::new(self.seed ^ sequence_mix ^ kind_mix);
185 let sla = fault.kind.sla_secs();
186 let detect_cap = (sla / 5).max(2);
187 let detected_after_secs = 1 + rng.next_bounded(detect_cap);
188 let failure_roll = rng.next_bounded(12);
189 let recovered_after_secs = if failure_roll == 0 {
190 sla + 5 + rng.next_bounded((sla / 3).max(5))
191 } else {
192 let floor = sla.saturating_sub((sla / 3).max(5));
193 floor + rng.next_bounded((sla - floor).max(1))
194 };
195
196 InjectedFault {
197 detected_after_secs,
198 recovered_after_secs,
199 notes: format!(
200 "Synthetic {} injection on virtual timeline; detection {}s, recovery {}s.",
201 fault.kind, detected_after_secs, recovered_after_secs
202 ),
203 }
204 }
205}
206
207#[derive(Debug, Clone)]
208struct Lcg {
209 state: u64,
210}
211
212impl Lcg {
213 fn new(seed: u64) -> Self {
214 Self {
215 state: seed.wrapping_add(0xD1B5_4A32_D192_ED03),
216 }
217 }
218
219 fn next_u64(&mut self) -> u64 {
220 self.state = self
221 .state
222 .wrapping_mul(6_364_136_223_846_793_005)
223 .wrapping_add(1_442_695_040_888_963_407);
224 self.state
225 }
226
227 fn next_bounded(&mut self, upper_exclusive: u64) -> u64 {
228 if upper_exclusive == 0 {
229 0
230 } else {
231 self.next_u64() % upper_exclusive
232 }
233 }
234}
235
236pub fn run(project_root: &Path, options: StressTestOptions) -> Result<StressRunArtifacts> {
237 let injector = SyntheticFaultInjector { seed: options.seed };
238 let report = run_with_injector(&options, &injector);
239
240 let report_dir = project_root.join(REPORTS_DIR);
241 std::fs::create_dir_all(&report_dir)
242 .with_context(|| format!("failed to create {}", report_dir.display()))?;
243
244 let timestamp = chrono::Utc::now().format("%Y%m%d-%H%M%S").to_string();
245 let json_path = options
246 .json_out
247 .clone()
248 .unwrap_or_else(|| report_dir.join(format!("stress-test-{timestamp}.json")));
249 let markdown_path = options
250 .markdown_out
251 .clone()
252 .unwrap_or_else(|| report_dir.join(format!("stress-test-{timestamp}.md")));
253
254 let json = serde_json::to_vec_pretty(&report).context("failed to serialize stress report")?;
255 std::fs::write(&json_path, json)
256 .with_context(|| format!("failed to write {}", json_path.display()))?;
257
258 let markdown = render_markdown(&report);
259 std::fs::write(&markdown_path, markdown)
260 .with_context(|| format!("failed to write {}", markdown_path.display()))?;
261
262 Ok(StressRunArtifacts {
263 summary: report.summary,
264 json_report_path: json_path,
265 markdown_report_path: markdown_path,
266 })
267}
268
269fn run_with_injector(options: &StressTestOptions, injector: &dyn FaultInjector) -> StressReport {
270 let virtual_duration_secs = if options.compact {
271 COMPACT_DURATION_SECS
272 } else {
273 options.duration_hours.max(1).saturating_mul(3600)
274 };
275 let faults = build_schedule(options.compact, virtual_duration_secs, options.seed)
276 .into_iter()
277 .map(|fault| evaluate_fault(fault, injector))
278 .collect::<Vec<_>>();
279
280 let total_faults = faults.len();
281 let passed_faults = faults.iter().filter(|fault| fault.passed_sla).count();
282 let failed_faults = total_faults.saturating_sub(passed_faults);
283 let max_recovery_secs = faults
284 .iter()
285 .map(|fault| fault.recovery_time_secs)
286 .max()
287 .unwrap_or(0);
288 let avg_recovery_secs = if total_faults == 0 {
289 0.0
290 } else {
291 faults
292 .iter()
293 .map(|fault| fault.recovery_time_secs as f64)
294 .sum::<f64>()
295 / total_faults as f64
296 };
297
298 let summary = StressSummary {
299 compact: options.compact,
300 seed: options.seed,
301 virtual_duration_secs,
302 total_faults,
303 passed_faults,
304 failed_faults,
305 max_recovery_secs,
306 avg_recovery_secs,
307 };
308
309 StressReport {
310 generated_at: chrono::Utc::now().to_rfc3339(),
311 compact: options.compact,
312 seed: options.seed,
313 virtual_duration_secs,
314 summary,
315 faults,
316 }
317}
318
319fn build_schedule(compact: bool, virtual_duration_secs: u64, seed: u64) -> Vec<ScheduledFault> {
320 if compact {
321 let spacing = (virtual_duration_secs / (FaultKind::ALL.len() as u64 + 1)).max(1);
322 return FaultKind::ALL
323 .into_iter()
324 .enumerate()
325 .map(|(idx, kind)| ScheduledFault {
326 sequence: idx + 1,
327 kind,
328 injected_at_secs: spacing * (idx as u64 + 1),
329 })
330 .collect();
331 }
332
333 let mut rng = Lcg::new(seed);
334 let mut scheduled = Vec::new();
335 let baseline_count = FaultKind::ALL.len();
336 let extra_count = ((virtual_duration_secs / 3600) as usize).max(2);
337 let total = baseline_count + extra_count;
338 let base_spacing = (virtual_duration_secs / (total as u64 + 1)).max(1);
339
340 for (idx, kind) in FaultKind::ALL.into_iter().enumerate() {
341 let jitter = rng.next_bounded((base_spacing / 3).max(1));
342 scheduled.push(ScheduledFault {
343 sequence: idx + 1,
344 kind,
345 injected_at_secs: (base_spacing * (idx as u64 + 1) + jitter)
346 .min(virtual_duration_secs.saturating_sub(1)),
347 });
348 }
349
350 for idx in baseline_count..total {
351 let kind = FaultKind::ALL[rng.next_bounded(FaultKind::ALL.len() as u64) as usize];
352 let jitter = rng.next_bounded((base_spacing / 2).max(1));
353 scheduled.push(ScheduledFault {
354 sequence: idx + 1,
355 kind,
356 injected_at_secs: (base_spacing * (idx as u64 + 1) + jitter)
357 .min(virtual_duration_secs.saturating_sub(1)),
358 });
359 }
360
361 scheduled.sort_by_key(|fault| (fault.injected_at_secs, fault.sequence));
362 for (idx, fault) in scheduled.iter_mut().enumerate() {
363 fault.sequence = idx + 1;
364 }
365 scheduled
366}
367
368fn evaluate_fault(fault: ScheduledFault, injector: &dyn FaultInjector) -> FaultRecord {
369 let injected = injector.inject(&fault);
370 let detected_at_secs = fault.injected_at_secs + injected.detected_after_secs;
371 let recovered_at_secs = fault.injected_at_secs + injected.recovered_after_secs;
372 let sla_secs = fault.kind.sla_secs();
373 let recovery_time_secs = injected.recovered_after_secs;
374
375 FaultRecord {
376 sequence: fault.sequence,
377 kind: fault.kind,
378 description: fault.kind.description().to_string(),
379 roadmap_anchor: fault.kind.roadmap_anchor().to_string(),
380 injected_at_secs: fault.injected_at_secs,
381 detected_at_secs,
382 recovered_at_secs,
383 recovery_time_secs,
384 sla_secs,
385 passed_sla: recovery_time_secs <= sla_secs,
386 notes: injected.notes,
387 }
388}
389
390fn render_markdown(report: &StressReport) -> String {
391 let mut out = String::new();
392 out.push_str("# Batty Stress Test Report\n\n");
393 out.push_str("## Summary\n\n");
394 out.push_str(&format!(
395 "- Mode: {}\n- Seed: {}\n- Virtual duration: {}s\n- Faults injected: {}\n- SLA passed: {}\n- SLA failed: {}\n- Max recovery: {}s\n- Avg recovery: {:.1}s\n\n",
396 if report.compact { "compact" } else { "standard" },
397 report.seed,
398 report.virtual_duration_secs,
399 report.summary.total_faults,
400 report.summary.passed_faults,
401 report.summary.failed_faults,
402 report.summary.max_recovery_secs,
403 report.summary.avg_recovery_secs,
404 ));
405 out.push_str("## Faults\n\n");
406 out.push_str("| # | Fault | Injected | Recovered | Recovery | SLA | Status |\n");
407 out.push_str("|---|---|---:|---:|---:|---:|---|\n");
408 for fault in &report.faults {
409 out.push_str(&format!(
410 "| {} | {} | {}s | {}s | {}s | {}s | {} |\n",
411 fault.sequence,
412 fault.kind,
413 fault.injected_at_secs,
414 fault.recovered_at_secs,
415 fault.recovery_time_secs,
416 fault.sla_secs,
417 if fault.passed_sla { "pass" } else { "fail" }
418 ));
419 }
420 out.push_str("\n## Notes\n\n");
421 for fault in &report.faults {
422 out.push_str(&format!(
423 "- `{}` mapped to roadmap item \"{}\": {}\n",
424 fault.kind, fault.roadmap_anchor, fault.notes
425 ));
426 }
427 out
428}
429
430#[cfg(test)]
431mod tests {
432 use super::*;
433
434 struct FixedInjector {
435 recoveries: Vec<(u64, u64)>,
436 }
437
438 impl FaultInjector for FixedInjector {
439 fn inject(&self, fault: &ScheduledFault) -> InjectedFault {
440 let (detected_after_secs, recovered_after_secs) = self.recoveries[fault.sequence - 1];
441 InjectedFault {
442 detected_after_secs,
443 recovered_after_secs,
444 notes: format!("fixed outcome for {}", fault.kind),
445 }
446 }
447 }
448
449 fn options(compact: bool) -> StressTestOptions {
450 StressTestOptions {
451 compact,
452 duration_hours: 8,
453 seed: 7,
454 json_out: None,
455 markdown_out: None,
456 }
457 }
458
459 #[test]
460 fn compact_schedule_covers_full_fault_matrix() {
461 let schedule = build_schedule(true, COMPACT_DURATION_SECS, 7);
462 assert_eq!(schedule.len(), FaultKind::ALL.len());
463 for kind in FaultKind::ALL {
464 assert!(schedule.iter().any(|fault| fault.kind == kind));
465 }
466 assert!(
467 schedule
468 .windows(2)
469 .all(|pair| { pair[0].injected_at_secs < pair[1].injected_at_secs })
470 );
471 }
472
473 #[test]
474 fn standard_schedule_extends_matrix_with_additional_faults() {
475 let schedule = build_schedule(false, 8 * 3600, 9);
476 assert!(schedule.len() > FaultKind::ALL.len());
477 for kind in FaultKind::ALL {
478 assert!(schedule.iter().any(|fault| fault.kind == kind));
479 }
480 assert!(
481 schedule
482 .iter()
483 .all(|fault| fault.injected_at_secs < 8 * 3600)
484 );
485 }
486
487 #[test]
488 fn sla_failure_is_reported_when_recovery_exceeds_threshold() {
489 let injector = FixedInjector {
490 recoveries: vec![(2, 61), (2, 89), (2, 88), (2, 100), (2, 115), (2, 59)],
491 };
492 let report = run_with_injector(&options(true), &injector);
493
494 assert_eq!(report.summary.total_faults, 6);
495 assert_eq!(report.summary.failed_faults, 1);
496 assert!(!report.faults[0].passed_sla);
497 assert!(report.faults[1].passed_sla);
498 }
499
500 #[test]
501 fn run_writes_json_and_markdown_reports() {
502 let tmp = tempfile::tempdir().unwrap();
503 let json_path = tmp.path().join("stress.json");
504 let markdown_path = tmp.path().join("stress.md");
505 let report = run(
506 tmp.path(),
507 StressTestOptions {
508 compact: true,
509 duration_hours: 8,
510 seed: 3,
511 json_out: Some(json_path.clone()),
512 markdown_out: Some(markdown_path.clone()),
513 },
514 )
515 .unwrap();
516
517 assert_eq!(report.json_report_path, json_path);
518 assert_eq!(report.markdown_report_path, markdown_path);
519
520 let json = std::fs::read_to_string(&report.json_report_path).unwrap();
521 let markdown = std::fs::read_to_string(&report.markdown_report_path).unwrap();
522
523 assert!(json.contains("\"faults\""));
524 assert!(markdown.contains("# Batty Stress Test Report"));
525 assert!(markdown.contains("| # | Fault |"));
526 }
527}