Skip to main content

batty_cli/team/
stress.rs

1//! Synthetic long-session stress harness with deterministic fault injection.
2//!
3//! The harness runs on a virtual clock so CI can exercise the full recovery
4//! matrix quickly while still producing reports that look like a compressed
5//! unattended session.
6
7use std::fmt;
8use std::path::{Path, PathBuf};
9
10use anyhow::{Context, Result};
11use serde::Serialize;
12
13const REPORTS_DIR: &str = ".batty/reports/stress";
14const COMPACT_DURATION_SECS: u64 = 10 * 60;
15
16#[derive(Debug, Clone)]
17pub struct StressTestOptions {
18    pub compact: bool,
19    pub duration_hours: u64,
20    pub seed: u64,
21    pub json_out: Option<PathBuf>,
22    pub markdown_out: Option<PathBuf>,
23}
24
25#[derive(Debug, Clone)]
26pub struct StressRunArtifacts {
27    pub summary: StressSummary,
28    pub json_report_path: PathBuf,
29    pub markdown_report_path: PathBuf,
30}
31
32#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq, Hash)]
33#[serde(rename_all = "snake_case")]
34pub enum FaultKind {
35    AgentCrash,
36    ContextExhaustion,
37    MergeConflict,
38    BoardStarvation,
39    WorktreeCorruption,
40    ShimEof,
41}
42
43impl FaultKind {
44    const ALL: [Self; 6] = [
45        Self::AgentCrash,
46        Self::ContextExhaustion,
47        Self::MergeConflict,
48        Self::BoardStarvation,
49        Self::WorktreeCorruption,
50        Self::ShimEof,
51    ];
52
53    fn label(self) -> &'static str {
54        match self {
55            Self::AgentCrash => "agent_crash",
56            Self::ContextExhaustion => "context_exhaustion",
57            Self::MergeConflict => "merge_conflict",
58            Self::BoardStarvation => "board_starvation",
59            Self::WorktreeCorruption => "worktree_corruption",
60            Self::ShimEof => "shim_eof",
61        }
62    }
63
64    fn description(self) -> &'static str {
65        match self {
66            Self::AgentCrash => "Shim-backed agent process exits unexpectedly during active work.",
67            Self::ContextExhaustion => {
68                "Agent exceeds context budget and must be restarted with handoff state."
69            }
70            Self::MergeConflict => "Engineer worktree is left in unresolved merge-conflict state.",
71            Self::BoardStarvation => {
72                "Idle engineers outnumber dispatchable tasks and planning must replenish work."
73            }
74            Self::WorktreeCorruption => {
75                "Engineer worktree becomes unusable and must be rebuilt or reset to base."
76            }
77            Self::ShimEof => "Shim command channel closes and daemon must detect the dead runtime.",
78        }
79    }
80
81    fn roadmap_anchor(self) -> &'static str {
82        match self {
83            Self::AgentCrash => "Agent process dies inside shim",
84            Self::ContextExhaustion => "Codex agents exhaust context on meta-conversations",
85            Self::MergeConflict => "Merge conflict permanent stall",
86            Self::BoardStarvation => "Board empties when agents don't create tasks",
87            Self::WorktreeCorruption => "Worktree stuck on old branch",
88            Self::ShimEof => "Agent process dies inside shim",
89        }
90    }
91
92    fn sla_secs(self) -> u64 {
93        match self {
94            Self::AgentCrash => 60,
95            Self::ContextExhaustion => 90,
96            Self::MergeConflict => 90,
97            Self::BoardStarvation => 120,
98            Self::WorktreeCorruption => 120,
99            Self::ShimEof => 60,
100        }
101    }
102
103    fn ordinal(self) -> u64 {
104        match self {
105            Self::AgentCrash => 0,
106            Self::ContextExhaustion => 1,
107            Self::MergeConflict => 2,
108            Self::BoardStarvation => 3,
109            Self::WorktreeCorruption => 4,
110            Self::ShimEof => 5,
111        }
112    }
113}
114
115impl fmt::Display for FaultKind {
116    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
117        f.write_str(self.label())
118    }
119}
120
121#[derive(Debug, Clone, Serialize)]
122pub struct StressSummary {
123    pub compact: bool,
124    pub seed: u64,
125    pub virtual_duration_secs: u64,
126    pub total_faults: usize,
127    pub passed_faults: usize,
128    pub failed_faults: usize,
129    pub max_recovery_secs: u64,
130    pub avg_recovery_secs: f64,
131}
132
133#[derive(Debug, Clone, Serialize)]
134pub struct StressReport {
135    pub generated_at: String,
136    pub compact: bool,
137    pub seed: u64,
138    pub virtual_duration_secs: u64,
139    pub summary: StressSummary,
140    pub faults: Vec<FaultRecord>,
141}
142
143#[derive(Debug, Clone, Serialize)]
144pub struct FaultRecord {
145    pub sequence: usize,
146    pub kind: FaultKind,
147    pub description: String,
148    pub roadmap_anchor: String,
149    pub injected_at_secs: u64,
150    pub detected_at_secs: u64,
151    pub recovered_at_secs: u64,
152    pub recovery_time_secs: u64,
153    pub sla_secs: u64,
154    pub passed_sla: bool,
155    pub notes: String,
156}
157
158#[derive(Debug, Clone)]
159struct ScheduledFault {
160    sequence: usize,
161    kind: FaultKind,
162    injected_at_secs: u64,
163}
164
165#[derive(Debug, Clone)]
166struct InjectedFault {
167    detected_after_secs: u64,
168    recovered_after_secs: u64,
169    notes: String,
170}
171
172trait FaultInjector {
173    fn inject(&self, fault: &ScheduledFault) -> InjectedFault;
174}
175
176struct SyntheticFaultInjector {
177    seed: u64,
178}
179
180impl FaultInjector for SyntheticFaultInjector {
181    fn inject(&self, fault: &ScheduledFault) -> InjectedFault {
182        let sequence_mix = (fault.sequence as u64 + 1).wrapping_mul(0x9E37_79B9_7F4A_7C15);
183        let kind_mix = fault.kind.ordinal().wrapping_mul(0xA24B_AED4_963E_E407);
184        let mut rng = Lcg::new(self.seed ^ sequence_mix ^ kind_mix);
185        let sla = fault.kind.sla_secs();
186        let detect_cap = (sla / 5).max(2);
187        let detected_after_secs = 1 + rng.next_bounded(detect_cap);
188        let failure_roll = rng.next_bounded(12);
189        let recovered_after_secs = if failure_roll == 0 {
190            sla + 5 + rng.next_bounded((sla / 3).max(5))
191        } else {
192            let floor = sla.saturating_sub((sla / 3).max(5));
193            floor + rng.next_bounded((sla - floor).max(1))
194        };
195
196        InjectedFault {
197            detected_after_secs,
198            recovered_after_secs,
199            notes: format!(
200                "Synthetic {} injection on virtual timeline; detection {}s, recovery {}s.",
201                fault.kind, detected_after_secs, recovered_after_secs
202            ),
203        }
204    }
205}
206
207#[derive(Debug, Clone)]
208struct Lcg {
209    state: u64,
210}
211
212impl Lcg {
213    fn new(seed: u64) -> Self {
214        Self {
215            state: seed.wrapping_add(0xD1B5_4A32_D192_ED03),
216        }
217    }
218
219    fn next_u64(&mut self) -> u64 {
220        self.state = self
221            .state
222            .wrapping_mul(6_364_136_223_846_793_005)
223            .wrapping_add(1_442_695_040_888_963_407);
224        self.state
225    }
226
227    fn next_bounded(&mut self, upper_exclusive: u64) -> u64 {
228        if upper_exclusive == 0 {
229            0
230        } else {
231            self.next_u64() % upper_exclusive
232        }
233    }
234}
235
236pub fn run(project_root: &Path, options: StressTestOptions) -> Result<StressRunArtifacts> {
237    let injector = SyntheticFaultInjector { seed: options.seed };
238    let report = run_with_injector(&options, &injector);
239
240    let report_dir = project_root.join(REPORTS_DIR);
241    std::fs::create_dir_all(&report_dir)
242        .with_context(|| format!("failed to create {}", report_dir.display()))?;
243
244    let timestamp = chrono::Utc::now().format("%Y%m%d-%H%M%S").to_string();
245    let json_path = options
246        .json_out
247        .clone()
248        .unwrap_or_else(|| report_dir.join(format!("stress-test-{timestamp}.json")));
249    let markdown_path = options
250        .markdown_out
251        .clone()
252        .unwrap_or_else(|| report_dir.join(format!("stress-test-{timestamp}.md")));
253
254    let json = serde_json::to_vec_pretty(&report).context("failed to serialize stress report")?;
255    std::fs::write(&json_path, json)
256        .with_context(|| format!("failed to write {}", json_path.display()))?;
257
258    let markdown = render_markdown(&report);
259    std::fs::write(&markdown_path, markdown)
260        .with_context(|| format!("failed to write {}", markdown_path.display()))?;
261
262    Ok(StressRunArtifacts {
263        summary: report.summary,
264        json_report_path: json_path,
265        markdown_report_path: markdown_path,
266    })
267}
268
269fn run_with_injector(options: &StressTestOptions, injector: &dyn FaultInjector) -> StressReport {
270    let virtual_duration_secs = if options.compact {
271        COMPACT_DURATION_SECS
272    } else {
273        options.duration_hours.max(1).saturating_mul(3600)
274    };
275    let faults = build_schedule(options.compact, virtual_duration_secs, options.seed)
276        .into_iter()
277        .map(|fault| evaluate_fault(fault, injector))
278        .collect::<Vec<_>>();
279
280    let total_faults = faults.len();
281    let passed_faults = faults.iter().filter(|fault| fault.passed_sla).count();
282    let failed_faults = total_faults.saturating_sub(passed_faults);
283    let max_recovery_secs = faults
284        .iter()
285        .map(|fault| fault.recovery_time_secs)
286        .max()
287        .unwrap_or(0);
288    let avg_recovery_secs = if total_faults == 0 {
289        0.0
290    } else {
291        faults
292            .iter()
293            .map(|fault| fault.recovery_time_secs as f64)
294            .sum::<f64>()
295            / total_faults as f64
296    };
297
298    let summary = StressSummary {
299        compact: options.compact,
300        seed: options.seed,
301        virtual_duration_secs,
302        total_faults,
303        passed_faults,
304        failed_faults,
305        max_recovery_secs,
306        avg_recovery_secs,
307    };
308
309    StressReport {
310        generated_at: chrono::Utc::now().to_rfc3339(),
311        compact: options.compact,
312        seed: options.seed,
313        virtual_duration_secs,
314        summary,
315        faults,
316    }
317}
318
319fn build_schedule(compact: bool, virtual_duration_secs: u64, seed: u64) -> Vec<ScheduledFault> {
320    if compact {
321        let spacing = (virtual_duration_secs / (FaultKind::ALL.len() as u64 + 1)).max(1);
322        return FaultKind::ALL
323            .into_iter()
324            .enumerate()
325            .map(|(idx, kind)| ScheduledFault {
326                sequence: idx + 1,
327                kind,
328                injected_at_secs: spacing * (idx as u64 + 1),
329            })
330            .collect();
331    }
332
333    let mut rng = Lcg::new(seed);
334    let mut scheduled = Vec::new();
335    let baseline_count = FaultKind::ALL.len();
336    let extra_count = ((virtual_duration_secs / 3600) as usize).max(2);
337    let total = baseline_count + extra_count;
338    let base_spacing = (virtual_duration_secs / (total as u64 + 1)).max(1);
339
340    for (idx, kind) in FaultKind::ALL.into_iter().enumerate() {
341        let jitter = rng.next_bounded((base_spacing / 3).max(1));
342        scheduled.push(ScheduledFault {
343            sequence: idx + 1,
344            kind,
345            injected_at_secs: (base_spacing * (idx as u64 + 1) + jitter)
346                .min(virtual_duration_secs.saturating_sub(1)),
347        });
348    }
349
350    for idx in baseline_count..total {
351        let kind = FaultKind::ALL[rng.next_bounded(FaultKind::ALL.len() as u64) as usize];
352        let jitter = rng.next_bounded((base_spacing / 2).max(1));
353        scheduled.push(ScheduledFault {
354            sequence: idx + 1,
355            kind,
356            injected_at_secs: (base_spacing * (idx as u64 + 1) + jitter)
357                .min(virtual_duration_secs.saturating_sub(1)),
358        });
359    }
360
361    scheduled.sort_by_key(|fault| (fault.injected_at_secs, fault.sequence));
362    for (idx, fault) in scheduled.iter_mut().enumerate() {
363        fault.sequence = idx + 1;
364    }
365    scheduled
366}
367
368fn evaluate_fault(fault: ScheduledFault, injector: &dyn FaultInjector) -> FaultRecord {
369    let injected = injector.inject(&fault);
370    let detected_at_secs = fault.injected_at_secs + injected.detected_after_secs;
371    let recovered_at_secs = fault.injected_at_secs + injected.recovered_after_secs;
372    let sla_secs = fault.kind.sla_secs();
373    let recovery_time_secs = injected.recovered_after_secs;
374
375    FaultRecord {
376        sequence: fault.sequence,
377        kind: fault.kind,
378        description: fault.kind.description().to_string(),
379        roadmap_anchor: fault.kind.roadmap_anchor().to_string(),
380        injected_at_secs: fault.injected_at_secs,
381        detected_at_secs,
382        recovered_at_secs,
383        recovery_time_secs,
384        sla_secs,
385        passed_sla: recovery_time_secs <= sla_secs,
386        notes: injected.notes,
387    }
388}
389
390fn render_markdown(report: &StressReport) -> String {
391    let mut out = String::new();
392    out.push_str("# Batty Stress Test Report\n\n");
393    out.push_str("## Summary\n\n");
394    out.push_str(&format!(
395        "- Mode: {}\n- Seed: {}\n- Virtual duration: {}s\n- Faults injected: {}\n- SLA passed: {}\n- SLA failed: {}\n- Max recovery: {}s\n- Avg recovery: {:.1}s\n\n",
396        if report.compact { "compact" } else { "standard" },
397        report.seed,
398        report.virtual_duration_secs,
399        report.summary.total_faults,
400        report.summary.passed_faults,
401        report.summary.failed_faults,
402        report.summary.max_recovery_secs,
403        report.summary.avg_recovery_secs,
404    ));
405    out.push_str("## Faults\n\n");
406    out.push_str("| # | Fault | Injected | Recovered | Recovery | SLA | Status |\n");
407    out.push_str("|---|---|---:|---:|---:|---:|---|\n");
408    for fault in &report.faults {
409        out.push_str(&format!(
410            "| {} | {} | {}s | {}s | {}s | {}s | {} |\n",
411            fault.sequence,
412            fault.kind,
413            fault.injected_at_secs,
414            fault.recovered_at_secs,
415            fault.recovery_time_secs,
416            fault.sla_secs,
417            if fault.passed_sla { "pass" } else { "fail" }
418        ));
419    }
420    out.push_str("\n## Notes\n\n");
421    for fault in &report.faults {
422        out.push_str(&format!(
423            "- `{}` mapped to roadmap item \"{}\": {}\n",
424            fault.kind, fault.roadmap_anchor, fault.notes
425        ));
426    }
427    out
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    struct FixedInjector {
435        recoveries: Vec<(u64, u64)>,
436    }
437
438    impl FaultInjector for FixedInjector {
439        fn inject(&self, fault: &ScheduledFault) -> InjectedFault {
440            let (detected_after_secs, recovered_after_secs) = self.recoveries[fault.sequence - 1];
441            InjectedFault {
442                detected_after_secs,
443                recovered_after_secs,
444                notes: format!("fixed outcome for {}", fault.kind),
445            }
446        }
447    }
448
449    fn options(compact: bool) -> StressTestOptions {
450        StressTestOptions {
451            compact,
452            duration_hours: 8,
453            seed: 7,
454            json_out: None,
455            markdown_out: None,
456        }
457    }
458
459    #[test]
460    fn compact_schedule_covers_full_fault_matrix() {
461        let schedule = build_schedule(true, COMPACT_DURATION_SECS, 7);
462        assert_eq!(schedule.len(), FaultKind::ALL.len());
463        for kind in FaultKind::ALL {
464            assert!(schedule.iter().any(|fault| fault.kind == kind));
465        }
466        assert!(
467            schedule
468                .windows(2)
469                .all(|pair| { pair[0].injected_at_secs < pair[1].injected_at_secs })
470        );
471    }
472
473    #[test]
474    fn standard_schedule_extends_matrix_with_additional_faults() {
475        let schedule = build_schedule(false, 8 * 3600, 9);
476        assert!(schedule.len() > FaultKind::ALL.len());
477        for kind in FaultKind::ALL {
478            assert!(schedule.iter().any(|fault| fault.kind == kind));
479        }
480        assert!(
481            schedule
482                .iter()
483                .all(|fault| fault.injected_at_secs < 8 * 3600)
484        );
485    }
486
487    #[test]
488    fn sla_failure_is_reported_when_recovery_exceeds_threshold() {
489        let injector = FixedInjector {
490            recoveries: vec![(2, 61), (2, 89), (2, 88), (2, 100), (2, 115), (2, 59)],
491        };
492        let report = run_with_injector(&options(true), &injector);
493
494        assert_eq!(report.summary.total_faults, 6);
495        assert_eq!(report.summary.failed_faults, 1);
496        assert!(!report.faults[0].passed_sla);
497        assert!(report.faults[1].passed_sla);
498    }
499
500    #[test]
501    fn run_writes_json_and_markdown_reports() {
502        let tmp = tempfile::tempdir().unwrap();
503        let json_path = tmp.path().join("stress.json");
504        let markdown_path = tmp.path().join("stress.md");
505        let report = run(
506            tmp.path(),
507            StressTestOptions {
508                compact: true,
509                duration_hours: 8,
510                seed: 3,
511                json_out: Some(json_path.clone()),
512                markdown_out: Some(markdown_path.clone()),
513            },
514        )
515        .unwrap();
516
517        assert_eq!(report.json_report_path, json_path);
518        assert_eq!(report.markdown_report_path, markdown_path);
519
520        let json = std::fs::read_to_string(&report.json_report_path).unwrap();
521        let markdown = std::fs::read_to_string(&report.markdown_report_path).unwrap();
522
523        assert!(json.contains("\"faults\""));
524        assert!(markdown.contains("# Batty Stress Test Report"));
525        assert!(markdown.contains("| # | Fault |"));
526    }
527}