Skip to main content

dev_chaos/
lib.rs

1//! # dev-chaos
2//!
3//! Failure injection and recovery testing for Rust. Part of the
4//! `dev-*` verification suite.
5//!
6//! Most code is tested only on the happy path. Real systems fail
7//! through partial writes, crashes, timeouts, corrupt data, and
8//! broken connections. `dev-chaos` provides primitives for injecting
9//! those failures on purpose, then verifying that recovery logic does
10//! its job.
11//!
12//! ## Quick example
13//!
14//! ```no_run
15//! use dev_chaos::{FailureSchedule, FailureMode};
16//!
17//! // Fail on the 3rd, 7th, and 10th attempt.
18//! let schedule = FailureSchedule::on_attempts(&[3, 7, 10], FailureMode::IoError);
19//!
20//! for attempt in 1..=12 {
21//!     match schedule.maybe_fail(attempt) {
22//!         Ok(()) => { /* operation proceeds */ }
23//!         Err(e) => { /* recovery path */ }
24//!     }
25//! }
26//! ```
27
28#![cfg_attr(docsrs, feature(doc_cfg))]
29#![warn(missing_docs)]
30#![warn(rust_2018_idioms)]
31
32use std::collections::HashSet;
33use std::sync::atomic::{AtomicUsize, Ordering};
34
35use dev_report::{CheckResult, Severity};
36
37/// A type of failure that can be injected.
38#[derive(Debug, Clone, Copy, PartialEq, Eq)]
39pub enum FailureMode {
40    /// Generic I/O error.
41    IoError,
42    /// Partial write: returns an error after writing some bytes.
43    PartialWrite,
44    /// Connection reset.
45    ConnectionReset,
46    /// Operation timeout.
47    Timeout,
48    /// Corrupted data: returns success but with corrupted bytes.
49    Corruption,
50    /// Permission denied.
51    PermissionDenied,
52}
53
54impl FailureMode {
55    /// Human-readable name for this failure mode.
56    pub fn as_str(&self) -> &'static str {
57        match self {
58            FailureMode::IoError => "io_error",
59            FailureMode::PartialWrite => "partial_write",
60            FailureMode::ConnectionReset => "connection_reset",
61            FailureMode::Timeout => "timeout",
62            FailureMode::Corruption => "corruption",
63            FailureMode::PermissionDenied => "permission_denied",
64        }
65    }
66}
67
68/// An error returned by injected failures.
69#[derive(Debug, Clone)]
70pub struct InjectedFailure {
71    /// The mode of failure that was injected.
72    pub mode: FailureMode,
73    /// The attempt number at which the failure was injected.
74    pub attempt: usize,
75}
76
77impl std::fmt::Display for InjectedFailure {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        write!(
80            f,
81            "injected failure {} at attempt {}",
82            self.mode.as_str(),
83            self.attempt
84        )
85    }
86}
87
88impl std::error::Error for InjectedFailure {}
89
90/// A schedule that decides whether a given attempt fails.
91pub struct FailureSchedule {
92    failing_attempts: HashSet<usize>,
93    mode: FailureMode,
94    invocations: AtomicUsize,
95}
96
97impl FailureSchedule {
98    /// Build a schedule that fails on specific attempt numbers
99    /// (1-indexed).
100    pub fn on_attempts(attempts: &[usize], mode: FailureMode) -> Self {
101        Self {
102            failing_attempts: attempts.iter().copied().collect(),
103            mode,
104            invocations: AtomicUsize::new(0),
105        }
106    }
107
108    /// Build a schedule that fails on every Nth attempt (1-indexed).
109    pub fn every_n(n: usize, mode: FailureMode) -> Self {
110        let mut s = HashSet::new();
111        // We don't know how many attempts there will be in advance,
112        // so we record the modulus instead and check at maybe_fail time.
113        // Implemented via a sentinel: attempts == [n] and a flag.
114        // Simpler: just expand for up to 1024 attempts.
115        for k in 1..=1024 {
116            if k % n == 0 {
117                s.insert(k);
118            }
119        }
120        Self {
121            failing_attempts: s,
122            mode,
123            invocations: AtomicUsize::new(0),
124        }
125    }
126
127    /// Check whether the given attempt should fail. Returns `Ok(())`
128    /// if it should proceed, `Err(InjectedFailure)` otherwise.
129    pub fn maybe_fail(&self, attempt: usize) -> Result<(), InjectedFailure> {
130        self.invocations.fetch_add(1, Ordering::Relaxed);
131        if self.failing_attempts.contains(&attempt) {
132            Err(InjectedFailure {
133                mode: self.mode,
134                attempt,
135            })
136        } else {
137            Ok(())
138        }
139    }
140
141    /// Total invocations of `maybe_fail` since this schedule was built.
142    pub fn invocation_count(&self) -> usize {
143        self.invocations.load(Ordering::Relaxed)
144    }
145}
146
147/// Verify that recovery logic succeeded after a failure schedule.
148///
149/// `expected_failures` is the number of times the recovery path was
150/// expected to be triggered. `actual_failures` is what was observed.
151/// Returns a `CheckResult` describing whether recovery worked.
152pub fn assert_recovered(
153    name: impl Into<String>,
154    expected_failures: usize,
155    actual_failures: usize,
156    final_state_ok: bool,
157) -> CheckResult {
158    let name = format!("chaos::{}", name.into());
159    if !final_state_ok {
160        return CheckResult::fail(name, Severity::Critical).with_detail(format!(
161            "system did not recover. expected {expected_failures} injected failures, observed {actual_failures}, final state failed validation"
162        ));
163    }
164    if actual_failures < expected_failures {
165        return CheckResult::warn(name, Severity::Warning).with_detail(format!(
166            "fewer failures observed than scheduled (expected {expected_failures}, observed {actual_failures})"
167        ));
168    }
169    CheckResult::pass(name).with_detail(format!(
170        "recovered after {actual_failures} injected failure(s)"
171    ))
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn schedule_fails_on_specified_attempts() {
180        let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
181        assert!(s.maybe_fail(1).is_ok());
182        assert!(s.maybe_fail(2).is_err());
183        assert!(s.maybe_fail(3).is_ok());
184        assert!(s.maybe_fail(4).is_err());
185        assert_eq!(s.invocation_count(), 4);
186    }
187
188    #[test]
189    fn every_n_pattern() {
190        let s = FailureSchedule::every_n(3, FailureMode::Timeout);
191        assert!(s.maybe_fail(1).is_ok());
192        assert!(s.maybe_fail(2).is_ok());
193        assert!(s.maybe_fail(3).is_err());
194        assert!(s.maybe_fail(6).is_err());
195        assert!(s.maybe_fail(9).is_err());
196    }
197
198    #[test]
199    fn recovery_check_pass() {
200        let c = assert_recovered("write_log", 2, 2, true);
201        assert!(matches!(c.verdict, dev_report::Verdict::Pass));
202    }
203
204    #[test]
205    fn recovery_check_fail_when_state_invalid() {
206        let c = assert_recovered("write_log", 2, 2, false);
207        assert!(matches!(c.verdict, dev_report::Verdict::Fail));
208    }
209}