Skip to main content

dev_chaos/
lib.rs

1//! # dev-chaos
2//!
3//! Failure injection and recovery testing for Rust. Part of the
4//! `dev-*` verification suite.
5//!
6//! Most code is tested only on the happy path. Real systems fail
7//! through partial writes, crashes, timeouts, corrupt data, and
8//! broken connections. `dev-chaos` provides primitives for injecting
9//! those failures on purpose, then verifying that recovery logic does
10//! its job.
11//!
12//! ## Quick example
13//!
14//! ```no_run
15//! use dev_chaos::{FailureSchedule, FailureMode};
16//!
17//! // Fail on the 3rd, 7th, and 10th attempt.
18//! let schedule = FailureSchedule::on_attempts(&[3, 7, 10], FailureMode::IoError);
19//!
20//! for attempt in 1..=12 {
21//!     match schedule.maybe_fail(attempt) {
22//!         Ok(()) => { /* operation proceeds */ }
23//!         Err(_e) => { /* recovery path */ }
24//!     }
25//! }
26//! ```
27//!
28//! ## Modules
29//!
30//! - [`io`] — sync IO wrappers (`ChaosReader`, `ChaosWriter`, `ChaosFile`).
31//! - [`latency`] — non-failing slowdowns via `LatencyInjector`.
32//! - [`crash`] — write-truncation via `CrashPoint`.
33//! - [`async_io`] (feature `async-io`) — `tokio::io` equivalents.
34//!
35//! ## Determinism
36//!
37//! All schedules are deterministic by default: the same sequence of
38//! attempts MUST produce the same sequence of failures across runs
39//! and machines. Probabilistic schedules
40//! ([`FailureSchedule::seeded_random`]) are opt-in, seeded, and
41//! reproducible from the seed.
42
43#![cfg_attr(docsrs, feature(doc_cfg))]
44#![warn(missing_docs)]
45#![warn(rust_2018_idioms)]
46
47use std::collections::HashSet;
48use std::sync::atomic::{AtomicUsize, Ordering};
49
50use dev_report::{CheckResult, Evidence, Producer, Report, Severity};
51
52pub mod crash;
53pub mod io;
54pub mod latency;
55
56#[cfg(feature = "async-io")]
57#[cfg_attr(docsrs, doc(cfg(feature = "async-io")))]
58pub mod async_io;
59
60/// A type of failure that can be injected.
61///
62/// # Example
63///
64/// ```
65/// use dev_chaos::FailureMode;
66/// assert_eq!(FailureMode::IoError.as_str(), "io_error");
67/// ```
68#[derive(Debug, Clone, Copy, PartialEq, Eq)]
69pub enum FailureMode {
70    /// Generic I/O error.
71    IoError,
72    /// Partial write: returns an error after writing some bytes.
73    PartialWrite,
74    /// Connection reset.
75    ConnectionReset,
76    /// Operation timeout.
77    Timeout,
78    /// Corrupted data: returns success but with corrupted bytes.
79    Corruption,
80    /// Permission denied.
81    PermissionDenied,
82}
83
84impl FailureMode {
85    /// Human-readable name for this failure mode.
86    pub fn as_str(&self) -> &'static str {
87        match self {
88            FailureMode::IoError => "io_error",
89            FailureMode::PartialWrite => "partial_write",
90            FailureMode::ConnectionReset => "connection_reset",
91            FailureMode::Timeout => "timeout",
92            FailureMode::Corruption => "corruption",
93            FailureMode::PermissionDenied => "permission_denied",
94        }
95    }
96
97    /// Map this mode to an `std::io::ErrorKind`.
98    pub fn to_io_kind(&self) -> std::io::ErrorKind {
99        match self {
100            FailureMode::IoError => std::io::ErrorKind::Other,
101            FailureMode::PartialWrite => std::io::ErrorKind::WriteZero,
102            FailureMode::ConnectionReset => std::io::ErrorKind::ConnectionReset,
103            FailureMode::Timeout => std::io::ErrorKind::TimedOut,
104            FailureMode::Corruption => std::io::ErrorKind::InvalidData,
105            FailureMode::PermissionDenied => std::io::ErrorKind::PermissionDenied,
106        }
107    }
108}
109
110/// An error returned by injected failures.
111///
112/// # Example
113///
114/// ```
115/// use dev_chaos::{FailureMode, InjectedFailure};
116/// let f = InjectedFailure { mode: FailureMode::Timeout, attempt: 3 };
117/// assert_eq!(f.mode.as_str(), "timeout");
118/// ```
119#[derive(Debug, Clone)]
120pub struct InjectedFailure {
121    /// The mode of failure that was injected.
122    pub mode: FailureMode,
123    /// The attempt number at which the failure was injected.
124    pub attempt: usize,
125}
126
127impl std::fmt::Display for InjectedFailure {
128    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
129        write!(
130            f,
131            "injected failure {} at attempt {}",
132            self.mode.as_str(),
133            self.attempt
134        )
135    }
136}
137
138impl std::error::Error for InjectedFailure {}
139
140impl From<InjectedFailure> for std::io::Error {
141    fn from(f: InjectedFailure) -> Self {
142        std::io::Error::new(f.mode.to_io_kind(), f.to_string())
143    }
144}
145
146/// A schedule that decides whether a given attempt fails.
147///
148/// Schedules are deterministic by default. The same `(schedule, attempt)`
149/// pair produces the same outcome across runs and machines.
150///
151/// # Example
152///
153/// ```
154/// use dev_chaos::{FailureMode, FailureSchedule};
155///
156/// let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
157/// assert!(s.maybe_fail(1).is_ok());
158/// assert!(s.maybe_fail(2).is_err());
159/// ```
160pub struct FailureSchedule {
161    inner: ScheduleKind,
162    mode: FailureMode,
163    invocations: AtomicUsize,
164}
165
166enum ScheduleKind {
167    Explicit(HashSet<usize>),
168    EveryN(usize),
169    SeededRandom { seed: u64, prob_thousandths: u32 },
170}
171
172impl FailureSchedule {
173    /// Build a schedule that fails on specific attempt numbers
174    /// (1-indexed).
175    ///
176    /// # Example
177    ///
178    /// ```
179    /// use dev_chaos::{FailureMode, FailureSchedule};
180    /// let s = FailureSchedule::on_attempts(&[3, 7], FailureMode::Timeout);
181    /// assert!(s.maybe_fail(3).is_err());
182    /// assert!(s.maybe_fail(4).is_ok());
183    /// ```
184    pub fn on_attempts(attempts: &[usize], mode: FailureMode) -> Self {
185        Self {
186            inner: ScheduleKind::Explicit(attempts.iter().copied().collect()),
187            mode,
188            invocations: AtomicUsize::new(0),
189        }
190    }
191
192    /// Build a schedule that fails on every Nth attempt (1-indexed).
193    ///
194    /// # Example
195    ///
196    /// ```
197    /// use dev_chaos::{FailureMode, FailureSchedule};
198    /// let s = FailureSchedule::every_n(3, FailureMode::Timeout);
199    /// assert!(s.maybe_fail(3).is_err());
200    /// assert!(s.maybe_fail(6).is_err());
201    /// ```
202    pub fn every_n(n: usize, mode: FailureMode) -> Self {
203        let n = n.max(1);
204        Self {
205            inner: ScheduleKind::EveryN(n),
206            mode,
207            invocations: AtomicUsize::new(0),
208        }
209    }
210
211    /// Build a deterministic, seeded "random" schedule.
212    ///
213    /// Each attempt is hashed (with the seed) into a value in `[0, 1000)`
214    /// and fails when that value is below `probability * 1000`. The
215    /// schedule is fully reproducible from the seed.
216    ///
217    /// `probability` is clamped to `[0.0, 1.0]`.
218    ///
219    /// **This is the only non-explicit schedule.** Even so, it is
220    /// strictly reproducible; no real RNG state, no clock, no thread.
221    ///
222    /// # Example
223    ///
224    /// ```
225    /// use dev_chaos::{FailureMode, FailureSchedule};
226    ///
227    /// let a = FailureSchedule::seeded_random(42, 0.10, FailureMode::IoError);
228    /// let b = FailureSchedule::seeded_random(42, 0.10, FailureMode::IoError);
229    /// // Same seed => same outcome at every attempt.
230    /// for attempt in 1..=100 {
231    ///     assert_eq!(a.maybe_fail(attempt).is_err(), b.maybe_fail(attempt).is_err());
232    /// }
233    /// ```
234    pub fn seeded_random(seed: u64, probability: f64, mode: FailureMode) -> Self {
235        let p = probability.clamp(0.0, 1.0);
236        let prob_thousandths = (p * 1000.0).round() as u32;
237        Self {
238            inner: ScheduleKind::SeededRandom {
239                seed,
240                prob_thousandths,
241            },
242            mode,
243            invocations: AtomicUsize::new(0),
244        }
245    }
246
247    /// Check whether the given attempt should fail.
248    ///
249    /// Returns `Ok(())` if the operation should proceed, or
250    /// `Err(InjectedFailure)` if the schedule fires on this attempt.
251    pub fn maybe_fail(&self, attempt: usize) -> Result<(), InjectedFailure> {
252        self.invocations.fetch_add(1, Ordering::Relaxed);
253        if self.fires(attempt) {
254            Err(InjectedFailure {
255                mode: self.mode,
256                attempt,
257            })
258        } else {
259            Ok(())
260        }
261    }
262
263    fn fires(&self, attempt: usize) -> bool {
264        match &self.inner {
265            ScheduleKind::Explicit(set) => set.contains(&attempt),
266            ScheduleKind::EveryN(n) => attempt % *n == 0,
267            ScheduleKind::SeededRandom {
268                seed,
269                prob_thousandths,
270            } => {
271                // Deterministic mix: combine attempt + seed via splitmix64.
272                let mut x =
273                    (*seed).wrapping_add((attempt as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15));
274                x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
275                x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
276                x ^= x >> 31;
277                let bucket = (x % 1000) as u32;
278                bucket < *prob_thousandths
279            }
280        }
281    }
282
283    /// Total invocations of `maybe_fail` since this schedule was built.
284    pub fn invocation_count(&self) -> usize {
285        self.invocations.load(Ordering::Relaxed)
286    }
287
288    /// Mode this schedule injects.
289    pub fn mode(&self) -> FailureMode {
290        self.mode
291    }
292}
293
294/// Verify that recovery logic succeeded after a failure schedule.
295///
296/// Returns a [`CheckResult`] tagged `chaos`. The verdict follows REPS
297/// section 4:
298///
299/// - `final_state_ok = false` -> `Fail (Critical)`, `regression` tag.
300/// - `actual_failures < expected_failures` AND `final_state_ok` ->
301///   `Warn (Warning)`, indicating under-injection.
302/// - Otherwise -> `Pass`.
303///
304/// Always carries numeric `Evidence` for `expected_failures`,
305/// `actual_failures`, `final_state_ok`.
306///
307/// # Example
308///
309/// ```
310/// use dev_chaos::assert_recovered;
311/// let c = assert_recovered("write_log", 2, 2, true);
312/// assert!(matches!(c.verdict, dev_report::Verdict::Pass));
313/// ```
314pub fn assert_recovered(
315    name: impl Into<String>,
316    expected_failures: usize,
317    actual_failures: usize,
318    final_state_ok: bool,
319) -> CheckResult {
320    let check_name = format!("chaos::{}", name.into());
321    let evidence = vec![
322        Evidence::numeric("expected_failures", expected_failures as f64),
323        Evidence::numeric("actual_failures", actual_failures as f64),
324        Evidence::numeric("final_state_ok", if final_state_ok { 1.0 } else { 0.0 }),
325    ];
326
327    if !final_state_ok {
328        let mut tags = vec![
329            "chaos".to_string(),
330            "recovery".to_string(),
331            "regression".to_string(),
332        ];
333        tags.sort();
334        let mut c = CheckResult::fail(check_name, Severity::Critical).with_detail(format!(
335            "system did not recover. expected {expected_failures} injected failures, observed {actual_failures}, final state failed validation"
336        ));
337        c.tags = tags;
338        c.evidence = evidence;
339        return c;
340    }
341
342    if actual_failures < expected_failures {
343        let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
344        tags.sort();
345        let mut c = CheckResult::warn(check_name, Severity::Warning).with_detail(format!(
346            "fewer failures observed than scheduled (expected {expected_failures}, observed {actual_failures})"
347        ));
348        c.tags = tags;
349        c.evidence = evidence;
350        return c;
351    }
352
353    let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
354    tags.sort();
355    let mut c = CheckResult::pass(check_name).with_detail(format!(
356        "recovered after {actual_failures} injected failure(s)"
357    ));
358    c.tags = tags;
359    c.evidence = evidence;
360    c
361}
362
363/// Producer wrapper that runs a chaos suite and emits a Report with
364/// each scenario's `CheckResult`.
365///
366/// # Example
367///
368/// ```no_run
369/// use dev_chaos::{assert_recovered, ChaosProducer};
370/// use dev_report::Producer;
371///
372/// fn run() -> Vec<dev_report::CheckResult> {
373///     vec![
374///         assert_recovered("write_log", 2, 2, true),
375///         assert_recovered("rename", 1, 1, true),
376///     ]
377/// }
378///
379/// let producer = ChaosProducer::new(run, "my-crate", "0.1.0");
380/// let report = producer.produce();
381/// assert_eq!(report.checks.len(), 2);
382/// ```
383pub struct ChaosProducer<F>
384where
385    F: Fn() -> Vec<CheckResult>,
386{
387    run: F,
388    subject: String,
389    subject_version: String,
390}
391
392impl<F> ChaosProducer<F>
393where
394    F: Fn() -> Vec<CheckResult>,
395{
396    /// Build a new producer.
397    pub fn new(run: F, subject: impl Into<String>, subject_version: impl Into<String>) -> Self {
398        Self {
399            run,
400            subject: subject.into(),
401            subject_version: subject_version.into(),
402        }
403    }
404}
405
406impl<F> Producer for ChaosProducer<F>
407where
408    F: Fn() -> Vec<CheckResult>,
409{
410    fn produce(&self) -> Report {
411        let checks = (self.run)();
412        let mut r = Report::new(self.subject.clone(), self.subject_version.clone())
413            .with_producer("dev-chaos");
414        for c in checks {
415            r.push(c);
416        }
417        r.finish();
418        r
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use super::*;
425    use dev_report::Verdict;
426
427    #[test]
428    fn schedule_fails_on_specified_attempts() {
429        let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
430        assert!(s.maybe_fail(1).is_ok());
431        assert!(s.maybe_fail(2).is_err());
432        assert!(s.maybe_fail(3).is_ok());
433        assert!(s.maybe_fail(4).is_err());
434        assert_eq!(s.invocation_count(), 4);
435    }
436
437    #[test]
438    fn every_n_fires_on_multiples() {
439        let s = FailureSchedule::every_n(3, FailureMode::Timeout);
440        assert!(s.maybe_fail(1).is_ok());
441        assert!(s.maybe_fail(2).is_ok());
442        assert!(s.maybe_fail(3).is_err());
443        assert!(s.maybe_fail(6).is_err());
444        assert!(s.maybe_fail(9).is_err());
445        // Beyond 1024-now-arbitrary because we use modulo.
446        assert!(s.maybe_fail(3_000).is_err());
447    }
448
449    #[test]
450    fn seeded_random_is_deterministic() {
451        let a = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
452        let b = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
453        for attempt in 1..=200 {
454            assert_eq!(
455                a.fires(attempt),
456                b.fires(attempt),
457                "differs at attempt {}",
458                attempt
459            );
460        }
461    }
462
463    #[test]
464    fn seeded_random_zero_probability_never_fires() {
465        let s = FailureSchedule::seeded_random(7, 0.0, FailureMode::IoError);
466        for attempt in 1..=1000 {
467            assert!(s.maybe_fail(attempt).is_ok());
468        }
469    }
470
471    #[test]
472    fn seeded_random_full_probability_always_fires() {
473        let s = FailureSchedule::seeded_random(7, 1.0, FailureMode::IoError);
474        for attempt in 1..=200 {
475            assert!(s.maybe_fail(attempt).is_err());
476        }
477    }
478
479    #[test]
480    fn injected_failure_converts_to_io_error() {
481        let f = InjectedFailure {
482            mode: FailureMode::Timeout,
483            attempt: 5,
484        };
485        let e: std::io::Error = f.into();
486        assert_eq!(e.kind(), std::io::ErrorKind::TimedOut);
487    }
488
489    #[test]
490    fn recovery_check_pass() {
491        let c = assert_recovered("write_log", 2, 2, true);
492        assert_eq!(c.verdict, Verdict::Pass);
493        assert!(c.has_tag("chaos"));
494        assert!(c.has_tag("recovery"));
495        assert!(!c.has_tag("regression"));
496    }
497
498    #[test]
499    fn recovery_check_fail_when_state_invalid() {
500        let c = assert_recovered("write_log", 2, 2, false);
501        assert_eq!(c.verdict, Verdict::Fail);
502        assert_eq!(c.severity, Some(Severity::Critical));
503        assert!(c.has_tag("regression"));
504    }
505
506    #[test]
507    fn recovery_check_warns_on_under_injection() {
508        let c = assert_recovered("write_log", 5, 2, true);
509        assert_eq!(c.verdict, Verdict::Warn);
510    }
511
512    #[test]
513    fn recovery_check_carries_numeric_evidence() {
514        let c = assert_recovered("op", 3, 3, true);
515        let labels: Vec<&str> = c.evidence.iter().map(|e| e.label.as_str()).collect();
516        assert!(labels.contains(&"expected_failures"));
517        assert!(labels.contains(&"actual_failures"));
518        assert!(labels.contains(&"final_state_ok"));
519    }
520
521    #[test]
522    fn chaos_producer_emits_report() {
523        let producer = ChaosProducer::new(
524            || {
525                vec![
526                    assert_recovered("a", 1, 1, true),
527                    assert_recovered("b", 2, 2, true),
528                ]
529            },
530            "my-crate",
531            "0.1.0",
532        );
533        let report = producer.produce();
534        assert_eq!(report.checks.len(), 2);
535        assert_eq!(report.producer.as_deref(), Some("dev-chaos"));
536        assert_eq!(report.overall_verdict(), Verdict::Pass);
537    }
538}