Skip to main content

dev_chaos/
lib.rs

1//! # dev-chaos
2//!
3//! Failure injection and recovery testing for Rust. Part of the
4//! `dev-*` verification suite.
5//!
6//! Most code is tested only on the happy path. Real systems fail
7//! through partial writes, crashes, timeouts, corrupt data, and
8//! broken connections. `dev-chaos` provides primitives for injecting
9//! those failures on purpose, then verifying that recovery logic does
10//! its job.
11//!
12//! ## Quick example
13//!
14//! ```no_run
15//! use dev_chaos::{FailureSchedule, FailureMode};
16//!
17//! // Fail on the 3rd, 7th, and 10th attempt.
18//! let schedule = FailureSchedule::on_attempts(&[3, 7, 10], FailureMode::IoError);
19//!
20//! for attempt in 1..=12 {
21//!     match schedule.maybe_fail(attempt) {
22//!         Ok(()) => { /* operation proceeds */ }
23//!         Err(_e) => { /* recovery path */ }
24//!     }
25//! }
26//! ```
27//!
28//! ## Modules
29//!
30//! - [`io`] — sync IO wrappers (`ChaosReader`, `ChaosWriter`, `ChaosFile`).
31//! - [`latency`] — non-failing slowdowns via `LatencyInjector`,
32//!   composable with `FailureSchedule` via `LatencyAndFailure`.
33//! - [`crash`] — write-truncation via `CrashPoint`.
34//! - [`clock`] — deterministic `Clock` for time-skew injection.
35//! - [`memory_pressure`] — `MemoryPressure` guards for memory-bound chaos.
36//! - `async_io` (feature `async-io`) — `tokio::io` equivalents
37//!   (visible in rustdoc when the feature is enabled).
38//!
39//! ## Determinism
40//!
41//! All schedules are deterministic by default: the same sequence of
42//! attempts MUST produce the same sequence of failures across runs
43//! and machines. Probabilistic schedules
44//! ([`FailureSchedule::seeded_random`]) are opt-in, seeded, and
45//! reproducible from the seed.
46
47#![cfg_attr(docsrs, feature(doc_cfg))]
48#![warn(missing_docs)]
49#![warn(rust_2018_idioms)]
50
51use std::collections::HashSet;
52use std::sync::atomic::{AtomicUsize, Ordering};
53
54use dev_report::{CheckResult, Evidence, Producer, Report, Severity};
55
56pub mod clock;
57pub mod crash;
58pub mod io;
59pub mod latency;
60pub mod memory_pressure;
61
62#[cfg(feature = "async-io")]
63#[cfg_attr(docsrs, doc(cfg(feature = "async-io")))]
64pub mod async_io;
65
66/// A type of failure that can be injected.
67///
68/// # Example
69///
70/// ```
71/// use dev_chaos::FailureMode;
72/// assert_eq!(FailureMode::IoError.as_str(), "io_error");
73/// ```
74#[derive(Debug, Clone, Copy, PartialEq, Eq)]
75pub enum FailureMode {
76    /// Generic I/O error.
77    IoError,
78    /// Partial write: returns an error after writing some bytes.
79    PartialWrite,
80    /// Connection reset.
81    ConnectionReset,
82    /// Operation timeout.
83    Timeout,
84    /// Corrupted data: returns success but with corrupted bytes.
85    Corruption,
86    /// Permission denied.
87    PermissionDenied,
88}
89
90impl FailureMode {
91    /// Human-readable name for this failure mode.
92    pub fn as_str(&self) -> &'static str {
93        match self {
94            FailureMode::IoError => "io_error",
95            FailureMode::PartialWrite => "partial_write",
96            FailureMode::ConnectionReset => "connection_reset",
97            FailureMode::Timeout => "timeout",
98            FailureMode::Corruption => "corruption",
99            FailureMode::PermissionDenied => "permission_denied",
100        }
101    }
102
103    /// Map this mode to an `std::io::ErrorKind`.
104    pub fn to_io_kind(&self) -> std::io::ErrorKind {
105        match self {
106            FailureMode::IoError => std::io::ErrorKind::Other,
107            FailureMode::PartialWrite => std::io::ErrorKind::WriteZero,
108            FailureMode::ConnectionReset => std::io::ErrorKind::ConnectionReset,
109            FailureMode::Timeout => std::io::ErrorKind::TimedOut,
110            FailureMode::Corruption => std::io::ErrorKind::InvalidData,
111            FailureMode::PermissionDenied => std::io::ErrorKind::PermissionDenied,
112        }
113    }
114}
115
116/// An error returned by injected failures.
117///
118/// # Example
119///
120/// ```
121/// use dev_chaos::{FailureMode, InjectedFailure};
122/// let f = InjectedFailure { mode: FailureMode::Timeout, attempt: 3 };
123/// assert_eq!(f.mode.as_str(), "timeout");
124/// ```
125#[derive(Debug, Clone)]
126pub struct InjectedFailure {
127    /// The mode of failure that was injected.
128    pub mode: FailureMode,
129    /// The attempt number at which the failure was injected.
130    pub attempt: usize,
131}
132
133impl std::fmt::Display for InjectedFailure {
134    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
135        write!(
136            f,
137            "injected failure {} at attempt {}",
138            self.mode.as_str(),
139            self.attempt
140        )
141    }
142}
143
144impl std::error::Error for InjectedFailure {}
145
146impl From<InjectedFailure> for std::io::Error {
147    fn from(f: InjectedFailure) -> Self {
148        std::io::Error::new(f.mode.to_io_kind(), f.to_string())
149    }
150}
151
152/// A schedule that decides whether a given attempt fails.
153///
154/// Schedules are deterministic by default. The same `(schedule, attempt)`
155/// pair produces the same outcome across runs and machines.
156///
157/// # Example
158///
159/// ```
160/// use dev_chaos::{FailureMode, FailureSchedule};
161///
162/// let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
163/// assert!(s.maybe_fail(1).is_ok());
164/// assert!(s.maybe_fail(2).is_err());
165/// ```
166pub struct FailureSchedule {
167    inner: ScheduleKind,
168    mode: FailureMode,
169    invocations: AtomicUsize,
170    failures: AtomicUsize,
171    /// `None` = unbounded; `Some(n)` = stop firing after `n` failures.
172    failure_limit: Option<usize>,
173}
174
175enum ScheduleKind {
176    Explicit(HashSet<usize>),
177    EveryN(usize),
178    SeededRandom { seed: u64, prob_thousandths: u32 },
179}
180
181impl FailureSchedule {
182    /// Build a schedule that fails on specific attempt numbers
183    /// (1-indexed).
184    ///
185    /// # Example
186    ///
187    /// ```
188    /// use dev_chaos::{FailureMode, FailureSchedule};
189    /// let s = FailureSchedule::on_attempts(&[3, 7], FailureMode::Timeout);
190    /// assert!(s.maybe_fail(3).is_err());
191    /// assert!(s.maybe_fail(4).is_ok());
192    /// ```
193    pub fn on_attempts(attempts: &[usize], mode: FailureMode) -> Self {
194        Self {
195            inner: ScheduleKind::Explicit(attempts.iter().copied().collect()),
196            mode,
197            invocations: AtomicUsize::new(0),
198            failures: AtomicUsize::new(0),
199            failure_limit: None,
200        }
201    }
202
203    /// Build a schedule that fails on every Nth attempt (1-indexed).
204    ///
205    /// # Example
206    ///
207    /// ```
208    /// use dev_chaos::{FailureMode, FailureSchedule};
209    /// let s = FailureSchedule::every_n(3, FailureMode::Timeout);
210    /// assert!(s.maybe_fail(3).is_err());
211    /// assert!(s.maybe_fail(6).is_err());
212    /// ```
213    pub fn every_n(n: usize, mode: FailureMode) -> Self {
214        let n = n.max(1);
215        Self {
216            inner: ScheduleKind::EveryN(n),
217            mode,
218            invocations: AtomicUsize::new(0),
219            failures: AtomicUsize::new(0),
220            failure_limit: None,
221        }
222    }
223
224    /// Build a deterministic, seeded "random" schedule.
225    ///
226    /// Each attempt is hashed (with the seed) into a value in `[0, 1000)`
227    /// and fails when that value is below `probability * 1000`. The
228    /// schedule is fully reproducible from the seed.
229    ///
230    /// `probability` is clamped to `[0.0, 1.0]`.
231    ///
232    /// **This is the only non-explicit schedule.** Even so, it is
233    /// strictly reproducible; no real RNG state, no clock, no thread.
234    ///
235    /// # Example
236    ///
237    /// ```
238    /// use dev_chaos::{FailureMode, FailureSchedule};
239    ///
240    /// let a = FailureSchedule::seeded_random(42, 0.10, FailureMode::IoError);
241    /// let b = FailureSchedule::seeded_random(42, 0.10, FailureMode::IoError);
242    /// // Same seed => same outcome at every attempt.
243    /// for attempt in 1..=100 {
244    ///     assert_eq!(a.maybe_fail(attempt).is_err(), b.maybe_fail(attempt).is_err());
245    /// }
246    /// ```
247    pub fn seeded_random(seed: u64, probability: f64, mode: FailureMode) -> Self {
248        let p = probability.clamp(0.0, 1.0);
249        let prob_thousandths = (p * 1000.0).round() as u32;
250        Self {
251            inner: ScheduleKind::SeededRandom {
252                seed,
253                prob_thousandths,
254            },
255            mode,
256            invocations: AtomicUsize::new(0),
257            failures: AtomicUsize::new(0),
258            failure_limit: None,
259        }
260    }
261
262    /// Cap the total number of failures this schedule will emit.
263    ///
264    /// After `n` failures have been emitted via [`maybe_fail`], the
265    /// schedule stops firing — every subsequent call returns `Ok(())`,
266    /// regardless of attempt number.
267    ///
268    /// Useful for bounded chaos: you want a few failures to verify
269    /// recovery, not an indefinite stream.
270    ///
271    /// # Example
272    ///
273    /// ```
274    /// use dev_chaos::{FailureMode, FailureSchedule};
275    ///
276    /// // Fail every attempt, but stop after 3.
277    /// let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(3);
278    /// let mut failures = 0;
279    /// for attempt in 1..=20 {
280    ///     if s.maybe_fail(attempt).is_err() {
281    ///         failures += 1;
282    ///     }
283    /// }
284    /// assert_eq!(failures, 3);
285    /// ```
286    ///
287    /// [`maybe_fail`]: Self::maybe_fail
288    pub fn limit(mut self, n: usize) -> Self {
289        self.failure_limit = Some(n);
290        self
291    }
292
293    /// Check whether the given attempt should fail.
294    ///
295    /// Returns `Ok(())` if the operation should proceed, or
296    /// `Err(InjectedFailure)` if the schedule fires on this attempt.
297    ///
298    /// If a [`limit`](Self::limit) has been applied and the failure
299    /// count has reached it, this returns `Ok(())` regardless of
300    /// whether the schedule would otherwise fire.
301    pub fn maybe_fail(&self, attempt: usize) -> Result<(), InjectedFailure> {
302        self.invocations.fetch_add(1, Ordering::Relaxed);
303        if !self.fires(attempt) {
304            return Ok(());
305        }
306        if let Some(limit) = self.failure_limit {
307            // fetch_update would be cleanest, but a fetch_add + check
308            // is sufficient: we accept that under contention we may
309            // emit at most `limit + (concurrency - 1)` failures, which
310            // is documented and acceptable for a fixture.
311            let prior = self.failures.fetch_add(1, Ordering::Relaxed);
312            if prior >= limit {
313                return Ok(());
314            }
315        } else {
316            self.failures.fetch_add(1, Ordering::Relaxed);
317        }
318        Err(InjectedFailure {
319            mode: self.mode,
320            attempt,
321        })
322    }
323
324    /// Total failures emitted by this schedule so far.
325    pub fn failure_count(&self) -> usize {
326        // When a limit is in effect, internal counter may exceed
327        // the limit by one due to fetch_add ordering; clamp on read.
328        let raw = self.failures.load(Ordering::Relaxed);
329        match self.failure_limit {
330            Some(limit) => raw.min(limit),
331            None => raw,
332        }
333    }
334
335    fn fires(&self, attempt: usize) -> bool {
336        match &self.inner {
337            ScheduleKind::Explicit(set) => set.contains(&attempt),
338            ScheduleKind::EveryN(n) => attempt % *n == 0,
339            ScheduleKind::SeededRandom {
340                seed,
341                prob_thousandths,
342            } => {
343                // Deterministic mix: combine attempt + seed via splitmix64.
344                let mut x =
345                    (*seed).wrapping_add((attempt as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15));
346                x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
347                x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
348                x ^= x >> 31;
349                let bucket = (x % 1000) as u32;
350                bucket < *prob_thousandths
351            }
352        }
353    }
354
355    /// Total invocations of `maybe_fail` since this schedule was built.
356    pub fn invocation_count(&self) -> usize {
357        self.invocations.load(Ordering::Relaxed)
358    }
359
360    /// Mode this schedule injects.
361    pub fn mode(&self) -> FailureMode {
362        self.mode
363    }
364}
365
366/// Verify that recovery logic succeeded after a failure schedule.
367///
368/// Returns a [`CheckResult`] tagged `chaos`. The verdict follows REPS
369/// section 4:
370///
371/// - `final_state_ok = false` -> `Fail (Critical)`, `regression` tag.
372/// - `actual_failures < expected_failures` AND `final_state_ok` ->
373///   `Warn (Warning)`, indicating under-injection.
374/// - Otherwise -> `Pass`.
375///
376/// Always carries numeric `Evidence` for `expected_failures`,
377/// `actual_failures`, `final_state_ok`.
378///
379/// # Example
380///
381/// ```
382/// use dev_chaos::assert_recovered;
383/// let c = assert_recovered("write_log", 2, 2, true);
384/// assert!(matches!(c.verdict, dev_report::Verdict::Pass));
385/// ```
386pub fn assert_recovered(
387    name: impl Into<String>,
388    expected_failures: usize,
389    actual_failures: usize,
390    final_state_ok: bool,
391) -> CheckResult {
392    let check_name = format!("chaos::{}", name.into());
393    let evidence = vec![
394        Evidence::numeric("expected_failures", expected_failures as f64),
395        Evidence::numeric("actual_failures", actual_failures as f64),
396        Evidence::numeric("final_state_ok", if final_state_ok { 1.0 } else { 0.0 }),
397    ];
398
399    if !final_state_ok {
400        let mut tags = vec![
401            "chaos".to_string(),
402            "recovery".to_string(),
403            "regression".to_string(),
404        ];
405        tags.sort();
406        let mut c = CheckResult::fail(check_name, Severity::Critical).with_detail(format!(
407            "system did not recover. expected {expected_failures} injected failures, observed {actual_failures}, final state failed validation"
408        ));
409        c.tags = tags;
410        c.evidence = evidence;
411        return c;
412    }
413
414    if actual_failures < expected_failures {
415        let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
416        tags.sort();
417        let mut c = CheckResult::warn(check_name, Severity::Warning).with_detail(format!(
418            "fewer failures observed than scheduled (expected {expected_failures}, observed {actual_failures})"
419        ));
420        c.tags = tags;
421        c.evidence = evidence;
422        return c;
423    }
424
425    let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
426    tags.sort();
427    let mut c = CheckResult::pass(check_name).with_detail(format!(
428        "recovered after {actual_failures} injected failure(s)"
429    ));
430    c.tags = tags;
431    c.evidence = evidence;
432    c
433}
434
435/// Producer wrapper that runs a chaos suite and emits a Report with
436/// each scenario's `CheckResult`.
437///
438/// # Example
439///
440/// ```no_run
441/// use dev_chaos::{assert_recovered, ChaosProducer};
442/// use dev_report::Producer;
443///
444/// fn run() -> Vec<dev_report::CheckResult> {
445///     vec![
446///         assert_recovered("write_log", 2, 2, true),
447///         assert_recovered("rename", 1, 1, true),
448///     ]
449/// }
450///
451/// let producer = ChaosProducer::new(run, "my-crate", "0.1.0");
452/// let report = producer.produce();
453/// assert_eq!(report.checks.len(), 2);
454/// ```
455pub struct ChaosProducer<F>
456where
457    F: Fn() -> Vec<CheckResult>,
458{
459    run: F,
460    subject: String,
461    subject_version: String,
462}
463
464impl<F> ChaosProducer<F>
465where
466    F: Fn() -> Vec<CheckResult>,
467{
468    /// Build a new producer.
469    pub fn new(run: F, subject: impl Into<String>, subject_version: impl Into<String>) -> Self {
470        Self {
471            run,
472            subject: subject.into(),
473            subject_version: subject_version.into(),
474        }
475    }
476}
477
478impl<F> Producer for ChaosProducer<F>
479where
480    F: Fn() -> Vec<CheckResult>,
481{
482    fn produce(&self) -> Report {
483        let checks = (self.run)();
484        let mut r = Report::new(self.subject.clone(), self.subject_version.clone())
485            .with_producer("dev-chaos");
486        for c in checks {
487            r.push(c);
488        }
489        r.finish();
490        r
491    }
492}
493
494#[cfg(test)]
495mod tests {
496    use super::*;
497    use dev_report::Verdict;
498
499    #[test]
500    fn schedule_fails_on_specified_attempts() {
501        let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
502        assert!(s.maybe_fail(1).is_ok());
503        assert!(s.maybe_fail(2).is_err());
504        assert!(s.maybe_fail(3).is_ok());
505        assert!(s.maybe_fail(4).is_err());
506        assert_eq!(s.invocation_count(), 4);
507    }
508
509    #[test]
510    fn every_n_fires_on_multiples() {
511        let s = FailureSchedule::every_n(3, FailureMode::Timeout);
512        assert!(s.maybe_fail(1).is_ok());
513        assert!(s.maybe_fail(2).is_ok());
514        assert!(s.maybe_fail(3).is_err());
515        assert!(s.maybe_fail(6).is_err());
516        assert!(s.maybe_fail(9).is_err());
517        // Beyond 1024-now-arbitrary because we use modulo.
518        assert!(s.maybe_fail(3_000).is_err());
519    }
520
521    #[test]
522    fn limit_caps_total_failures() {
523        let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(3);
524        let mut failures = 0;
525        for attempt in 1..=20 {
526            if s.maybe_fail(attempt).is_err() {
527                failures += 1;
528            }
529        }
530        assert_eq!(failures, 3);
531        assert_eq!(s.failure_count(), 3);
532    }
533
534    #[test]
535    fn limit_zero_disables_failures() {
536        let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(0);
537        for attempt in 1..=10 {
538            assert!(s.maybe_fail(attempt).is_ok());
539        }
540        assert_eq!(s.failure_count(), 0);
541    }
542
543    #[test]
544    fn unlimited_schedule_still_increments_failure_count() {
545        let s = FailureSchedule::every_n(1, FailureMode::IoError);
546        for attempt in 1..=5 {
547            let _ = s.maybe_fail(attempt);
548        }
549        assert_eq!(s.failure_count(), 5);
550    }
551
552    #[test]
553    fn limit_works_with_seeded_random() {
554        let s = FailureSchedule::seeded_random(42, 1.0, FailureMode::IoError).limit(2);
555        let mut failures = 0;
556        for attempt in 1..=20 {
557            if s.maybe_fail(attempt).is_err() {
558                failures += 1;
559            }
560        }
561        assert_eq!(failures, 2);
562    }
563
564    #[test]
565    fn seeded_random_is_deterministic() {
566        let a = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
567        let b = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
568        for attempt in 1..=200 {
569            assert_eq!(
570                a.fires(attempt),
571                b.fires(attempt),
572                "differs at attempt {}",
573                attempt
574            );
575        }
576    }
577
578    #[test]
579    fn seeded_random_zero_probability_never_fires() {
580        let s = FailureSchedule::seeded_random(7, 0.0, FailureMode::IoError);
581        for attempt in 1..=1000 {
582            assert!(s.maybe_fail(attempt).is_ok());
583        }
584    }
585
586    #[test]
587    fn seeded_random_full_probability_always_fires() {
588        let s = FailureSchedule::seeded_random(7, 1.0, FailureMode::IoError);
589        for attempt in 1..=200 {
590            assert!(s.maybe_fail(attempt).is_err());
591        }
592    }
593
594    #[test]
595    fn injected_failure_converts_to_io_error() {
596        let f = InjectedFailure {
597            mode: FailureMode::Timeout,
598            attempt: 5,
599        };
600        let e: std::io::Error = f.into();
601        assert_eq!(e.kind(), std::io::ErrorKind::TimedOut);
602    }
603
604    #[test]
605    fn recovery_check_pass() {
606        let c = assert_recovered("write_log", 2, 2, true);
607        assert_eq!(c.verdict, Verdict::Pass);
608        assert!(c.has_tag("chaos"));
609        assert!(c.has_tag("recovery"));
610        assert!(!c.has_tag("regression"));
611    }
612
613    #[test]
614    fn recovery_check_fail_when_state_invalid() {
615        let c = assert_recovered("write_log", 2, 2, false);
616        assert_eq!(c.verdict, Verdict::Fail);
617        assert_eq!(c.severity, Some(Severity::Critical));
618        assert!(c.has_tag("regression"));
619    }
620
621    #[test]
622    fn recovery_check_warns_on_under_injection() {
623        let c = assert_recovered("write_log", 5, 2, true);
624        assert_eq!(c.verdict, Verdict::Warn);
625    }
626
627    #[test]
628    fn recovery_check_carries_numeric_evidence() {
629        let c = assert_recovered("op", 3, 3, true);
630        let labels: Vec<&str> = c.evidence.iter().map(|e| e.label.as_str()).collect();
631        assert!(labels.contains(&"expected_failures"));
632        assert!(labels.contains(&"actual_failures"));
633        assert!(labels.contains(&"final_state_ok"));
634    }
635
636    #[test]
637    fn chaos_producer_emits_report() {
638        let producer = ChaosProducer::new(
639            || {
640                vec![
641                    assert_recovered("a", 1, 1, true),
642                    assert_recovered("b", 2, 2, true),
643                ]
644            },
645            "my-crate",
646            "0.1.0",
647        );
648        let report = producer.produce();
649        assert_eq!(report.checks.len(), 2);
650        assert_eq!(report.producer.as_deref(), Some("dev-chaos"));
651        assert_eq!(report.overall_verdict(), Verdict::Pass);
652    }
653}