Skip to main content

dev_chaos/
lib.rs

1//! # dev-chaos
2//!
3//! Failure injection and recovery testing for Rust. Part of the
4//! `dev-*` verification suite.
5//!
6//! Most code is tested only on the happy path. Real systems fail
7//! through partial writes, crashes, timeouts, corrupt data, and
8//! broken connections. `dev-chaos` provides primitives for injecting
9//! those failures on purpose, then verifying that recovery logic does
10//! its job.
11//!
12//! ## Quick example
13//!
14//! ```no_run
15//! use dev_chaos::{FailureSchedule, FailureMode};
16//!
17//! // Fail on the 3rd, 7th, and 10th attempt.
18//! let schedule = FailureSchedule::on_attempts(&[3, 7, 10], FailureMode::IoError);
19//!
20//! for attempt in 1..=12 {
21//!     match schedule.maybe_fail(attempt) {
22//!         Ok(()) => { /* operation proceeds */ }
23//!         Err(_e) => { /* recovery path */ }
24//!     }
25//! }
26//! ```
27//!
28//! ## Modules
29//!
30//! - [`io`] — sync IO wrappers (`ChaosReader`, `ChaosWriter`, `ChaosFile`).
31//! - [`latency`] — non-failing slowdowns via `LatencyInjector`.
32//! - [`crash`] — write-truncation via `CrashPoint`.
33//! - `async_io` (feature `async-io`) — `tokio::io` equivalents
34//!   (visible in rustdoc when the feature is enabled).
35//!
36//! ## Determinism
37//!
38//! All schedules are deterministic by default: the same sequence of
39//! attempts MUST produce the same sequence of failures across runs
40//! and machines. Probabilistic schedules
41//! ([`FailureSchedule::seeded_random`]) are opt-in, seeded, and
42//! reproducible from the seed.
43
44#![cfg_attr(docsrs, feature(doc_cfg))]
45#![warn(missing_docs)]
46#![warn(rust_2018_idioms)]
47
48use std::collections::HashSet;
49use std::sync::atomic::{AtomicUsize, Ordering};
50
51use dev_report::{CheckResult, Evidence, Producer, Report, Severity};
52
53pub mod crash;
54pub mod io;
55pub mod latency;
56
57#[cfg(feature = "async-io")]
58#[cfg_attr(docsrs, doc(cfg(feature = "async-io")))]
59pub mod async_io;
60
61/// A type of failure that can be injected.
62///
63/// # Example
64///
65/// ```
66/// use dev_chaos::FailureMode;
67/// assert_eq!(FailureMode::IoError.as_str(), "io_error");
68/// ```
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum FailureMode {
71    /// Generic I/O error.
72    IoError,
73    /// Partial write: returns an error after writing some bytes.
74    PartialWrite,
75    /// Connection reset.
76    ConnectionReset,
77    /// Operation timeout.
78    Timeout,
79    /// Corrupted data: returns success but with corrupted bytes.
80    Corruption,
81    /// Permission denied.
82    PermissionDenied,
83}
84
85impl FailureMode {
86    /// Human-readable name for this failure mode.
87    pub fn as_str(&self) -> &'static str {
88        match self {
89            FailureMode::IoError => "io_error",
90            FailureMode::PartialWrite => "partial_write",
91            FailureMode::ConnectionReset => "connection_reset",
92            FailureMode::Timeout => "timeout",
93            FailureMode::Corruption => "corruption",
94            FailureMode::PermissionDenied => "permission_denied",
95        }
96    }
97
98    /// Map this mode to an `std::io::ErrorKind`.
99    pub fn to_io_kind(&self) -> std::io::ErrorKind {
100        match self {
101            FailureMode::IoError => std::io::ErrorKind::Other,
102            FailureMode::PartialWrite => std::io::ErrorKind::WriteZero,
103            FailureMode::ConnectionReset => std::io::ErrorKind::ConnectionReset,
104            FailureMode::Timeout => std::io::ErrorKind::TimedOut,
105            FailureMode::Corruption => std::io::ErrorKind::InvalidData,
106            FailureMode::PermissionDenied => std::io::ErrorKind::PermissionDenied,
107        }
108    }
109}
110
111/// An error returned by injected failures.
112///
113/// # Example
114///
115/// ```
116/// use dev_chaos::{FailureMode, InjectedFailure};
117/// let f = InjectedFailure { mode: FailureMode::Timeout, attempt: 3 };
118/// assert_eq!(f.mode.as_str(), "timeout");
119/// ```
120#[derive(Debug, Clone)]
121pub struct InjectedFailure {
122    /// The mode of failure that was injected.
123    pub mode: FailureMode,
124    /// The attempt number at which the failure was injected.
125    pub attempt: usize,
126}
127
128impl std::fmt::Display for InjectedFailure {
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        write!(
131            f,
132            "injected failure {} at attempt {}",
133            self.mode.as_str(),
134            self.attempt
135        )
136    }
137}
138
139impl std::error::Error for InjectedFailure {}
140
141impl From<InjectedFailure> for std::io::Error {
142    fn from(f: InjectedFailure) -> Self {
143        std::io::Error::new(f.mode.to_io_kind(), f.to_string())
144    }
145}
146
147/// A schedule that decides whether a given attempt fails.
148///
149/// Schedules are deterministic by default. The same `(schedule, attempt)`
150/// pair produces the same outcome across runs and machines.
151///
152/// # Example
153///
154/// ```
155/// use dev_chaos::{FailureMode, FailureSchedule};
156///
157/// let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
158/// assert!(s.maybe_fail(1).is_ok());
159/// assert!(s.maybe_fail(2).is_err());
160/// ```
161pub struct FailureSchedule {
162    inner: ScheduleKind,
163    mode: FailureMode,
164    invocations: AtomicUsize,
165    failures: AtomicUsize,
166    /// `None` = unbounded; `Some(n)` = stop firing after `n` failures.
167    failure_limit: Option<usize>,
168}
169
170enum ScheduleKind {
171    Explicit(HashSet<usize>),
172    EveryN(usize),
173    SeededRandom { seed: u64, prob_thousandths: u32 },
174}
175
176impl FailureSchedule {
177    /// Build a schedule that fails on specific attempt numbers
178    /// (1-indexed).
179    ///
180    /// # Example
181    ///
182    /// ```
183    /// use dev_chaos::{FailureMode, FailureSchedule};
184    /// let s = FailureSchedule::on_attempts(&[3, 7], FailureMode::Timeout);
185    /// assert!(s.maybe_fail(3).is_err());
186    /// assert!(s.maybe_fail(4).is_ok());
187    /// ```
188    pub fn on_attempts(attempts: &[usize], mode: FailureMode) -> Self {
189        Self {
190            inner: ScheduleKind::Explicit(attempts.iter().copied().collect()),
191            mode,
192            invocations: AtomicUsize::new(0),
193            failures: AtomicUsize::new(0),
194            failure_limit: None,
195        }
196    }
197
198    /// Build a schedule that fails on every Nth attempt (1-indexed).
199    ///
200    /// # Example
201    ///
202    /// ```
203    /// use dev_chaos::{FailureMode, FailureSchedule};
204    /// let s = FailureSchedule::every_n(3, FailureMode::Timeout);
205    /// assert!(s.maybe_fail(3).is_err());
206    /// assert!(s.maybe_fail(6).is_err());
207    /// ```
208    pub fn every_n(n: usize, mode: FailureMode) -> Self {
209        let n = n.max(1);
210        Self {
211            inner: ScheduleKind::EveryN(n),
212            mode,
213            invocations: AtomicUsize::new(0),
214            failures: AtomicUsize::new(0),
215            failure_limit: None,
216        }
217    }
218
219    /// Build a deterministic, seeded "random" schedule.
220    ///
221    /// Each attempt is hashed (with the seed) into a value in `[0, 1000)`
222    /// and fails when that value is below `probability * 1000`. The
223    /// schedule is fully reproducible from the seed.
224    ///
225    /// `probability` is clamped to `[0.0, 1.0]`.
226    ///
227    /// **This is the only non-explicit schedule.** Even so, it is
228    /// strictly reproducible; no real RNG state, no clock, no thread.
229    ///
230    /// # Example
231    ///
232    /// ```
233    /// use dev_chaos::{FailureMode, FailureSchedule};
234    ///
235    /// let a = FailureSchedule::seeded_random(42, 0.10, FailureMode::IoError);
236    /// let b = FailureSchedule::seeded_random(42, 0.10, FailureMode::IoError);
237    /// // Same seed => same outcome at every attempt.
238    /// for attempt in 1..=100 {
239    ///     assert_eq!(a.maybe_fail(attempt).is_err(), b.maybe_fail(attempt).is_err());
240    /// }
241    /// ```
242    pub fn seeded_random(seed: u64, probability: f64, mode: FailureMode) -> Self {
243        let p = probability.clamp(0.0, 1.0);
244        let prob_thousandths = (p * 1000.0).round() as u32;
245        Self {
246            inner: ScheduleKind::SeededRandom {
247                seed,
248                prob_thousandths,
249            },
250            mode,
251            invocations: AtomicUsize::new(0),
252            failures: AtomicUsize::new(0),
253            failure_limit: None,
254        }
255    }
256
257    /// Cap the total number of failures this schedule will emit.
258    ///
259    /// After `n` failures have been emitted via [`maybe_fail`], the
260    /// schedule stops firing — every subsequent call returns `Ok(())`,
261    /// regardless of attempt number.
262    ///
263    /// Useful for bounded chaos: you want a few failures to verify
264    /// recovery, not an indefinite stream.
265    ///
266    /// # Example
267    ///
268    /// ```
269    /// use dev_chaos::{FailureMode, FailureSchedule};
270    ///
271    /// // Fail every attempt, but stop after 3.
272    /// let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(3);
273    /// let mut failures = 0;
274    /// for attempt in 1..=20 {
275    ///     if s.maybe_fail(attempt).is_err() {
276    ///         failures += 1;
277    ///     }
278    /// }
279    /// assert_eq!(failures, 3);
280    /// ```
281    ///
282    /// [`maybe_fail`]: Self::maybe_fail
283    pub fn limit(mut self, n: usize) -> Self {
284        self.failure_limit = Some(n);
285        self
286    }
287
288    /// Check whether the given attempt should fail.
289    ///
290    /// Returns `Ok(())` if the operation should proceed, or
291    /// `Err(InjectedFailure)` if the schedule fires on this attempt.
292    ///
293    /// If a [`limit`](Self::limit) has been applied and the failure
294    /// count has reached it, this returns `Ok(())` regardless of
295    /// whether the schedule would otherwise fire.
296    pub fn maybe_fail(&self, attempt: usize) -> Result<(), InjectedFailure> {
297        self.invocations.fetch_add(1, Ordering::Relaxed);
298        if !self.fires(attempt) {
299            return Ok(());
300        }
301        if let Some(limit) = self.failure_limit {
302            // fetch_update would be cleanest, but a fetch_add + check
303            // is sufficient: we accept that under contention we may
304            // emit at most `limit + (concurrency - 1)` failures, which
305            // is documented and acceptable for a fixture.
306            let prior = self.failures.fetch_add(1, Ordering::Relaxed);
307            if prior >= limit {
308                return Ok(());
309            }
310        } else {
311            self.failures.fetch_add(1, Ordering::Relaxed);
312        }
313        Err(InjectedFailure {
314            mode: self.mode,
315            attempt,
316        })
317    }
318
319    /// Total failures emitted by this schedule so far.
320    pub fn failure_count(&self) -> usize {
321        // When a limit is in effect, internal counter may exceed
322        // the limit by one due to fetch_add ordering; clamp on read.
323        let raw = self.failures.load(Ordering::Relaxed);
324        match self.failure_limit {
325            Some(limit) => raw.min(limit),
326            None => raw,
327        }
328    }
329
330    fn fires(&self, attempt: usize) -> bool {
331        match &self.inner {
332            ScheduleKind::Explicit(set) => set.contains(&attempt),
333            ScheduleKind::EveryN(n) => attempt % *n == 0,
334            ScheduleKind::SeededRandom {
335                seed,
336                prob_thousandths,
337            } => {
338                // Deterministic mix: combine attempt + seed via splitmix64.
339                let mut x =
340                    (*seed).wrapping_add((attempt as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15));
341                x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
342                x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
343                x ^= x >> 31;
344                let bucket = (x % 1000) as u32;
345                bucket < *prob_thousandths
346            }
347        }
348    }
349
350    /// Total invocations of `maybe_fail` since this schedule was built.
351    pub fn invocation_count(&self) -> usize {
352        self.invocations.load(Ordering::Relaxed)
353    }
354
355    /// Mode this schedule injects.
356    pub fn mode(&self) -> FailureMode {
357        self.mode
358    }
359}
360
361/// Verify that recovery logic succeeded after a failure schedule.
362///
363/// Returns a [`CheckResult`] tagged `chaos`. The verdict follows REPS
364/// section 4:
365///
366/// - `final_state_ok = false` -> `Fail (Critical)`, `regression` tag.
367/// - `actual_failures < expected_failures` AND `final_state_ok` ->
368///   `Warn (Warning)`, indicating under-injection.
369/// - Otherwise -> `Pass`.
370///
371/// Always carries numeric `Evidence` for `expected_failures`,
372/// `actual_failures`, `final_state_ok`.
373///
374/// # Example
375///
376/// ```
377/// use dev_chaos::assert_recovered;
378/// let c = assert_recovered("write_log", 2, 2, true);
379/// assert!(matches!(c.verdict, dev_report::Verdict::Pass));
380/// ```
381pub fn assert_recovered(
382    name: impl Into<String>,
383    expected_failures: usize,
384    actual_failures: usize,
385    final_state_ok: bool,
386) -> CheckResult {
387    let check_name = format!("chaos::{}", name.into());
388    let evidence = vec![
389        Evidence::numeric("expected_failures", expected_failures as f64),
390        Evidence::numeric("actual_failures", actual_failures as f64),
391        Evidence::numeric("final_state_ok", if final_state_ok { 1.0 } else { 0.0 }),
392    ];
393
394    if !final_state_ok {
395        let mut tags = vec![
396            "chaos".to_string(),
397            "recovery".to_string(),
398            "regression".to_string(),
399        ];
400        tags.sort();
401        let mut c = CheckResult::fail(check_name, Severity::Critical).with_detail(format!(
402            "system did not recover. expected {expected_failures} injected failures, observed {actual_failures}, final state failed validation"
403        ));
404        c.tags = tags;
405        c.evidence = evidence;
406        return c;
407    }
408
409    if actual_failures < expected_failures {
410        let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
411        tags.sort();
412        let mut c = CheckResult::warn(check_name, Severity::Warning).with_detail(format!(
413            "fewer failures observed than scheduled (expected {expected_failures}, observed {actual_failures})"
414        ));
415        c.tags = tags;
416        c.evidence = evidence;
417        return c;
418    }
419
420    let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
421    tags.sort();
422    let mut c = CheckResult::pass(check_name).with_detail(format!(
423        "recovered after {actual_failures} injected failure(s)"
424    ));
425    c.tags = tags;
426    c.evidence = evidence;
427    c
428}
429
430/// Producer wrapper that runs a chaos suite and emits a Report with
431/// each scenario's `CheckResult`.
432///
433/// # Example
434///
435/// ```no_run
436/// use dev_chaos::{assert_recovered, ChaosProducer};
437/// use dev_report::Producer;
438///
439/// fn run() -> Vec<dev_report::CheckResult> {
440///     vec![
441///         assert_recovered("write_log", 2, 2, true),
442///         assert_recovered("rename", 1, 1, true),
443///     ]
444/// }
445///
446/// let producer = ChaosProducer::new(run, "my-crate", "0.1.0");
447/// let report = producer.produce();
448/// assert_eq!(report.checks.len(), 2);
449/// ```
450pub struct ChaosProducer<F>
451where
452    F: Fn() -> Vec<CheckResult>,
453{
454    run: F,
455    subject: String,
456    subject_version: String,
457}
458
459impl<F> ChaosProducer<F>
460where
461    F: Fn() -> Vec<CheckResult>,
462{
463    /// Build a new producer.
464    pub fn new(run: F, subject: impl Into<String>, subject_version: impl Into<String>) -> Self {
465        Self {
466            run,
467            subject: subject.into(),
468            subject_version: subject_version.into(),
469        }
470    }
471}
472
473impl<F> Producer for ChaosProducer<F>
474where
475    F: Fn() -> Vec<CheckResult>,
476{
477    fn produce(&self) -> Report {
478        let checks = (self.run)();
479        let mut r = Report::new(self.subject.clone(), self.subject_version.clone())
480            .with_producer("dev-chaos");
481        for c in checks {
482            r.push(c);
483        }
484        r.finish();
485        r
486    }
487}
488
489#[cfg(test)]
490mod tests {
491    use super::*;
492    use dev_report::Verdict;
493
494    #[test]
495    fn schedule_fails_on_specified_attempts() {
496        let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
497        assert!(s.maybe_fail(1).is_ok());
498        assert!(s.maybe_fail(2).is_err());
499        assert!(s.maybe_fail(3).is_ok());
500        assert!(s.maybe_fail(4).is_err());
501        assert_eq!(s.invocation_count(), 4);
502    }
503
504    #[test]
505    fn every_n_fires_on_multiples() {
506        let s = FailureSchedule::every_n(3, FailureMode::Timeout);
507        assert!(s.maybe_fail(1).is_ok());
508        assert!(s.maybe_fail(2).is_ok());
509        assert!(s.maybe_fail(3).is_err());
510        assert!(s.maybe_fail(6).is_err());
511        assert!(s.maybe_fail(9).is_err());
512        // Beyond 1024-now-arbitrary because we use modulo.
513        assert!(s.maybe_fail(3_000).is_err());
514    }
515
516    #[test]
517    fn limit_caps_total_failures() {
518        let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(3);
519        let mut failures = 0;
520        for attempt in 1..=20 {
521            if s.maybe_fail(attempt).is_err() {
522                failures += 1;
523            }
524        }
525        assert_eq!(failures, 3);
526        assert_eq!(s.failure_count(), 3);
527    }
528
529    #[test]
530    fn limit_zero_disables_failures() {
531        let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(0);
532        for attempt in 1..=10 {
533            assert!(s.maybe_fail(attempt).is_ok());
534        }
535        assert_eq!(s.failure_count(), 0);
536    }
537
538    #[test]
539    fn unlimited_schedule_still_increments_failure_count() {
540        let s = FailureSchedule::every_n(1, FailureMode::IoError);
541        for attempt in 1..=5 {
542            let _ = s.maybe_fail(attempt);
543        }
544        assert_eq!(s.failure_count(), 5);
545    }
546
547    #[test]
548    fn limit_works_with_seeded_random() {
549        let s = FailureSchedule::seeded_random(42, 1.0, FailureMode::IoError).limit(2);
550        let mut failures = 0;
551        for attempt in 1..=20 {
552            if s.maybe_fail(attempt).is_err() {
553                failures += 1;
554            }
555        }
556        assert_eq!(failures, 2);
557    }
558
559    #[test]
560    fn seeded_random_is_deterministic() {
561        let a = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
562        let b = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
563        for attempt in 1..=200 {
564            assert_eq!(
565                a.fires(attempt),
566                b.fires(attempt),
567                "differs at attempt {}",
568                attempt
569            );
570        }
571    }
572
573    #[test]
574    fn seeded_random_zero_probability_never_fires() {
575        let s = FailureSchedule::seeded_random(7, 0.0, FailureMode::IoError);
576        for attempt in 1..=1000 {
577            assert!(s.maybe_fail(attempt).is_ok());
578        }
579    }
580
581    #[test]
582    fn seeded_random_full_probability_always_fires() {
583        let s = FailureSchedule::seeded_random(7, 1.0, FailureMode::IoError);
584        for attempt in 1..=200 {
585            assert!(s.maybe_fail(attempt).is_err());
586        }
587    }
588
589    #[test]
590    fn injected_failure_converts_to_io_error() {
591        let f = InjectedFailure {
592            mode: FailureMode::Timeout,
593            attempt: 5,
594        };
595        let e: std::io::Error = f.into();
596        assert_eq!(e.kind(), std::io::ErrorKind::TimedOut);
597    }
598
599    #[test]
600    fn recovery_check_pass() {
601        let c = assert_recovered("write_log", 2, 2, true);
602        assert_eq!(c.verdict, Verdict::Pass);
603        assert!(c.has_tag("chaos"));
604        assert!(c.has_tag("recovery"));
605        assert!(!c.has_tag("regression"));
606    }
607
608    #[test]
609    fn recovery_check_fail_when_state_invalid() {
610        let c = assert_recovered("write_log", 2, 2, false);
611        assert_eq!(c.verdict, Verdict::Fail);
612        assert_eq!(c.severity, Some(Severity::Critical));
613        assert!(c.has_tag("regression"));
614    }
615
616    #[test]
617    fn recovery_check_warns_on_under_injection() {
618        let c = assert_recovered("write_log", 5, 2, true);
619        assert_eq!(c.verdict, Verdict::Warn);
620    }
621
622    #[test]
623    fn recovery_check_carries_numeric_evidence() {
624        let c = assert_recovered("op", 3, 3, true);
625        let labels: Vec<&str> = c.evidence.iter().map(|e| e.label.as_str()).collect();
626        assert!(labels.contains(&"expected_failures"));
627        assert!(labels.contains(&"actual_failures"));
628        assert!(labels.contains(&"final_state_ok"));
629    }
630
631    #[test]
632    fn chaos_producer_emits_report() {
633        let producer = ChaosProducer::new(
634            || {
635                vec![
636                    assert_recovered("a", 1, 1, true),
637                    assert_recovered("b", 2, 2, true),
638                ]
639            },
640            "my-crate",
641            "0.1.0",
642        );
643        let report = producer.produce();
644        assert_eq!(report.checks.len(), 2);
645        assert_eq!(report.producer.as_deref(), Some("dev-chaos"));
646        assert_eq!(report.overall_verdict(), Verdict::Pass);
647    }
648}