1#![cfg_attr(docsrs, feature(doc_cfg))]
48#![warn(missing_docs)]
49#![warn(rust_2018_idioms)]
50
51use std::collections::HashSet;
52use std::sync::atomic::{AtomicUsize, Ordering};
53
54use dev_report::{CheckResult, Evidence, Producer, Report, Severity};
55
56pub mod clock;
57pub mod crash;
58pub mod io;
59pub mod latency;
60pub mod memory_pressure;
61
62#[cfg(feature = "async-io")]
63#[cfg_attr(docsrs, doc(cfg(feature = "async-io")))]
64pub mod async_io;
65
66#[derive(Debug, Clone, Copy, PartialEq, Eq)]
75pub enum FailureMode {
76 IoError,
78 PartialWrite,
80 ConnectionReset,
82 Timeout,
84 Corruption,
86 PermissionDenied,
88}
89
90impl FailureMode {
91 pub fn as_str(&self) -> &'static str {
93 match self {
94 FailureMode::IoError => "io_error",
95 FailureMode::PartialWrite => "partial_write",
96 FailureMode::ConnectionReset => "connection_reset",
97 FailureMode::Timeout => "timeout",
98 FailureMode::Corruption => "corruption",
99 FailureMode::PermissionDenied => "permission_denied",
100 }
101 }
102
103 pub fn to_io_kind(&self) -> std::io::ErrorKind {
105 match self {
106 FailureMode::IoError => std::io::ErrorKind::Other,
107 FailureMode::PartialWrite => std::io::ErrorKind::WriteZero,
108 FailureMode::ConnectionReset => std::io::ErrorKind::ConnectionReset,
109 FailureMode::Timeout => std::io::ErrorKind::TimedOut,
110 FailureMode::Corruption => std::io::ErrorKind::InvalidData,
111 FailureMode::PermissionDenied => std::io::ErrorKind::PermissionDenied,
112 }
113 }
114}
115
116#[derive(Debug, Clone)]
126pub struct InjectedFailure {
127 pub mode: FailureMode,
129 pub attempt: usize,
131}
132
133impl std::fmt::Display for InjectedFailure {
134 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
135 write!(
136 f,
137 "injected failure {} at attempt {}",
138 self.mode.as_str(),
139 self.attempt
140 )
141 }
142}
143
144impl std::error::Error for InjectedFailure {}
145
146impl From<InjectedFailure> for std::io::Error {
147 fn from(f: InjectedFailure) -> Self {
148 std::io::Error::new(f.mode.to_io_kind(), f.to_string())
149 }
150}
151
152pub struct FailureSchedule {
167 inner: ScheduleKind,
168 mode: FailureMode,
169 invocations: AtomicUsize,
170 failures: AtomicUsize,
171 failure_limit: Option<usize>,
173}
174
175enum ScheduleKind {
176 Explicit(HashSet<usize>),
177 EveryN(usize),
178 SeededRandom { seed: u64, prob_thousandths: u32 },
179}
180
181impl FailureSchedule {
182 pub fn on_attempts(attempts: &[usize], mode: FailureMode) -> Self {
194 Self {
195 inner: ScheduleKind::Explicit(attempts.iter().copied().collect()),
196 mode,
197 invocations: AtomicUsize::new(0),
198 failures: AtomicUsize::new(0),
199 failure_limit: None,
200 }
201 }
202
203 pub fn every_n(n: usize, mode: FailureMode) -> Self {
214 let n = n.max(1);
215 Self {
216 inner: ScheduleKind::EveryN(n),
217 mode,
218 invocations: AtomicUsize::new(0),
219 failures: AtomicUsize::new(0),
220 failure_limit: None,
221 }
222 }
223
224 pub fn seeded_random(seed: u64, probability: f64, mode: FailureMode) -> Self {
248 let p = probability.clamp(0.0, 1.0);
249 let prob_thousandths = (p * 1000.0).round() as u32;
250 Self {
251 inner: ScheduleKind::SeededRandom {
252 seed,
253 prob_thousandths,
254 },
255 mode,
256 invocations: AtomicUsize::new(0),
257 failures: AtomicUsize::new(0),
258 failure_limit: None,
259 }
260 }
261
262 pub fn limit(mut self, n: usize) -> Self {
289 self.failure_limit = Some(n);
290 self
291 }
292
293 pub fn maybe_fail(&self, attempt: usize) -> Result<(), InjectedFailure> {
302 self.invocations.fetch_add(1, Ordering::Relaxed);
303 if !self.fires(attempt) {
304 return Ok(());
305 }
306 if let Some(limit) = self.failure_limit {
307 let prior = self.failures.fetch_add(1, Ordering::Relaxed);
312 if prior >= limit {
313 return Ok(());
314 }
315 } else {
316 self.failures.fetch_add(1, Ordering::Relaxed);
317 }
318 Err(InjectedFailure {
319 mode: self.mode,
320 attempt,
321 })
322 }
323
324 pub fn failure_count(&self) -> usize {
326 let raw = self.failures.load(Ordering::Relaxed);
329 match self.failure_limit {
330 Some(limit) => raw.min(limit),
331 None => raw,
332 }
333 }
334
335 fn fires(&self, attempt: usize) -> bool {
336 match &self.inner {
337 ScheduleKind::Explicit(set) => set.contains(&attempt),
338 ScheduleKind::EveryN(n) => attempt % *n == 0,
339 ScheduleKind::SeededRandom {
340 seed,
341 prob_thousandths,
342 } => {
343 let mut x =
345 (*seed).wrapping_add((attempt as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15));
346 x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
347 x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
348 x ^= x >> 31;
349 let bucket = (x % 1000) as u32;
350 bucket < *prob_thousandths
351 }
352 }
353 }
354
355 pub fn invocation_count(&self) -> usize {
357 self.invocations.load(Ordering::Relaxed)
358 }
359
360 pub fn mode(&self) -> FailureMode {
362 self.mode
363 }
364}
365
366pub fn assert_recovered(
387 name: impl Into<String>,
388 expected_failures: usize,
389 actual_failures: usize,
390 final_state_ok: bool,
391) -> CheckResult {
392 let check_name = format!("chaos::{}", name.into());
393 let evidence = vec![
394 Evidence::numeric("expected_failures", expected_failures as f64),
395 Evidence::numeric("actual_failures", actual_failures as f64),
396 Evidence::numeric("final_state_ok", if final_state_ok { 1.0 } else { 0.0 }),
397 ];
398
399 if !final_state_ok {
400 let mut tags = vec![
401 "chaos".to_string(),
402 "recovery".to_string(),
403 "regression".to_string(),
404 ];
405 tags.sort();
406 let mut c = CheckResult::fail(check_name, Severity::Critical).with_detail(format!(
407 "system did not recover. expected {expected_failures} injected failures, observed {actual_failures}, final state failed validation"
408 ));
409 c.tags = tags;
410 c.evidence = evidence;
411 return c;
412 }
413
414 if actual_failures < expected_failures {
415 let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
416 tags.sort();
417 let mut c = CheckResult::warn(check_name, Severity::Warning).with_detail(format!(
418 "fewer failures observed than scheduled (expected {expected_failures}, observed {actual_failures})"
419 ));
420 c.tags = tags;
421 c.evidence = evidence;
422 return c;
423 }
424
425 let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
426 tags.sort();
427 let mut c = CheckResult::pass(check_name).with_detail(format!(
428 "recovered after {actual_failures} injected failure(s)"
429 ));
430 c.tags = tags;
431 c.evidence = evidence;
432 c
433}
434
435pub struct ChaosProducer<F>
456where
457 F: Fn() -> Vec<CheckResult>,
458{
459 run: F,
460 subject: String,
461 subject_version: String,
462}
463
464impl<F> ChaosProducer<F>
465where
466 F: Fn() -> Vec<CheckResult>,
467{
468 pub fn new(run: F, subject: impl Into<String>, subject_version: impl Into<String>) -> Self {
470 Self {
471 run,
472 subject: subject.into(),
473 subject_version: subject_version.into(),
474 }
475 }
476}
477
478impl<F> Producer for ChaosProducer<F>
479where
480 F: Fn() -> Vec<CheckResult>,
481{
482 fn produce(&self) -> Report {
483 let checks = (self.run)();
484 let mut r = Report::new(self.subject.clone(), self.subject_version.clone())
485 .with_producer("dev-chaos");
486 for c in checks {
487 r.push(c);
488 }
489 r.finish();
490 r
491 }
492}
493
494#[cfg(test)]
495mod tests {
496 use super::*;
497 use dev_report::Verdict;
498
499 #[test]
500 fn schedule_fails_on_specified_attempts() {
501 let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
502 assert!(s.maybe_fail(1).is_ok());
503 assert!(s.maybe_fail(2).is_err());
504 assert!(s.maybe_fail(3).is_ok());
505 assert!(s.maybe_fail(4).is_err());
506 assert_eq!(s.invocation_count(), 4);
507 }
508
509 #[test]
510 fn every_n_fires_on_multiples() {
511 let s = FailureSchedule::every_n(3, FailureMode::Timeout);
512 assert!(s.maybe_fail(1).is_ok());
513 assert!(s.maybe_fail(2).is_ok());
514 assert!(s.maybe_fail(3).is_err());
515 assert!(s.maybe_fail(6).is_err());
516 assert!(s.maybe_fail(9).is_err());
517 assert!(s.maybe_fail(3_000).is_err());
519 }
520
521 #[test]
522 fn limit_caps_total_failures() {
523 let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(3);
524 let mut failures = 0;
525 for attempt in 1..=20 {
526 if s.maybe_fail(attempt).is_err() {
527 failures += 1;
528 }
529 }
530 assert_eq!(failures, 3);
531 assert_eq!(s.failure_count(), 3);
532 }
533
534 #[test]
535 fn limit_zero_disables_failures() {
536 let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(0);
537 for attempt in 1..=10 {
538 assert!(s.maybe_fail(attempt).is_ok());
539 }
540 assert_eq!(s.failure_count(), 0);
541 }
542
543 #[test]
544 fn unlimited_schedule_still_increments_failure_count() {
545 let s = FailureSchedule::every_n(1, FailureMode::IoError);
546 for attempt in 1..=5 {
547 let _ = s.maybe_fail(attempt);
548 }
549 assert_eq!(s.failure_count(), 5);
550 }
551
552 #[test]
553 fn limit_works_with_seeded_random() {
554 let s = FailureSchedule::seeded_random(42, 1.0, FailureMode::IoError).limit(2);
555 let mut failures = 0;
556 for attempt in 1..=20 {
557 if s.maybe_fail(attempt).is_err() {
558 failures += 1;
559 }
560 }
561 assert_eq!(failures, 2);
562 }
563
564 #[test]
565 fn seeded_random_is_deterministic() {
566 let a = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
567 let b = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
568 for attempt in 1..=200 {
569 assert_eq!(
570 a.fires(attempt),
571 b.fires(attempt),
572 "differs at attempt {}",
573 attempt
574 );
575 }
576 }
577
578 #[test]
579 fn seeded_random_zero_probability_never_fires() {
580 let s = FailureSchedule::seeded_random(7, 0.0, FailureMode::IoError);
581 for attempt in 1..=1000 {
582 assert!(s.maybe_fail(attempt).is_ok());
583 }
584 }
585
586 #[test]
587 fn seeded_random_full_probability_always_fires() {
588 let s = FailureSchedule::seeded_random(7, 1.0, FailureMode::IoError);
589 for attempt in 1..=200 {
590 assert!(s.maybe_fail(attempt).is_err());
591 }
592 }
593
594 #[test]
595 fn injected_failure_converts_to_io_error() {
596 let f = InjectedFailure {
597 mode: FailureMode::Timeout,
598 attempt: 5,
599 };
600 let e: std::io::Error = f.into();
601 assert_eq!(e.kind(), std::io::ErrorKind::TimedOut);
602 }
603
604 #[test]
605 fn recovery_check_pass() {
606 let c = assert_recovered("write_log", 2, 2, true);
607 assert_eq!(c.verdict, Verdict::Pass);
608 assert!(c.has_tag("chaos"));
609 assert!(c.has_tag("recovery"));
610 assert!(!c.has_tag("regression"));
611 }
612
613 #[test]
614 fn recovery_check_fail_when_state_invalid() {
615 let c = assert_recovered("write_log", 2, 2, false);
616 assert_eq!(c.verdict, Verdict::Fail);
617 assert_eq!(c.severity, Some(Severity::Critical));
618 assert!(c.has_tag("regression"));
619 }
620
621 #[test]
622 fn recovery_check_warns_on_under_injection() {
623 let c = assert_recovered("write_log", 5, 2, true);
624 assert_eq!(c.verdict, Verdict::Warn);
625 }
626
627 #[test]
628 fn recovery_check_carries_numeric_evidence() {
629 let c = assert_recovered("op", 3, 3, true);
630 let labels: Vec<&str> = c.evidence.iter().map(|e| e.label.as_str()).collect();
631 assert!(labels.contains(&"expected_failures"));
632 assert!(labels.contains(&"actual_failures"));
633 assert!(labels.contains(&"final_state_ok"));
634 }
635
636 #[test]
637 fn chaos_producer_emits_report() {
638 let producer = ChaosProducer::new(
639 || {
640 vec![
641 assert_recovered("a", 1, 1, true),
642 assert_recovered("b", 2, 2, true),
643 ]
644 },
645 "my-crate",
646 "0.1.0",
647 );
648 let report = producer.produce();
649 assert_eq!(report.checks.len(), 2);
650 assert_eq!(report.producer.as_deref(), Some("dev-chaos"));
651 assert_eq!(report.overall_verdict(), Verdict::Pass);
652 }
653}