1#![cfg_attr(docsrs, feature(doc_cfg))]
45#![warn(missing_docs)]
46#![warn(rust_2018_idioms)]
47
48use std::collections::HashSet;
49use std::sync::atomic::{AtomicUsize, Ordering};
50
51use dev_report::{CheckResult, Evidence, Producer, Report, Severity};
52
53pub mod crash;
54pub mod io;
55pub mod latency;
56
57#[cfg(feature = "async-io")]
58#[cfg_attr(docsrs, doc(cfg(feature = "async-io")))]
59pub mod async_io;
60
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70pub enum FailureMode {
71 IoError,
73 PartialWrite,
75 ConnectionReset,
77 Timeout,
79 Corruption,
81 PermissionDenied,
83}
84
85impl FailureMode {
86 pub fn as_str(&self) -> &'static str {
88 match self {
89 FailureMode::IoError => "io_error",
90 FailureMode::PartialWrite => "partial_write",
91 FailureMode::ConnectionReset => "connection_reset",
92 FailureMode::Timeout => "timeout",
93 FailureMode::Corruption => "corruption",
94 FailureMode::PermissionDenied => "permission_denied",
95 }
96 }
97
98 pub fn to_io_kind(&self) -> std::io::ErrorKind {
100 match self {
101 FailureMode::IoError => std::io::ErrorKind::Other,
102 FailureMode::PartialWrite => std::io::ErrorKind::WriteZero,
103 FailureMode::ConnectionReset => std::io::ErrorKind::ConnectionReset,
104 FailureMode::Timeout => std::io::ErrorKind::TimedOut,
105 FailureMode::Corruption => std::io::ErrorKind::InvalidData,
106 FailureMode::PermissionDenied => std::io::ErrorKind::PermissionDenied,
107 }
108 }
109}
110
111#[derive(Debug, Clone)]
121pub struct InjectedFailure {
122 pub mode: FailureMode,
124 pub attempt: usize,
126}
127
128impl std::fmt::Display for InjectedFailure {
129 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130 write!(
131 f,
132 "injected failure {} at attempt {}",
133 self.mode.as_str(),
134 self.attempt
135 )
136 }
137}
138
139impl std::error::Error for InjectedFailure {}
140
141impl From<InjectedFailure> for std::io::Error {
142 fn from(f: InjectedFailure) -> Self {
143 std::io::Error::new(f.mode.to_io_kind(), f.to_string())
144 }
145}
146
147pub struct FailureSchedule {
162 inner: ScheduleKind,
163 mode: FailureMode,
164 invocations: AtomicUsize,
165 failures: AtomicUsize,
166 failure_limit: Option<usize>,
168}
169
170enum ScheduleKind {
171 Explicit(HashSet<usize>),
172 EveryN(usize),
173 SeededRandom { seed: u64, prob_thousandths: u32 },
174}
175
176impl FailureSchedule {
177 pub fn on_attempts(attempts: &[usize], mode: FailureMode) -> Self {
189 Self {
190 inner: ScheduleKind::Explicit(attempts.iter().copied().collect()),
191 mode,
192 invocations: AtomicUsize::new(0),
193 failures: AtomicUsize::new(0),
194 failure_limit: None,
195 }
196 }
197
198 pub fn every_n(n: usize, mode: FailureMode) -> Self {
209 let n = n.max(1);
210 Self {
211 inner: ScheduleKind::EveryN(n),
212 mode,
213 invocations: AtomicUsize::new(0),
214 failures: AtomicUsize::new(0),
215 failure_limit: None,
216 }
217 }
218
219 pub fn seeded_random(seed: u64, probability: f64, mode: FailureMode) -> Self {
243 let p = probability.clamp(0.0, 1.0);
244 let prob_thousandths = (p * 1000.0).round() as u32;
245 Self {
246 inner: ScheduleKind::SeededRandom {
247 seed,
248 prob_thousandths,
249 },
250 mode,
251 invocations: AtomicUsize::new(0),
252 failures: AtomicUsize::new(0),
253 failure_limit: None,
254 }
255 }
256
257 pub fn limit(mut self, n: usize) -> Self {
284 self.failure_limit = Some(n);
285 self
286 }
287
288 pub fn maybe_fail(&self, attempt: usize) -> Result<(), InjectedFailure> {
297 self.invocations.fetch_add(1, Ordering::Relaxed);
298 if !self.fires(attempt) {
299 return Ok(());
300 }
301 if let Some(limit) = self.failure_limit {
302 let prior = self.failures.fetch_add(1, Ordering::Relaxed);
307 if prior >= limit {
308 return Ok(());
309 }
310 } else {
311 self.failures.fetch_add(1, Ordering::Relaxed);
312 }
313 Err(InjectedFailure {
314 mode: self.mode,
315 attempt,
316 })
317 }
318
319 pub fn failure_count(&self) -> usize {
321 let raw = self.failures.load(Ordering::Relaxed);
324 match self.failure_limit {
325 Some(limit) => raw.min(limit),
326 None => raw,
327 }
328 }
329
330 fn fires(&self, attempt: usize) -> bool {
331 match &self.inner {
332 ScheduleKind::Explicit(set) => set.contains(&attempt),
333 ScheduleKind::EveryN(n) => attempt % *n == 0,
334 ScheduleKind::SeededRandom {
335 seed,
336 prob_thousandths,
337 } => {
338 let mut x =
340 (*seed).wrapping_add((attempt as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15));
341 x = (x ^ (x >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
342 x = (x ^ (x >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
343 x ^= x >> 31;
344 let bucket = (x % 1000) as u32;
345 bucket < *prob_thousandths
346 }
347 }
348 }
349
350 pub fn invocation_count(&self) -> usize {
352 self.invocations.load(Ordering::Relaxed)
353 }
354
355 pub fn mode(&self) -> FailureMode {
357 self.mode
358 }
359}
360
361pub fn assert_recovered(
382 name: impl Into<String>,
383 expected_failures: usize,
384 actual_failures: usize,
385 final_state_ok: bool,
386) -> CheckResult {
387 let check_name = format!("chaos::{}", name.into());
388 let evidence = vec![
389 Evidence::numeric("expected_failures", expected_failures as f64),
390 Evidence::numeric("actual_failures", actual_failures as f64),
391 Evidence::numeric("final_state_ok", if final_state_ok { 1.0 } else { 0.0 }),
392 ];
393
394 if !final_state_ok {
395 let mut tags = vec![
396 "chaos".to_string(),
397 "recovery".to_string(),
398 "regression".to_string(),
399 ];
400 tags.sort();
401 let mut c = CheckResult::fail(check_name, Severity::Critical).with_detail(format!(
402 "system did not recover. expected {expected_failures} injected failures, observed {actual_failures}, final state failed validation"
403 ));
404 c.tags = tags;
405 c.evidence = evidence;
406 return c;
407 }
408
409 if actual_failures < expected_failures {
410 let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
411 tags.sort();
412 let mut c = CheckResult::warn(check_name, Severity::Warning).with_detail(format!(
413 "fewer failures observed than scheduled (expected {expected_failures}, observed {actual_failures})"
414 ));
415 c.tags = tags;
416 c.evidence = evidence;
417 return c;
418 }
419
420 let mut tags = vec!["chaos".to_string(), "recovery".to_string()];
421 tags.sort();
422 let mut c = CheckResult::pass(check_name).with_detail(format!(
423 "recovered after {actual_failures} injected failure(s)"
424 ));
425 c.tags = tags;
426 c.evidence = evidence;
427 c
428}
429
430pub struct ChaosProducer<F>
451where
452 F: Fn() -> Vec<CheckResult>,
453{
454 run: F,
455 subject: String,
456 subject_version: String,
457}
458
459impl<F> ChaosProducer<F>
460where
461 F: Fn() -> Vec<CheckResult>,
462{
463 pub fn new(run: F, subject: impl Into<String>, subject_version: impl Into<String>) -> Self {
465 Self {
466 run,
467 subject: subject.into(),
468 subject_version: subject_version.into(),
469 }
470 }
471}
472
473impl<F> Producer for ChaosProducer<F>
474where
475 F: Fn() -> Vec<CheckResult>,
476{
477 fn produce(&self) -> Report {
478 let checks = (self.run)();
479 let mut r = Report::new(self.subject.clone(), self.subject_version.clone())
480 .with_producer("dev-chaos");
481 for c in checks {
482 r.push(c);
483 }
484 r.finish();
485 r
486 }
487}
488
489#[cfg(test)]
490mod tests {
491 use super::*;
492 use dev_report::Verdict;
493
494 #[test]
495 fn schedule_fails_on_specified_attempts() {
496 let s = FailureSchedule::on_attempts(&[2, 4], FailureMode::IoError);
497 assert!(s.maybe_fail(1).is_ok());
498 assert!(s.maybe_fail(2).is_err());
499 assert!(s.maybe_fail(3).is_ok());
500 assert!(s.maybe_fail(4).is_err());
501 assert_eq!(s.invocation_count(), 4);
502 }
503
504 #[test]
505 fn every_n_fires_on_multiples() {
506 let s = FailureSchedule::every_n(3, FailureMode::Timeout);
507 assert!(s.maybe_fail(1).is_ok());
508 assert!(s.maybe_fail(2).is_ok());
509 assert!(s.maybe_fail(3).is_err());
510 assert!(s.maybe_fail(6).is_err());
511 assert!(s.maybe_fail(9).is_err());
512 assert!(s.maybe_fail(3_000).is_err());
514 }
515
516 #[test]
517 fn limit_caps_total_failures() {
518 let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(3);
519 let mut failures = 0;
520 for attempt in 1..=20 {
521 if s.maybe_fail(attempt).is_err() {
522 failures += 1;
523 }
524 }
525 assert_eq!(failures, 3);
526 assert_eq!(s.failure_count(), 3);
527 }
528
529 #[test]
530 fn limit_zero_disables_failures() {
531 let s = FailureSchedule::every_n(1, FailureMode::IoError).limit(0);
532 for attempt in 1..=10 {
533 assert!(s.maybe_fail(attempt).is_ok());
534 }
535 assert_eq!(s.failure_count(), 0);
536 }
537
538 #[test]
539 fn unlimited_schedule_still_increments_failure_count() {
540 let s = FailureSchedule::every_n(1, FailureMode::IoError);
541 for attempt in 1..=5 {
542 let _ = s.maybe_fail(attempt);
543 }
544 assert_eq!(s.failure_count(), 5);
545 }
546
547 #[test]
548 fn limit_works_with_seeded_random() {
549 let s = FailureSchedule::seeded_random(42, 1.0, FailureMode::IoError).limit(2);
550 let mut failures = 0;
551 for attempt in 1..=20 {
552 if s.maybe_fail(attempt).is_err() {
553 failures += 1;
554 }
555 }
556 assert_eq!(failures, 2);
557 }
558
559 #[test]
560 fn seeded_random_is_deterministic() {
561 let a = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
562 let b = FailureSchedule::seeded_random(7, 0.5, FailureMode::IoError);
563 for attempt in 1..=200 {
564 assert_eq!(
565 a.fires(attempt),
566 b.fires(attempt),
567 "differs at attempt {}",
568 attempt
569 );
570 }
571 }
572
573 #[test]
574 fn seeded_random_zero_probability_never_fires() {
575 let s = FailureSchedule::seeded_random(7, 0.0, FailureMode::IoError);
576 for attempt in 1..=1000 {
577 assert!(s.maybe_fail(attempt).is_ok());
578 }
579 }
580
581 #[test]
582 fn seeded_random_full_probability_always_fires() {
583 let s = FailureSchedule::seeded_random(7, 1.0, FailureMode::IoError);
584 for attempt in 1..=200 {
585 assert!(s.maybe_fail(attempt).is_err());
586 }
587 }
588
589 #[test]
590 fn injected_failure_converts_to_io_error() {
591 let f = InjectedFailure {
592 mode: FailureMode::Timeout,
593 attempt: 5,
594 };
595 let e: std::io::Error = f.into();
596 assert_eq!(e.kind(), std::io::ErrorKind::TimedOut);
597 }
598
599 #[test]
600 fn recovery_check_pass() {
601 let c = assert_recovered("write_log", 2, 2, true);
602 assert_eq!(c.verdict, Verdict::Pass);
603 assert!(c.has_tag("chaos"));
604 assert!(c.has_tag("recovery"));
605 assert!(!c.has_tag("regression"));
606 }
607
608 #[test]
609 fn recovery_check_fail_when_state_invalid() {
610 let c = assert_recovered("write_log", 2, 2, false);
611 assert_eq!(c.verdict, Verdict::Fail);
612 assert_eq!(c.severity, Some(Severity::Critical));
613 assert!(c.has_tag("regression"));
614 }
615
616 #[test]
617 fn recovery_check_warns_on_under_injection() {
618 let c = assert_recovered("write_log", 5, 2, true);
619 assert_eq!(c.verdict, Verdict::Warn);
620 }
621
622 #[test]
623 fn recovery_check_carries_numeric_evidence() {
624 let c = assert_recovered("op", 3, 3, true);
625 let labels: Vec<&str> = c.evidence.iter().map(|e| e.label.as_str()).collect();
626 assert!(labels.contains(&"expected_failures"));
627 assert!(labels.contains(&"actual_failures"));
628 assert!(labels.contains(&"final_state_ok"));
629 }
630
631 #[test]
632 fn chaos_producer_emits_report() {
633 let producer = ChaosProducer::new(
634 || {
635 vec![
636 assert_recovered("a", 1, 1, true),
637 assert_recovered("b", 2, 2, true),
638 ]
639 },
640 "my-crate",
641 "0.1.0",
642 );
643 let report = producer.produce();
644 assert_eq!(report.checks.len(), 2);
645 assert_eq!(report.producer.as_deref(), Some("dev-chaos"));
646 assert_eq!(report.overall_verdict(), Verdict::Pass);
647 }
648}