1use std::sync::atomic::{AtomicU64, Ordering};
6use std::time::Duration;
7
8use rand::prelude::*;
9use serde::{Deserialize, Serialize};
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct FailureConfig {
14 pub enabled: bool,
16
17 pub failure_rate: f64,
19
20 pub failure_types: Vec<FailureType>,
22
23 pub schedule: FailureSchedule,
25
26 pub cascade_probability: f64,
28
29 pub recovery_time: Duration,
31
32 pub max_concurrent_failures: usize,
34
35 #[serde(skip_serializing_if = "Option::is_none")]
37 pub seed: Option<u64>,
38}
39
40impl Default for FailureConfig {
41 fn default() -> Self {
42 Self {
43 enabled: true,
44 failure_rate: 0.01, failure_types: vec![
46 FailureType::Timeout,
47 FailureType::ConnectionReset,
48 FailureType::ProtocolError,
49 ],
50 schedule: FailureSchedule::Random {
51 min_interval: Duration::from_secs(10),
52 max_interval: Duration::from_secs(60),
53 },
54 cascade_probability: 0.0,
55 recovery_time: Duration::from_secs(30),
56 max_concurrent_failures: 5,
57 seed: None,
58 }
59 }
60}
61
62impl FailureConfig {
63 pub fn new(rate: f64) -> Self {
65 Self {
66 failure_rate: rate.clamp(0.0, 1.0),
67 ..Default::default()
68 }
69 }
70
71 pub fn none() -> Self {
73 Self {
74 enabled: false,
75 failure_rate: 0.0,
76 failure_types: vec![],
77 ..Default::default()
78 }
79 }
80
81 pub fn low() -> Self {
83 Self {
84 failure_rate: 0.001,
85 ..Default::default()
86 }
87 }
88
89 pub fn medium() -> Self {
91 Self {
92 failure_rate: 0.01,
93 ..Default::default()
94 }
95 }
96
97 pub fn high() -> Self {
99 Self {
100 failure_rate: 0.05,
101 ..Default::default()
102 }
103 }
104
105 pub fn chaos() -> Self {
107 Self {
108 failure_rate: 0.10,
109 failure_types: FailureType::all().to_vec(),
110 ..Default::default()
111 }
112 }
113
114 pub fn with_types(mut self, types: Vec<FailureType>) -> Self {
116 self.failure_types = types;
117 self
118 }
119
120 pub fn with_schedule(mut self, schedule: FailureSchedule) -> Self {
122 self.schedule = schedule;
123 self
124 }
125
126 pub fn with_seed(mut self, seed: u64) -> Self {
128 self.seed = Some(seed);
129 self
130 }
131
132 pub fn should_inject(&self) -> bool {
134 if !self.enabled || self.failure_rate <= 0.0 || self.failure_types.is_empty() {
135 return false;
136 }
137
138 let mut rng = match self.seed {
139 Some(seed) => StdRng::seed_from_u64(seed),
140 None => StdRng::from_entropy(),
141 };
142
143 rng.gen::<f64>() < self.failure_rate
144 }
145
146 pub fn next_failure_type(&self) -> FailureType {
148 if self.failure_types.is_empty() {
149 return FailureType::Generic("No failures configured".into());
150 }
151
152 let mut rng = match self.seed {
153 Some(seed) => StdRng::seed_from_u64(seed),
154 None => StdRng::from_entropy(),
155 };
156 let idx = rng.gen_range(0..self.failure_types.len());
157 self.failure_types[idx].clone()
158 }
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
163pub enum FailureSchedule {
164 Random {
166 min_interval: Duration,
168 max_interval: Duration,
170 },
171
172 Periodic {
174 interval: Duration,
176 duration: Duration,
178 },
179
180 Scheduled {
182 entries: Vec<ScheduledFailure>,
184 },
185
186 Burst {
188 count: usize,
190 burst_interval: Duration,
192 burst_pause: Duration,
194 },
195}
196
197impl Default for FailureSchedule {
198 fn default() -> Self {
199 Self::Random {
200 min_interval: Duration::from_secs(10),
201 max_interval: Duration::from_secs(60),
202 }
203 }
204}
205
206#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
208pub enum FailureType {
209 Timeout,
211
212 ConnectionReset,
214
215 ConnectionRefused,
217
218 ProtocolError,
220
221 InvalidResponse,
223
224 PartialResponse,
226
227 SlowResponse {
229 delay: Duration,
231 },
232
233 OutOfMemory,
235
236 DiskFull,
238
239 DeviceOffline,
241
242 NetworkPartition,
244
245 CorruptedData,
247
248 RateLimited,
250
251 AuthFailure,
253
254 Generic(String),
256}
257
258impl FailureType {
259 pub fn all() -> &'static [FailureType] {
261 &[
262 FailureType::Timeout,
263 FailureType::ConnectionReset,
264 FailureType::ConnectionRefused,
265 FailureType::ProtocolError,
266 FailureType::InvalidResponse,
267 FailureType::PartialResponse,
268 FailureType::OutOfMemory,
269 FailureType::DiskFull,
270 FailureType::DeviceOffline,
271 FailureType::NetworkPartition,
272 FailureType::CorruptedData,
273 FailureType::RateLimited,
274 FailureType::AuthFailure,
275 ]
276 }
277
278 pub fn network() -> Vec<FailureType> {
280 vec![
281 FailureType::Timeout,
282 FailureType::ConnectionReset,
283 FailureType::ConnectionRefused,
284 FailureType::NetworkPartition,
285 ]
286 }
287
288 pub fn protocol() -> Vec<FailureType> {
290 vec![
291 FailureType::ProtocolError,
292 FailureType::InvalidResponse,
293 FailureType::PartialResponse,
294 FailureType::CorruptedData,
295 ]
296 }
297
298 pub fn resource() -> Vec<FailureType> {
300 vec![
301 FailureType::OutOfMemory,
302 FailureType::DiskFull,
303 FailureType::RateLimited,
304 ]
305 }
306
307 pub fn description(&self) -> &str {
309 match self {
310 Self::Timeout => "Request timed out",
311 Self::ConnectionReset => "Connection reset by peer",
312 Self::ConnectionRefused => "Connection refused",
313 Self::ProtocolError => "Protocol violation",
314 Self::InvalidResponse => "Invalid response received",
315 Self::PartialResponse => "Incomplete response",
316 Self::SlowResponse { .. } => "Response delayed significantly",
317 Self::OutOfMemory => "Out of memory",
318 Self::DiskFull => "Disk full",
319 Self::DeviceOffline => "Device went offline",
320 Self::NetworkPartition => "Network partition detected",
321 Self::CorruptedData => "Data corruption detected",
322 Self::RateLimited => "Rate limit exceeded",
323 Self::AuthFailure => "Authentication failed",
324 Self::Generic(msg) => msg,
325 }
326 }
327
328 pub fn is_recoverable(&self) -> bool {
330 match self {
331 Self::Timeout
332 | Self::ConnectionReset
333 | Self::SlowResponse { .. }
334 | Self::RateLimited
335 | Self::DeviceOffline
336 | Self::NetworkPartition => true,
337
338 Self::ConnectionRefused
339 | Self::ProtocolError
340 | Self::InvalidResponse
341 | Self::PartialResponse
342 | Self::OutOfMemory
343 | Self::DiskFull
344 | Self::CorruptedData
345 | Self::AuthFailure
346 | Self::Generic(_) => false,
347 }
348 }
349
350 pub fn suggested_retry_delay(&self) -> Option<Duration> {
352 if !self.is_recoverable() {
353 return None;
354 }
355
356 Some(match self {
357 Self::Timeout => Duration::from_secs(5),
358 Self::ConnectionReset => Duration::from_millis(500),
359 Self::SlowResponse { .. } => Duration::from_secs(1),
360 Self::RateLimited => Duration::from_secs(30),
361 Self::DeviceOffline => Duration::from_secs(10),
362 Self::NetworkPartition => Duration::from_secs(60),
363 _ => Duration::from_secs(1),
364 })
365 }
366}
367
368impl std::fmt::Display for FailureType {
369 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
370 write!(f, "{}", self.description())
371 }
372}
373
374#[derive(Debug, Clone, Serialize, Deserialize)]
376pub struct ScheduledFailure {
377 pub at: Duration,
379 pub failure_type: FailureType,
381 pub duration: Option<Duration>,
383 pub repeat: usize,
385}
386
387impl ScheduledFailure {
388 pub fn new(at: Duration, failure_type: FailureType) -> Self {
390 Self {
391 at,
392 failure_type,
393 duration: None,
394 repeat: 1,
395 }
396 }
397
398 pub fn for_duration(mut self, duration: Duration) -> Self {
400 self.duration = Some(duration);
401 self
402 }
403
404 pub fn repeat(mut self, times: usize) -> Self {
406 self.repeat = times;
407 self
408 }
409}
410
411#[derive(Debug)]
413pub struct FailureInjector {
414 config: FailureConfig,
415 current_index: AtomicU64,
416 active_failures: AtomicU64,
417 total_injected: AtomicU64,
418 rng: parking_lot::Mutex<StdRng>,
419}
420
421impl FailureInjector {
422 pub fn new(config: FailureConfig) -> Self {
424 let rng = match config.seed {
425 Some(seed) => StdRng::seed_from_u64(seed),
426 None => StdRng::from_entropy(),
427 };
428
429 Self {
430 config,
431 current_index: AtomicU64::new(0),
432 active_failures: AtomicU64::new(0),
433 total_injected: AtomicU64::new(0),
434 rng: parking_lot::Mutex::new(rng),
435 }
436 }
437
438 pub fn should_inject(&self) -> bool {
440 if !self.config.enabled
441 || self.config.failure_rate <= 0.0
442 || self.config.failure_types.is_empty()
443 {
444 return false;
445 }
446
447 let active = self.active_failures.load(Ordering::SeqCst);
449 if active >= self.config.max_concurrent_failures as u64 {
450 return false;
451 }
452
453 let mut rng = self.rng.lock();
454 rng.gen::<f64>() < self.config.failure_rate
455 }
456
457 pub fn next_failure_type(&self) -> FailureType {
459 if self.config.failure_types.is_empty() {
460 return FailureType::Generic("No failures configured".into());
461 }
462
463 let mut rng = self.rng.lock();
464 let idx = rng.gen_range(0..self.config.failure_types.len());
465 self.config.failure_types[idx].clone()
466 }
467
468 pub fn inject(&self) -> Option<FailureType> {
470 if !self.should_inject() {
471 return None;
472 }
473
474 self.active_failures.fetch_add(1, Ordering::SeqCst);
475 self.total_injected.fetch_add(1, Ordering::SeqCst);
476
477 Some(self.next_failure_type())
478 }
479
480 pub fn recover(&self) {
482 let current = self.active_failures.load(Ordering::SeqCst);
483 if current > 0 {
484 self.active_failures.fetch_sub(1, Ordering::SeqCst);
485 }
486 }
487
488 pub fn active_count(&self) -> u64 {
490 self.active_failures.load(Ordering::SeqCst)
491 }
492
493 pub fn total_count(&self) -> u64 {
495 self.total_injected.load(Ordering::SeqCst)
496 }
497
498 pub fn reset(&self) {
500 self.current_index.store(0, Ordering::SeqCst);
501 self.active_failures.store(0, Ordering::SeqCst);
502 self.total_injected.store(0, Ordering::SeqCst);
503 }
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_failure_config_default() {
512 let config = FailureConfig::default();
513 assert!(config.enabled);
514 assert_eq!(config.failure_rate, 0.01);
515 assert!(!config.failure_types.is_empty());
516 }
517
518 #[test]
519 fn test_failure_config_presets() {
520 let none = FailureConfig::none();
521 assert!(!none.enabled);
522 assert_eq!(none.failure_rate, 0.0);
523
524 let low = FailureConfig::low();
525 assert_eq!(low.failure_rate, 0.001);
526
527 let chaos = FailureConfig::chaos();
528 assert_eq!(chaos.failure_rate, 0.10);
529 }
530
531 #[test]
532 fn test_should_inject_disabled() {
533 let config = FailureConfig::none();
534 assert!(!config.should_inject());
535 }
536
537 #[test]
538 fn test_failure_type_all() {
539 let all = FailureType::all();
540 assert!(all.len() >= 10);
541 }
542
543 #[test]
544 fn test_failure_type_categories() {
545 let network = FailureType::network();
546 assert!(network.contains(&FailureType::Timeout));
547
548 let protocol = FailureType::protocol();
549 assert!(protocol.contains(&FailureType::ProtocolError));
550
551 let resource = FailureType::resource();
552 assert!(resource.contains(&FailureType::OutOfMemory));
553 }
554
555 #[test]
556 fn test_failure_type_recoverable() {
557 assert!(FailureType::Timeout.is_recoverable());
558 assert!(!FailureType::ProtocolError.is_recoverable());
559 }
560
561 #[test]
562 fn test_retry_delay() {
563 let timeout = FailureType::Timeout;
564 assert!(timeout.suggested_retry_delay().is_some());
565
566 let protocol = FailureType::ProtocolError;
567 assert!(protocol.suggested_retry_delay().is_none());
568 }
569
570 #[test]
571 fn test_failure_schedule_random() {
572 let schedule = FailureSchedule::Random {
573 min_interval: Duration::from_secs(5),
574 max_interval: Duration::from_secs(30),
575 };
576 matches!(schedule, FailureSchedule::Random { .. });
577 }
578
579 #[test]
580 fn test_failure_schedule_periodic() {
581 let schedule = FailureSchedule::Periodic {
582 interval: Duration::from_secs(60),
583 duration: Duration::from_secs(10),
584 };
585 matches!(schedule, FailureSchedule::Periodic { .. });
586 }
587
588 #[test]
589 fn test_failure_injector() {
590 let config = FailureConfig::new(1.0) .with_types(vec![FailureType::Timeout]);
592
593 let injector = FailureInjector::new(config);
594
595 let failure = injector.inject();
597 assert!(failure.is_some());
598 assert_eq!(injector.active_count(), 1);
599 assert_eq!(injector.total_count(), 1);
600
601 injector.recover();
603 assert_eq!(injector.active_count(), 0);
604
605 injector.reset();
607 assert_eq!(injector.total_count(), 0);
608 }
609
610 #[test]
611 fn test_failure_injector_max_concurrent() {
612 let mut config = FailureConfig::new(1.0);
613 config.max_concurrent_failures = 2;
614
615 let injector = FailureInjector::new(config);
616
617 injector.inject();
619 injector.inject();
620 assert_eq!(injector.active_count(), 2);
621
622 let active_before = injector.active_count();
625 if !injector.should_inject() {
626 assert_eq!(active_before, 2);
627 }
628 }
629
630 #[test]
631 fn test_failure_type_display() {
632 let failure = FailureType::Timeout;
633 assert_eq!(failure.to_string(), "Request timed out");
634
635 let slow = FailureType::SlowResponse {
636 delay: Duration::from_secs(5),
637 };
638 assert_eq!(slow.to_string(), "Response delayed significantly");
639 }
640
641 #[test]
642 fn test_scheduled_failure() {
643 let failure =
644 ScheduledFailure::new(Duration::from_secs(10), FailureType::NetworkPartition)
645 .for_duration(Duration::from_secs(30))
646 .repeat(3);
647
648 assert_eq!(failure.at, Duration::from_secs(10));
649 assert_eq!(failure.duration, Some(Duration::from_secs(30)));
650 assert_eq!(failure.repeat, 3);
651 }
652}