Skip to main content

mabi_core/simulation/
failure.rs

1//! Failure injection for chaos engineering.
2//!
3//! Simulates various failure conditions to test system resilience.
4
5use std::sync::atomic::{AtomicU64, Ordering};
6use std::time::Duration;
7
8use rand::prelude::*;
9use serde::{Deserialize, Serialize};
10
11/// Failure injection configuration.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct FailureConfig {
14    /// Whether failure injection is enabled.
15    pub enabled: bool,
16
17    /// Failure injection rate (0.0 - 1.0).
18    pub failure_rate: f64,
19
20    /// Types of failures to inject.
21    pub failure_types: Vec<FailureType>,
22
23    /// Failure schedule type.
24    pub schedule: FailureSchedule,
25
26    /// Probability of cascading failures.
27    pub cascade_probability: f64,
28
29    /// Time to recover from a failure.
30    pub recovery_time: Duration,
31
32    /// Maximum concurrent failures allowed.
33    pub max_concurrent_failures: usize,
34
35    /// Random seed for reproducibility.
36    #[serde(skip_serializing_if = "Option::is_none")]
37    pub seed: Option<u64>,
38}
39
40impl Default for FailureConfig {
41    fn default() -> Self {
42        Self {
43            enabled: true,
44            failure_rate: 0.01, // 1% failure rate
45            failure_types: vec![
46                FailureType::Timeout,
47                FailureType::ConnectionReset,
48                FailureType::ProtocolError,
49            ],
50            schedule: FailureSchedule::Random {
51                min_interval: Duration::from_secs(10),
52                max_interval: Duration::from_secs(60),
53            },
54            cascade_probability: 0.0,
55            recovery_time: Duration::from_secs(30),
56            max_concurrent_failures: 5,
57            seed: None,
58        }
59    }
60}
61
62impl FailureConfig {
63    /// Create a new failure config.
64    pub fn new(rate: f64) -> Self {
65        Self {
66            failure_rate: rate.clamp(0.0, 1.0),
67            ..Default::default()
68        }
69    }
70
71    /// No failures (disabled).
72    pub fn none() -> Self {
73        Self {
74            enabled: false,
75            failure_rate: 0.0,
76            failure_types: vec![],
77            ..Default::default()
78        }
79    }
80
81    /// Low failure rate (0.1%).
82    pub fn low() -> Self {
83        Self {
84            failure_rate: 0.001,
85            ..Default::default()
86        }
87    }
88
89    /// Medium failure rate (1%).
90    pub fn medium() -> Self {
91        Self {
92            failure_rate: 0.01,
93            ..Default::default()
94        }
95    }
96
97    /// High failure rate (5%).
98    pub fn high() -> Self {
99        Self {
100            failure_rate: 0.05,
101            ..Default::default()
102        }
103    }
104
105    /// Chaos mode (10%).
106    pub fn chaos() -> Self {
107        Self {
108            failure_rate: 0.10,
109            failure_types: FailureType::all().to_vec(),
110            ..Default::default()
111        }
112    }
113
114    /// Set failure types.
115    pub fn with_types(mut self, types: Vec<FailureType>) -> Self {
116        self.failure_types = types;
117        self
118    }
119
120    /// Set schedule.
121    pub fn with_schedule(mut self, schedule: FailureSchedule) -> Self {
122        self.schedule = schedule;
123        self
124    }
125
126    /// Set random seed.
127    pub fn with_seed(mut self, seed: u64) -> Self {
128        self.seed = Some(seed);
129        self
130    }
131
132    /// Check if a failure should be injected.
133    pub fn should_inject(&self) -> bool {
134        if !self.enabled || self.failure_rate <= 0.0 || self.failure_types.is_empty() {
135            return false;
136        }
137
138        let mut rng = match self.seed {
139            Some(seed) => StdRng::seed_from_u64(seed),
140            None => StdRng::from_entropy(),
141        };
142
143        rng.gen::<f64>() < self.failure_rate
144    }
145
146    /// Get the next failure type to inject.
147    pub fn next_failure_type(&self) -> FailureType {
148        if self.failure_types.is_empty() {
149            return FailureType::Generic("No failures configured".into());
150        }
151
152        let mut rng = match self.seed {
153            Some(seed) => StdRng::seed_from_u64(seed),
154            None => StdRng::from_entropy(),
155        };
156        let idx = rng.gen_range(0..self.failure_types.len());
157        self.failure_types[idx].clone()
158    }
159}
160
161/// Failure schedule types.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub enum FailureSchedule {
164    /// Random intervals between failures.
165    Random {
166        /// Minimum time between failures.
167        min_interval: Duration,
168        /// Maximum time between failures.
169        max_interval: Duration,
170    },
171
172    /// Fixed periodic failures.
173    Periodic {
174        /// Interval between failures.
175        interval: Duration,
176        /// Duration of each failure.
177        duration: Duration,
178    },
179
180    /// Scheduled at specific times.
181    Scheduled {
182        /// List of scheduled failures.
183        entries: Vec<ScheduledFailure>,
184    },
185
186    /// Burst of failures.
187    Burst {
188        /// Number of failures in burst.
189        count: usize,
190        /// Time between failures in burst.
191        burst_interval: Duration,
192        /// Time between bursts.
193        burst_pause: Duration,
194    },
195}
196
197impl Default for FailureSchedule {
198    fn default() -> Self {
199        Self::Random {
200            min_interval: Duration::from_secs(10),
201            max_interval: Duration::from_secs(60),
202        }
203    }
204}
205
206/// Types of failures that can be injected.
207#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
208pub enum FailureType {
209    /// Request timeout.
210    Timeout,
211
212    /// Connection reset by peer.
213    ConnectionReset,
214
215    /// Connection refused.
216    ConnectionRefused,
217
218    /// Protocol error.
219    ProtocolError,
220
221    /// Invalid response.
222    InvalidResponse,
223
224    /// Partial response.
225    PartialResponse,
226
227    /// Slow response (latency spike).
228    SlowResponse {
229        /// Delay to add.
230        delay: Duration,
231    },
232
233    /// Out of memory simulation.
234    OutOfMemory,
235
236    /// Disk full simulation.
237    DiskFull,
238
239    /// Device offline.
240    DeviceOffline,
241
242    /// Network partition.
243    NetworkPartition,
244
245    /// Corrupted data.
246    CorruptedData,
247
248    /// Rate limited.
249    RateLimited,
250
251    /// Authentication failure.
252    AuthFailure,
253
254    /// Generic failure with custom message.
255    Generic(String),
256}
257
258impl FailureType {
259    /// Get all failure types (except parameterized ones).
260    pub fn all() -> &'static [FailureType] {
261        &[
262            FailureType::Timeout,
263            FailureType::ConnectionReset,
264            FailureType::ConnectionRefused,
265            FailureType::ProtocolError,
266            FailureType::InvalidResponse,
267            FailureType::PartialResponse,
268            FailureType::OutOfMemory,
269            FailureType::DiskFull,
270            FailureType::DeviceOffline,
271            FailureType::NetworkPartition,
272            FailureType::CorruptedData,
273            FailureType::RateLimited,
274            FailureType::AuthFailure,
275        ]
276    }
277
278    /// Get network-related failures.
279    pub fn network() -> Vec<FailureType> {
280        vec![
281            FailureType::Timeout,
282            FailureType::ConnectionReset,
283            FailureType::ConnectionRefused,
284            FailureType::NetworkPartition,
285        ]
286    }
287
288    /// Get protocol-related failures.
289    pub fn protocol() -> Vec<FailureType> {
290        vec![
291            FailureType::ProtocolError,
292            FailureType::InvalidResponse,
293            FailureType::PartialResponse,
294            FailureType::CorruptedData,
295        ]
296    }
297
298    /// Get resource-related failures.
299    pub fn resource() -> Vec<FailureType> {
300        vec![
301            FailureType::OutOfMemory,
302            FailureType::DiskFull,
303            FailureType::RateLimited,
304        ]
305    }
306
307    /// Get a description of this failure type.
308    pub fn description(&self) -> &str {
309        match self {
310            Self::Timeout => "Request timed out",
311            Self::ConnectionReset => "Connection reset by peer",
312            Self::ConnectionRefused => "Connection refused",
313            Self::ProtocolError => "Protocol violation",
314            Self::InvalidResponse => "Invalid response received",
315            Self::PartialResponse => "Incomplete response",
316            Self::SlowResponse { .. } => "Response delayed significantly",
317            Self::OutOfMemory => "Out of memory",
318            Self::DiskFull => "Disk full",
319            Self::DeviceOffline => "Device went offline",
320            Self::NetworkPartition => "Network partition detected",
321            Self::CorruptedData => "Data corruption detected",
322            Self::RateLimited => "Rate limit exceeded",
323            Self::AuthFailure => "Authentication failed",
324            Self::Generic(msg) => msg,
325        }
326    }
327
328    /// Check if this failure is recoverable.
329    pub fn is_recoverable(&self) -> bool {
330        match self {
331            Self::Timeout
332            | Self::ConnectionReset
333            | Self::SlowResponse { .. }
334            | Self::RateLimited
335            | Self::DeviceOffline
336            | Self::NetworkPartition => true,
337
338            Self::ConnectionRefused
339            | Self::ProtocolError
340            | Self::InvalidResponse
341            | Self::PartialResponse
342            | Self::OutOfMemory
343            | Self::DiskFull
344            | Self::CorruptedData
345            | Self::AuthFailure
346            | Self::Generic(_) => false,
347        }
348    }
349
350    /// Get suggested retry delay for recoverable failures.
351    pub fn suggested_retry_delay(&self) -> Option<Duration> {
352        if !self.is_recoverable() {
353            return None;
354        }
355
356        Some(match self {
357            Self::Timeout => Duration::from_secs(5),
358            Self::ConnectionReset => Duration::from_millis(500),
359            Self::SlowResponse { .. } => Duration::from_secs(1),
360            Self::RateLimited => Duration::from_secs(30),
361            Self::DeviceOffline => Duration::from_secs(10),
362            Self::NetworkPartition => Duration::from_secs(60),
363            _ => Duration::from_secs(1),
364        })
365    }
366}
367
368impl std::fmt::Display for FailureType {
369    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
370        write!(f, "{}", self.description())
371    }
372}
373
374/// A scheduled failure.
375#[derive(Debug, Clone, Serialize, Deserialize)]
376pub struct ScheduledFailure {
377    /// When to inject (offset from start).
378    pub at: Duration,
379    /// Type of failure.
380    pub failure_type: FailureType,
381    /// Duration of the failure (for continuous failures).
382    pub duration: Option<Duration>,
383    /// Number of times to repeat.
384    pub repeat: usize,
385}
386
387impl ScheduledFailure {
388    /// Create a new scheduled failure.
389    pub fn new(at: Duration, failure_type: FailureType) -> Self {
390        Self {
391            at,
392            failure_type,
393            duration: None,
394            repeat: 1,
395        }
396    }
397
398    /// Set the duration.
399    pub fn for_duration(mut self, duration: Duration) -> Self {
400        self.duration = Some(duration);
401        self
402    }
403
404    /// Set repeat count.
405    pub fn repeat(mut self, times: usize) -> Self {
406        self.repeat = times;
407        self
408    }
409}
410
411/// Stateful failure injector for tracking injection state.
412#[derive(Debug)]
413pub struct FailureInjector {
414    config: FailureConfig,
415    current_index: AtomicU64,
416    active_failures: AtomicU64,
417    total_injected: AtomicU64,
418    rng: parking_lot::Mutex<StdRng>,
419}
420
421impl FailureInjector {
422    /// Create a new failure injector.
423    pub fn new(config: FailureConfig) -> Self {
424        let rng = match config.seed {
425            Some(seed) => StdRng::seed_from_u64(seed),
426            None => StdRng::from_entropy(),
427        };
428
429        Self {
430            config,
431            current_index: AtomicU64::new(0),
432            active_failures: AtomicU64::new(0),
433            total_injected: AtomicU64::new(0),
434            rng: parking_lot::Mutex::new(rng),
435        }
436    }
437
438    /// Check if a failure should be injected.
439    pub fn should_inject(&self) -> bool {
440        if !self.config.enabled
441            || self.config.failure_rate <= 0.0
442            || self.config.failure_types.is_empty()
443        {
444            return false;
445        }
446
447        // Check max concurrent failures
448        let active = self.active_failures.load(Ordering::SeqCst);
449        if active >= self.config.max_concurrent_failures as u64 {
450            return false;
451        }
452
453        let mut rng = self.rng.lock();
454        rng.gen::<f64>() < self.config.failure_rate
455    }
456
457    /// Get the next failure type.
458    pub fn next_failure_type(&self) -> FailureType {
459        if self.config.failure_types.is_empty() {
460            return FailureType::Generic("No failures configured".into());
461        }
462
463        let mut rng = self.rng.lock();
464        let idx = rng.gen_range(0..self.config.failure_types.len());
465        self.config.failure_types[idx].clone()
466    }
467
468    /// Inject a failure and track it.
469    pub fn inject(&self) -> Option<FailureType> {
470        if !self.should_inject() {
471            return None;
472        }
473
474        self.active_failures.fetch_add(1, Ordering::SeqCst);
475        self.total_injected.fetch_add(1, Ordering::SeqCst);
476
477        Some(self.next_failure_type())
478    }
479
480    /// Mark a failure as recovered.
481    pub fn recover(&self) {
482        let current = self.active_failures.load(Ordering::SeqCst);
483        if current > 0 {
484            self.active_failures.fetch_sub(1, Ordering::SeqCst);
485        }
486    }
487
488    /// Get the number of active failures.
489    pub fn active_count(&self) -> u64 {
490        self.active_failures.load(Ordering::SeqCst)
491    }
492
493    /// Get the total number of injected failures.
494    pub fn total_count(&self) -> u64 {
495        self.total_injected.load(Ordering::SeqCst)
496    }
497
498    /// Reset the injector state.
499    pub fn reset(&self) {
500        self.current_index.store(0, Ordering::SeqCst);
501        self.active_failures.store(0, Ordering::SeqCst);
502        self.total_injected.store(0, Ordering::SeqCst);
503    }
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_failure_config_default() {
512        let config = FailureConfig::default();
513        assert!(config.enabled);
514        assert_eq!(config.failure_rate, 0.01);
515        assert!(!config.failure_types.is_empty());
516    }
517
518    #[test]
519    fn test_failure_config_presets() {
520        let none = FailureConfig::none();
521        assert!(!none.enabled);
522        assert_eq!(none.failure_rate, 0.0);
523
524        let low = FailureConfig::low();
525        assert_eq!(low.failure_rate, 0.001);
526
527        let chaos = FailureConfig::chaos();
528        assert_eq!(chaos.failure_rate, 0.10);
529    }
530
531    #[test]
532    fn test_should_inject_disabled() {
533        let config = FailureConfig::none();
534        assert!(!config.should_inject());
535    }
536
537    #[test]
538    fn test_failure_type_all() {
539        let all = FailureType::all();
540        assert!(all.len() >= 10);
541    }
542
543    #[test]
544    fn test_failure_type_categories() {
545        let network = FailureType::network();
546        assert!(network.contains(&FailureType::Timeout));
547
548        let protocol = FailureType::protocol();
549        assert!(protocol.contains(&FailureType::ProtocolError));
550
551        let resource = FailureType::resource();
552        assert!(resource.contains(&FailureType::OutOfMemory));
553    }
554
555    #[test]
556    fn test_failure_type_recoverable() {
557        assert!(FailureType::Timeout.is_recoverable());
558        assert!(!FailureType::ProtocolError.is_recoverable());
559    }
560
561    #[test]
562    fn test_retry_delay() {
563        let timeout = FailureType::Timeout;
564        assert!(timeout.suggested_retry_delay().is_some());
565
566        let protocol = FailureType::ProtocolError;
567        assert!(protocol.suggested_retry_delay().is_none());
568    }
569
570    #[test]
571    fn test_failure_schedule_random() {
572        let schedule = FailureSchedule::Random {
573            min_interval: Duration::from_secs(5),
574            max_interval: Duration::from_secs(30),
575        };
576        matches!(schedule, FailureSchedule::Random { .. });
577    }
578
579    #[test]
580    fn test_failure_schedule_periodic() {
581        let schedule = FailureSchedule::Periodic {
582            interval: Duration::from_secs(60),
583            duration: Duration::from_secs(10),
584        };
585        matches!(schedule, FailureSchedule::Periodic { .. });
586    }
587
588    #[test]
589    fn test_failure_injector() {
590        let config = FailureConfig::new(1.0) // 100% rate for testing
591            .with_types(vec![FailureType::Timeout]);
592
593        let injector = FailureInjector::new(config);
594
595        // Should be able to inject
596        let failure = injector.inject();
597        assert!(failure.is_some());
598        assert_eq!(injector.active_count(), 1);
599        assert_eq!(injector.total_count(), 1);
600
601        // Recover
602        injector.recover();
603        assert_eq!(injector.active_count(), 0);
604
605        // Reset
606        injector.reset();
607        assert_eq!(injector.total_count(), 0);
608    }
609
610    #[test]
611    fn test_failure_injector_max_concurrent() {
612        let mut config = FailureConfig::new(1.0);
613        config.max_concurrent_failures = 2;
614
615        let injector = FailureInjector::new(config);
616
617        // Inject up to max
618        injector.inject();
619        injector.inject();
620        assert_eq!(injector.active_count(), 2);
621
622        // Should not inject more
623        // Note: should_inject will return false due to max_concurrent check
624        let active_before = injector.active_count();
625        if !injector.should_inject() {
626            assert_eq!(active_before, 2);
627        }
628    }
629
630    #[test]
631    fn test_failure_type_display() {
632        let failure = FailureType::Timeout;
633        assert_eq!(failure.to_string(), "Request timed out");
634
635        let slow = FailureType::SlowResponse {
636            delay: Duration::from_secs(5),
637        };
638        assert_eq!(slow.to_string(), "Response delayed significantly");
639    }
640
641    #[test]
642    fn test_scheduled_failure() {
643        let failure =
644            ScheduledFailure::new(Duration::from_secs(10), FailureType::NetworkPartition)
645                .for_duration(Duration::from_secs(30))
646                .repeat(3);
647
648        assert_eq!(failure.at, Duration::from_secs(10));
649        assert_eq!(failure.duration, Some(Duration::from_secs(30)));
650        assert_eq!(failure.repeat, 3);
651    }
652}