Skip to main content

vortex_core/
fault_config.rs

1//! Fault configuration DSL — declarative fault injection rules.
2//!
3//! Users describe fault injection scenarios in a structured config that can
4//! be serialized to/from YAML, TOML, or JSON. The [`FaultConfig`] is used by
5//! all layers (SDK traits, syscall interceptor, WASM sandbox) to decide when
6//! and how to inject faults.
7
8use serde::{Deserialize, Serialize};
9
10/// Top-level fault configuration.
11///
12/// ```
13/// # use vortex_core::FaultConfig;
14/// let yaml = r#"
15///   fs:
16///     rules:
17///       - path: "*.wal"
18///         op: write
19///         error: enospc
20///         after_bytes: 4096
21///         probability: 0.05
22///   clock:
23///     drift_us_per_sec: 200
24///     jitter_us: 50
25///   alloc:
26///     fail_probability: 0.001
27///     hard_limit_bytes: 67108864
28///   process:
29///     rules:
30///       - command: "gcc"
31///         fault: hang
32///         after_us: 5000000
33///   threading:
34///     strategy: random
35///     preempt_probability: 0.1
36/// "#;
37/// ```
38#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
39pub struct FaultConfig {
40    /// Filesystem fault rules.
41    #[serde(default)]
42    pub fs: FsFaultConfig,
43    /// Clock fault configuration.
44    #[serde(default)]
45    pub clock: ClockFaultConfig,
46    /// Memory allocator fault configuration.
47    #[serde(default)]
48    pub alloc: AllocFaultConfig,
49    /// Subprocess fault rules.
50    #[serde(default)]
51    pub process: ProcessFaultConfig,
52    /// Network fault rules (for local loopback interception).
53    #[serde(default)]
54    pub network: NetworkFaultConfig,
55    /// Thread scheduling configuration (Layer 2 ptrace only).
56    #[serde(default)]
57    pub threading: ThreadSchedulingConfig,
58}
59
60impl FaultConfig {
61    /// Create an empty configuration (no faults injected).
62    pub fn none() -> Self {
63        Self {
64            fs: FsFaultConfig::default(),
65            clock: ClockFaultConfig::default(),
66            alloc: AllocFaultConfig::default(),
67            process: ProcessFaultConfig::default(),
68            network: NetworkFaultConfig::default(),
69            threading: ThreadSchedulingConfig::default(),
70        }
71    }
72
73    /// Merge two configs: `overlay` values take precedence over `base`.
74    /// Rules are concatenated (not replaced).
75    pub fn merge(base: &FaultConfig, overlay: &FaultConfig) -> FaultConfig {
76        FaultConfig {
77            fs: FsFaultConfig {
78                rules: [base.fs.rules.clone(), overlay.fs.rules.clone()].concat(),
79            },
80            clock: if overlay.clock != ClockFaultConfig::default() {
81                overlay.clock.clone()
82            } else {
83                base.clock.clone()
84            },
85            alloc: if overlay.alloc != AllocFaultConfig::default() {
86                overlay.alloc.clone()
87            } else {
88                base.alloc.clone()
89            },
90            process: ProcessFaultConfig {
91                rules: [base.process.rules.clone(), overlay.process.rules.clone()].concat(),
92            },
93            network: NetworkFaultConfig {
94                rules: [base.network.rules.clone(), overlay.network.rules.clone()].concat(),
95            },
96            threading: if overlay.threading != ThreadSchedulingConfig::default() {
97                overlay.threading.clone()
98            } else {
99                base.threading.clone()
100            },
101        }
102    }
103
104    /// Validate the configuration, returning a list of errors.
105    pub fn validate(&self) -> Vec<ConfigError> {
106        let mut errors = Vec::new();
107        for (i, rule) in self.fs.rules.iter().enumerate() {
108            if rule.probability < 0.0 || rule.probability > 1.0 {
109                errors.push(ConfigError {
110                    path: format!("fs.rules[{i}].probability"),
111                    message: format!(
112                        "probability must be in [0.0, 1.0], got {}",
113                        rule.probability
114                    ),
115                });
116            }
117            if rule.path.is_empty() {
118                errors.push(ConfigError {
119                    path: format!("fs.rules[{i}].path"),
120                    message: "path pattern must not be empty".into(),
121                });
122            }
123        }
124        if self.alloc.fail_probability < 0.0 || self.alloc.fail_probability > 1.0 {
125            errors.push(ConfigError {
126                path: "alloc.fail_probability".into(),
127                message: format!("must be in [0.0, 1.0], got {}", self.alloc.fail_probability),
128            });
129        }
130        if self.clock.jitter_us < 0 {
131            errors.push(ConfigError {
132                path: "clock.jitter_us".into(),
133                message: format!("jitter must be non-negative, got {}", self.clock.jitter_us),
134            });
135        }
136        for (i, rule) in self.process.rules.iter().enumerate() {
137            if rule.command.is_empty() {
138                errors.push(ConfigError {
139                    path: format!("process.rules[{i}].command"),
140                    message: "command pattern must not be empty".into(),
141                });
142            }
143        }
144        if self.threading.preempt_probability < 0.0 || self.threading.preempt_probability > 1.0 {
145            errors.push(ConfigError {
146                path: "threading.preempt_probability".into(),
147                message: format!(
148                    "must be in [0.0, 1.0], got {}",
149                    self.threading.preempt_probability
150                ),
151            });
152        }
153        errors
154    }
155
156    /// Returns `true` if any fault injection is configured.
157    pub fn has_faults(&self) -> bool {
158        !self.fs.rules.is_empty()
159            || self.clock != ClockFaultConfig::default()
160            || self.alloc != AllocFaultConfig::default()
161            || !self.process.rules.is_empty()
162            || !self.network.rules.is_empty()
163            || self.threading.is_enabled()
164    }
165}
166
167impl Default for FaultConfig {
168    fn default() -> Self {
169        Self::none()
170    }
171}
172
173// ---------------------------------------------------------------------------
174// Filesystem faults
175// ---------------------------------------------------------------------------
176
177/// Filesystem fault injection configuration.
178#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
179pub struct FsFaultConfig {
180    /// Ordered list of fault rules (first matching rule wins).
181    #[serde(default)]
182    pub rules: Vec<FsFaultRule>,
183}
184
185/// A single filesystem fault rule.
186#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
187pub struct FsFaultRule {
188    /// Glob pattern for matching file paths (e.g. `"*.wal"`, `"/data/**"`).
189    pub path: String,
190    /// Which operation to fault.
191    #[serde(default)]
192    pub op: FsOp,
193    /// What error to inject.
194    #[serde(default)]
195    pub error: FsError,
196    /// Inject error after this many bytes have been written (0 = immediately).
197    #[serde(default)]
198    pub after_bytes: u64,
199    /// Probability of fault injection per operation [0.0, 1.0].
200    #[serde(default = "default_probability")]
201    pub probability: f64,
202}
203
204fn default_probability() -> f64 {
205    1.0
206}
207
208/// Filesystem operation to target.
209#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
210#[serde(rename_all = "snake_case")]
211pub enum FsOp {
212    Read,
213    Write,
214    Open,
215    Fsync,
216    Rename,
217    Delete,
218    /// Match any operation.
219    #[default]
220    Any,
221}
222
223/// Filesystem error to inject.
224#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
225#[serde(rename_all = "snake_case")]
226pub enum FsError {
227    /// Disk full (ENOSPC).
228    Enospc,
229    /// I/O error (EIO).
230    #[default]
231    Eio,
232    /// Permission denied (EACCES).
233    Eacces,
234    /// Torn write: partial write then error.
235    TornWrite,
236    /// Byte corruption in written data.
237    Corrupt,
238    /// Delayed fsync: fsync becomes a no-op, simulating data loss on crash.
239    /// Writes succeed but are never durably committed.
240    DelayedFsync,
241}
242
243// ---------------------------------------------------------------------------
244// Clock faults
245// ---------------------------------------------------------------------------
246
247/// Clock fault injection configuration.
248#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
249pub struct ClockFaultConfig {
250    /// Clock drift in microseconds per second of logical time.
251    #[serde(default)]
252    pub drift_us_per_sec: i64,
253    /// Random jitter added to each clock read, in microseconds.
254    #[serde(default)]
255    pub jitter_us: i64,
256}
257
258// ---------------------------------------------------------------------------
259// Allocator faults
260// ---------------------------------------------------------------------------
261
262/// Memory allocator fault injection configuration.
263#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
264pub struct AllocFaultConfig {
265    /// Probability of any single allocation failing [0.0, 1.0].
266    #[serde(default)]
267    pub fail_probability: f64,
268    /// Hard limit on total heap bytes. All allocations after this are rejected.
269    /// 0 = no limit.
270    #[serde(default)]
271    pub hard_limit_bytes: u64,
272    /// Only fail allocations larger than this size (bytes). 0 = all sizes.
273    #[serde(default)]
274    pub min_fail_size: u64,
275}
276
277// ---------------------------------------------------------------------------
278// Process faults
279// ---------------------------------------------------------------------------
280
281/// Subprocess fault injection configuration.
282#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
283pub struct ProcessFaultConfig {
284    /// Ordered list of subprocess fault rules.
285    #[serde(default)]
286    pub rules: Vec<ProcessFaultRule>,
287}
288
289/// A single subprocess fault rule.
290#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
291pub struct ProcessFaultRule {
292    /// Glob pattern for matching command names (e.g. `"gcc"`, `"npm*"`).
293    pub command: String,
294    /// What kind of fault to inject.
295    #[serde(default)]
296    pub fault: ProcessFault,
297    /// Inject the fault after this many microseconds of simulated runtime.
298    /// 0 = immediately.
299    #[serde(default)]
300    pub after_us: u64,
301}
302
303/// Types of subprocess faults.
304#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
305#[serde(rename_all = "snake_case")]
306pub enum ProcessFault {
307    /// Process hangs indefinitely (never exits).
308    Hang,
309    /// Process is killed by SIGKILL (exit code 137).
310    Sigkill,
311    /// Process produces corrupted stdout.
312    CorruptStdout,
313    /// Process exits with a non-zero exit code.
314    #[default]
315    ExitError,
316}
317
318// ---------------------------------------------------------------------------
319// Network faults
320// ---------------------------------------------------------------------------
321
322/// Network fault injection configuration (for local loopback interception).
323#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
324pub struct NetworkFaultConfig {
325    /// Ordered list of network fault rules.
326    #[serde(default)]
327    pub rules: Vec<NetworkFaultRule>,
328}
329
330/// A single network fault rule.
331#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
332pub struct NetworkFaultRule {
333    /// Target address pattern (e.g. `"127.0.0.1:5432"`, `"*:8080"`).
334    pub target: String,
335    /// What kind of fault to inject.
336    #[serde(default)]
337    pub fault: NetworkFault,
338    /// Probability of this fault being injected per connection/packet.
339    #[serde(default = "default_probability")]
340    pub probability: f64,
341    /// Bandwidth limit in bytes per second (only applies to BandwidthLimit fault).
342    /// 0 means unbounded.
343    #[serde(default)]
344    pub limit_bytes_per_sec: u64,
345}
346
347/// Types of network faults.
348#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
349#[serde(rename_all = "snake_case")]
350pub enum NetworkFault {
351    /// Connection refused.
352    #[default]
353    ConnRefused,
354    /// Connection timeout.
355    ConnTimeout,
356    /// Packet dropped silently.
357    PacketDrop,
358    /// Packet delay.
359    PacketDelay,
360    /// Packet data corrupted.
361    PacketCorrupt,
362    /// Bandwidth throttling based on bytes per second.
363    BandwidthLimit,
364}
365
366// ---------------------------------------------------------------------------
367// Thread scheduling
368// ---------------------------------------------------------------------------
369
370/// Thread scheduling configuration for deterministic thread interleaving.
371///
372/// Controls how the ptrace supervisor schedules threads when multiple are
373/// runnable. Only effective in Layer 2 (ptrace interceptor).
374#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
375pub struct ThreadSchedulingConfig {
376    /// Scheduling strategy for choosing which thread to run next.
377    #[serde(default)]
378    pub strategy: SchedulingStrategy,
379    /// Probability of preempting a thread at each syscall boundary [0.0, 1.0].
380    /// Higher values explore more interleavings but increase overhead.
381    #[serde(default)]
382    pub preempt_probability: f64,
383}
384
385impl Default for ThreadSchedulingConfig {
386    fn default() -> Self {
387        Self {
388            strategy: SchedulingStrategy::default(),
389            preempt_probability: 0.0,
390        }
391    }
392}
393
394impl ThreadSchedulingConfig {
395    /// Returns `true` if deterministic thread scheduling is enabled.
396    pub fn is_enabled(&self) -> bool {
397        self.preempt_probability > 0.0
398    }
399}
400
401/// Strategy for choosing which thread to schedule next.
402#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
403#[serde(rename_all = "snake_case")]
404pub enum SchedulingStrategy {
405    /// Seed-driven random scheduling: use DetRng to pick the next thread.
406    #[default]
407    Random,
408    /// Fixed round-robin: threads execute in a deterministic rotation.
409    RoundRobin,
410    /// Adversarial: always pick the thread most likely to cause races
411    /// (e.g., the thread that last accessed shared state).
412    Adversarial,
413}
414
415// ---------------------------------------------------------------------------
416// Validation errors
417// ---------------------------------------------------------------------------
418
419/// A validation error in a fault configuration.
420#[derive(Debug, Clone)]
421pub struct ConfigError {
422    /// Dot-separated path to the offending field (e.g. `"fs.rules[0].probability"`).
423    pub path: String,
424    /// Human-readable error message.
425    pub message: String,
426}
427
428impl std::fmt::Display for ConfigError {
429    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
430        write!(f, "{}: {}", self.path, self.message)
431    }
432}
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437
438    #[test]
439    fn test_default_config_has_no_faults() {
440        let cfg = FaultConfig::none();
441        assert!(!cfg.has_faults());
442        assert!(cfg.validate().is_empty());
443    }
444
445    #[test]
446    fn test_config_with_fs_fault() {
447        let cfg = FaultConfig {
448            fs: FsFaultConfig {
449                rules: vec![FsFaultRule {
450                    path: "*.wal".into(),
451                    op: FsOp::Write,
452                    error: FsError::Enospc,
453                    after_bytes: 4096,
454                    probability: 0.05,
455                }],
456            },
457            ..FaultConfig::none()
458        };
459        assert!(cfg.has_faults());
460        assert!(cfg.validate().is_empty());
461    }
462
463    #[test]
464    fn test_validate_bad_probability() {
465        let cfg = FaultConfig {
466            fs: FsFaultConfig {
467                rules: vec![FsFaultRule {
468                    path: "*.log".into(),
469                    op: FsOp::Write,
470                    error: FsError::Eio,
471                    after_bytes: 0,
472                    probability: 1.5, // invalid
473                }],
474            },
475            ..FaultConfig::none()
476        };
477        let errors = cfg.validate();
478        assert_eq!(errors.len(), 1);
479        assert!(errors[0].path.contains("probability"));
480    }
481
482    #[test]
483    fn test_validate_empty_path() {
484        let cfg = FaultConfig {
485            fs: FsFaultConfig {
486                rules: vec![FsFaultRule {
487                    path: "".into(), // invalid
488                    op: FsOp::Read,
489                    error: FsError::Eio,
490                    after_bytes: 0,
491                    probability: 0.5,
492                }],
493            },
494            ..FaultConfig::none()
495        };
496        let errors = cfg.validate();
497        assert_eq!(errors.len(), 1);
498        assert!(errors[0].path.contains("path"));
499    }
500
501    #[test]
502    fn test_merge_concatenates_rules() {
503        let base = FaultConfig {
504            fs: FsFaultConfig {
505                rules: vec![FsFaultRule {
506                    path: "*.wal".into(),
507                    op: FsOp::Write,
508                    error: FsError::Enospc,
509                    after_bytes: 4096,
510                    probability: 0.05,
511                }],
512            },
513            ..FaultConfig::none()
514        };
515        let overlay = FaultConfig {
516            fs: FsFaultConfig {
517                rules: vec![FsFaultRule {
518                    path: "*.log".into(),
519                    op: FsOp::Read,
520                    error: FsError::Eio,
521                    after_bytes: 0,
522                    probability: 0.01,
523                }],
524            },
525            ..FaultConfig::none()
526        };
527        let merged = FaultConfig::merge(&base, &overlay);
528        assert_eq!(merged.fs.rules.len(), 2);
529        assert_eq!(merged.fs.rules[0].path, "*.wal");
530        assert_eq!(merged.fs.rules[1].path, "*.log");
531    }
532
533    #[test]
534    fn test_merge_overlay_clock_takes_precedence() {
535        let base = FaultConfig {
536            clock: ClockFaultConfig {
537                drift_us_per_sec: 100,
538                jitter_us: 50,
539            },
540            ..FaultConfig::none()
541        };
542        let overlay = FaultConfig {
543            clock: ClockFaultConfig {
544                drift_us_per_sec: 200,
545                jitter_us: 10,
546            },
547            ..FaultConfig::none()
548        };
549        let merged = FaultConfig::merge(&base, &overlay);
550        assert_eq!(merged.clock.drift_us_per_sec, 200);
551        assert_eq!(merged.clock.jitter_us, 10);
552    }
553
554    #[test]
555    fn test_serde_roundtrip() {
556        let cfg = FaultConfig {
557            fs: FsFaultConfig {
558                rules: vec![FsFaultRule {
559                    path: "/data/*.wal".into(),
560                    op: FsOp::Write,
561                    error: FsError::TornWrite,
562                    after_bytes: 1024,
563                    probability: 0.02,
564                }],
565            },
566            clock: ClockFaultConfig {
567                drift_us_per_sec: 200,
568                jitter_us: 50,
569            },
570            alloc: AllocFaultConfig {
571                fail_probability: 0.001,
572                hard_limit_bytes: 67_108_864,
573                min_fail_size: 0,
574            },
575            process: ProcessFaultConfig {
576                rules: vec![ProcessFaultRule {
577                    command: "gcc".into(),
578                    fault: ProcessFault::Hang,
579                    after_us: 5_000_000,
580                }],
581            },
582            network: NetworkFaultConfig {
583                rules: vec![NetworkFaultRule {
584                    target: "127.0.0.1:5432".into(),
585                    fault: NetworkFault::ConnTimeout,
586                    probability: 0.1,
587                    limit_bytes_per_sec: 0,
588                }],
589            },
590            threading: ThreadSchedulingConfig {
591                strategy: SchedulingStrategy::Random,
592                preempt_probability: 0.1,
593            },
594        };
595        let json = serde_json::to_string_pretty(&cfg).unwrap();
596        let parsed: FaultConfig = serde_json::from_str(&json).unwrap();
597        assert_eq!(cfg, parsed);
598    }
599
600    #[test]
601    fn test_threading_config_default_disabled() {
602        let cfg = FaultConfig::none();
603        assert!(!cfg.threading.is_enabled());
604        assert_eq!(cfg.threading.strategy, SchedulingStrategy::Random);
605        assert_eq!(cfg.threading.preempt_probability, 0.0);
606    }
607
608    #[test]
609    fn test_threading_config_enabled() {
610        let cfg = FaultConfig {
611            threading: ThreadSchedulingConfig {
612                strategy: SchedulingStrategy::Adversarial,
613                preempt_probability: 0.5,
614            },
615            ..FaultConfig::none()
616        };
617        assert!(cfg.threading.is_enabled());
618        assert!(cfg.has_faults());
619    }
620
621    #[test]
622    fn test_validate_bad_preempt_probability() {
623        let cfg = FaultConfig {
624            threading: ThreadSchedulingConfig {
625                strategy: SchedulingStrategy::Random,
626                preempt_probability: 1.5,
627            },
628            ..FaultConfig::none()
629        };
630        let errors = cfg.validate();
631        assert_eq!(errors.len(), 1);
632        assert!(errors[0].path.contains("threading"));
633    }
634
635    #[test]
636    fn test_merge_threading_overlay() {
637        let base = FaultConfig::none();
638        let overlay = FaultConfig {
639            threading: ThreadSchedulingConfig {
640                strategy: SchedulingStrategy::RoundRobin,
641                preempt_probability: 0.3,
642            },
643            ..FaultConfig::none()
644        };
645        let merged = FaultConfig::merge(&base, &overlay);
646        assert_eq!(merged.threading.strategy, SchedulingStrategy::RoundRobin);
647        assert_eq!(merged.threading.preempt_probability, 0.3);
648    }
649
650    #[test]
651    fn test_serde_threading_roundtrip_json() {
652        let json = r#"{
653            "threading": {
654                "strategy": "adversarial",
655                "preempt_probability": 0.25
656            }
657        }"#;
658        let cfg: FaultConfig = serde_json::from_str(json).unwrap();
659        assert_eq!(cfg.threading.strategy, SchedulingStrategy::Adversarial);
660        assert!((cfg.threading.preempt_probability - 0.25).abs() < f64::EPSILON);
661    }
662}