moonpool_sim/runner/process.rs
1//! Process trait and reboot types for simulation testing.
2//!
3//! Processes represent the **system under test** — server nodes that can be
4//! killed and restarted (rebooted). Each process gets fresh in-memory state
5//! on every boot; persistence is only through storage.
6//!
7//! This is separate from [`Workload`](super::workload::Workload), which
8//! represents the **test driver** that survives server reboots.
9//!
10//! # Usage
11//!
12//! ```ignore
13//! use moonpool_sim::{Process, SimContext, SimulationResult};
14//!
15//! struct PaxosNode;
16//!
17//! #[async_trait(?Send)]
18//! impl Process for PaxosNode {
19//! fn name(&self) -> &str { "paxos" }
20//! async fn run(&mut self, ctx: &SimContext) -> SimulationResult<()> {
21//! let role = ctx.topology().my_tags().get("role")
22//! .ok_or_else(|| moonpool_sim::SimulationError::InvalidState("missing role tag".into()))?;
23//! // Run based on assigned role from tags...
24//! Ok(())
25//! }
26//! }
27//! ```
28
29use std::ops::Range;
30
31use async_trait::async_trait;
32
33use crate::SimulationResult;
34
35use super::context::SimContext;
36
37/// A process that participates in simulation as part of the system under test.
38///
39/// Processes are the primary unit of server behavior. A fresh instance is created
40/// from the factory on every boot (first boot and every reboot). State only
41/// persists through storage, not in-memory fields.
42///
43/// The process reads its tags and index from [`SimContext`] to determine its role.
44#[async_trait(?Send)]
45pub trait Process: 'static {
46 /// Name of this process type for reporting.
47 fn name(&self) -> &str;
48
49 /// Run the process. Called on each boot (first boot and every reboot).
50 ///
51 /// The [`SimContext`] has fresh providers each boot. The process should
52 /// bind listeners, establish connections, and run its main loop.
53 ///
54 /// Returns when the process exits voluntarily, or gets cancelled on reboot.
55 async fn run(&mut self, ctx: &SimContext) -> SimulationResult<()>;
56}
57
58/// The type of reboot to perform on a process.
59#[derive(Debug, Clone, Copy, PartialEq, Eq)]
60pub enum RebootKind {
61 /// Signal shutdown token, wait grace period, drain send buffers, then restart.
62 ///
63 /// The process's `ctx.shutdown()` token fires. The process has a grace period
64 /// to finish up. If it doesn't exit in time, the task is force-cancelled.
65 /// Send buffers drain during the grace period (FIN delivery).
66 Graceful,
67
68 /// Instant kill: task cancelled, all connections abort immediately.
69 ///
70 /// No buffer drain. Peers see connection reset errors. Unsynced storage
71 /// data may be lost (when per-IP storage scoping is implemented).
72 Crash,
73
74 /// Instant kill + wipe all storage for this process.
75 ///
76 /// Same as [`Crash`](RebootKind::Crash) but also deletes all persistent
77 /// storage owned by this process's IP. Simulates total data loss or a
78 /// new node joining the cluster.
79 CrashAndWipe,
80}
81
82/// Built-in attrition configuration for automatic process reboots.
83///
84/// Provides a default chaos mechanism that randomly kills and restarts server
85/// processes during the chaos phase. For custom fault injection strategies,
86/// implement [`FaultInjector`](super::fault_injector::FaultInjector) instead.
87///
88/// # Probabilities
89///
90/// The `prob_*` fields are weights that get normalized internally. They don't
91/// need to sum to 1.0, but all must be non-negative.
92///
93/// # Example
94///
95/// ```ignore
96/// Attrition {
97/// max_dead: 1,
98/// prob_graceful: 0.3,
99/// prob_crash: 0.5,
100/// prob_wipe: 0.2,
101/// recovery_delay_ms: None,
102/// grace_period_ms: None,
103/// }
104/// ```
105#[derive(Debug, Clone, PartialEq)]
106pub struct Attrition {
107 /// Maximum number of simultaneously dead processes.
108 ///
109 /// The attrition injector will not kill a process if the number of currently
110 /// dead (not yet restarted) processes is already at this limit.
111 pub max_dead: usize,
112
113 /// Weight for [`RebootKind::Graceful`] reboots.
114 pub prob_graceful: f64,
115
116 /// Weight for [`RebootKind::Crash`] reboots.
117 pub prob_crash: f64,
118
119 /// Weight for [`RebootKind::CrashAndWipe`] reboots.
120 pub prob_wipe: f64,
121
122 /// Recovery delay range in milliseconds.
123 ///
124 /// After a process is killed (crash or force-kill after grace), it restarts
125 /// after a seeded random delay drawn from this range.
126 ///
127 /// Defaults to `1000..10000` (1-10 seconds) if not set.
128 pub recovery_delay_ms: Option<Range<usize>>,
129
130 /// Grace period range in milliseconds (for graceful reboots).
131 ///
132 /// After the per-process shutdown token is cancelled, the process has this
133 /// long to clean up before being force-killed. The actual duration is a
134 /// seeded random value from this range.
135 ///
136 /// Defaults to `2000..5000` (2-5 seconds) if not set.
137 pub grace_period_ms: Option<Range<usize>>,
138}
139
140impl Attrition {
141 /// Choose a [`RebootKind`] based on the configured probabilities using the
142 /// given random value in `[0.0, 1.0)`.
143 pub(crate) fn choose_kind(&self, rand_val: f64) -> RebootKind {
144 let total = self.prob_graceful + self.prob_crash + self.prob_wipe;
145 if total <= 0.0 {
146 return RebootKind::Crash;
147 }
148
149 let normalized = rand_val * total;
150 if normalized < self.prob_graceful {
151 RebootKind::Graceful
152 } else if normalized < self.prob_graceful + self.prob_crash {
153 RebootKind::Crash
154 } else {
155 RebootKind::CrashAndWipe
156 }
157 }
158}