rust_supervisor/runtime/lifecycle.rs
1//! Runtime control plane lifecycle state.
2//!
3//! This module stores health state and final exit reports that a
4//! `SupervisorHandle` can read repeatedly. It does not execute runtime control
5//! loop commands.
6
7use crate::error::types::SupervisorError;
8use serde::{Deserialize, Serialize};
9use std::sync::{Arc, Mutex};
10use std::time::{SystemTime, UNIX_EPOCH};
11use tokio::sync::Notify;
12
13/// Stable runtime control plane state.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
15pub enum RuntimeControlPlaneState {
16 /// Control loop task has been created but is not yet accepting commands.
17 Starting,
18 /// Control loop can still accept commands.
19 Alive,
20 /// Control plane has received an explicit shutdown request.
21 ShuttingDown,
22 /// Control loop completed normally.
23 Completed,
24 /// Control loop failed.
25 Failed,
26}
27
28impl RuntimeControlPlaneState {
29 /// Returns a low-cardinality state label.
30 ///
31 /// # Arguments
32 ///
33 /// This function has no arguments.
34 ///
35 /// # Returns
36 ///
37 /// Returns a stable state label.
38 ///
39 /// # Examples
40 ///
41 /// ```
42 /// let state = rust_supervisor::runtime::lifecycle::RuntimeControlPlaneState::Alive;
43 /// assert_eq!(state.as_str(), "alive");
44 /// ```
45 pub fn as_str(&self) -> &'static str {
46 match self {
47 Self::Starting => "starting",
48 Self::Alive => "alive",
49 Self::ShuttingDown => "shutting_down",
50 Self::Completed => "completed",
51 Self::Failed => "failed",
52 }
53 }
54
55 /// Returns whether this state is terminal.
56 ///
57 /// # Arguments
58 ///
59 /// This function has no arguments.
60 ///
61 /// # Returns
62 ///
63 /// Returns `true` when the state is terminal.
64 pub fn is_terminal(&self) -> bool {
65 matches!(self, Self::Completed | Self::Failed)
66 }
67}
68
69/// Runtime control loop failure reason.
70#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
71pub struct RuntimeFailureReason {
72 /// Failure phase.
73 pub phase: String,
74 /// Human-readable failure reason.
75 pub reason: String,
76 /// Whether the failure came from panic.
77 pub panic: bool,
78 /// Whether callers can recover by creating a new supervisor.
79 pub recoverable: bool,
80}
81
82impl RuntimeFailureReason {
83 /// Creates a failure reason.
84 ///
85 /// # Arguments
86 ///
87 /// - `phase`: Failure phase.
88 /// - `reason`: Human-readable reason.
89 /// - `panic`: Whether the failure came from panic.
90 /// - `recoverable`: Whether a new supervisor can recover.
91 ///
92 /// # Returns
93 ///
94 /// Returns a [`RuntimeFailureReason`].
95 ///
96 /// # Examples
97 ///
98 /// ```
99 /// let failure = rust_supervisor::runtime::lifecycle::RuntimeFailureReason::new(
100 /// "watchdog",
101 /// "runtime control loop panic",
102 /// true,
103 /// true,
104 /// );
105 /// assert!(failure.panic);
106 /// ```
107 pub fn new(
108 phase: impl Into<String>,
109 reason: impl Into<String>,
110 panic: bool,
111 recoverable: bool,
112 ) -> Self {
113 Self {
114 phase: phase.into(),
115 reason: reason.into(),
116 panic,
117 recoverable,
118 }
119 }
120}
121
122/// Final runtime control loop exit report.
123#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
124pub struct RuntimeExitReport {
125 /// Final state, which must be completed or failed.
126 pub state: RuntimeControlPlaneState,
127 /// Exit phase.
128 pub phase: String,
129 /// Human-readable exit reason.
130 pub reason: String,
131 /// Whether callers can recover by creating a new supervisor.
132 pub recoverable: bool,
133 /// Final report timestamp in Unix epoch nanoseconds.
134 pub completed_at_unix_nanos: u128,
135 /// Whether the report came from panic.
136 pub panic: bool,
137}
138
139impl RuntimeExitReport {
140 /// Creates a completed exit report.
141 ///
142 /// # Arguments
143 ///
144 /// - `phase`: Completion phase.
145 /// - `reason`: Human-readable reason.
146 ///
147 /// # Returns
148 ///
149 /// Returns a completed [`RuntimeExitReport`].
150 ///
151 /// # Examples
152 ///
153 /// ```
154 /// let report = rust_supervisor::runtime::lifecycle::RuntimeExitReport::completed(
155 /// "shutdown",
156 /// "operator requested shutdown",
157 /// );
158 /// assert_eq!(report.state.as_str(), "completed");
159 /// ```
160 pub fn completed(phase: impl Into<String>, reason: impl Into<String>) -> Self {
161 Self {
162 state: RuntimeControlPlaneState::Completed,
163 phase: phase.into(),
164 reason: reason.into(),
165 recoverable: false,
166 completed_at_unix_nanos: unix_nanos_now(),
167 panic: false,
168 }
169 }
170
171 /// Creates a failed exit report.
172 ///
173 /// # Arguments
174 ///
175 /// - `phase`: Failure phase.
176 /// - `reason`: Human-readable reason.
177 /// - `panic`: Whether the failure came from panic.
178 /// - `recoverable`: Whether a new supervisor can recover.
179 ///
180 /// # Returns
181 ///
182 /// Returns a failed [`RuntimeExitReport`].
183 pub fn failed(
184 phase: impl Into<String>,
185 reason: impl Into<String>,
186 panic: bool,
187 recoverable: bool,
188 ) -> Self {
189 Self {
190 state: RuntimeControlPlaneState::Failed,
191 phase: phase.into(),
192 reason: reason.into(),
193 recoverable,
194 completed_at_unix_nanos: unix_nanos_now(),
195 panic,
196 }
197 }
198
199 /// Converts this report into a health failure reason.
200 ///
201 /// # Arguments
202 ///
203 /// This function has no arguments.
204 ///
205 /// # Returns
206 ///
207 /// Returns a failure reason when this report represents failure.
208 pub fn failure_reason(&self) -> Option<RuntimeFailureReason> {
209 (self.state == RuntimeControlPlaneState::Failed).then(|| {
210 RuntimeFailureReason::new(
211 self.phase.clone(),
212 self.reason.clone(),
213 self.panic,
214 self.recoverable,
215 )
216 })
217 }
218}
219
220/// Health report read by runtime callers.
221#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
222pub struct RuntimeHealthReport {
223 /// Whether the control loop can still accept commands.
224 pub alive: bool,
225 /// Current control plane state.
226 pub state: RuntimeControlPlaneState,
227 /// Control plane startup timestamp in Unix epoch nanoseconds.
228 pub started_at_unix_nanos: u128,
229 /// Last observation timestamp in Unix epoch nanoseconds.
230 pub last_observed_at_unix_nanos: u128,
231 /// Structured reason for failed state.
232 pub failure: Option<RuntimeFailureReason>,
233 /// Final report for completed or failed state.
234 pub exit_report: Option<RuntimeExitReport>,
235}
236
237/// Runtime control plane with repeatable reads.
238#[derive(Debug, Clone)]
239pub struct RuntimeControlPlane {
240 /// Shared inner state.
241 inner: Arc<Mutex<RuntimeControlPlaneInner>>,
242 /// Terminal state notifier.
243 notify: Arc<Notify>,
244}
245
246impl RuntimeControlPlane {
247 /// Creates new control plane lifecycle state.
248 ///
249 /// # Arguments
250 ///
251 /// This function has no arguments.
252 ///
253 /// # Returns
254 ///
255 /// Returns a [`RuntimeControlPlane`] in starting state.
256 ///
257 /// # Examples
258 ///
259 /// ```
260 /// let control_plane = rust_supervisor::runtime::lifecycle::RuntimeControlPlane::new();
261 /// assert!(!control_plane.is_alive());
262 /// ```
263 pub fn new() -> Self {
264 let now = unix_nanos_now();
265 Self {
266 inner: Arc::new(Mutex::new(RuntimeControlPlaneInner {
267 state: RuntimeControlPlaneState::Starting,
268 started_at_unix_nanos: now,
269 last_observed_at_unix_nanos: now,
270 exit_report: None,
271 failure: None,
272 shutdown_requested_by: None,
273 shutdown_reason: None,
274 })),
275 notify: Arc::new(Notify::new()),
276 }
277 }
278
279 /// Marks the control loop as accepting commands.
280 ///
281 /// # Arguments
282 ///
283 /// This function has no arguments.
284 ///
285 /// # Returns
286 ///
287 /// This function does not return a value.
288 pub fn mark_alive(&self) {
289 let mut inner = self.lock_inner();
290 if !inner.state.is_terminal() {
291 inner.state = RuntimeControlPlaneState::Alive;
292 inner.last_observed_at_unix_nanos = unix_nanos_now();
293 }
294 }
295
296 /// Returns whether the control loop is alive.
297 ///
298 /// # Arguments
299 ///
300 /// This function has no arguments.
301 ///
302 /// # Returns
303 ///
304 /// Returns `true` when ordinary control commands may be sent.
305 pub fn is_alive(&self) -> bool {
306 let mut inner = self.lock_inner();
307 inner.last_observed_at_unix_nanos = unix_nanos_now();
308 inner.state == RuntimeControlPlaneState::Alive
309 }
310
311 /// Reads a health report.
312 ///
313 /// # Arguments
314 ///
315 /// This function has no arguments.
316 ///
317 /// # Returns
318 ///
319 /// Returns a [`RuntimeHealthReport`] value for the current observation.
320 pub fn health(&self) -> RuntimeHealthReport {
321 let mut inner = self.lock_inner();
322 inner.last_observed_at_unix_nanos = unix_nanos_now();
323 RuntimeHealthReport {
324 alive: inner.state == RuntimeControlPlaneState::Alive,
325 state: inner.state,
326 started_at_unix_nanos: inner.started_at_unix_nanos,
327 last_observed_at_unix_nanos: inner.last_observed_at_unix_nanos,
328 failure: inner.failure.clone(),
329 exit_report: inner.exit_report.clone(),
330 }
331 }
332
333 /// Marks that shutdown has been requested.
334 ///
335 /// # Arguments
336 ///
337 /// - `requested_by`: Actor that requested shutdown.
338 /// - `reason`: Human-readable shutdown reason.
339 ///
340 /// # Returns
341 ///
342 /// Returns an existing final report when the control plane already ended.
343 pub fn mark_shutdown_requested(
344 &self,
345 requested_by: impl Into<String>,
346 reason: impl Into<String>,
347 ) -> Result<Option<RuntimeExitReport>, SupervisorError> {
348 let requested_by = requested_by.into();
349 let reason = reason.into();
350 validate_required_text(&requested_by, "requested_by")?;
351 validate_required_text(&reason, "reason")?;
352
353 let mut inner = self.lock_inner();
354 if let Some(report) = &inner.exit_report {
355 return Ok(Some(report.clone()));
356 }
357 inner.state = RuntimeControlPlaneState::ShuttingDown;
358 inner.shutdown_requested_by = Some(requested_by);
359 inner.shutdown_reason = Some(reason);
360 inner.last_observed_at_unix_nanos = unix_nanos_now();
361 Ok(None)
362 }
363
364 /// Writes the final exit report.
365 ///
366 /// # Arguments
367 ///
368 /// - `report`: Final runtime exit report.
369 ///
370 /// # Returns
371 ///
372 /// Returns the cached final report.
373 pub fn complete(&self, report: RuntimeExitReport) -> RuntimeExitReport {
374 let mut inner = self.lock_inner();
375 if let Some(existing) = &inner.exit_report {
376 return existing.clone();
377 }
378 inner.state = report.state;
379 inner.failure = report.failure_reason();
380 inner.exit_report = Some(report.clone());
381 inner.last_observed_at_unix_nanos = report.completed_at_unix_nanos;
382 self.notify.notify_waiters();
383 report
384 }
385
386 /// Returns the cached final exit report.
387 ///
388 /// # Arguments
389 ///
390 /// This function has no arguments.
391 ///
392 /// # Returns
393 ///
394 /// Returns the final report when the control plane has ended.
395 pub fn final_report(&self) -> Option<RuntimeExitReport> {
396 self.lock_inner().exit_report.clone()
397 }
398
399 /// Waits for the control plane to reach a terminal state.
400 ///
401 /// # Arguments
402 ///
403 /// This function has no arguments.
404 ///
405 /// # Returns
406 ///
407 /// Returns the cached final [`RuntimeExitReport`].
408 pub async fn join(&self) -> RuntimeExitReport {
409 loop {
410 let notified = self.notify.notified();
411 if let Some(report) = self.final_report() {
412 return report;
413 }
414 notified.await;
415 }
416 }
417
418 /// Acquires the inner state lock.
419 fn lock_inner(&self) -> std::sync::MutexGuard<'_, RuntimeControlPlaneInner> {
420 self.inner
421 .lock()
422 .expect("runtime control plane lock poisoned")
423 }
424}
425
426impl Default for RuntimeControlPlane {
427 /// Creates the default runtime control plane.
428 fn default() -> Self {
429 Self::new()
430 }
431}
432
433/// Runtime control plane inner state.
434#[derive(Debug)]
435struct RuntimeControlPlaneInner {
436 /// Current state.
437 state: RuntimeControlPlaneState,
438 /// Startup timestamp.
439 started_at_unix_nanos: u128,
440 /// Last observation timestamp.
441 last_observed_at_unix_nanos: u128,
442 /// Final exit report.
443 exit_report: Option<RuntimeExitReport>,
444 /// Failure reason.
445 failure: Option<RuntimeFailureReason>,
446 /// Shutdown requester.
447 shutdown_requested_by: Option<String>,
448 /// Shutdown reason.
449 shutdown_reason: Option<String>,
450}
451
452/// Validates required text.
453fn validate_required_text(value: &str, field: &str) -> Result<(), SupervisorError> {
454 if value.trim().is_empty() {
455 return Err(SupervisorError::InvalidTransition {
456 message: format!("runtime control plane {field} must not be empty"),
457 });
458 }
459 Ok(())
460}
461
462/// Returns current Unix epoch nanoseconds.
463fn unix_nanos_now() -> u128 {
464 SystemTime::now()
465 .duration_since(UNIX_EPOCH)
466 .map(|duration| duration.as_nanos())
467 .unwrap_or_default()
468}