Skip to main content

rust_supervisor/runtime/
lifecycle.rs

1//! Runtime control plane lifecycle state.
2//!
3//! This module stores health state and final exit reports that a
4//! `SupervisorHandle` can read repeatedly. It does not execute runtime control
5//! loop commands.
6
7use crate::error::types::SupervisorError;
8use serde::{Deserialize, Serialize};
9use std::sync::{Arc, Mutex};
10use std::time::{SystemTime, UNIX_EPOCH};
11use tokio::sync::Notify;
12
13/// Stable runtime control plane state.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
15pub enum RuntimeControlPlaneState {
16    /// Control loop task has been created but is not yet accepting commands.
17    Starting,
18    /// Control loop can still accept commands.
19    Alive,
20    /// Control plane has received an explicit shutdown request.
21    ShuttingDown,
22    /// Control loop completed normally.
23    Completed,
24    /// Control loop failed.
25    Failed,
26}
27
28impl RuntimeControlPlaneState {
29    /// Returns a low-cardinality state label.
30    ///
31    /// # Arguments
32    ///
33    /// This function has no arguments.
34    ///
35    /// # Returns
36    ///
37    /// Returns a stable state label.
38    ///
39    /// # Examples
40    ///
41    /// ```
42    /// let state = rust_supervisor::runtime::lifecycle::RuntimeControlPlaneState::Alive;
43    /// assert_eq!(state.as_str(), "alive");
44    /// ```
45    pub fn as_str(&self) -> &'static str {
46        match self {
47            Self::Starting => "starting",
48            Self::Alive => "alive",
49            Self::ShuttingDown => "shutting_down",
50            Self::Completed => "completed",
51            Self::Failed => "failed",
52        }
53    }
54
55    /// Returns whether this state is terminal.
56    ///
57    /// # Arguments
58    ///
59    /// This function has no arguments.
60    ///
61    /// # Returns
62    ///
63    /// Returns `true` when the state is terminal.
64    pub fn is_terminal(&self) -> bool {
65        matches!(self, Self::Completed | Self::Failed)
66    }
67}
68
69/// Runtime control loop failure reason.
70#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
71pub struct RuntimeFailureReason {
72    /// Failure phase.
73    pub phase: String,
74    /// Human-readable failure reason.
75    pub reason: String,
76    /// Whether the failure came from panic.
77    pub panic: bool,
78    /// Whether callers can recover by creating a new supervisor.
79    pub recoverable: bool,
80}
81
82impl RuntimeFailureReason {
83    /// Creates a failure reason.
84    ///
85    /// # Arguments
86    ///
87    /// - `phase`: Failure phase.
88    /// - `reason`: Human-readable reason.
89    /// - `panic`: Whether the failure came from panic.
90    /// - `recoverable`: Whether a new supervisor can recover.
91    ///
92    /// # Returns
93    ///
94    /// Returns a [`RuntimeFailureReason`].
95    ///
96    /// # Examples
97    ///
98    /// ```
99    /// let failure = rust_supervisor::runtime::lifecycle::RuntimeFailureReason::new(
100    ///     "watchdog",
101    ///     "runtime control loop panic",
102    ///     true,
103    ///     true,
104    /// );
105    /// assert!(failure.panic);
106    /// ```
107    pub fn new(
108        phase: impl Into<String>,
109        reason: impl Into<String>,
110        panic: bool,
111        recoverable: bool,
112    ) -> Self {
113        Self {
114            phase: phase.into(),
115            reason: reason.into(),
116            panic,
117            recoverable,
118        }
119    }
120}
121
122/// Final runtime control loop exit report.
123#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
124pub struct RuntimeExitReport {
125    /// Final state, which must be completed or failed.
126    pub state: RuntimeControlPlaneState,
127    /// Exit phase.
128    pub phase: String,
129    /// Human-readable exit reason.
130    pub reason: String,
131    /// Whether callers can recover by creating a new supervisor.
132    pub recoverable: bool,
133    /// Final report timestamp in Unix epoch nanoseconds.
134    pub completed_at_unix_nanos: u128,
135    /// Whether the report came from panic.
136    pub panic: bool,
137}
138
139impl RuntimeExitReport {
140    /// Creates a completed exit report.
141    ///
142    /// # Arguments
143    ///
144    /// - `phase`: Completion phase.
145    /// - `reason`: Human-readable reason.
146    ///
147    /// # Returns
148    ///
149    /// Returns a completed [`RuntimeExitReport`].
150    ///
151    /// # Examples
152    ///
153    /// ```
154    /// let report = rust_supervisor::runtime::lifecycle::RuntimeExitReport::completed(
155    ///     "shutdown",
156    ///     "operator requested shutdown",
157    /// );
158    /// assert_eq!(report.state.as_str(), "completed");
159    /// ```
160    pub fn completed(phase: impl Into<String>, reason: impl Into<String>) -> Self {
161        Self {
162            state: RuntimeControlPlaneState::Completed,
163            phase: phase.into(),
164            reason: reason.into(),
165            recoverable: false,
166            completed_at_unix_nanos: unix_nanos_now(),
167            panic: false,
168        }
169    }
170
171    /// Creates a failed exit report.
172    ///
173    /// # Arguments
174    ///
175    /// - `phase`: Failure phase.
176    /// - `reason`: Human-readable reason.
177    /// - `panic`: Whether the failure came from panic.
178    /// - `recoverable`: Whether a new supervisor can recover.
179    ///
180    /// # Returns
181    ///
182    /// Returns a failed [`RuntimeExitReport`].
183    pub fn failed(
184        phase: impl Into<String>,
185        reason: impl Into<String>,
186        panic: bool,
187        recoverable: bool,
188    ) -> Self {
189        Self {
190            state: RuntimeControlPlaneState::Failed,
191            phase: phase.into(),
192            reason: reason.into(),
193            recoverable,
194            completed_at_unix_nanos: unix_nanos_now(),
195            panic,
196        }
197    }
198
199    /// Converts this report into a health failure reason.
200    ///
201    /// # Arguments
202    ///
203    /// This function has no arguments.
204    ///
205    /// # Returns
206    ///
207    /// Returns a failure reason when this report represents failure.
208    pub fn failure_reason(&self) -> Option<RuntimeFailureReason> {
209        (self.state == RuntimeControlPlaneState::Failed).then(|| {
210            RuntimeFailureReason::new(
211                self.phase.clone(),
212                self.reason.clone(),
213                self.panic,
214                self.recoverable,
215            )
216        })
217    }
218}
219
220/// Health report read by runtime callers.
221#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
222pub struct RuntimeHealthReport {
223    /// Whether the control loop can still accept commands.
224    pub alive: bool,
225    /// Current control plane state.
226    pub state: RuntimeControlPlaneState,
227    /// Control plane startup timestamp in Unix epoch nanoseconds.
228    pub started_at_unix_nanos: u128,
229    /// Last observation timestamp in Unix epoch nanoseconds.
230    pub last_observed_at_unix_nanos: u128,
231    /// Structured reason for failed state.
232    pub failure: Option<RuntimeFailureReason>,
233    /// Final report for completed or failed state.
234    pub exit_report: Option<RuntimeExitReport>,
235}
236
237/// Runtime control plane with repeatable reads.
238#[derive(Debug, Clone)]
239pub struct RuntimeControlPlane {
240    /// Shared inner state.
241    inner: Arc<Mutex<RuntimeControlPlaneInner>>,
242    /// Terminal state notifier.
243    notify: Arc<Notify>,
244}
245
246impl RuntimeControlPlane {
247    /// Creates new control plane lifecycle state.
248    ///
249    /// # Arguments
250    ///
251    /// This function has no arguments.
252    ///
253    /// # Returns
254    ///
255    /// Returns a [`RuntimeControlPlane`] in starting state.
256    ///
257    /// # Examples
258    ///
259    /// ```
260    /// let control_plane = rust_supervisor::runtime::lifecycle::RuntimeControlPlane::new();
261    /// assert!(!control_plane.is_alive());
262    /// ```
263    pub fn new() -> Self {
264        let now = unix_nanos_now();
265        Self {
266            inner: Arc::new(Mutex::new(RuntimeControlPlaneInner {
267                state: RuntimeControlPlaneState::Starting,
268                started_at_unix_nanos: now,
269                last_observed_at_unix_nanos: now,
270                exit_report: None,
271                failure: None,
272                shutdown_requested_by: None,
273                shutdown_reason: None,
274            })),
275            notify: Arc::new(Notify::new()),
276        }
277    }
278
279    /// Marks the control loop as accepting commands.
280    ///
281    /// # Arguments
282    ///
283    /// This function has no arguments.
284    ///
285    /// # Returns
286    ///
287    /// This function does not return a value.
288    pub fn mark_alive(&self) {
289        let mut inner = self.lock_inner();
290        if !inner.state.is_terminal() {
291            inner.state = RuntimeControlPlaneState::Alive;
292            inner.last_observed_at_unix_nanos = unix_nanos_now();
293        }
294    }
295
296    /// Returns whether the control loop is alive.
297    ///
298    /// # Arguments
299    ///
300    /// This function has no arguments.
301    ///
302    /// # Returns
303    ///
304    /// Returns `true` when ordinary control commands may be sent.
305    pub fn is_alive(&self) -> bool {
306        let mut inner = self.lock_inner();
307        inner.last_observed_at_unix_nanos = unix_nanos_now();
308        inner.state == RuntimeControlPlaneState::Alive
309    }
310
311    /// Reads a health report.
312    ///
313    /// # Arguments
314    ///
315    /// This function has no arguments.
316    ///
317    /// # Returns
318    ///
319    /// Returns a [`RuntimeHealthReport`] value for the current observation.
320    pub fn health(&self) -> RuntimeHealthReport {
321        let mut inner = self.lock_inner();
322        inner.last_observed_at_unix_nanos = unix_nanos_now();
323        RuntimeHealthReport {
324            alive: inner.state == RuntimeControlPlaneState::Alive,
325            state: inner.state,
326            started_at_unix_nanos: inner.started_at_unix_nanos,
327            last_observed_at_unix_nanos: inner.last_observed_at_unix_nanos,
328            failure: inner.failure.clone(),
329            exit_report: inner.exit_report.clone(),
330        }
331    }
332
333    /// Marks that shutdown has been requested.
334    ///
335    /// # Arguments
336    ///
337    /// - `requested_by`: Actor that requested shutdown.
338    /// - `reason`: Human-readable shutdown reason.
339    ///
340    /// # Returns
341    ///
342    /// Returns an existing final report when the control plane already ended.
343    pub fn mark_shutdown_requested(
344        &self,
345        requested_by: impl Into<String>,
346        reason: impl Into<String>,
347    ) -> Result<Option<RuntimeExitReport>, SupervisorError> {
348        let requested_by = requested_by.into();
349        let reason = reason.into();
350        validate_required_text(&requested_by, "requested_by")?;
351        validate_required_text(&reason, "reason")?;
352
353        let mut inner = self.lock_inner();
354        if let Some(report) = &inner.exit_report {
355            return Ok(Some(report.clone()));
356        }
357        inner.state = RuntimeControlPlaneState::ShuttingDown;
358        inner.shutdown_requested_by = Some(requested_by);
359        inner.shutdown_reason = Some(reason);
360        inner.last_observed_at_unix_nanos = unix_nanos_now();
361        Ok(None)
362    }
363
364    /// Writes the final exit report.
365    ///
366    /// # Arguments
367    ///
368    /// - `report`: Final runtime exit report.
369    ///
370    /// # Returns
371    ///
372    /// Returns the cached final report.
373    pub fn complete(&self, report: RuntimeExitReport) -> RuntimeExitReport {
374        let mut inner = self.lock_inner();
375        if let Some(existing) = &inner.exit_report {
376            return existing.clone();
377        }
378        inner.state = report.state;
379        inner.failure = report.failure_reason();
380        inner.exit_report = Some(report.clone());
381        inner.last_observed_at_unix_nanos = report.completed_at_unix_nanos;
382        self.notify.notify_waiters();
383        report
384    }
385
386    /// Returns the cached final exit report.
387    ///
388    /// # Arguments
389    ///
390    /// This function has no arguments.
391    ///
392    /// # Returns
393    ///
394    /// Returns the final report when the control plane has ended.
395    pub fn final_report(&self) -> Option<RuntimeExitReport> {
396        self.lock_inner().exit_report.clone()
397    }
398
399    /// Waits for the control plane to reach a terminal state.
400    ///
401    /// # Arguments
402    ///
403    /// This function has no arguments.
404    ///
405    /// # Returns
406    ///
407    /// Returns the cached final [`RuntimeExitReport`].
408    pub async fn join(&self) -> RuntimeExitReport {
409        loop {
410            let notified = self.notify.notified();
411            if let Some(report) = self.final_report() {
412                return report;
413            }
414            notified.await;
415        }
416    }
417
418    /// Acquires the inner state lock.
419    fn lock_inner(&self) -> std::sync::MutexGuard<'_, RuntimeControlPlaneInner> {
420        self.inner
421            .lock()
422            .expect("runtime control plane lock poisoned")
423    }
424}
425
426impl Default for RuntimeControlPlane {
427    /// Creates the default runtime control plane.
428    fn default() -> Self {
429        Self::new()
430    }
431}
432
433/// Runtime control plane inner state.
434#[derive(Debug)]
435struct RuntimeControlPlaneInner {
436    /// Current state.
437    state: RuntimeControlPlaneState,
438    /// Startup timestamp.
439    started_at_unix_nanos: u128,
440    /// Last observation timestamp.
441    last_observed_at_unix_nanos: u128,
442    /// Final exit report.
443    exit_report: Option<RuntimeExitReport>,
444    /// Failure reason.
445    failure: Option<RuntimeFailureReason>,
446    /// Shutdown requester.
447    shutdown_requested_by: Option<String>,
448    /// Shutdown reason.
449    shutdown_reason: Option<String>,
450}
451
452/// Validates required text.
453fn validate_required_text(value: &str, field: &str) -> Result<(), SupervisorError> {
454    if value.trim().is_empty() {
455        return Err(SupervisorError::InvalidTransition {
456            message: format!("runtime control plane {field} must not be empty"),
457        });
458    }
459    Ok(())
460}
461
462/// Returns current Unix epoch nanoseconds.
463fn unix_nanos_now() -> u128 {
464    SystemTime::now()
465        .duration_since(UNIX_EPOCH)
466        .map(|duration| duration.as_nanos())
467        .unwrap_or_default()
468}