Skip to main content

rustrade_supervisor/
lifecycle.rs

1//! Service lifecycle state machine.
2//!
3//! Each service managed by the [`Supervisor`](super::Supervisor) progresses
4//! through a well-defined set of states:
5//!
6//! ```text
7//!   ┌──────────┐
8//!   │ Starting │──────────────────────────┐
9//!   └────┬─────┘                          │
10//!        │ run() entered                  │ init error
11//!        ▼                                ▼
12//!   ┌──────────┐                    ┌────────────┐
13//!   │ Running  │───── error ──────▶│ BackingOff │
14//!   └────┬─────┘                    └──────┬─────┘
15//!        │                                 │
16//!        │ cancel / Ok(())                 │ retry
17//!        │                                 │
18//!        │    ┌────────────────────────────┘
19//!        ▼    ▼
20//!   ┌──────────┐         ┌────────────┐
21//!   │ Stopping │────────▶│ Terminated │
22//!   └──────────┘         └────────────┘
23//! ```
24//!
25//! - **Starting**: the service is initializing.
26//! - **Running**: the service's `run()` loop is active.
27//! - **BackingOff**: the service failed and is waiting for the backoff
28//!   timer before the supervisor retries.
29//! - **Stopping**: a cancellation signal was received; the service is
30//!   finalizing.
31//! - **Terminated**: terminal state — the service has exited (or the
32//!   circuit breaker tripped and the supervisor gave up).
33//!
34//! The `BackingOff` state prevents the supervisor from tight-looping on a
35//! persistent failure, which would burn CPU and flood logs.
36
37use std::fmt;
38use std::time::{Duration, Instant};
39
40use serde::{Deserialize, Serialize};
41
42// ---------------------------------------------------------------------------
43// ServicePhase
44// ---------------------------------------------------------------------------
45
46/// Lifecycle phase of a supervised service.
47///
48/// Plain enum without associated data; richer context (timing, error info,
49/// attempt counts) lives in [`ServiceLifecycle`].
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
51#[serde(rename_all = "snake_case")]
52pub enum ServicePhase {
53    /// The service is initializing.
54    Starting,
55    /// The service's main `run()` loop is executing.
56    Running,
57    /// The service failed and is waiting for the backoff timer to expire.
58    BackingOff,
59    /// A shutdown signal was received; the service is performing cleanup.
60    Stopping,
61    /// Terminal state — the service has exited.
62    Terminated,
63}
64
65impl fmt::Display for ServicePhase {
66    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67        match self {
68            Self::Starting => write!(f, "starting"),
69            Self::Running => write!(f, "running"),
70            Self::BackingOff => write!(f, "backing_off"),
71            Self::Stopping => write!(f, "stopping"),
72            Self::Terminated => write!(f, "terminated"),
73        }
74    }
75}
76
77impl ServicePhase {
78    /// True if the service is in a terminal state.
79    pub fn is_terminal(&self) -> bool {
80        matches!(self, Self::Terminated)
81    }
82
83    /// True if the service is "alive" (starting, running, or backing off).
84    pub fn is_alive(&self) -> bool {
85        matches!(self, Self::Starting | Self::Running | Self::BackingOff)
86    }
87}
88
89// ---------------------------------------------------------------------------
90// TerminationReason
91// ---------------------------------------------------------------------------
92
93/// Why a service reached the `Terminated` phase.
94#[derive(Debug, Clone, PartialEq, Eq)]
95pub enum TerminationReason {
96    /// The service's `run()` returned `Ok(())` — clean completion.
97    Completed,
98    /// The supervisor's cancellation token was triggered (graceful shutdown).
99    Cancelled,
100    /// The circuit breaker tripped after too many failures.
101    CircuitBreakerOpen {
102        /// Number of failures observed within the circuit-breaker window.
103        failures: u32,
104        /// The configured maximum before tripping.
105        max_retries: u32,
106    },
107    /// The service encountered an unrecoverable error and its restart
108    /// policy is [`RestartPolicy::Never`](super::RestartPolicy::Never).
109    Unrecoverable(String),
110}
111
112impl fmt::Display for TerminationReason {
113    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
114        match self {
115            Self::Completed => write!(f, "completed"),
116            Self::Cancelled => write!(f, "cancelled"),
117            Self::CircuitBreakerOpen {
118                failures,
119                max_retries,
120            } => write!(
121                f,
122                "circuit breaker open ({failures}/{max_retries} failures)"
123            ),
124            Self::Unrecoverable(msg) => write!(f, "unrecoverable: {msg}"),
125        }
126    }
127}
128
129// ---------------------------------------------------------------------------
130// TransitionError
131// ---------------------------------------------------------------------------
132
133/// Error returned when an invalid state transition is attempted.
134#[derive(Debug, Clone, thiserror::Error)]
135#[error("invalid lifecycle transition: {from} → {to}")]
136pub struct TransitionError {
137    /// Phase the service was in.
138    pub from: ServicePhase,
139    /// Phase the caller tried to move to.
140    pub to: ServicePhase,
141}
142
143// ---------------------------------------------------------------------------
144// ServiceLifecycle
145// ---------------------------------------------------------------------------
146
147/// Full lifecycle tracker for a single supervised service.
148///
149/// Wraps the [`ServicePhase`] enum with timing data, counters, and
150/// transition validation logic. The supervisor holds one of these per
151/// managed service.
152#[derive(Debug, Clone)]
153pub struct ServiceLifecycle {
154    phase: ServicePhase,
155    service_name: String,
156    created_at: Instant,
157    phase_entered_at: Instant,
158    start_count: u32,
159    total_failures: u32,
160    last_error: Option<String>,
161    termination_reason: Option<TerminationReason>,
162    cumulative_running: Duration,
163    running_since: Option<Instant>,
164}
165
166impl ServiceLifecycle {
167    /// Create a new lifecycle tracker in the `Starting` phase.
168    pub fn new(service_name: impl Into<String>) -> Self {
169        let now = Instant::now();
170        Self {
171            phase: ServicePhase::Starting,
172            service_name: service_name.into(),
173            created_at: now,
174            phase_entered_at: now,
175            start_count: 1,
176            total_failures: 0,
177            last_error: None,
178            termination_reason: None,
179            cumulative_running: Duration::ZERO,
180            running_since: None,
181        }
182    }
183
184    // ── Accessors ─────────────────────────────────────────────────────
185
186    /// Current lifecycle phase.
187    pub fn phase(&self) -> ServicePhase {
188        self.phase
189    }
190
191    /// Service name as configured on construction.
192    pub fn service_name(&self) -> &str {
193        &self.service_name
194    }
195
196    /// How long the service has existed (since first `Starting`).
197    pub fn age(&self) -> Duration {
198        self.created_at.elapsed()
199    }
200
201    /// How long the service has been in its current phase.
202    pub fn time_in_current_phase(&self) -> Duration {
203        self.phase_entered_at.elapsed()
204    }
205
206    /// Total number of times the service has been started.
207    pub fn start_count(&self) -> u32 {
208        self.start_count
209    }
210
211    /// Total failures over the service's lifetime.
212    pub fn total_failures(&self) -> u32 {
213        self.total_failures
214    }
215
216    /// The last error message recorded on a failed transition, if any.
217    pub fn last_error(&self) -> Option<&str> {
218        self.last_error.as_deref()
219    }
220
221    /// Why the service terminated (only `Some` when phase is `Terminated`).
222    pub fn termination_reason(&self) -> Option<&TerminationReason> {
223        self.termination_reason.as_ref()
224    }
225
226    /// Cumulative wall-clock time spent in the `Running` phase.
227    ///
228    /// If the service is currently running, includes time up to *now*.
229    pub fn cumulative_running_time(&self) -> Duration {
230        let extra = self
231            .running_since
232            .map(|since| since.elapsed())
233            .unwrap_or(Duration::ZERO);
234        self.cumulative_running + extra
235    }
236
237    // ── Transitions ───────────────────────────────────────────────────
238
239    /// Move from `Starting` to `Running`.
240    pub fn transition_to_running(&mut self) -> Result<(), TransitionError> {
241        self.validate_transition(ServicePhase::Running)?;
242        self.set_phase(ServicePhase::Running);
243        self.running_since = Some(Instant::now());
244        tracing::info!(
245            service = %self.service_name,
246            start_count = self.start_count,
247            "service entered Running phase"
248        );
249        Ok(())
250    }
251
252    /// Move from `Running` (or `Starting`) to `BackingOff` after a failure.
253    pub fn transition_to_backing_off(
254        &mut self,
255        error: &str,
256        backoff_duration: Duration,
257    ) -> Result<(), TransitionError> {
258        self.validate_transition(ServicePhase::BackingOff)?;
259        self.accumulate_running_time();
260        self.total_failures += 1;
261        self.last_error = Some(error.to_string());
262        self.set_phase(ServicePhase::BackingOff);
263        tracing::warn!(
264            service = %self.service_name,
265            error = %error,
266            attempt = self.total_failures,
267            backoff_ms = backoff_duration.as_millis() as u64,
268            "service failed, entering BackingOff phase"
269        );
270        Ok(())
271    }
272
273    /// Transition from `BackingOff` → `Starting` (retry).
274    pub fn transition_to_restarting(&mut self) -> Result<(), TransitionError> {
275        self.validate_transition(ServicePhase::Starting)?;
276        self.start_count += 1;
277        self.set_phase(ServicePhase::Starting);
278        tracing::info!(
279            service = %self.service_name,
280            start_count = self.start_count,
281            "service restarting (entering Starting phase)"
282        );
283        Ok(())
284    }
285
286    /// Move to `Stopping` on cancellation — services drain after this.
287    pub fn transition_to_stopping(&mut self) -> Result<(), TransitionError> {
288        self.validate_transition(ServicePhase::Stopping)?;
289        self.accumulate_running_time();
290        self.set_phase(ServicePhase::Stopping);
291        tracing::info!(
292            service = %self.service_name,
293            "service entering Stopping phase"
294        );
295        Ok(())
296    }
297
298    /// Transition to `Terminated`. Terminal — no further transitions allowed.
299    pub fn transition_to_terminated(
300        &mut self,
301        reason: TerminationReason,
302    ) -> Result<(), TransitionError> {
303        self.validate_transition(ServicePhase::Terminated)?;
304        self.accumulate_running_time();
305        self.termination_reason = Some(reason.clone());
306        self.set_phase(ServicePhase::Terminated);
307        tracing::info!(
308            service = %self.service_name,
309            reason = %reason,
310            total_starts = self.start_count,
311            total_failures = self.total_failures,
312            cumulative_running_secs = self.cumulative_running.as_secs_f64(),
313            "service terminated"
314        );
315        Ok(())
316    }
317
318    // ── Internal helpers ──────────────────────────────────────────────
319
320    fn validate_transition(&self, target: ServicePhase) -> Result<(), TransitionError> {
321        let valid = match (self.phase, target) {
322            (ServicePhase::Starting, ServicePhase::Running) => true,
323            (ServicePhase::Starting, ServicePhase::Terminated) => true,
324            (ServicePhase::Starting, ServicePhase::Stopping) => true,
325            (ServicePhase::Starting, ServicePhase::BackingOff) => true,
326
327            (ServicePhase::Running, ServicePhase::BackingOff) => true,
328            (ServicePhase::Running, ServicePhase::Stopping) => true,
329            (ServicePhase::Running, ServicePhase::Terminated) => true,
330
331            (ServicePhase::BackingOff, ServicePhase::Starting) => true,
332            (ServicePhase::BackingOff, ServicePhase::Stopping) => true,
333            (ServicePhase::BackingOff, ServicePhase::Terminated) => true,
334
335            (ServicePhase::Stopping, ServicePhase::Terminated) => true,
336
337            (ServicePhase::Terminated, _) => false,
338
339            _ => false,
340        };
341
342        if valid {
343            Ok(())
344        } else {
345            Err(TransitionError {
346                from: self.phase,
347                to: target,
348            })
349        }
350    }
351
352    fn set_phase(&mut self, phase: ServicePhase) {
353        self.phase = phase;
354        self.phase_entered_at = Instant::now();
355    }
356
357    fn accumulate_running_time(&mut self) {
358        if let Some(since) = self.running_since.take() {
359            self.cumulative_running += since.elapsed();
360        }
361    }
362}
363
364impl fmt::Display for ServiceLifecycle {
365    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
366        write!(
367            f,
368            "{}[{}] starts={} failures={} running={:.1}s",
369            self.service_name,
370            self.phase,
371            self.start_count,
372            self.total_failures,
373            self.cumulative_running_time().as_secs_f64(),
374        )
375    }
376}
377
378// ---------------------------------------------------------------------------
379// Serializable snapshot for health / metrics
380// ---------------------------------------------------------------------------
381
382/// Point-in-time snapshot of a service's lifecycle, suitable for
383/// serialization (e.g., for a `/health` JSON endpoint).
384#[derive(Debug, Clone, Serialize, Deserialize)]
385pub struct ServiceLifecycleSnapshot {
386    /// Service name.
387    pub service_name: String,
388    /// Lifecycle phase at snapshot time.
389    pub phase: ServicePhase,
390    /// Total start attempts to date.
391    pub start_count: u32,
392    /// Total failures to date.
393    pub total_failures: u32,
394    /// Last recorded error message, if any.
395    pub last_error: Option<String>,
396    /// Cumulative wall-clock time in `Running`.
397    pub cumulative_running_secs: f64,
398    /// How long since the service was first created.
399    pub age_secs: f64,
400    /// How long since entering the current phase.
401    pub time_in_phase_secs: f64,
402    /// Termination reason as a human string, if `phase == Terminated`.
403    pub termination_reason: Option<String>,
404}
405
406impl From<&ServiceLifecycle> for ServiceLifecycleSnapshot {
407    fn from(lc: &ServiceLifecycle) -> Self {
408        Self {
409            service_name: lc.service_name.clone(),
410            phase: lc.phase,
411            start_count: lc.start_count,
412            total_failures: lc.total_failures,
413            last_error: lc.last_error.clone(),
414            cumulative_running_secs: lc.cumulative_running_time().as_secs_f64(),
415            age_secs: lc.age().as_secs_f64(),
416            time_in_phase_secs: lc.time_in_current_phase().as_secs_f64(),
417            termination_reason: lc.termination_reason.as_ref().map(|r| r.to_string()),
418        }
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use super::*;
425
426    #[test]
427    fn test_new_lifecycle_starts_in_starting() {
428        let lc = ServiceLifecycle::new("test-svc");
429        assert_eq!(lc.phase(), ServicePhase::Starting);
430        assert_eq!(lc.start_count(), 1);
431        assert_eq!(lc.total_failures(), 0);
432        assert!(lc.last_error().is_none());
433        assert!(lc.termination_reason().is_none());
434    }
435
436    #[test]
437    fn test_service_name() {
438        let lc = ServiceLifecycle::new("data-service");
439        assert_eq!(lc.service_name(), "data-service");
440    }
441
442    #[test]
443    fn test_happy_path_starting_to_running_to_stopping_to_terminated() {
444        let mut lc = ServiceLifecycle::new("happy");
445        lc.transition_to_running().unwrap();
446        assert_eq!(lc.phase(), ServicePhase::Running);
447        lc.transition_to_stopping().unwrap();
448        assert_eq!(lc.phase(), ServicePhase::Stopping);
449        lc.transition_to_terminated(TerminationReason::Cancelled)
450            .unwrap();
451        assert_eq!(lc.phase(), ServicePhase::Terminated);
452        assert_eq!(lc.termination_reason(), Some(&TerminationReason::Cancelled));
453    }
454
455    #[test]
456    fn test_failure_and_restart_cycle() {
457        let mut lc = ServiceLifecycle::new("flaky");
458        lc.transition_to_running().unwrap();
459        assert_eq!(lc.start_count(), 1);
460
461        lc.transition_to_backing_off("connection refused", Duration::from_millis(200))
462            .unwrap();
463        assert_eq!(lc.phase(), ServicePhase::BackingOff);
464        assert_eq!(lc.total_failures(), 1);
465        assert_eq!(lc.last_error(), Some("connection refused"));
466
467        lc.transition_to_restarting().unwrap();
468        assert_eq!(lc.phase(), ServicePhase::Starting);
469        assert_eq!(lc.start_count(), 2);
470
471        lc.transition_to_running().unwrap();
472        assert_eq!(lc.phase(), ServicePhase::Running);
473    }
474
475    #[test]
476    fn test_circuit_breaker_termination() {
477        let mut lc = ServiceLifecycle::new("breaker");
478        lc.transition_to_running().unwrap();
479        lc.transition_to_backing_off("error 1", Duration::from_millis(100))
480            .unwrap();
481
482        lc.transition_to_terminated(TerminationReason::CircuitBreakerOpen {
483            failures: 10,
484            max_retries: 10,
485        })
486        .unwrap();
487
488        assert_eq!(lc.phase(), ServicePhase::Terminated);
489        assert!(matches!(
490            lc.termination_reason(),
491            Some(TerminationReason::CircuitBreakerOpen { .. })
492        ));
493    }
494
495    #[test]
496    fn test_completed_termination_from_running() {
497        let mut lc = ServiceLifecycle::new("one-shot");
498        lc.transition_to_running().unwrap();
499        lc.transition_to_terminated(TerminationReason::Completed)
500            .unwrap();
501        assert_eq!(lc.phase(), ServicePhase::Terminated);
502        assert_eq!(lc.termination_reason(), Some(&TerminationReason::Completed));
503    }
504
505    #[test]
506    fn test_invalid_transition_terminated_to_anything() {
507        let mut lc = ServiceLifecycle::new("dead");
508        lc.transition_to_running().unwrap();
509        lc.transition_to_terminated(TerminationReason::Completed)
510            .unwrap();
511
512        assert!(lc.transition_to_running().is_err());
513        assert!(lc.transition_to_stopping().is_err());
514        assert!(
515            lc.transition_to_terminated(TerminationReason::Cancelled)
516                .is_err()
517        );
518        assert!(lc.transition_to_restarting().is_err());
519    }
520
521    #[test]
522    fn test_invalid_transition_running_to_starting() {
523        let mut lc = ServiceLifecycle::new("bad");
524        lc.transition_to_running().unwrap();
525
526        let err = lc.transition_to_restarting().unwrap_err();
527        assert_eq!(err.from, ServicePhase::Running);
528        assert_eq!(err.to, ServicePhase::Starting);
529    }
530
531    #[test]
532    fn test_stopping_from_backing_off() {
533        let mut lc = ServiceLifecycle::new("interrupted");
534        lc.transition_to_running().unwrap();
535        lc.transition_to_backing_off("timeout", Duration::from_secs(5))
536            .unwrap();
537
538        lc.transition_to_stopping().unwrap();
539        assert_eq!(lc.phase(), ServicePhase::Stopping);
540
541        lc.transition_to_terminated(TerminationReason::Cancelled)
542            .unwrap();
543        assert_eq!(lc.phase(), ServicePhase::Terminated);
544    }
545
546    #[test]
547    fn test_starting_directly_to_terminated() {
548        let mut lc = ServiceLifecycle::new("init-fail");
549        lc.transition_to_terminated(TerminationReason::Unrecoverable(
550            "missing config".to_string(),
551        ))
552        .unwrap();
553        assert_eq!(lc.phase(), ServicePhase::Terminated);
554    }
555
556    #[test]
557    fn test_starting_to_backing_off() {
558        let mut lc = ServiceLifecycle::new("init-retry");
559        lc.transition_to_backing_off("db connect timeout", Duration::from_millis(500))
560            .unwrap();
561        assert_eq!(lc.phase(), ServicePhase::BackingOff);
562        assert_eq!(lc.total_failures(), 1);
563    }
564
565    #[test]
566    fn test_phase_display() {
567        assert_eq!(ServicePhase::Starting.to_string(), "starting");
568        assert_eq!(ServicePhase::Running.to_string(), "running");
569        assert_eq!(ServicePhase::BackingOff.to_string(), "backing_off");
570        assert_eq!(ServicePhase::Stopping.to_string(), "stopping");
571        assert_eq!(ServicePhase::Terminated.to_string(), "terminated");
572    }
573
574    #[test]
575    fn test_phase_is_terminal() {
576        assert!(!ServicePhase::Starting.is_terminal());
577        assert!(!ServicePhase::Running.is_terminal());
578        assert!(!ServicePhase::BackingOff.is_terminal());
579        assert!(!ServicePhase::Stopping.is_terminal());
580        assert!(ServicePhase::Terminated.is_terminal());
581    }
582
583    #[test]
584    fn test_phase_is_alive() {
585        assert!(ServicePhase::Starting.is_alive());
586        assert!(ServicePhase::Running.is_alive());
587        assert!(ServicePhase::BackingOff.is_alive());
588        assert!(!ServicePhase::Stopping.is_alive());
589        assert!(!ServicePhase::Terminated.is_alive());
590    }
591
592    #[test]
593    fn test_lifecycle_display() {
594        let lc = ServiceLifecycle::new("display-test");
595        let display = format!("{lc}");
596        assert!(display.contains("display-test"));
597        assert!(display.contains("starting"));
598        assert!(display.contains("starts=1"));
599        assert!(display.contains("failures=0"));
600    }
601
602    #[test]
603    fn test_snapshot_from_lifecycle() {
604        let mut lc = ServiceLifecycle::new("snapshot-svc");
605        lc.transition_to_running().unwrap();
606        lc.transition_to_backing_off("oops", Duration::from_millis(100))
607            .unwrap();
608
609        let snap = ServiceLifecycleSnapshot::from(&lc);
610        assert_eq!(snap.service_name, "snapshot-svc");
611        assert_eq!(snap.phase, ServicePhase::BackingOff);
612        assert_eq!(snap.start_count, 1);
613        assert_eq!(snap.total_failures, 1);
614        assert_eq!(snap.last_error.as_deref(), Some("oops"));
615        assert!(snap.termination_reason.is_none());
616        assert!(snap.age_secs >= 0.0);
617    }
618
619    #[test]
620    fn test_termination_reason_display() {
621        assert_eq!(TerminationReason::Completed.to_string(), "completed");
622        assert_eq!(TerminationReason::Cancelled.to_string(), "cancelled");
623        assert_eq!(
624            TerminationReason::CircuitBreakerOpen {
625                failures: 5,
626                max_retries: 5
627            }
628            .to_string(),
629            "circuit breaker open (5/5 failures)"
630        );
631        assert_eq!(
632            TerminationReason::Unrecoverable("bad config".into()).to_string(),
633            "unrecoverable: bad config"
634        );
635    }
636
637    #[test]
638    fn test_transition_error_display() {
639        let err = TransitionError {
640            from: ServicePhase::Terminated,
641            to: ServicePhase::Running,
642        };
643        assert_eq!(
644            err.to_string(),
645            "invalid lifecycle transition: terminated → running"
646        );
647    }
648
649    #[test]
650    fn test_multiple_failure_cycles_accumulate() {
651        let mut lc = ServiceLifecycle::new("multi-fail");
652
653        for i in 1..=5 {
654            lc.transition_to_running().unwrap();
655            lc.transition_to_backing_off(
656                &format!("error {i}"),
657                Duration::from_millis(100 * i as u64),
658            )
659            .unwrap();
660            if i < 5 {
661                lc.transition_to_restarting().unwrap();
662            }
663        }
664
665        assert_eq!(lc.total_failures(), 5);
666        assert_eq!(lc.start_count(), 5);
667        assert_eq!(lc.last_error(), Some("error 5"));
668    }
669
670    #[test]
671    fn test_stopping_from_starting() {
672        let mut lc = ServiceLifecycle::new("early-stop");
673        lc.transition_to_stopping().unwrap();
674        assert_eq!(lc.phase(), ServicePhase::Stopping);
675        lc.transition_to_terminated(TerminationReason::Cancelled)
676            .unwrap();
677    }
678}