Skip to main content

nodedb_cluster/
lifecycle_state.rs

1//! Cluster lifecycle state tracking for observability.
2//!
3//! `ClusterLifecycleTracker` is the single source of truth for "what
4//! phase is this node's cluster init in right now". It is owned by
5//! the main binary, passed by reference into `start_cluster` and
6//! friends, and read by:
7//!
8//! - The `/cluster/status` HTTP endpoint.
9//! - The `nodedb_cluster_state` Prometheus gauge.
10//! - systemd via `readiness::notify_status` so `systemctl status`
11//!   surfaces a live phase string during the seconds-to-minutes
12//!   window when the cluster is still forming.
13//! - INFO-level structured logs — every transition calls `info!` with
14//!   the previous state, the new state, and the reason so a post-
15//!   mortem on a flaky deploy can be done from `journalctl` alone.
16//!
17//! Transitions are validated only in the loose sense that every
18//! transition goes through a typed method on the tracker. There is no
19//! strict state machine — a cluster can legitimately go
20//! `Joining{3} → Failed{"timeout"} → Joining{0} → Ready{3}` if the
21//! operator restarts with `force_bootstrap`, so we allow any → any.
22
23use std::sync::{Arc, RwLock};
24
25use serde::{Deserialize, Serialize};
26use tracing::info;
27
28use crate::readiness;
29
30/// Discrete phase of this node's cluster init.
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
32#[serde(tag = "phase", rename_all = "snake_case")]
33pub enum ClusterLifecycleState {
34    /// Tracker just created, no phase decision yet.
35    Starting,
36    /// Catalog already marked as bootstrapped — loading from disk.
37    Restarting,
38    /// This node is the elected bootstrapper — creating a fresh cluster.
39    Bootstrapping,
40    /// Joining an existing cluster. `attempt` counts from 0.
41    Joining {
42        /// Current join attempt (0-indexed). See
43        /// `bootstrap::config::JoinRetryPolicy` for the backoff schedule.
44        attempt: u32,
45    },
46    /// Cluster init finished successfully. `nodes` is the number of
47    /// members the node observed at the moment of transition.
48    Ready {
49        /// Number of peers in topology when the transition occurred.
50        nodes: usize,
51    },
52    /// Cluster init failed terminally. `reason` is a short
53    /// human-readable description; see `journalctl` for the full
54    /// context.
55    Failed {
56        /// One-line reason for the failure.
57        reason: String,
58    },
59}
60
61impl ClusterLifecycleState {
62    /// Short label used in the `state=` dimension of the
63    /// `nodedb_cluster_state` Prometheus gauge. Stable across restarts
64    /// so dashboards don't break.
65    pub fn label(&self) -> &'static str {
66        match self {
67            Self::Starting => "starting",
68            Self::Restarting => "restarting",
69            Self::Bootstrapping => "bootstrapping",
70            Self::Joining { .. } => "joining",
71            Self::Ready { .. } => "ready",
72            Self::Failed { .. } => "failed",
73        }
74    }
75
76    /// `true` only in the `Ready` variant. Used by readiness probes.
77    pub fn is_ready(&self) -> bool {
78        matches!(self, Self::Ready { .. })
79    }
80
81    /// Every label this enum can produce. Used by the metrics
82    /// endpoint to emit a one-hot gauge over the full state space.
83    pub fn all_labels() -> &'static [&'static str] {
84        &[
85            "starting",
86            "restarting",
87            "bootstrapping",
88            "joining",
89            "ready",
90            "failed",
91        ]
92    }
93}
94
95/// Thread-safe container for the current `ClusterLifecycleState`.
96///
97/// Wraps `Arc<RwLock<...>>` so a single tracker can be shared
98/// between `start_cluster`, the main binary, and HTTP / metrics
99/// readers without any cloning beyond the cheap `Arc` bump.
100///
101/// Every transition method:
102///
103/// 1. Takes the write lock.
104/// 2. Swaps the stored state.
105/// 3. Drops the lock.
106/// 4. Emits an `info!` event with `prev`, `new`, and any relevant
107///    context fields.
108/// 5. Calls `readiness::notify_status(...)` so `systemctl status`
109///    shows the new phase without any polling.
110#[derive(Debug, Clone)]
111pub struct ClusterLifecycleTracker {
112    inner: Arc<RwLock<ClusterLifecycleState>>,
113}
114
115impl ClusterLifecycleTracker {
116    /// Create a fresh tracker in `Starting` state.
117    pub fn new() -> Self {
118        Self {
119            inner: Arc::new(RwLock::new(ClusterLifecycleState::Starting)),
120        }
121    }
122
123    /// Read the current state. Returns a clone — callers never hold
124    /// the lock across an await or a long loop.
125    pub fn current(&self) -> ClusterLifecycleState {
126        self.inner.read().unwrap_or_else(|p| p.into_inner()).clone()
127    }
128
129    /// `true` iff the tracker has reached `Ready`.
130    pub fn is_ready(&self) -> bool {
131        self.current().is_ready()
132    }
133
134    pub fn to_restarting(&self) {
135        self.transition(ClusterLifecycleState::Restarting, "restart");
136    }
137
138    pub fn to_bootstrapping(&self) {
139        self.transition(
140            ClusterLifecycleState::Bootstrapping,
141            "bootstrapping new cluster",
142        );
143    }
144
145    pub fn to_joining(&self, attempt: u32) {
146        let detail = format!("joining cluster (attempt {attempt})");
147        self.transition(ClusterLifecycleState::Joining { attempt }, &detail);
148    }
149
150    pub fn to_ready(&self, nodes: usize) {
151        let detail = format!("ready ({nodes} nodes)");
152        self.transition(ClusterLifecycleState::Ready { nodes }, &detail);
153    }
154
155    pub fn to_failed(&self, reason: impl Into<String>) {
156        let reason = reason.into();
157        let detail = format!("failed: {reason}");
158        self.transition(ClusterLifecycleState::Failed { reason }, &detail);
159    }
160
161    /// Shared implementation: swap the state, log at INFO, push the
162    /// status string to systemd.
163    fn transition(&self, new: ClusterLifecycleState, human: &str) {
164        let prev = {
165            let mut guard = self.inner.write().unwrap_or_else(|p| p.into_inner());
166            std::mem::replace(&mut *guard, new.clone())
167        };
168        info!(
169            prev = prev.label(),
170            new = new.label(),
171            detail = human,
172            "cluster lifecycle transition"
173        );
174        readiness::notify_status(human);
175    }
176}
177
178impl Default for ClusterLifecycleTracker {
179    fn default() -> Self {
180        Self::new()
181    }
182}
183
184#[cfg(test)]
185mod tests {
186    use super::*;
187
188    #[test]
189    fn initial_state_is_starting() {
190        let t = ClusterLifecycleTracker::new();
191        assert_eq!(t.current(), ClusterLifecycleState::Starting);
192        assert!(!t.is_ready());
193    }
194
195    #[test]
196    fn transition_sequence_logs_and_updates() {
197        let t = ClusterLifecycleTracker::new();
198        t.to_joining(0);
199        assert_eq!(t.current(), ClusterLifecycleState::Joining { attempt: 0 });
200        t.to_joining(1);
201        assert_eq!(t.current(), ClusterLifecycleState::Joining { attempt: 1 });
202        t.to_ready(3);
203        assert_eq!(t.current(), ClusterLifecycleState::Ready { nodes: 3 });
204        assert!(t.is_ready());
205    }
206
207    #[test]
208    fn bootstrapping_then_ready() {
209        let t = ClusterLifecycleTracker::new();
210        t.to_bootstrapping();
211        assert_eq!(t.current(), ClusterLifecycleState::Bootstrapping);
212        t.to_ready(1);
213        assert!(t.is_ready());
214    }
215
216    #[test]
217    fn restarting_path() {
218        let t = ClusterLifecycleTracker::new();
219        t.to_restarting();
220        assert_eq!(t.current(), ClusterLifecycleState::Restarting);
221        t.to_ready(3);
222        assert!(t.is_ready());
223    }
224
225    #[test]
226    fn failed_is_not_terminal_by_contract() {
227        // Operator recovery (e.g. `force_bootstrap` after a failed
228        // join) is a real scenario, so the tracker allows any → any
229        // transitions: `Failed → Ready` is legal and is the correct
230        // behaviour here.
231        let t = ClusterLifecycleTracker::new();
232        t.to_joining(5);
233        t.to_failed("timeout");
234        assert!(matches!(t.current(), ClusterLifecycleState::Failed { .. }));
235        t.to_ready(3);
236        assert_eq!(t.current(), ClusterLifecycleState::Ready { nodes: 3 });
237    }
238
239    #[test]
240    fn labels_are_stable() {
241        assert_eq!(ClusterLifecycleState::Starting.label(), "starting");
242        assert_eq!(ClusterLifecycleState::Restarting.label(), "restarting");
243        assert_eq!(
244            ClusterLifecycleState::Bootstrapping.label(),
245            "bootstrapping"
246        );
247        assert_eq!(
248            ClusterLifecycleState::Joining { attempt: 0 }.label(),
249            "joining"
250        );
251        assert_eq!(ClusterLifecycleState::Ready { nodes: 3 }.label(), "ready");
252        assert_eq!(
253            ClusterLifecycleState::Failed { reason: "x".into() }.label(),
254            "failed"
255        );
256    }
257
258    #[test]
259    fn all_labels_matches_variants() {
260        // Every variant's label must be present in all_labels, so the
261        // Prometheus one-hot gauge covers every state.
262        for variant in [
263            ClusterLifecycleState::Starting,
264            ClusterLifecycleState::Restarting,
265            ClusterLifecycleState::Bootstrapping,
266            ClusterLifecycleState::Joining { attempt: 0 },
267            ClusterLifecycleState::Ready { nodes: 0 },
268            ClusterLifecycleState::Failed { reason: "x".into() },
269        ] {
270            assert!(
271                ClusterLifecycleState::all_labels().contains(&variant.label()),
272                "label {} missing from all_labels()",
273                variant.label()
274            );
275        }
276    }
277
278    #[test]
279    fn tracker_is_cheap_to_clone() {
280        let a = ClusterLifecycleTracker::new();
281        let b = a.clone();
282        a.to_bootstrapping();
283        // Both handles see the same state because they share an Arc.
284        assert_eq!(b.current(), ClusterLifecycleState::Bootstrapping);
285    }
286}