Skip to main content

nodedb_cluster/
lifecycle_state.rs

1// SPDX-License-Identifier: BUSL-1.1
2
3//! Cluster lifecycle state tracking for observability.
4//!
5//! `ClusterLifecycleTracker` is the single source of truth for "what
6//! phase is this node's cluster init in right now". It is owned by
7//! the main binary, passed by reference into `start_cluster` and
8//! friends, and read by:
9//!
10//! - The `/cluster/status` HTTP endpoint.
11//! - The `nodedb_cluster_state` Prometheus gauge.
12//! - systemd via `readiness::notify_status` so `systemctl status`
13//!   surfaces a live phase string during the seconds-to-minutes
14//!   window when the cluster is still forming.
15//! - INFO-level structured logs — every transition calls `info!` with
16//!   the previous state, the new state, and the reason so a post-
17//!   mortem on a flaky deploy can be done from `journalctl` alone.
18//!
19//! Transitions are validated only in the loose sense that every
20//! transition goes through a typed method on the tracker. There is no
21//! strict state machine — a cluster can legitimately go
22//! `Joining{3} → Failed{"timeout"} → Joining{0} → Ready{3}` if the
23//! operator restarts with `force_bootstrap`, so we allow any → any.
24
25use std::sync::{Arc, RwLock};
26
27use serde::{Deserialize, Serialize};
28use tracing::info;
29
30use crate::readiness;
31
32/// Discrete phase of this node's cluster init.
33#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
34#[serde(tag = "phase", rename_all = "snake_case")]
35pub enum ClusterLifecycleState {
36    /// Tracker just created, no phase decision yet.
37    Starting,
38    /// Catalog already marked as bootstrapped — loading from disk.
39    Restarting,
40    /// This node is the elected bootstrapper — creating a fresh cluster.
41    Bootstrapping,
42    /// Joining an existing cluster. `attempt` counts from 0.
43    Joining {
44        /// Current join attempt (0-indexed). See
45        /// `bootstrap::config::JoinRetryPolicy` for the backoff schedule.
46        attempt: u32,
47    },
48    /// Cluster init finished successfully. `nodes` is the number of
49    /// members the node observed at the moment of transition.
50    Ready {
51        /// Number of peers in topology when the transition occurred.
52        nodes: usize,
53    },
54    /// Cluster init failed terminally. `reason` is a short
55    /// human-readable description; see `journalctl` for the full
56    /// context.
57    Failed {
58        /// One-line reason for the failure.
59        reason: String,
60    },
61}
62
63impl ClusterLifecycleState {
64    /// Short label used in the `state=` dimension of the
65    /// `nodedb_cluster_state` Prometheus gauge. Stable across restarts
66    /// so dashboards don't break.
67    pub fn label(&self) -> &'static str {
68        match self {
69            Self::Starting => "starting",
70            Self::Restarting => "restarting",
71            Self::Bootstrapping => "bootstrapping",
72            Self::Joining { .. } => "joining",
73            Self::Ready { .. } => "ready",
74            Self::Failed { .. } => "failed",
75        }
76    }
77
78    /// `true` only in the `Ready` variant. Used by readiness probes.
79    pub fn is_ready(&self) -> bool {
80        matches!(self, Self::Ready { .. })
81    }
82
83    /// Every label this enum can produce. Used by the metrics
84    /// endpoint to emit a one-hot gauge over the full state space.
85    pub fn all_labels() -> &'static [&'static str] {
86        &[
87            "starting",
88            "restarting",
89            "bootstrapping",
90            "joining",
91            "ready",
92            "failed",
93        ]
94    }
95}
96
97/// Thread-safe container for the current `ClusterLifecycleState`.
98///
99/// Wraps `Arc<RwLock<...>>` so a single tracker can be shared
100/// between `start_cluster`, the main binary, and HTTP / metrics
101/// readers without any cloning beyond the cheap `Arc` bump.
102///
103/// Every transition method:
104///
105/// 1. Takes the write lock.
106/// 2. Swaps the stored state.
107/// 3. Drops the lock.
108/// 4. Emits an `info!` event with `prev`, `new`, and any relevant
109///    context fields.
110/// 5. Calls `readiness::notify_status(...)` so `systemctl status`
111///    shows the new phase without any polling.
112#[derive(Debug, Clone)]
113pub struct ClusterLifecycleTracker {
114    inner: Arc<RwLock<ClusterLifecycleState>>,
115}
116
117impl ClusterLifecycleTracker {
118    /// Create a fresh tracker in `Starting` state.
119    pub fn new() -> Self {
120        Self {
121            inner: Arc::new(RwLock::new(ClusterLifecycleState::Starting)),
122        }
123    }
124
125    /// Read the current state. Returns a clone — callers never hold
126    /// the lock across an await or a long loop.
127    pub fn current(&self) -> ClusterLifecycleState {
128        self.inner.read().unwrap_or_else(|p| p.into_inner()).clone()
129    }
130
131    /// `true` iff the tracker has reached `Ready`.
132    pub fn is_ready(&self) -> bool {
133        self.current().is_ready()
134    }
135
136    pub fn to_restarting(&self) {
137        self.transition(ClusterLifecycleState::Restarting, "restart");
138    }
139
140    pub fn to_bootstrapping(&self) {
141        self.transition(
142            ClusterLifecycleState::Bootstrapping,
143            "bootstrapping new cluster",
144        );
145    }
146
147    pub fn to_joining(&self, attempt: u32) {
148        let detail = format!("joining cluster (attempt {attempt})");
149        self.transition(ClusterLifecycleState::Joining { attempt }, &detail);
150    }
151
152    pub fn to_ready(&self, nodes: usize) {
153        let detail = format!("ready ({nodes} nodes)");
154        self.transition(ClusterLifecycleState::Ready { nodes }, &detail);
155    }
156
157    pub fn to_failed(&self, reason: impl Into<String>) {
158        let reason = reason.into();
159        let detail = format!("failed: {reason}");
160        self.transition(ClusterLifecycleState::Failed { reason }, &detail);
161    }
162
163    /// Shared implementation: swap the state, log at INFO, push the
164    /// status string to systemd.
165    fn transition(&self, new: ClusterLifecycleState, human: &str) {
166        let prev = {
167            let mut guard = self.inner.write().unwrap_or_else(|p| p.into_inner());
168            std::mem::replace(&mut *guard, new.clone())
169        };
170        info!(
171            prev = prev.label(),
172            new = new.label(),
173            detail = human,
174            "cluster lifecycle transition"
175        );
176        readiness::notify_status(human);
177    }
178}
179
180impl Default for ClusterLifecycleTracker {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186#[cfg(test)]
187mod tests {
188    use super::*;
189
190    #[test]
191    fn initial_state_is_starting() {
192        let t = ClusterLifecycleTracker::new();
193        assert_eq!(t.current(), ClusterLifecycleState::Starting);
194        assert!(!t.is_ready());
195    }
196
197    #[test]
198    fn transition_sequence_logs_and_updates() {
199        let t = ClusterLifecycleTracker::new();
200        t.to_joining(0);
201        assert_eq!(t.current(), ClusterLifecycleState::Joining { attempt: 0 });
202        t.to_joining(1);
203        assert_eq!(t.current(), ClusterLifecycleState::Joining { attempt: 1 });
204        t.to_ready(3);
205        assert_eq!(t.current(), ClusterLifecycleState::Ready { nodes: 3 });
206        assert!(t.is_ready());
207    }
208
209    #[test]
210    fn bootstrapping_then_ready() {
211        let t = ClusterLifecycleTracker::new();
212        t.to_bootstrapping();
213        assert_eq!(t.current(), ClusterLifecycleState::Bootstrapping);
214        t.to_ready(1);
215        assert!(t.is_ready());
216    }
217
218    #[test]
219    fn restarting_path() {
220        let t = ClusterLifecycleTracker::new();
221        t.to_restarting();
222        assert_eq!(t.current(), ClusterLifecycleState::Restarting);
223        t.to_ready(3);
224        assert!(t.is_ready());
225    }
226
227    #[test]
228    fn failed_is_not_terminal_by_contract() {
229        // Operator recovery (e.g. `force_bootstrap` after a failed
230        // join) is a real scenario, so the tracker allows any → any
231        // transitions: `Failed → Ready` is legal and is the correct
232        // behaviour here.
233        let t = ClusterLifecycleTracker::new();
234        t.to_joining(5);
235        t.to_failed("timeout");
236        assert!(matches!(t.current(), ClusterLifecycleState::Failed { .. }));
237        t.to_ready(3);
238        assert_eq!(t.current(), ClusterLifecycleState::Ready { nodes: 3 });
239    }
240
241    #[test]
242    fn labels_are_stable() {
243        assert_eq!(ClusterLifecycleState::Starting.label(), "starting");
244        assert_eq!(ClusterLifecycleState::Restarting.label(), "restarting");
245        assert_eq!(
246            ClusterLifecycleState::Bootstrapping.label(),
247            "bootstrapping"
248        );
249        assert_eq!(
250            ClusterLifecycleState::Joining { attempt: 0 }.label(),
251            "joining"
252        );
253        assert_eq!(ClusterLifecycleState::Ready { nodes: 3 }.label(), "ready");
254        assert_eq!(
255            ClusterLifecycleState::Failed { reason: "x".into() }.label(),
256            "failed"
257        );
258    }
259
260    #[test]
261    fn all_labels_matches_variants() {
262        // Every variant's label must be present in all_labels, so the
263        // Prometheus one-hot gauge covers every state.
264        for variant in [
265            ClusterLifecycleState::Starting,
266            ClusterLifecycleState::Restarting,
267            ClusterLifecycleState::Bootstrapping,
268            ClusterLifecycleState::Joining { attempt: 0 },
269            ClusterLifecycleState::Ready { nodes: 0 },
270            ClusterLifecycleState::Failed { reason: "x".into() },
271        ] {
272            assert!(
273                ClusterLifecycleState::all_labels().contains(&variant.label()),
274                "label {} missing from all_labels()",
275                variant.label()
276            );
277        }
278    }
279
280    #[test]
281    fn tracker_is_cheap_to_clone() {
282        let a = ClusterLifecycleTracker::new();
283        let b = a.clone();
284        a.to_bootstrapping();
285        // Both handles see the same state because they share an Arc.
286        assert_eq!(b.current(), ClusterLifecycleState::Bootstrapping);
287    }
288}