Skip to main content

dig_service/
shutdown.rs

1//! Shutdown coordination.
2//!
3//! [`ShutdownToken`] is a thin wrapper over [`CancellationToken`](tokio_util::sync::CancellationToken)
4//! that adds a typed reason. Cancellation is broadcast: every clone sees
5//! the same flip when `.cancel(reason)` is called on any one of them.
6//!
7//! # Reasons
8//!
9//! [`ShutdownReason`] distinguishes *why* the service is shutting down so
10//! the exiting binary can set the right process exit code (0 for graceful,
11//! non-zero for fatal).
12//!
13//! # Idempotency
14//!
15//! `.cancel(reason)` records only the **first** reason; subsequent calls are
16//! no-ops. This matches the "stop is stop" invariant — you can't un-shutdown.
17
18use std::sync::Arc;
19
20use parking_lot::RwLock;
21use tokio_util::sync::CancellationToken;
22
23/// A cancellation token with a typed [`ShutdownReason`].
24///
25/// Clone-to-share; every clone observes the same cancellation.
26#[derive(Clone, Debug)]
27pub struct ShutdownToken {
28    inner: CancellationToken,
29    reason: Arc<RwLock<Option<ShutdownReason>>>,
30}
31
32impl ShutdownToken {
33    /// Construct a fresh, uncancelled token.
34    pub fn new() -> Self {
35        Self {
36            inner: CancellationToken::new(),
37            reason: Arc::new(RwLock::new(None)),
38        }
39    }
40
41    /// Non-blocking check.
42    pub fn is_cancelled(&self) -> bool {
43        self.inner.is_cancelled()
44    }
45
46    /// Awaitable trigger; completes as soon as any clone calls `.cancel`.
47    pub async fn cancelled(&self) {
48        self.inner.cancelled().await
49    }
50
51    /// Trigger shutdown. Idempotent — only the **first** call records its
52    /// reason; later calls are no-ops.
53    pub fn cancel(&self, reason: ShutdownReason) {
54        // Record reason atomically before flipping the token so anyone
55        // waking up from `cancelled().await` sees a populated `reason()`.
56        {
57            let mut w = self.reason.write();
58            if w.is_none() {
59                *w = Some(reason);
60            }
61        }
62        self.inner.cancel();
63    }
64
65    /// Read the first-recorded reason, if any.
66    pub fn reason(&self) -> Option<ShutdownReason> {
67        self.reason.read().clone()
68    }
69
70    /// Construct a child token that inherits cancellation from `self`.
71    ///
72    /// Cancelling the child does NOT cancel the parent. Useful when a
73    /// subsystem wants to shut its own subtasks down without affecting the
74    /// outer service.
75    pub fn child_token(&self) -> ShutdownToken {
76        Self {
77            inner: self.inner.child_token(),
78            reason: self.reason.clone(),
79        }
80    }
81}
82
83impl Default for ShutdownToken {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89/// Why the service is shutting down.
90///
91/// Binaries map these to process exit codes:
92/// - `UserRequested` / `RequestedByRun` / `ReloadRequested` → 0.
93/// - `RpcRequested` → 0.
94/// - `Fatal(_)` → non-zero.
95#[derive(Clone, Debug)]
96pub enum ShutdownReason {
97    /// SIGINT / SIGTERM / Ctrl-C.
98    UserRequested,
99    /// An admin RPC method (`stop_node`) requested shutdown.
100    RpcRequested,
101    /// A reload was requested; the binary should exit and re-spawn.
102    ReloadRequested,
103    /// The node's `run` method returned without any outside signal; the
104    /// service treats that as a graceful exit.
105    RequestedByRun,
106    /// An unrecoverable internal error. The string is human-readable.
107    Fatal(String),
108}
109
110/// Why the service exited.
111///
112/// Richer than [`ShutdownReason`] because it also captures the "ran to
113/// completion normally" and "run method returned an error" cases.
114#[derive(Clone, Debug)]
115pub enum ExitReason {
116    /// Shutdown was requested externally; the carried reason explains how.
117    RequestedShutdown(ShutdownReason),
118    /// The node's `run` method returned `Ok(())` without any outside signal.
119    RunCompleted,
120    /// The node's `run` method returned an error. The `Arc<anyhow::Error>`
121    /// carries the original.
122    RunError(Arc<anyhow::Error>),
123}
124
125/// The final status returned by `Service::start`.
126#[derive(Clone, Debug)]
127pub struct ExitStatus {
128    /// Why the service exited.
129    pub reason: ExitReason,
130}
131
132impl ExitStatus {
133    /// Whether this exit was graceful (i.e., not `RunError` and not `Fatal`).
134    ///
135    /// Binaries can use this to decide on exit code 0 vs 1.
136    pub fn is_graceful(&self) -> bool {
137        !matches!(
138            self.reason,
139            ExitReason::RunError(_) | ExitReason::RequestedShutdown(ShutdownReason::Fatal(_))
140        )
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    /// **Proves:** a fresh `ShutdownToken` is not cancelled and has no
149    /// recorded reason.
150    ///
151    /// **Why it matters:** This is the default state every service starts
152    /// in. A regression that pre-sets the cancel flag would cause every
153    /// newly-constructed `Service` to believe shutdown was already
154    /// requested and skip `run` entirely.
155    ///
156    /// **Catches:** accidentally swapping `CancellationToken::new()` for
157    /// `CancellationToken::new_cancelled()` or similar.
158    #[test]
159    fn fresh_token_is_uncancelled() {
160        let t = ShutdownToken::new();
161        assert!(!t.is_cancelled());
162        assert!(t.reason().is_none());
163    }
164
165    /// **Proves:** calling `.cancel(reason)` flips `is_cancelled` and
166    /// records the reason.
167    ///
168    /// **Why it matters:** The "which reason did we shut down for" field
169    /// is how binaries decide their exit code. If `cancel` ever failed to
170    /// record the reason, every shutdown would look like "unknown".
171    ///
172    /// **Catches:** a regression where the inner `CancellationToken::cancel`
173    /// is called before the reason is recorded — then a racing task that
174    /// wakes up on `cancelled()` might observe `reason() == None`.
175    #[test]
176    fn cancel_sets_state() {
177        let t = ShutdownToken::new();
178        t.cancel(ShutdownReason::UserRequested);
179        assert!(t.is_cancelled());
180        assert!(matches!(t.reason(), Some(ShutdownReason::UserRequested)));
181    }
182
183    /// **Proves:** only the first `.cancel` call records its reason;
184    /// subsequent calls do not overwrite it.
185    ///
186    /// **Why it matters:** If someone calls `Service::request_shutdown(User)`
187    /// and then an anomaly triggers `Service::request_shutdown(Fatal("..."))`,
188    /// we want the **first** reason recorded — the shutdown was already in
189    /// flight for a benign reason, and the Fatal is a consequence of being
190    /// torn down mid-flight.
191    ///
192    /// **Catches:** a regression where `.cancel` unconditionally overwrites
193    /// the reason.
194    #[test]
195    fn cancel_reason_is_first_wins() {
196        let t = ShutdownToken::new();
197        t.cancel(ShutdownReason::UserRequested);
198        t.cancel(ShutdownReason::Fatal("too late".to_string()));
199        assert!(matches!(t.reason(), Some(ShutdownReason::UserRequested)));
200    }
201
202    /// **Proves:** cloning a `ShutdownToken` produces handles that share
203    /// cancellation state.
204    ///
205    /// **Why it matters:** `ServiceHandle::request_shutdown` works by holding
206    /// a clone of the token. If the clone had independent state, calling
207    /// `handle.request_shutdown()` would flip a token nobody awaits.
208    ///
209    /// **Catches:** a regression where `Clone` for `ShutdownToken` deep-clones
210    /// the underlying `CancellationToken`.
211    #[test]
212    fn clone_shares_state() {
213        let a = ShutdownToken::new();
214        let b = a.clone();
215        assert!(!a.is_cancelled() && !b.is_cancelled());
216        b.cancel(ShutdownReason::RpcRequested);
217        assert!(a.is_cancelled() && b.is_cancelled());
218        assert!(matches!(a.reason(), Some(ShutdownReason::RpcRequested)));
219    }
220
221    /// **Proves:** `ExitStatus::is_graceful` is `true` for `RunCompleted`
222    /// and `RequestedShutdown(UserRequested)`, and `false` for
223    /// `RunError`, `Fatal`.
224    ///
225    /// **Why it matters:** This method drives the process exit code. Any
226    /// mis-classification could cause orchestrators (systemd, k8s) to
227    /// infinitely restart a binary that exited cleanly.
228    ///
229    /// **Catches:** swapping the match arms; missing a new `ShutdownReason`
230    /// variant when the enum grows.
231    #[test]
232    fn exit_status_is_graceful_classifies_correctly() {
233        let graceful = ExitStatus {
234            reason: ExitReason::RunCompleted,
235        };
236        assert!(graceful.is_graceful());
237
238        let graceful = ExitStatus {
239            reason: ExitReason::RequestedShutdown(ShutdownReason::UserRequested),
240        };
241        assert!(graceful.is_graceful());
242
243        let fatal = ExitStatus {
244            reason: ExitReason::RequestedShutdown(ShutdownReason::Fatal("x".into())),
245        };
246        assert!(!fatal.is_graceful());
247
248        let run_err = ExitStatus {
249            reason: ExitReason::RunError(Arc::new(anyhow::anyhow!("boom"))),
250        };
251        assert!(!run_err.is_graceful());
252    }
253}