dig_service/shutdown.rs
1//! Shutdown coordination.
2//!
3//! [`ShutdownToken`] is a thin wrapper over [`CancellationToken`](tokio_util::sync::CancellationToken)
4//! that adds a typed reason. Cancellation is broadcast: every clone sees
5//! the same flip when `.cancel(reason)` is called on any one of them.
6//!
7//! # Reasons
8//!
9//! [`ShutdownReason`] distinguishes *why* the service is shutting down so
10//! the exiting binary can set the right process exit code (0 for graceful,
11//! non-zero for fatal).
12//!
13//! # Idempotency
14//!
15//! `.cancel(reason)` records only the **first** reason; subsequent calls are
16//! no-ops. This matches the "stop is stop" invariant — you can't un-shutdown.
17
18use std::sync::Arc;
19
20use parking_lot::RwLock;
21use tokio_util::sync::CancellationToken;
22
23/// A cancellation token with a typed [`ShutdownReason`].
24///
25/// Clone-to-share; every clone observes the same cancellation.
26#[derive(Clone, Debug)]
27pub struct ShutdownToken {
28 inner: CancellationToken,
29 reason: Arc<RwLock<Option<ShutdownReason>>>,
30}
31
32impl ShutdownToken {
33 /// Construct a fresh, uncancelled token.
34 pub fn new() -> Self {
35 Self {
36 inner: CancellationToken::new(),
37 reason: Arc::new(RwLock::new(None)),
38 }
39 }
40
41 /// Non-blocking check.
42 pub fn is_cancelled(&self) -> bool {
43 self.inner.is_cancelled()
44 }
45
46 /// Awaitable trigger; completes as soon as any clone calls `.cancel`.
47 pub async fn cancelled(&self) {
48 self.inner.cancelled().await
49 }
50
51 /// Trigger shutdown. Idempotent — only the **first** call records its
52 /// reason; later calls are no-ops.
53 pub fn cancel(&self, reason: ShutdownReason) {
54 // Record reason atomically before flipping the token so anyone
55 // waking up from `cancelled().await` sees a populated `reason()`.
56 {
57 let mut w = self.reason.write();
58 if w.is_none() {
59 *w = Some(reason);
60 }
61 }
62 self.inner.cancel();
63 }
64
65 /// Read the first-recorded reason, if any.
66 pub fn reason(&self) -> Option<ShutdownReason> {
67 self.reason.read().clone()
68 }
69
70 /// Construct a child token that inherits cancellation from `self`.
71 ///
72 /// Cancelling the child does NOT cancel the parent. Useful when a
73 /// subsystem wants to shut its own subtasks down without affecting the
74 /// outer service.
75 pub fn child_token(&self) -> ShutdownToken {
76 Self {
77 inner: self.inner.child_token(),
78 reason: self.reason.clone(),
79 }
80 }
81}
82
83impl Default for ShutdownToken {
84 fn default() -> Self {
85 Self::new()
86 }
87}
88
89/// Why the service is shutting down.
90///
91/// Binaries map these to process exit codes:
92/// - `UserRequested` / `RequestedByRun` / `ReloadRequested` → 0.
93/// - `RpcRequested` → 0.
94/// - `Fatal(_)` → non-zero.
95#[derive(Clone, Debug)]
96pub enum ShutdownReason {
97 /// SIGINT / SIGTERM / Ctrl-C.
98 UserRequested,
99 /// An admin RPC method (`stop_node`) requested shutdown.
100 RpcRequested,
101 /// A reload was requested; the binary should exit and re-spawn.
102 ReloadRequested,
103 /// The node's `run` method returned without any outside signal; the
104 /// service treats that as a graceful exit.
105 RequestedByRun,
106 /// An unrecoverable internal error. The string is human-readable.
107 Fatal(String),
108}
109
110/// Why the service exited.
111///
112/// Richer than [`ShutdownReason`] because it also captures the "ran to
113/// completion normally" and "run method returned an error" cases.
114#[derive(Clone, Debug)]
115pub enum ExitReason {
116 /// Shutdown was requested externally; the carried reason explains how.
117 RequestedShutdown(ShutdownReason),
118 /// The node's `run` method returned `Ok(())` without any outside signal.
119 RunCompleted,
120 /// The node's `run` method returned an error. The `Arc<anyhow::Error>`
121 /// carries the original.
122 RunError(Arc<anyhow::Error>),
123}
124
125/// The final status returned by `Service::start`.
126#[derive(Clone, Debug)]
127pub struct ExitStatus {
128 /// Why the service exited.
129 pub reason: ExitReason,
130}
131
132impl ExitStatus {
133 /// Whether this exit was graceful (i.e., not `RunError` and not `Fatal`).
134 ///
135 /// Binaries can use this to decide on exit code 0 vs 1.
136 pub fn is_graceful(&self) -> bool {
137 !matches!(
138 self.reason,
139 ExitReason::RunError(_) | ExitReason::RequestedShutdown(ShutdownReason::Fatal(_))
140 )
141 }
142}
143
144#[cfg(test)]
145mod tests {
146 use super::*;
147
148 /// **Proves:** a fresh `ShutdownToken` is not cancelled and has no
149 /// recorded reason.
150 ///
151 /// **Why it matters:** This is the default state every service starts
152 /// in. A regression that pre-sets the cancel flag would cause every
153 /// newly-constructed `Service` to believe shutdown was already
154 /// requested and skip `run` entirely.
155 ///
156 /// **Catches:** accidentally swapping `CancellationToken::new()` for
157 /// `CancellationToken::new_cancelled()` or similar.
158 #[test]
159 fn fresh_token_is_uncancelled() {
160 let t = ShutdownToken::new();
161 assert!(!t.is_cancelled());
162 assert!(t.reason().is_none());
163 }
164
165 /// **Proves:** calling `.cancel(reason)` flips `is_cancelled` and
166 /// records the reason.
167 ///
168 /// **Why it matters:** The "which reason did we shut down for" field
169 /// is how binaries decide their exit code. If `cancel` ever failed to
170 /// record the reason, every shutdown would look like "unknown".
171 ///
172 /// **Catches:** a regression where the inner `CancellationToken::cancel`
173 /// is called before the reason is recorded — then a racing task that
174 /// wakes up on `cancelled()` might observe `reason() == None`.
175 #[test]
176 fn cancel_sets_state() {
177 let t = ShutdownToken::new();
178 t.cancel(ShutdownReason::UserRequested);
179 assert!(t.is_cancelled());
180 assert!(matches!(t.reason(), Some(ShutdownReason::UserRequested)));
181 }
182
183 /// **Proves:** only the first `.cancel` call records its reason;
184 /// subsequent calls do not overwrite it.
185 ///
186 /// **Why it matters:** If someone calls `Service::request_shutdown(User)`
187 /// and then an anomaly triggers `Service::request_shutdown(Fatal("..."))`,
188 /// we want the **first** reason recorded — the shutdown was already in
189 /// flight for a benign reason, and the Fatal is a consequence of being
190 /// torn down mid-flight.
191 ///
192 /// **Catches:** a regression where `.cancel` unconditionally overwrites
193 /// the reason.
194 #[test]
195 fn cancel_reason_is_first_wins() {
196 let t = ShutdownToken::new();
197 t.cancel(ShutdownReason::UserRequested);
198 t.cancel(ShutdownReason::Fatal("too late".to_string()));
199 assert!(matches!(t.reason(), Some(ShutdownReason::UserRequested)));
200 }
201
202 /// **Proves:** cloning a `ShutdownToken` produces handles that share
203 /// cancellation state.
204 ///
205 /// **Why it matters:** `ServiceHandle::request_shutdown` works by holding
206 /// a clone of the token. If the clone had independent state, calling
207 /// `handle.request_shutdown()` would flip a token nobody awaits.
208 ///
209 /// **Catches:** a regression where `Clone` for `ShutdownToken` deep-clones
210 /// the underlying `CancellationToken`.
211 #[test]
212 fn clone_shares_state() {
213 let a = ShutdownToken::new();
214 let b = a.clone();
215 assert!(!a.is_cancelled() && !b.is_cancelled());
216 b.cancel(ShutdownReason::RpcRequested);
217 assert!(a.is_cancelled() && b.is_cancelled());
218 assert!(matches!(a.reason(), Some(ShutdownReason::RpcRequested)));
219 }
220
221 /// **Proves:** `ExitStatus::is_graceful` is `true` for `RunCompleted`
222 /// and `RequestedShutdown(UserRequested)`, and `false` for
223 /// `RunError`, `Fatal`.
224 ///
225 /// **Why it matters:** This method drives the process exit code. Any
226 /// mis-classification could cause orchestrators (systemd, k8s) to
227 /// infinitely restart a binary that exited cleanly.
228 ///
229 /// **Catches:** swapping the match arms; missing a new `ShutdownReason`
230 /// variant when the enum grows.
231 #[test]
232 fn exit_status_is_graceful_classifies_correctly() {
233 let graceful = ExitStatus {
234 reason: ExitReason::RunCompleted,
235 };
236 assert!(graceful.is_graceful());
237
238 let graceful = ExitStatus {
239 reason: ExitReason::RequestedShutdown(ShutdownReason::UserRequested),
240 };
241 assert!(graceful.is_graceful());
242
243 let fatal = ExitStatus {
244 reason: ExitReason::RequestedShutdown(ShutdownReason::Fatal("x".into())),
245 };
246 assert!(!fatal.is_graceful());
247
248 let run_err = ExitStatus {
249 reason: ExitReason::RunError(Arc::new(anyhow::anyhow!("boom"))),
250 };
251 assert!(!run_err.is_graceful());
252 }
253}