task_killswitch/
lib.rs

1// Copyright (C) 2025, Cloudflare, Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright notice,
9//       this list of conditions and the following disclaimer.
10//
11//     * Redistributions in binary form must reproduce the above copyright
12//       notice, this list of conditions and the following disclaimer in the
13//       documentation and/or other materials provided with the distribution.
14//
15// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
16// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27use dashmap::DashMap;
28use parking_lot::Mutex;
29use tokio::sync::watch;
30use tokio::task;
31use tokio::task::AbortHandle;
32use tokio::task::Id;
33
34use std::future::Future;
35use std::sync::atomic::AtomicBool;
36use std::sync::atomic::Ordering;
37use std::sync::LazyLock;
38
39/// Drop guard for task removal. If a task panics, this makes sure
40/// it is removed from [`ActiveTasks`] properly.
41struct RemoveOnDrop {
42    id: task::Id,
43    storage: &'static ActiveTasks,
44}
45impl Drop for RemoveOnDrop {
46    fn drop(&mut self) {
47        self.storage.remove_task(self.id);
48    }
49}
50
51/// A task killswitch that allows aborting all the tasks spawned with it at
52/// once. The implementation strives to minimize in-band locking. Spawning a
53/// future requires a single sharded lock from an internal [`DashMap`].
54/// Conflicts are expected to be very rare (dashmap defaults to `4 * nproc`
55/// shards, while each thread can only spawn one task at a time.)
56struct TaskKillswitch {
57    // Invariant: If `activated` is true, we don't add new tasks anymore.
58    activated: AtomicBool,
59    storage: &'static ActiveTasks,
60
61    /// Watcher that is triggered after all kill signals have been sent (by
62    /// dropping `signal_killed`.) Currently-running tasks are killed after
63    /// their next yield, which may be after this triggers.
64    all_killed: watch::Receiver<()>,
65    // NOTE: All we want here is to take ownership of `signal_killed` when
66    // activating the killswitch. That code path only runs once per instance, but
67    // requires interior mutability. Using `Mutex` is easier than bothering with
68    // an `UnsafeCell`. The mutex is guaranteed to be unlocked.
69    signal_killed: Mutex<Option<watch::Sender<()>>>,
70}
71
72impl TaskKillswitch {
73    fn new(storage: &'static ActiveTasks) -> Self {
74        let (signal_killed, all_killed) = watch::channel(());
75        let signal_killed = Mutex::new(Some(signal_killed));
76
77        Self {
78            activated: AtomicBool::new(false),
79            storage,
80            signal_killed,
81            all_killed,
82        }
83    }
84
85    /// Creates a killswitch by allocating and leaking the task storage.
86    ///
87    /// **NOTE:** This is intended for use in `static`s and tests. It should not
88    /// be exposed publicly!
89    fn with_leaked_storage() -> Self {
90        let storage = Box::leak(Box::new(ActiveTasks::default()));
91        Self::new(storage)
92    }
93
94    fn was_activated(&self) -> bool {
95        // All synchronization is done using locks,
96        // so we can use relaxed for our atomics.
97        self.activated.load(Ordering::Relaxed)
98    }
99
100    fn spawn_task(
101        &self, fut: impl Future<Output = ()> + Send + 'static,
102    ) -> Option<Id> {
103        if self.was_activated() {
104            return None;
105        }
106
107        let storage = self.storage;
108        let handle = tokio::spawn(async move {
109            let id = task::id();
110            let _guard = RemoveOnDrop { id, storage };
111            fut.await;
112        })
113        .abort_handle();
114
115        let id = handle.id();
116
117        let res = self.storage.add_task_if(handle, || !self.was_activated());
118        if let Err(handle) = res {
119            // Killswitch was activated by the time we got a lock on the map shard
120            handle.abort();
121            return None;
122        }
123        Some(id)
124    }
125
126    fn activate(&self) {
127        // We check `activated` after locking the map shard and before inserting
128        // an element. This ensures in-progress spawns either complete before
129        // `tasks.kill_all()` obtains the lock for that shard, or they abort
130        // afterwards.
131        assert!(
132            !self.activated.swap(true, Ordering::Relaxed),
133            "killswitch can't be used twice"
134        );
135
136        let tasks = self.storage;
137        let signal_killed = self.signal_killed.lock().take();
138        std::thread::spawn(move || {
139            tasks.kill_all();
140            drop(signal_killed);
141        });
142    }
143
144    fn killed(&self) -> impl Future<Output = ()> + Send + 'static {
145        let mut signal = self.all_killed.clone();
146        async move {
147            let _ = signal.changed().await;
148        }
149    }
150}
151
152enum TaskEntry {
153    /// Task was added and not yet removed.
154    Handle(AbortHandle),
155    /// Task was removed before it was added. This can happen if a spawned
156    /// future completes before the spawning thread can add it to the map.
157    Tombstone,
158}
159
160#[derive(Default)]
161struct ActiveTasks {
162    tasks: DashMap<task::Id, TaskEntry>,
163}
164
165impl ActiveTasks {
166    fn kill_all(&self) {
167        self.tasks.retain(|_, entry| {
168            if let TaskEntry::Handle(task) = entry {
169                task.abort();
170            }
171            false // remove all elements
172        });
173    }
174
175    fn add_task_if(
176        &self, handle: AbortHandle, cond: impl FnOnce() -> bool,
177    ) -> Result<(), AbortHandle> {
178        use dashmap::Entry::*;
179        let id = handle.id();
180
181        match self.tasks.entry(id) {
182            Vacant(e) => {
183                if !cond() {
184                    return Err(handle);
185                }
186                e.insert(TaskEntry::Handle(handle));
187            },
188            Occupied(e) if matches!(e.get(), TaskEntry::Tombstone) => {
189                // Task was removed before it was added. Clear the map entry and
190                // drop the handle.
191                e.remove();
192            },
193            Occupied(_) => panic!("tokio task ID already in use: {id}"),
194        }
195
196        Ok(())
197    }
198
199    fn remove_task(&self, id: task::Id) {
200        use dashmap::Entry::*;
201        match self.tasks.entry(id) {
202            Vacant(e) => {
203                // Task was not added yet, set a tombstone instead.
204                e.insert(TaskEntry::Tombstone);
205            },
206            Occupied(e) if matches!(e.get(), TaskEntry::Tombstone) => {},
207            Occupied(e) => {
208                e.remove();
209            },
210        }
211    }
212}
213
214/// The global [`TaskKillswitch`] exposed publicly from the crate.
215static TASK_KILLSWITCH: LazyLock<TaskKillswitch> =
216    LazyLock::new(TaskKillswitch::with_leaked_storage);
217
218/// Spawns a new asynchronous task and registers it in the crate's global
219/// killswitch.
220///
221/// Under the hood, [`tokio::spawn`] schedules the actual execution.
222#[inline]
223pub fn spawn_with_killswitch(
224    fut: impl Future<Output = ()> + Send + 'static,
225) -> Option<Id> {
226    TASK_KILLSWITCH.spawn_task(fut)
227}
228
229#[deprecated = "activate() was unnecessarily declared async. Use activate_now() instead."]
230pub async fn activate() {
231    TASK_KILLSWITCH.activate()
232}
233
234/// Triggers the killswitch, thereby scheduling all registered tasks to be
235/// killed.
236///
237/// Note: tasks are not killed synchronously in this function. This means
238/// `activate_now()` will return before all tasks have been stopped.
239#[inline]
240pub fn activate_now() {
241    TASK_KILLSWITCH.activate();
242}
243
244/// Returns a future that resolves when all registered tasks have been killed,
245/// after [`activate_now`] has been called.
246///
247/// Note: tokio does not kill a task until the next time it yields to the
248/// runtime. This means some killed tasks may still be running by the time this
249/// Future resolves.
250#[inline]
251pub fn killed_signal() -> impl Future<Output = ()> + Send + 'static {
252    TASK_KILLSWITCH.killed()
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258    use futures_util::future;
259    use std::time::Duration;
260    use tokio::sync::oneshot;
261
262    struct TaskAbortSignal(Option<oneshot::Sender<()>>);
263
264    impl TaskAbortSignal {
265        fn new() -> (Self, oneshot::Receiver<()>) {
266            let (tx, rx) = oneshot::channel();
267
268            (Self(Some(tx)), rx)
269        }
270    }
271
272    impl Drop for TaskAbortSignal {
273        fn drop(&mut self) {
274            let _ = self.0.take().unwrap().send(());
275        }
276    }
277
278    fn start_test_tasks(
279        killswitch: &TaskKillswitch,
280    ) -> Vec<oneshot::Receiver<()>> {
281        (0..1000)
282            .map(|_| {
283                let (tx, rx) = TaskAbortSignal::new();
284
285                killswitch.spawn_task(async move {
286                    tokio::time::sleep(tokio::time::Duration::from_secs(3600))
287                        .await;
288                    drop(tx);
289                });
290
291                rx
292            })
293            .collect()
294    }
295
296    #[tokio::test]
297    async fn activate_killswitch_early() {
298        let killswitch = TaskKillswitch::with_leaked_storage();
299        let abort_signals = start_test_tasks(&killswitch);
300
301        killswitch.activate();
302
303        tokio::time::timeout(
304            Duration::from_secs(1),
305            future::join_all(abort_signals),
306        )
307        .await
308        .expect("tasks should be killed within given timeframe");
309    }
310
311    #[tokio::test]
312    async fn activate_killswitch_with_delay() {
313        let killswitch = TaskKillswitch::with_leaked_storage();
314        let abort_signals = start_test_tasks(&killswitch);
315        let signal_handle = tokio::spawn(killswitch.killed());
316
317        // NOTE: give tasks time to start executing.
318        tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
319
320        assert!(!signal_handle.is_finished());
321        killswitch.activate();
322
323        tokio::time::timeout(
324            Duration::from_secs(1),
325            future::join_all(abort_signals),
326        )
327        .await
328        .expect("tasks should be killed within given timeframe");
329
330        tokio::time::timeout(Duration::from_secs(1), signal_handle)
331            .await
332            .expect("killed() signal should have resolved")
333            .expect("signal task should join successfully");
334    }
335}