triton_distributed/
worker.rs

1// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! The [Worker] class is a convenience wrapper around the construction of the [Runtime]
17//! and execution of the users application.
18//!
19//! In the future, the [Worker] should probably be moved to a procedural macro similar
20//! to the `#[tokio::main]` attribute, where we might annotate an async main function with
21//! #[triton::main] or similar.
22//!
23//! The [Worker::execute] method is designed to be called once from main and will block
24//! the calling thread until the application completes or is canceled. The method initialized
25//! the signal handler used to trap `SIGINT` and `SIGTERM` signals and trigger a graceful shutdown.
26//!
27//! On termination, the user application is given a graceful shutdown period of controlled by
28//! the [TRITON_WORKER_GRACEFUL_SHUTDOWN_TIMEOUT] environment variable. If the application does not
29//! shutdown in time, the worker will terminate the application with an exit code of 911.
30//!
31//! The default values of `TRITON_WORKER_GRACEFUL_SHUTDOWN_TIMEOUT` differ between the development
32//! and release builds. In development, the default is [DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_DEBUG] and
33//! in release, the default is [DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_RELEASE].
34
35use super::{error, log, CancellationToken, Result, Runtime, RuntimeConfig};
36
37use futures::Future;
38use once_cell::sync::OnceCell;
39use std::{sync::Mutex, time::Duration};
40use tokio::{signal, task::JoinHandle};
41
42static RT: OnceCell<tokio::runtime::Runtime> = OnceCell::new();
43static INIT: OnceCell<Mutex<Option<tokio::task::JoinHandle<Result<()>>>>> = OnceCell::new();
44
45const SHUTDOWN_MESSAGE: &str =
46    "Application received shutdown signal; attempting to gracefully shutdown";
47const SHUTDOWN_TIMEOUT_MESSAGE: &str =
48    "Use TRITON_WORKER_GRACEFUL_SHUTDOWN_TIMEOUT to control the graceful shutdown timeout";
49
50/// Environment variable to control the graceful shutdown timeout
51pub const TRITON_WORKER_GRACEFUL_SHUTDOWN_TIMEOUT: &str = "TRITON_WORKER_GRACEFUL_SHUTDOWN_TIMEOUT";
52
53/// Default graceful shutdown timeout in seconds in debug mode
54pub const DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_DEBUG: u64 = 5;
55
56/// Default graceful shutdown timeout in seconds in release mode
57pub const DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_RELEASE: u64 = 30;
58
59pub struct Worker {
60    runtime: Runtime,
61}
62
63impl Worker {
64    /// Create a new [`Worker`] instance from [`RuntimeConfig`] settings which is sourced from the environment
65    pub fn from_settings() -> Result<Worker> {
66        let config = RuntimeConfig::from_settings()?;
67        Worker::from_config(config)
68    }
69
70    /// Create a new [`Worker`] instance from a provided [`RuntimeConfig`]
71    pub fn from_config(config: RuntimeConfig) -> Result<Worker> {
72        // if the runtime is already initialized, return an error
73        if RT.get().is_some() {
74            return Err(error!("Worker already initialized"));
75        }
76
77        // create a new runtime and insert it into the OnceCell
78        // there is still a potential race-condition here, two threads cou have passed the first check
79        // but only one will succeed in inserting the runtime
80        let rt = RT.try_insert(config.create_runtime()?).map_err(|_| {
81            error!("Failed to create worker; Only a single Worker should ever be created")
82        })?;
83
84        let runtime = Runtime::from_handle(rt.handle().clone())?;
85        Ok(Worker { runtime })
86    }
87
88    pub fn tokio_runtime(&self) -> Result<&'static tokio::runtime::Runtime> {
89        RT.get().ok_or_else(|| error!("Worker not initialized"))
90    }
91
92    pub fn runtime(&self) -> &Runtime {
93        &self.runtime
94    }
95
96    /// Executes the provided application/closure on the [`Runtime`].
97    /// This is designed to be called once from main and will block the calling thread until the application completes.
98    pub fn execute<F, Fut>(self, f: F) -> Result<()>
99    where
100        F: FnOnce(Runtime) -> Fut + Send + 'static,
101        Fut: Future<Output = Result<()>> + Send + 'static,
102    {
103        let runtime = self.runtime;
104        let primary = runtime.primary();
105        let secondary = runtime.secondary.clone();
106
107        let timeout = std::env::var(TRITON_WORKER_GRACEFUL_SHUTDOWN_TIMEOUT)
108            .ok()
109            .and_then(|s| s.parse::<u64>().ok())
110            .unwrap_or({
111                if cfg!(debug_assertions) {
112                    DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_DEBUG
113                } else {
114                    DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_RELEASE
115                }
116            });
117
118        INIT.set(Mutex::new(Some(secondary.spawn(async move {
119            // start signal handler
120            tokio::spawn(signal_handler(runtime.cancellation_token.clone()));
121
122            let cancel_token = runtime.child_token();
123            let (mut app_tx, app_rx) = tokio::sync::oneshot::channel::<()>();
124
125            // spawn a task to run the application
126            let task: JoinHandle<Result<()>> = primary.spawn(async move {
127                let _rx = app_rx;
128                f(runtime).await
129            });
130
131            tokio::select! {
132                _ = cancel_token.cancelled() => {
133                    eprintln!("{}", SHUTDOWN_MESSAGE);
134                    eprintln!("{} {} seconds", SHUTDOWN_TIMEOUT_MESSAGE, timeout);
135                }
136
137                _ = app_tx.closed() => {
138                }
139            };
140
141            let result = tokio::select! {
142                result = task => {
143                    result
144                }
145
146                _ = tokio::time::sleep(tokio::time::Duration::from_secs(timeout)) => {
147                    eprintln!("Application did not shutdown in time; terminating");
148                    std::process::exit(911);
149                }
150            }?;
151
152            match &result {
153                Ok(_) => {
154                    log::info!("Application shutdown successfully");
155                }
156                Err(e) => {
157                    log::error!("Application shutdown with error: {:?}", e);
158                }
159            }
160
161            result
162        }))))
163        .map_err(|e| error!("Failed to spawn application task: {:?}", e))?;
164
165        let task = INIT
166            .get()
167            .expect("Application task not initialized")
168            .lock()
169            .unwrap()
170            .take()
171            .expect("Application initialized; but another thread is awaiting it; Worker.execute() can only be called once");
172
173        secondary.block_on(task)?
174    }
175}
176
177/// Catch signals and trigger a shutdown
178async fn signal_handler(cancel_token: CancellationToken) -> Result<()> {
179    let ctrl_c = async {
180        signal::ctrl_c().await?;
181        anyhow::Ok(())
182    };
183
184    let sigterm = async {
185        signal::unix::signal(signal::unix::SignalKind::terminate())?
186            .recv()
187            .await;
188        anyhow::Ok(())
189    };
190
191    tokio::select! {
192        _ = ctrl_c => {
193            tracing::info!("Ctrl+C received, starting graceful shutdown");
194        },
195        _ = sigterm => {
196            tracing::info!("SIGTERM received, starting graceful shutdown");
197        },
198        _ = cancel_token.cancelled() => {
199            tracing::info!("CancellationToken triggered; shutting down");
200        },
201    }
202
203    // trigger a shutdown
204    cancel_token.cancel();
205
206    Ok(())
207}