Skip to main content

tokio_process_tools/process_handle/termination/
mod.rs

1use super::ProcessHandle;
2use crate::error::{TerminationAction, TerminationError};
3use crate::output_stream::OutputStream;
4use std::io;
5use std::process::ExitStatus;
6use std::time::Duration;
7
8mod diagnostics;
9#[cfg(any(unix, windows))]
10mod shutdown;
11
12pub(in crate::process_handle) use diagnostics::TerminationDiagnostics;
13#[cfg(any(unix, windows))]
14pub use shutdown::{
15    GracefulShutdown, GracefulShutdownBuilder, UnixGracefulPhase, UnixGracefulShutdown,
16    UnixGracefulSignal, WindowsGracefulShutdown,
17};
18
19/// Maximum time to wait for process termination after forceful kill.
20///
21/// This is a safety timeout since forceful kill should terminate processes immediately,
22/// but there are rare cases where even forceful kill may not work.
23#[cfg(any(unix, windows))]
24const FORCE_KILL_WAIT_TIMEOUT: Duration = Duration::from_secs(3);
25
26/// Grace window granted to Tokio's SIGCHLD reaper after a signal-send failure so a freshly-exited
27/// child is observed as exited rather than as still running. Covers the brief race where the OS
28/// rejects signals to a not-yet-reaped process group (`EPERM` on macOS, `ESRCH` on Linux).
29#[cfg(any(unix, windows))]
30const REAP_AFTER_SIGNAL_FAILURE_GRACE: Duration = Duration::from_millis(100);
31
32/// Label recorded in diagnostics for the forceful kill phase. Cross-platform because `kill()` is
33/// available on every platform Tokio supports (the underlying `Child::start_kill()` is what runs
34/// on targets where graceful escalation is unavailable).
35#[cfg(unix)]
36const KILL_LABEL: &str = "SIGKILL";
37#[cfg(windows)]
38const KILL_LABEL: &str = "TerminateProcess";
39#[cfg(not(any(unix, windows)))]
40const KILL_LABEL: &str = "kill";
41
42/// One step of the cross-platform graceful-termination loop. Pre-computed by the per-platform
43/// wrapper so the shared loop body has no platform branches.
44#[cfg(any(unix, windows))]
45#[derive(Debug, Clone, Copy)]
46struct TerminationStep {
47    signal_label: &'static str,
48    timeout: Duration,
49}
50
51// Functionality available on all Tokio-supported platforms.
52impl<Stdout, Stderr> ProcessHandle<Stdout, Stderr>
53where
54    Stdout: OutputStream,
55    Stderr: OutputStream,
56{
57    /// Forces the process to exit. Most users should call [`ProcessHandle::terminate`] instead.
58    ///
59    /// This is equivalent to sending `SIGKILL` on Unix or calling `TerminateProcess` on Windows,
60    /// followed by a wait for the process to be reaped. On other Tokio-supported platforms it
61    /// forwards to [`tokio::process::Child::start_kill`].
62    ///
63    /// Any still-open stdin handle is closed before Tokio performs that kill-and-wait sequence,
64    /// matching [`tokio::process::Child::kill`] semantics.
65    /// A successful call waits for the child to exit and disarms the drop cleanup and panic guards,
66    /// so the handle can be dropped safely afterward.
67    ///
68    /// `kill` is a reasonable next step when [`terminate`](Self::terminate) returns `Err` and the
69    /// caller is not interested in further graceful escalation.
70    ///
71    /// # Errors
72    ///
73    /// Returns [`TerminationError`] if Tokio cannot kill or wait for the child process.
74    pub async fn kill(&mut self) -> Result<(), TerminationError> {
75        self.stdin().close();
76        let mut diagnostics = TerminationDiagnostics::default();
77
78        if let Err(err) = self.send_kill_signal() {
79            diagnostics.record(
80                TerminationAction::SendSignal {
81                    signal_name: KILL_LABEL,
82                },
83                err,
84            );
85            return Err(diagnostics.into_termination_failed(self.name.clone()));
86        }
87
88        if let Err(err) = self.wait_for_completion_unbounded().await {
89            diagnostics.record(TerminationAction::WaitForExit, err);
90            return Err(diagnostics.into_termination_failed(self.name.clone()));
91        }
92
93        Ok(())
94    }
95}
96
97// Graceful-termination methods. Only available on Unix and Windows because they rely on platform
98// signal primitives that have no cross-platform analogue.
99#[cfg(any(unix, windows))]
100impl<Stdout, Stderr> ProcessHandle<Stdout, Stderr>
101where
102    Stdout: OutputStream,
103    Stderr: OutputStream,
104{
105    /// Terminates this process by dispatching the configured graceful-shutdown sequence first,
106    /// then forcefully killing the process if it has not exited after the sequence completes.
107    ///
108    /// The signature is the same on every supported platform; the shape of `shutdown` is
109    /// platform-conditional. See [`GracefulShutdown`] for how to construct one and
110    /// [`UnixGracefulShutdown`] for guidance on choosing the Unix sequence.
111    ///
112    /// - On Unix the configured [`UnixGracefulShutdown`] dispatches one or more graceful signals
113    ///   in order; each phase's `timeout` bounds how long to wait for the child to exit before
114    ///   escalating to the next phase. After the last configured phase, `SIGKILL` runs as the
115    ///   implicit forceful fallback.
116    /// - On Windows this is a 2-phase termination: `CTRL_BREAK_EVENT` -> wait
117    ///   `shutdown.windows.timeout` -> `TerminateProcess`. **Only one `CTRL_BREAK_EVENT`
118    ///   is ever sent.** `GenerateConsoleCtrlEvent` can only target a child's process group with
119    ///   `CTRL_BREAK_EVENT` (sending `CTRL_C_EVENT` would require `dwProcessGroupId = 0` and
120    ///   broadcast to the parent), so a second graceful send would be the same event and cannot
121    ///   do more than the first send already did.
122    ///
123    /// The forceful kill fallback adds one fixed 3-second wait on top of the graceful timeouts.
124    ///
125    /// # Timeouts are upper bounds, not delays
126    ///
127    /// Each per-phase timeout bounds the post-signal wait of its phase. The wait future resolves
128    /// the instant Tokio's `SIGCHLD` reaper observes the child exit, so handler-less children
129    /// (children that have no handler installed for the signal we send) typically die in
130    /// microseconds via the kernel's default disposition (`Term`) and the configured timeout
131    /// never fires for them. The timeout only matters when the child has installed a handler
132    /// that takes time to complete.
133    ///
134    /// # What signal should I send?
135    ///
136    /// See [`UnixGracefulShutdown`] for the recommended single-signal sequences and a discussion
137    /// of why mixing `SIGINT` and `SIGTERM` does not cover children with unknown signal handlers.
138    ///
139    /// # Windows interop note
140    ///
141    /// `tokio::signal::ctrl_c()` on Windows registers only for `CTRL_C_EVENT`; it does not catch
142    /// `CTRL_BREAK_EVENT`. A child Rust binary that listens only on the cross-platform
143    /// `tokio::signal::ctrl_c()` will not respond to this graceful step on Windows and will be
144    /// terminated forcefully after `timeout`. To interoperate, such a child should
145    /// additionally listen on `tokio::signal::windows::ctrl_break()`, or expose another
146    /// shutdown channel (stdin sentinel, IPC, or a command protocol).
147    ///
148    /// # Per-phase timeout semantics
149    ///
150    /// Each per-phase timeout in `shutdown` bounds the post-signal wait of its phase:
151    ///
152    /// - Signal send succeeds: wait up to the user-supplied timeout, then escalate.
153    /// - Signal send fails: replace the user timeout with a fixed 100 ms grace so Tokio's
154    ///   reaper can catch up to a child that just exited (the OS rejects signals to a not-yet-
155    ///   reaped process group with `EPERM` on macOS or `ESRCH` on Linux). Real permission
156    ///   denials still surface as an error after the grace elapses.
157    ///
158    /// `Duration::from_secs(0)` disables the post-signal wait entirely and effectively forces
159    /// the call into the forceful kill (`SIGKILL` on Unix, `TerminateProcess` on Windows).
160    /// Prefer small but non-zero values (e.g. 100 ms to a few seconds).
161    ///
162    /// # Drop guards on `Ok` vs `Err`
163    ///
164    /// On `Ok`, the drop cleanup and panic guards are disarmed and the handle can be dropped
165    /// safely. On `Err` (or if the future is canceled), the guards stay armed: the library cannot
166    /// verify cleanup from the outside, so dropping would leak a process. Recover by retrying
167    /// `terminate`, escalating to [`kill`](Self::kill), calling
168    /// [`must_not_be_terminated`](Self::must_not_be_terminated) to accept the failure, or
169    /// propagating the error and letting the panic-on-drop surface the leak.
170    ///
171    /// # Errors
172    ///
173    /// Returns [`TerminationError`] if signalling or waiting for process termination fails.
174    pub async fn terminate(
175        &mut self,
176        shutdown: GracefulShutdown,
177    ) -> Result<ExitStatus, TerminationError> {
178        #[cfg(unix)]
179        {
180            self.terminate_with_hooks(
181                &shutdown.unix,
182                Self::try_reap_exit_status,
183                |this, signal| match signal {
184                    UnixGracefulSignal::Interrupt => this.group.send_interrupt(),
185                    UnixGracefulSignal::Terminate => this.group.send_terminate(),
186                },
187            )
188            .await
189        }
190        #[cfg(windows)]
191        {
192            self.terminate_with_hooks(&shutdown.windows, Self::try_reap_exit_status, |this| {
193                this.group.send_ctrl_break()
194            })
195            .await
196        }
197    }
198
199    /// Test-only fault-injection seam underneath [`terminate`](Self::terminate). Drives the
200    /// termination state machine with caller-supplied hooks for the preflight exit-status poll and
201    /// the per-phase signal send. Production code should call [`terminate`](Self::terminate)
202    /// instead, which wires `try_reap_exit_status` to the real `Child::try_wait` path and
203    /// `send_signal` to the platform-appropriate process-group signaler.
204    ///
205    /// `try_reap_exit_status` is consulted once before the first signal send so an already-exited
206    /// child is observed without sending. `send_signal` is called once per phase of `sequence`.
207    ///
208    /// Drop-guard semantics, escalation to the forceful kill fallback, and the post-success disarm,
209    /// all described through the public `terminate`, are handled here.
210    ///
211    /// # Errors
212    ///
213    /// Returns [`TerminationError`] if signaling or waiting for process termination fails.
214    #[doc(hidden)]
215    #[cfg(unix)]
216    pub async fn terminate_with_hooks<ExitStatusReaper, SignalSender>(
217        &mut self,
218        sequence: &UnixGracefulShutdown,
219        try_reap_exit_status: ExitStatusReaper,
220        mut send_signal: SignalSender,
221    ) -> Result<ExitStatus, TerminationError>
222    where
223        ExitStatusReaper: FnMut(&mut Self) -> Result<Option<ExitStatus>, io::Error>,
224        SignalSender: FnMut(&mut Self, UnixGracefulSignal) -> Result<(), io::Error>,
225    {
226        let phases = sequence.phases();
227        let steps: Vec<TerminationStep> = phases
228            .iter()
229            .map(|phase| TerminationStep {
230                signal_label: phase.signal.label(),
231                timeout: phase.timeout,
232            })
233            .collect();
234
235        self.run_termination_loop(&steps, try_reap_exit_status, |this, index, _step| {
236            send_signal(this, phases[index].signal)
237        })
238        .await
239    }
240
241    /// Test-only fault-injection seam underneath [`terminate`](Self::terminate). Drives the
242    /// termination state machine with caller-supplied hooks for the preflight exit-status poll
243    /// and the signal send. Production code should call [`terminate`](Self::terminate)
244    /// instead, which wires `try_reap_exit_status` to the real `Child::try_wait` path and
245    /// `send_signal` to the platform-appropriate process-group signaller.
246    ///
247    /// `try_reap_exit_status` is consulted once before the signal send so an already-exited
248    /// child is observed without sending. `send_signal` is called once (Windows always sends a
249    /// single `CTRL_BREAK_EVENT`). Drop-guard semantics, escalation to the forceful kill
250    /// fallback, and the post-success disarm all behave identically to `terminate`.
251    ///
252    /// # Errors
253    ///
254    /// Returns [`TerminationError`] if signalling or waiting for process termination fails.
255    #[doc(hidden)]
256    #[cfg(windows)]
257    pub async fn terminate_with_hooks<PreflightReaper, SignalSender>(
258        &mut self,
259        sequence: &WindowsGracefulShutdown,
260        try_reap_exit_status: PreflightReaper,
261        mut send_signal: SignalSender,
262    ) -> Result<ExitStatus, TerminationError>
263    where
264        PreflightReaper: FnMut(&mut Self) -> Result<Option<ExitStatus>, io::Error>,
265        SignalSender: FnMut(&mut Self) -> Result<(), io::Error>,
266    {
267        let steps = [TerminationStep {
268            signal_label: "CTRL_BREAK_EVENT",
269            timeout: sequence.timeout,
270        }];
271
272        self.run_termination_loop(&steps, try_reap_exit_status, |this, _index, _step| {
273            send_signal(this)
274        })
275        .await
276    }
277
278    /// Cross-platform termination driver. Iterates `steps` in order, sending each phase's
279    /// signal via `send_signal` and waiting up to its `timeout` for the child to exit before
280    /// escalating. Falls back to the implicit forceful kill (`SIGKILL` on Unix,
281    /// `TerminateProcess` on Windows) after the last step. `try_reap_exit_status` is consulted
282    /// once before the first signal send so an already-exited child is observed without sending.
283    ///
284    /// `steps` must be non-empty. Per-platform wrappers enforce this: `UnixGracefulShutdown`
285    /// rejects empty input at construction, and the Windows path always builds a single-element
286    /// slice.
287    #[cfg(any(unix, windows))]
288    async fn run_termination_loop<PreflightReaper, SignalSender>(
289        &mut self,
290        steps: &[TerminationStep],
291        mut try_reap_exit_status: PreflightReaper,
292        mut send_signal: SignalSender,
293    ) -> Result<ExitStatus, TerminationError>
294    where
295        PreflightReaper: FnMut(&mut Self) -> Result<Option<ExitStatus>, io::Error>,
296        SignalSender: FnMut(&mut Self, usize, &TerminationStep) -> Result<(), io::Error>,
297    {
298        debug_assert!(
299            !steps.is_empty(),
300            "run_termination_loop requires at least one graceful step",
301        );
302
303        let result = 'termination: {
304            let mut diagnostics = TerminationDiagnostics::default();
305            let first_phase_label = steps.first().map_or(KILL_LABEL, |step| step.signal_label);
306
307            match try_reap_exit_status(self) {
308                Ok(Some(exit_status)) => {
309                    break 'termination Ok(exit_status);
310                }
311                Ok(None) => {}
312                Err(err) => {
313                    tracing::warn!(
314                        process = %self.name,
315                        signal = first_phase_label,
316                        error = %err,
317                        "Could not determine process state before termination. Attempting first graceful phase."
318                    );
319                    diagnostics.record(TerminationAction::CheckStatus, err);
320                }
321            }
322
323            for (index, step) in steps.iter().enumerate() {
324                let next_label = steps
325                    .get(index + 1)
326                    .map_or(KILL_LABEL, |next| next.signal_label);
327                let send = &mut send_signal;
328
329                let outcome = self
330                    .attempt_graceful_phase(
331                        step.signal_label,
332                        next_label,
333                        step.timeout,
334                        &mut diagnostics,
335                        &mut |this: &mut Self| send(this, index, step),
336                    )
337                    .await;
338
339                if let Some(exit_status) = outcome {
340                    break 'termination Ok(exit_status);
341                }
342            }
343
344            self.attempt_forceful_kill(diagnostics).await
345        };
346
347        self.disarm_after_successful_termination(result)
348    }
349
350    /// Test-only helper for verifying that an `Err` termination result leaves the drop guards
351    /// armed. Disarms only on `Ok`; production code uses [`terminate`](Self::terminate) which
352    /// applies this internally. Exposed so integration tests can drive the disarm-on-success
353    /// contract with synthetic results without going through a real signal-injection sequence.
354    #[doc(hidden)]
355    pub fn disarm_after_successful_termination<T>(
356        &mut self,
357        result: Result<T, TerminationError>,
358    ) -> Result<T, TerminationError> {
359        if result.is_ok() {
360            self.must_not_be_terminated();
361        }
362        result
363    }
364
365    /// Send the graceful signal for one phase and wait up to `timeout` for the child to exit.
366    ///
367    /// Returns `Some(exit_status)` if the child exits during the phase. Returns `None` to escalate
368    /// to the next phase, recording a diagnostic for whatever went wrong (signal-send failure,
369    /// post-signal wait timeout, or wait error). When the signal send itself fails, also probes
370    /// briefly with [`REAP_AFTER_SIGNAL_FAILURE_GRACE`] so a freshly-exited child is observed as
371    /// exited rather than as still running.
372    async fn attempt_graceful_phase<SignalSender>(
373        &mut self,
374        signal_name: &'static str,
375        next_signal_name: &'static str,
376        timeout: Duration,
377        diagnostics: &mut TerminationDiagnostics,
378        send_signal: &mut SignalSender,
379    ) -> Option<ExitStatus>
380    where
381        SignalSender: FnMut(&mut Self) -> Result<(), io::Error>,
382    {
383        match send_signal(self) {
384            Ok(()) => match self.wait_for_exit_after_signal(timeout).await {
385                Ok(Some(exit_status)) => Some(exit_status),
386                Ok(None) => {
387                    let not_terminated = wait_timeout_error(timeout);
388                    tracing::warn!(
389                        process = %self.name,
390                        signal = signal_name,
391                        next_signal = next_signal_name,
392                        error = %not_terminated,
393                        "Graceful shutdown signal timed out. Attempting next shutdown phase."
394                    );
395                    diagnostics.record(TerminationAction::WaitForExit, not_terminated);
396                    None
397                }
398                Err(wait_error) => {
399                    tracing::warn!(
400                        process = %self.name,
401                        signal = signal_name,
402                        next_signal = next_signal_name,
403                        error = %wait_error,
404                        "Wait for graceful shutdown failed. Attempting next shutdown phase."
405                    );
406                    diagnostics.record(TerminationAction::WaitForExit, wait_error);
407                    None
408                }
409            },
410            Err(send_error) => {
411                tracing::warn!(
412                    process = %self.name,
413                    signal = signal_name,
414                    next_signal = next_signal_name,
415                    error = %send_error,
416                    "Graceful shutdown signal could not be sent. Attempting next shutdown phase."
417                );
418                diagnostics.record(TerminationAction::SendSignal { signal_name }, send_error);
419
420                match self
421                    .wait_for_exit_after_signal(REAP_AFTER_SIGNAL_FAILURE_GRACE)
422                    .await
423                {
424                    Ok(Some(exit_status)) => Some(exit_status),
425                    Ok(None) => None,
426                    Err(reap_error) => {
427                        tracing::warn!(
428                            process = %self.name,
429                            signal = signal_name,
430                            error = %reap_error,
431                            "Could not determine process state after graceful signal send failed."
432                        );
433                        diagnostics.record(TerminationAction::CheckStatus, reap_error);
434                        None
435                    }
436                }
437            }
438        }
439    }
440
441    async fn attempt_forceful_kill(
442        &mut self,
443        mut diagnostics: TerminationDiagnostics,
444    ) -> Result<ExitStatus, TerminationError> {
445        match self.send_kill_signal() {
446            Ok(()) => {
447                // Note: A forceful kill should typically (somewhat) immediately lead to
448                // termination of the process. But there are cases in which even a forceful kill
449                // does not / cannot / will not kill a process. We do not want to wait indefinitely
450                // in case this happens and therefore wait (at max) for a fixed duration after any
451                // kill.
452                match self
453                    .wait_for_exit_after_signal(FORCE_KILL_WAIT_TIMEOUT)
454                    .await
455                {
456                    Ok(Some(exit_status)) => Ok(exit_status),
457                    Ok(None) => {
458                        let not_terminated_after_kill = wait_timeout_error(FORCE_KILL_WAIT_TIMEOUT);
459                        // Unlikely. See the note above.
460                        tracing::error!(
461                            process = %self.name,
462                            kill_signal = KILL_LABEL,
463                            "Process did not terminate after all termination attempts. Process may still be running. Manual intervention and investigation required!"
464                        );
465                        diagnostics
466                            .record(TerminationAction::WaitForExit, not_terminated_after_kill);
467                        Err(diagnostics.into_termination_failed(self.name.clone()))
468                    }
469                    Err(not_terminated_after_kill) => {
470                        // Unlikely. See the note above.
471                        tracing::error!(
472                            process = %self.name,
473                            kill_signal = KILL_LABEL,
474                            "Process did not terminate after all termination attempts. Process may still be running. Manual intervention and investigation required!"
475                        );
476                        diagnostics
477                            .record(TerminationAction::WaitForExit, not_terminated_after_kill);
478                        Err(diagnostics.into_termination_failed(self.name.clone()))
479                    }
480                }
481            }
482            Err(kill_error) => {
483                tracing::error!(
484                    process = %self.name,
485                    error = %kill_error,
486                    signal = KILL_LABEL,
487                    "Forceful shutdown failed. Process may still be running. Manual intervention required!"
488                );
489                diagnostics.record(
490                    TerminationAction::SendSignal {
491                        signal_name: KILL_LABEL,
492                    },
493                    kill_error,
494                );
495
496                // Brief grace for Tokio's SIGCHLD reaper to catch up - see
497                // `REAP_AFTER_SIGNAL_FAILURE_GRACE`.
498                match self
499                    .wait_for_exit_after_signal(REAP_AFTER_SIGNAL_FAILURE_GRACE)
500                    .await
501                {
502                    Ok(Some(exit_status)) => {
503                        return Ok(exit_status);
504                    }
505                    Ok(None) => {}
506                    Err(reap_error) => {
507                        tracing::warn!(
508                            process = %self.name,
509                            signal = KILL_LABEL,
510                            error = %reap_error,
511                            "Could not determine process state after forceful shutdown failed."
512                        );
513                        diagnostics.record(TerminationAction::CheckStatus, reap_error);
514                    }
515                }
516
517                Err(diagnostics.into_termination_failed(self.name.clone()))
518            }
519        }
520    }
521}
522
523fn wait_timeout_error(timeout: Duration) -> io::Error {
524    io::Error::new(
525        io::ErrorKind::TimedOut,
526        format!("process did not complete within {timeout:?}"),
527    )
528}