tokio_process_tools/process_handle/termination/mod.rs
1use super::ProcessHandle;
2use crate::error::{TerminationAction, TerminationError};
3use crate::output_stream::OutputStream;
4use std::io;
5use std::process::ExitStatus;
6use std::time::Duration;
7
8mod diagnostics;
9#[cfg(any(unix, windows))]
10mod shutdown;
11
12pub(in crate::process_handle) use diagnostics::TerminationDiagnostics;
13#[cfg(any(unix, windows))]
14pub use shutdown::{
15 GracefulShutdown, GracefulShutdownBuilder, UnixGracefulPhase, UnixGracefulShutdown,
16 UnixGracefulSignal, WindowsGracefulShutdown,
17};
18
19/// Maximum time to wait for process termination after forceful kill.
20///
21/// This is a safety timeout since forceful kill should terminate processes immediately,
22/// but there are rare cases where even forceful kill may not work.
23#[cfg(any(unix, windows))]
24const FORCE_KILL_WAIT_TIMEOUT: Duration = Duration::from_secs(3);
25
26/// Grace window granted to Tokio's SIGCHLD reaper after a signal-send failure so a freshly-exited
27/// child is observed as exited rather than as still running. Covers the brief race where the OS
28/// rejects signals to a not-yet-reaped process group (`EPERM` on macOS, `ESRCH` on Linux).
29#[cfg(any(unix, windows))]
30const REAP_AFTER_SIGNAL_FAILURE_GRACE: Duration = Duration::from_millis(100);
31
32/// Label recorded in diagnostics for the forceful kill phase. Cross-platform because `kill()` is
33/// available on every platform Tokio supports (the underlying `Child::start_kill()` is what runs
34/// on targets where graceful escalation is unavailable).
35#[cfg(unix)]
36const KILL_LABEL: &str = "SIGKILL";
37#[cfg(windows)]
38const KILL_LABEL: &str = "TerminateProcess";
39#[cfg(not(any(unix, windows)))]
40const KILL_LABEL: &str = "kill";
41
42/// One step of the cross-platform graceful-termination loop. Pre-computed by the per-platform
43/// wrapper so the shared loop body has no platform branches.
44#[cfg(any(unix, windows))]
45#[derive(Debug, Clone, Copy)]
46struct TerminationStep {
47 signal_label: &'static str,
48 timeout: Duration,
49}
50
51// Functionality available on all Tokio-supported platforms.
52impl<Stdout, Stderr> ProcessHandle<Stdout, Stderr>
53where
54 Stdout: OutputStream,
55 Stderr: OutputStream,
56{
57 /// Forces the process to exit. Most users should call [`ProcessHandle::terminate`] instead.
58 ///
59 /// This is equivalent to sending `SIGKILL` on Unix or calling `TerminateProcess` on Windows,
60 /// followed by a wait for the process to be reaped. On other Tokio-supported platforms it
61 /// forwards to [`tokio::process::Child::start_kill`].
62 ///
63 /// Any still-open stdin handle is closed before Tokio performs that kill-and-wait sequence,
64 /// matching [`tokio::process::Child::kill`] semantics.
65 /// A successful call waits for the child to exit and disarms the drop cleanup and panic guards,
66 /// so the handle can be dropped safely afterward.
67 ///
68 /// `kill` is a reasonable next step when [`terminate`](Self::terminate) returns `Err` and the
69 /// caller is not interested in further graceful escalation.
70 ///
71 /// # Errors
72 ///
73 /// Returns [`TerminationError`] if Tokio cannot kill or wait for the child process.
74 pub async fn kill(&mut self) -> Result<(), TerminationError> {
75 self.stdin().close();
76 let mut diagnostics = TerminationDiagnostics::default();
77
78 if let Err(err) = self.send_kill_signal() {
79 diagnostics.record(
80 TerminationAction::SendSignal {
81 signal_name: KILL_LABEL,
82 },
83 err,
84 );
85 return Err(diagnostics.into_termination_failed(self.name.clone()));
86 }
87
88 if let Err(err) = self.wait_for_completion_unbounded().await {
89 diagnostics.record(TerminationAction::WaitForExit, err);
90 return Err(diagnostics.into_termination_failed(self.name.clone()));
91 }
92
93 Ok(())
94 }
95}
96
97// Graceful-termination methods. Only available on Unix and Windows because they rely on platform
98// signal primitives that have no cross-platform analogue.
99#[cfg(any(unix, windows))]
100impl<Stdout, Stderr> ProcessHandle<Stdout, Stderr>
101where
102 Stdout: OutputStream,
103 Stderr: OutputStream,
104{
105 /// Terminates this process by dispatching the configured graceful-shutdown sequence first,
106 /// then forcefully killing the process if it has not exited after the sequence completes.
107 ///
108 /// The signature is the same on every supported platform; the shape of `shutdown` is
109 /// platform-conditional. See [`GracefulShutdown`] for how to construct one and
110 /// [`UnixGracefulShutdown`] for guidance on choosing the Unix sequence.
111 ///
112 /// - On Unix the configured [`UnixGracefulShutdown`] dispatches one or more graceful signals
113 /// in order; each phase's `timeout` bounds how long to wait for the child to exit before
114 /// escalating to the next phase. After the last configured phase, `SIGKILL` runs as the
115 /// implicit forceful fallback.
116 /// - On Windows this is a 2-phase termination: `CTRL_BREAK_EVENT` -> wait
117 /// `shutdown.windows.timeout` -> `TerminateProcess`. **Only one `CTRL_BREAK_EVENT`
118 /// is ever sent.** `GenerateConsoleCtrlEvent` can only target a child's process group with
119 /// `CTRL_BREAK_EVENT` (sending `CTRL_C_EVENT` would require `dwProcessGroupId = 0` and
120 /// broadcast to the parent), so a second graceful send would be the same event and cannot
121 /// do more than the first send already did.
122 ///
123 /// The forceful kill fallback adds one fixed 3-second wait on top of the graceful timeouts.
124 ///
125 /// # Timeouts are upper bounds, not delays
126 ///
127 /// Each per-phase timeout bounds the post-signal wait of its phase. The wait future resolves
128 /// the instant Tokio's `SIGCHLD` reaper observes the child exit, so handler-less children
129 /// (children that have no handler installed for the signal we send) typically die in
130 /// microseconds via the kernel's default disposition (`Term`) and the configured timeout
131 /// never fires for them. The timeout only matters when the child has installed a handler
132 /// that takes time to complete.
133 ///
134 /// # What signal should I send?
135 ///
136 /// See [`UnixGracefulShutdown`] for the recommended single-signal sequences and a discussion
137 /// of why mixing `SIGINT` and `SIGTERM` does not cover children with unknown signal handlers.
138 ///
139 /// # Windows interop note
140 ///
141 /// `tokio::signal::ctrl_c()` on Windows registers only for `CTRL_C_EVENT`; it does not catch
142 /// `CTRL_BREAK_EVENT`. A child Rust binary that listens only on the cross-platform
143 /// `tokio::signal::ctrl_c()` will not respond to this graceful step on Windows and will be
144 /// terminated forcefully after `timeout`. To interoperate, such a child should
145 /// additionally listen on `tokio::signal::windows::ctrl_break()`, or expose another
146 /// shutdown channel (stdin sentinel, IPC, or a command protocol).
147 ///
148 /// # Per-phase timeout semantics
149 ///
150 /// Each per-phase timeout in `shutdown` bounds the post-signal wait of its phase:
151 ///
152 /// - Signal send succeeds: wait up to the user-supplied timeout, then escalate.
153 /// - Signal send fails: replace the user timeout with a fixed 100 ms grace so Tokio's
154 /// reaper can catch up to a child that just exited (the OS rejects signals to a not-yet-
155 /// reaped process group with `EPERM` on macOS or `ESRCH` on Linux). Real permission
156 /// denials still surface as an error after the grace elapses.
157 ///
158 /// `Duration::from_secs(0)` disables the post-signal wait entirely and effectively forces
159 /// the call into the forceful kill (`SIGKILL` on Unix, `TerminateProcess` on Windows).
160 /// Prefer small but non-zero values (e.g. 100 ms to a few seconds).
161 ///
162 /// # Drop guards on `Ok` vs `Err`
163 ///
164 /// On `Ok`, the drop cleanup and panic guards are disarmed and the handle can be dropped
165 /// safely. On `Err` (or if the future is canceled), the guards stay armed: the library cannot
166 /// verify cleanup from the outside, so dropping would leak a process. Recover by retrying
167 /// `terminate`, escalating to [`kill`](Self::kill), calling
168 /// [`must_not_be_terminated`](Self::must_not_be_terminated) to accept the failure, or
169 /// propagating the error and letting the panic-on-drop surface the leak.
170 ///
171 /// # Errors
172 ///
173 /// Returns [`TerminationError`] if signalling or waiting for process termination fails.
174 pub async fn terminate(
175 &mut self,
176 shutdown: GracefulShutdown,
177 ) -> Result<ExitStatus, TerminationError> {
178 #[cfg(unix)]
179 {
180 self.terminate_with_hooks(
181 &shutdown.unix,
182 Self::try_reap_exit_status,
183 |this, signal| match signal {
184 UnixGracefulSignal::Interrupt => this.group.send_interrupt(),
185 UnixGracefulSignal::Terminate => this.group.send_terminate(),
186 },
187 )
188 .await
189 }
190 #[cfg(windows)]
191 {
192 self.terminate_with_hooks(&shutdown.windows, Self::try_reap_exit_status, |this| {
193 this.group.send_ctrl_break()
194 })
195 .await
196 }
197 }
198
199 /// Test-only fault-injection seam underneath [`terminate`](Self::terminate). Drives the
200 /// termination state machine with caller-supplied hooks for the preflight exit-status poll and
201 /// the per-phase signal send. Production code should call [`terminate`](Self::terminate)
202 /// instead, which wires `try_reap_exit_status` to the real `Child::try_wait` path and
203 /// `send_signal` to the platform-appropriate process-group signaler.
204 ///
205 /// `try_reap_exit_status` is consulted once before the first signal send so an already-exited
206 /// child is observed without sending. `send_signal` is called once per phase of `sequence`.
207 ///
208 /// Drop-guard semantics, escalation to the forceful kill fallback, and the post-success disarm,
209 /// all described through the public `terminate`, are handled here.
210 ///
211 /// # Errors
212 ///
213 /// Returns [`TerminationError`] if signaling or waiting for process termination fails.
214 #[doc(hidden)]
215 #[cfg(unix)]
216 pub async fn terminate_with_hooks<ExitStatusReaper, SignalSender>(
217 &mut self,
218 sequence: &UnixGracefulShutdown,
219 try_reap_exit_status: ExitStatusReaper,
220 mut send_signal: SignalSender,
221 ) -> Result<ExitStatus, TerminationError>
222 where
223 ExitStatusReaper: FnMut(&mut Self) -> Result<Option<ExitStatus>, io::Error>,
224 SignalSender: FnMut(&mut Self, UnixGracefulSignal) -> Result<(), io::Error>,
225 {
226 let phases = sequence.phases();
227 let steps: Vec<TerminationStep> = phases
228 .iter()
229 .map(|phase| TerminationStep {
230 signal_label: phase.signal.label(),
231 timeout: phase.timeout,
232 })
233 .collect();
234
235 self.run_termination_loop(&steps, try_reap_exit_status, |this, index, _step| {
236 send_signal(this, phases[index].signal)
237 })
238 .await
239 }
240
241 /// Test-only fault-injection seam underneath [`terminate`](Self::terminate). Drives the
242 /// termination state machine with caller-supplied hooks for the preflight exit-status poll
243 /// and the signal send. Production code should call [`terminate`](Self::terminate)
244 /// instead, which wires `try_reap_exit_status` to the real `Child::try_wait` path and
245 /// `send_signal` to the platform-appropriate process-group signaller.
246 ///
247 /// `try_reap_exit_status` is consulted once before the signal send so an already-exited
248 /// child is observed without sending. `send_signal` is called once (Windows always sends a
249 /// single `CTRL_BREAK_EVENT`). Drop-guard semantics, escalation to the forceful kill
250 /// fallback, and the post-success disarm all behave identically to `terminate`.
251 ///
252 /// # Errors
253 ///
254 /// Returns [`TerminationError`] if signalling or waiting for process termination fails.
255 #[doc(hidden)]
256 #[cfg(windows)]
257 pub async fn terminate_with_hooks<PreflightReaper, SignalSender>(
258 &mut self,
259 sequence: &WindowsGracefulShutdown,
260 try_reap_exit_status: PreflightReaper,
261 mut send_signal: SignalSender,
262 ) -> Result<ExitStatus, TerminationError>
263 where
264 PreflightReaper: FnMut(&mut Self) -> Result<Option<ExitStatus>, io::Error>,
265 SignalSender: FnMut(&mut Self) -> Result<(), io::Error>,
266 {
267 let steps = [TerminationStep {
268 signal_label: "CTRL_BREAK_EVENT",
269 timeout: sequence.timeout,
270 }];
271
272 self.run_termination_loop(&steps, try_reap_exit_status, |this, _index, _step| {
273 send_signal(this)
274 })
275 .await
276 }
277
278 /// Cross-platform termination driver. Iterates `steps` in order, sending each phase's
279 /// signal via `send_signal` and waiting up to its `timeout` for the child to exit before
280 /// escalating. Falls back to the implicit forceful kill (`SIGKILL` on Unix,
281 /// `TerminateProcess` on Windows) after the last step. `try_reap_exit_status` is consulted
282 /// once before the first signal send so an already-exited child is observed without sending.
283 ///
284 /// `steps` must be non-empty. Per-platform wrappers enforce this: `UnixGracefulShutdown`
285 /// rejects empty input at construction, and the Windows path always builds a single-element
286 /// slice.
287 #[cfg(any(unix, windows))]
288 async fn run_termination_loop<PreflightReaper, SignalSender>(
289 &mut self,
290 steps: &[TerminationStep],
291 mut try_reap_exit_status: PreflightReaper,
292 mut send_signal: SignalSender,
293 ) -> Result<ExitStatus, TerminationError>
294 where
295 PreflightReaper: FnMut(&mut Self) -> Result<Option<ExitStatus>, io::Error>,
296 SignalSender: FnMut(&mut Self, usize, &TerminationStep) -> Result<(), io::Error>,
297 {
298 debug_assert!(
299 !steps.is_empty(),
300 "run_termination_loop requires at least one graceful step",
301 );
302
303 let result = 'termination: {
304 let mut diagnostics = TerminationDiagnostics::default();
305 let first_phase_label = steps.first().map_or(KILL_LABEL, |step| step.signal_label);
306
307 match try_reap_exit_status(self) {
308 Ok(Some(exit_status)) => {
309 break 'termination Ok(exit_status);
310 }
311 Ok(None) => {}
312 Err(err) => {
313 tracing::warn!(
314 process = %self.name,
315 signal = first_phase_label,
316 error = %err,
317 "Could not determine process state before termination. Attempting first graceful phase."
318 );
319 diagnostics.record(TerminationAction::CheckStatus, err);
320 }
321 }
322
323 for (index, step) in steps.iter().enumerate() {
324 let next_label = steps
325 .get(index + 1)
326 .map_or(KILL_LABEL, |next| next.signal_label);
327 let send = &mut send_signal;
328
329 let outcome = self
330 .attempt_graceful_phase(
331 step.signal_label,
332 next_label,
333 step.timeout,
334 &mut diagnostics,
335 &mut |this: &mut Self| send(this, index, step),
336 )
337 .await;
338
339 if let Some(exit_status) = outcome {
340 break 'termination Ok(exit_status);
341 }
342 }
343
344 self.attempt_forceful_kill(diagnostics).await
345 };
346
347 self.disarm_after_successful_termination(result)
348 }
349
350 /// Test-only helper for verifying that an `Err` termination result leaves the drop guards
351 /// armed. Disarms only on `Ok`; production code uses [`terminate`](Self::terminate) which
352 /// applies this internally. Exposed so integration tests can drive the disarm-on-success
353 /// contract with synthetic results without going through a real signal-injection sequence.
354 #[doc(hidden)]
355 pub fn disarm_after_successful_termination<T>(
356 &mut self,
357 result: Result<T, TerminationError>,
358 ) -> Result<T, TerminationError> {
359 if result.is_ok() {
360 self.must_not_be_terminated();
361 }
362 result
363 }
364
365 /// Send the graceful signal for one phase and wait up to `timeout` for the child to exit.
366 ///
367 /// Returns `Some(exit_status)` if the child exits during the phase. Returns `None` to escalate
368 /// to the next phase, recording a diagnostic for whatever went wrong (signal-send failure,
369 /// post-signal wait timeout, or wait error). When the signal send itself fails, also probes
370 /// briefly with [`REAP_AFTER_SIGNAL_FAILURE_GRACE`] so a freshly-exited child is observed as
371 /// exited rather than as still running.
372 async fn attempt_graceful_phase<SignalSender>(
373 &mut self,
374 signal_name: &'static str,
375 next_signal_name: &'static str,
376 timeout: Duration,
377 diagnostics: &mut TerminationDiagnostics,
378 send_signal: &mut SignalSender,
379 ) -> Option<ExitStatus>
380 where
381 SignalSender: FnMut(&mut Self) -> Result<(), io::Error>,
382 {
383 match send_signal(self) {
384 Ok(()) => match self.wait_for_exit_after_signal(timeout).await {
385 Ok(Some(exit_status)) => Some(exit_status),
386 Ok(None) => {
387 let not_terminated = wait_timeout_error(timeout);
388 tracing::warn!(
389 process = %self.name,
390 signal = signal_name,
391 next_signal = next_signal_name,
392 error = %not_terminated,
393 "Graceful shutdown signal timed out. Attempting next shutdown phase."
394 );
395 diagnostics.record(TerminationAction::WaitForExit, not_terminated);
396 None
397 }
398 Err(wait_error) => {
399 tracing::warn!(
400 process = %self.name,
401 signal = signal_name,
402 next_signal = next_signal_name,
403 error = %wait_error,
404 "Wait for graceful shutdown failed. Attempting next shutdown phase."
405 );
406 diagnostics.record(TerminationAction::WaitForExit, wait_error);
407 None
408 }
409 },
410 Err(send_error) => {
411 tracing::warn!(
412 process = %self.name,
413 signal = signal_name,
414 next_signal = next_signal_name,
415 error = %send_error,
416 "Graceful shutdown signal could not be sent. Attempting next shutdown phase."
417 );
418 diagnostics.record(TerminationAction::SendSignal { signal_name }, send_error);
419
420 match self
421 .wait_for_exit_after_signal(REAP_AFTER_SIGNAL_FAILURE_GRACE)
422 .await
423 {
424 Ok(Some(exit_status)) => Some(exit_status),
425 Ok(None) => None,
426 Err(reap_error) => {
427 tracing::warn!(
428 process = %self.name,
429 signal = signal_name,
430 error = %reap_error,
431 "Could not determine process state after graceful signal send failed."
432 );
433 diagnostics.record(TerminationAction::CheckStatus, reap_error);
434 None
435 }
436 }
437 }
438 }
439 }
440
441 async fn attempt_forceful_kill(
442 &mut self,
443 mut diagnostics: TerminationDiagnostics,
444 ) -> Result<ExitStatus, TerminationError> {
445 match self.send_kill_signal() {
446 Ok(()) => {
447 // Note: A forceful kill should typically (somewhat) immediately lead to
448 // termination of the process. But there are cases in which even a forceful kill
449 // does not / cannot / will not kill a process. We do not want to wait indefinitely
450 // in case this happens and therefore wait (at max) for a fixed duration after any
451 // kill.
452 match self
453 .wait_for_exit_after_signal(FORCE_KILL_WAIT_TIMEOUT)
454 .await
455 {
456 Ok(Some(exit_status)) => Ok(exit_status),
457 Ok(None) => {
458 let not_terminated_after_kill = wait_timeout_error(FORCE_KILL_WAIT_TIMEOUT);
459 // Unlikely. See the note above.
460 tracing::error!(
461 process = %self.name,
462 kill_signal = KILL_LABEL,
463 "Process did not terminate after all termination attempts. Process may still be running. Manual intervention and investigation required!"
464 );
465 diagnostics
466 .record(TerminationAction::WaitForExit, not_terminated_after_kill);
467 Err(diagnostics.into_termination_failed(self.name.clone()))
468 }
469 Err(not_terminated_after_kill) => {
470 // Unlikely. See the note above.
471 tracing::error!(
472 process = %self.name,
473 kill_signal = KILL_LABEL,
474 "Process did not terminate after all termination attempts. Process may still be running. Manual intervention and investigation required!"
475 );
476 diagnostics
477 .record(TerminationAction::WaitForExit, not_terminated_after_kill);
478 Err(diagnostics.into_termination_failed(self.name.clone()))
479 }
480 }
481 }
482 Err(kill_error) => {
483 tracing::error!(
484 process = %self.name,
485 error = %kill_error,
486 signal = KILL_LABEL,
487 "Forceful shutdown failed. Process may still be running. Manual intervention required!"
488 );
489 diagnostics.record(
490 TerminationAction::SendSignal {
491 signal_name: KILL_LABEL,
492 },
493 kill_error,
494 );
495
496 // Brief grace for Tokio's SIGCHLD reaper to catch up - see
497 // `REAP_AFTER_SIGNAL_FAILURE_GRACE`.
498 match self
499 .wait_for_exit_after_signal(REAP_AFTER_SIGNAL_FAILURE_GRACE)
500 .await
501 {
502 Ok(Some(exit_status)) => {
503 return Ok(exit_status);
504 }
505 Ok(None) => {}
506 Err(reap_error) => {
507 tracing::warn!(
508 process = %self.name,
509 signal = KILL_LABEL,
510 error = %reap_error,
511 "Could not determine process state after forceful shutdown failed."
512 );
513 diagnostics.record(TerminationAction::CheckStatus, reap_error);
514 }
515 }
516
517 Err(diagnostics.into_termination_failed(self.name.clone()))
518 }
519 }
520 }
521}
522
523fn wait_timeout_error(timeout: Duration) -> io::Error {
524 io::Error::new(
525 io::ErrorKind::TimedOut,
526 format!("process did not complete within {timeout:?}"),
527 )
528}