Skip to main content

vtcode_bash_runner/
process_group.rs

1//! Process-group helpers for reliable child process cleanup.
2//!
3//! This module centralizes OS-specific pieces that ensure a spawned
4//! command can be cleaned up reliably:
5//! - `set_process_group` is called in `pre_exec` so the child starts its own
6//!   process group.
7//! - `detach_from_tty` starts a new session so non-interactive children do not
8//!   inherit the controlling TTY.
9//! - `kill_process_group_by_pid` targets the whole group (children/grandchildren)
10//!   instead of a single PID.
11//! - `kill_process_group` targets a known process group ID directly.
12//! - `set_parent_death_signal` (Linux only) arranges for the child to receive a
13//!   `SIGTERM` when the parent exits, and re-checks the parent PID to avoid
14//!   races during fork/exec.
15//! - `graceful_kill_process_group` sends SIGTERM, waits for a grace period, then
16//!   SIGKILL if still running.
17//!
18//! On non-Unix platforms these helpers are no-ops or adapted equivalents.
19//!
20//! Inspired by codex-rs/utils/pty process group management patterns.
21
22use std::io;
23
24#[cfg(unix)]
25use nix::errno::Errno;
26#[cfg(target_os = "linux")]
27use nix::sys::prctl;
28#[cfg(unix)]
29use nix::sys::signal::{self, Signal};
30#[cfg(unix)]
31use nix::unistd::{self, Pid};
32#[cfg(unix)]
33use tokio::process::Child;
34
35/// Default grace period for graceful termination (milliseconds).
36pub const DEFAULT_GRACEFUL_TIMEOUT_MS: u64 = 500;
37
38/// Signal to send when killing process groups.
39#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
40pub enum KillSignal {
41    /// SIGINT - interrupt (Ctrl+C equivalent)
42    Int,
43    /// SIGTERM - allows graceful shutdown
44    Term,
45    /// SIGKILL - immediate termination
46    #[default]
47    Kill,
48}
49
50#[cfg(unix)]
51impl KillSignal {
52    fn as_nix_signal(self) -> Signal {
53        match self {
54            KillSignal::Int => Signal::SIGINT,
55            KillSignal::Term => Signal::SIGTERM,
56            KillSignal::Kill => Signal::SIGKILL,
57        }
58    }
59}
60
61#[cfg(unix)]
62fn nix_err_to_io(err: Errno) -> io::Error {
63    io::Error::from_raw_os_error(err as i32)
64}
65
66/// Ensure the child receives SIGTERM when the original parent dies.
67///
68/// This should run in `pre_exec` and uses `parent_pid` captured before spawn to
69/// avoid a race where the parent exits between fork and exec.
70#[cfg(target_os = "linux")]
71pub fn set_parent_death_signal(parent_pid: libc::pid_t) -> io::Result<()> {
72    prctl::set_pdeathsig(Some(Signal::SIGTERM)).map_err(nix_err_to_io)?;
73
74    // Re-check parent PID to avoid race condition where parent exits between fork and exec.
75    if unistd::getppid() != Pid::from_raw(parent_pid) {
76        signal::kill(unistd::getpid(), Signal::SIGTERM).map_err(nix_err_to_io)?;
77    }
78
79    Ok(())
80}
81
82/// No-op on non-Linux platforms.
83#[cfg(not(target_os = "linux"))]
84pub fn set_parent_death_signal(_parent_pid: i32) -> io::Result<()> {
85    Ok(())
86}
87
88/// Detach from the controlling TTY by starting a new session.
89///
90/// This is useful for spawning background processes that should not receive
91/// signals from the controlling terminal.
92#[cfg(unix)]
93pub fn detach_from_tty() -> io::Result<()> {
94    match unistd::setsid() {
95        Ok(_) => Ok(()),
96        // EPERM means we're already a session leader, fall back to setpgid.
97        Err(Errno::EPERM) => set_process_group(),
98        Err(err) => Err(nix_err_to_io(err)),
99    }
100}
101
102/// No-op on non-Unix platforms.
103#[cfg(not(unix))]
104pub fn detach_from_tty() -> io::Result<()> {
105    Ok(())
106}
107
108/// Put the calling process into its own process group.
109///
110/// Intended for use in `pre_exec` so the child becomes the group leader.
111#[cfg(unix)]
112pub fn set_process_group() -> io::Result<()> {
113    unistd::setpgid(Pid::from_raw(0), Pid::from_raw(0)).map_err(nix_err_to_io)
114}
115
116/// No-op on non-Unix platforms.
117#[cfg(not(unix))]
118pub fn set_process_group() -> io::Result<()> {
119    Ok(())
120}
121
122/// Kill the process group for the given PID (best-effort).
123///
124/// This resolves the PGID for `pid` and sends SIGKILL to the whole group.
125#[cfg(unix)]
126pub fn kill_process_group_by_pid(pid: u32) -> io::Result<()> {
127    kill_process_group_by_pid_with_signal(pid, KillSignal::Kill)
128}
129
130/// Kill the process group for the given PID with a specific signal.
131#[cfg(unix)]
132pub fn kill_process_group_by_pid_with_signal(pid: u32, signal: KillSignal) -> io::Result<()> {
133    use std::io::ErrorKind;
134
135    let target_pid = Pid::from_raw(pid as libc::pid_t);
136    let pgid = unistd::getpgid(Some(target_pid));
137    let mut pgid_err = None;
138
139    match pgid {
140        Ok(group) => {
141            if let Err(err) = signal::killpg(group, signal.as_nix_signal()) {
142                let io_err = nix_err_to_io(err);
143                if io_err.kind() != ErrorKind::NotFound {
144                    pgid_err = Some(io_err);
145                }
146            }
147        }
148        Err(err) => pgid_err = Some(nix_err_to_io(err)),
149    }
150
151    // Always attempt to kill the direct child process handle as a fallback.
152    // This ensures termination even if the cached PGID was stale or
153    // the process group kill had issues.
154    if let Err(err) = signal::kill(target_pid, signal.as_nix_signal()) {
155        let io_err = nix_err_to_io(err);
156        if io_err.kind() == ErrorKind::NotFound {
157            // If direct kill says not found, we're done regardless of pgid result.
158            return Ok(());
159        }
160        // If we have a pgid error and a direct kill error, prefer the pgid one.
161        if let Some(pgid_error) = pgid_err {
162            return Err(pgid_error);
163        }
164        return Err(io_err);
165    }
166
167    Ok(())
168}
169
170/// No-op on non-Unix platforms.
171#[cfg(not(unix))]
172pub fn kill_process_group_by_pid(_pid: u32) -> io::Result<()> {
173    Ok(())
174}
175
176/// No-op on non-Unix platforms.
177#[cfg(not(unix))]
178pub fn kill_process_group_by_pid_with_signal(_pid: u32, _signal: KillSignal) -> io::Result<()> {
179    Ok(())
180}
181
182/// Kill a specific process group ID (best-effort).
183#[cfg(unix)]
184pub fn kill_process_group(process_group_id: u32) -> io::Result<()> {
185    kill_process_group_with_signal(process_group_id, KillSignal::Kill)
186}
187
188/// Kill a specific process group ID with a specific signal.
189#[cfg(unix)]
190pub fn kill_process_group_with_signal(process_group_id: u32, signal: KillSignal) -> io::Result<()> {
191    use std::io::ErrorKind;
192
193    let pgid = Pid::from_raw(process_group_id as libc::pid_t);
194    if let Err(err) = signal::killpg(pgid, signal.as_nix_signal()) {
195        let io_err = nix_err_to_io(err);
196        if io_err.kind() != ErrorKind::NotFound {
197            return Err(io_err);
198        }
199    }
200
201    Ok(())
202}
203
204/// No-op on non-Unix platforms.
205#[cfg(not(unix))]
206pub fn kill_process_group(_process_group_id: u32) -> io::Result<()> {
207    Ok(())
208}
209
210/// No-op on non-Unix platforms.
211#[cfg(not(unix))]
212pub fn kill_process_group_with_signal(
213    _process_group_id: u32,
214    _signal: KillSignal,
215) -> io::Result<()> {
216    Ok(())
217}
218
219/// Kill the process group for a tokio child (best-effort).
220#[cfg(unix)]
221pub fn kill_child_process_group(child: &mut Child) -> io::Result<()> {
222    kill_child_process_group_with_signal(child, KillSignal::Kill)
223}
224
225/// Kill the process group for a tokio child with a specific signal.
226#[cfg(unix)]
227pub fn kill_child_process_group_with_signal(
228    child: &mut Child,
229    signal: KillSignal,
230) -> io::Result<()> {
231    if let Some(pid) = child.id() {
232        return kill_process_group_by_pid_with_signal(pid, signal);
233    }
234
235    Ok(())
236}
237
238/// No-op on non-Unix platforms.
239#[cfg(not(unix))]
240pub fn kill_child_process_group(_child: &mut tokio::process::Child) -> io::Result<()> {
241    Ok(())
242}
243
244/// No-op on non-Unix platforms.
245#[cfg(not(unix))]
246pub fn kill_child_process_group_with_signal(
247    _child: &mut tokio::process::Child,
248    _signal: KillSignal,
249) -> io::Result<()> {
250    Ok(())
251}
252
253/// Kill a process by PID on Windows.
254#[cfg(windows)]
255pub fn kill_process(pid: u32) -> io::Result<()> {
256    let status = std::process::Command::new("taskkill")
257        .args(["/PID", &pid.to_string(), "/T", "/F"])
258        .status()?;
259    if status.success() {
260        Ok(())
261    } else {
262        Err(io::Error::other("taskkill failed"))
263    }
264}
265
266/// No-op on non-Windows platforms.
267#[cfg(not(windows))]
268pub fn kill_process(_pid: u32) -> io::Result<()> {
269    Ok(())
270}
271
272/// Result of a graceful termination attempt.
273#[derive(Debug, Clone, Copy, PartialEq, Eq)]
274pub enum GracefulTerminationResult {
275    /// Process exited gracefully after SIGTERM/SIGINT.
276    GracefulExit,
277    /// Process had to be forcefully killed with SIGKILL.
278    ForcefulKill,
279    /// Process was already not running.
280    AlreadyExited,
281    /// Failed to check or terminate the process.
282    Error,
283}
284
285/// Check if a process (by PID) is still running.
286#[cfg(unix)]
287fn is_process_running(pid: u32) -> bool {
288    let target_pid = Pid::from_raw(pid as libc::pid_t);
289    match signal::kill(target_pid, None::<Signal>) {
290        Ok(()) => true,
291        // EPERM = exists but no permission (still running)
292        Err(Errno::EPERM) => true,
293        Err(_) => false,
294    }
295}
296
297#[cfg(not(unix))]
298fn is_process_running(_pid: u32) -> bool {
299    // On non-Unix, assume running (will fail gracefully)
300    true
301}
302
303/// Gracefully terminate a process group by PID.
304///
305/// This function implements a staged termination strategy:
306/// 1. Send the initial signal (default: SIGTERM, or SIGINT for interactive processes)
307/// 2. Wait up to `grace_period` for the process to exit
308/// 3. If still running, send SIGKILL
309///
310/// Returns information about how the termination completed.
311///
312/// # Arguments
313/// * `pid` - Process ID (will be used to resolve the process group)
314/// * `initial_signal` - Signal to try first (SIGINT, SIGTERM)
315/// * `grace_period` - How long to wait before SIGKILL
316#[cfg(unix)]
317pub fn graceful_kill_process_group(
318    pid: u32,
319    initial_signal: KillSignal,
320    grace_period: std::time::Duration,
321) -> GracefulTerminationResult {
322    // Check if already exited
323    if !is_process_running(pid) {
324        return GracefulTerminationResult::AlreadyExited;
325    }
326
327    // Resolve PGID
328    let target_pid = Pid::from_raw(pid as libc::pid_t);
329    let Ok(pgid) = unistd::getpgid(Some(target_pid)) else {
330        // Can't get PGID - process may have already exited.
331        return GracefulTerminationResult::AlreadyExited;
332    };
333
334    // Send initial signal (SIGTERM or SIGINT)
335    let signal = match initial_signal {
336        KillSignal::Kill => Signal::SIGTERM, // Don't send SIGKILL as initial.
337        other => other.as_nix_signal(),
338    };
339
340    if let Err(err) = signal::killpg(pgid, signal) {
341        if err != Errno::ESRCH {
342            return GracefulTerminationResult::Error;
343        }
344        return GracefulTerminationResult::AlreadyExited;
345    }
346
347    // Wait for graceful exit
348    let deadline = std::time::Instant::now() + grace_period;
349    let poll_interval = std::time::Duration::from_millis(10);
350
351    while std::time::Instant::now() < deadline {
352        if !is_process_running(pid) {
353            return GracefulTerminationResult::GracefulExit;
354        }
355        std::thread::sleep(poll_interval);
356    }
357
358    // Still running - force kill.
359    // Use the robust termination behavior from codex-rs/utils/pty PR 12688
360    // by attempting both a pgid kill and a direct pid kill.
361    let _ = signal::killpg(pgid, Signal::SIGKILL);
362    if let Err(err) = signal::kill(target_pid, Signal::SIGKILL) {
363        if err == Errno::ESRCH {
364            // Exited between check and kill.
365            return GracefulTerminationResult::GracefulExit;
366        }
367        return GracefulTerminationResult::Error;
368    }
369
370    GracefulTerminationResult::ForcefulKill
371}
372
373/// Graceful termination on non-Unix (best effort).
374///
375/// On Windows, uses `taskkill` without `/F` first, then retries with `/F`
376/// after the grace period.
377#[cfg(not(unix))]
378pub fn graceful_kill_process_group(
379    pid: u32,
380    initial_signal: KillSignal,
381    grace_period: std::time::Duration,
382) -> GracefulTerminationResult {
383    #[cfg(windows)]
384    {
385        let _ = initial_signal;
386        let pid_arg = pid.to_string();
387        match std::process::Command::new("taskkill")
388            .args(["/PID", &pid_arg, "/T"])
389            .status()
390        {
391            Ok(status) if status.success() => {
392                std::thread::sleep(grace_period);
393                GracefulTerminationResult::GracefulExit
394            }
395            Ok(_) => match kill_process(pid) {
396                Ok(()) => GracefulTerminationResult::ForcefulKill,
397                Err(_) => GracefulTerminationResult::AlreadyExited,
398            },
399            Err(_) => GracefulTerminationResult::Error,
400        }
401    }
402    #[cfg(not(windows))]
403    {
404        let _ = (pid, initial_signal, grace_period);
405        GracefulTerminationResult::Error
406    }
407}
408
409/// Gracefully terminate a process group with default settings.
410///
411/// Uses SIGTERM and the default grace period (500ms).
412#[cfg(unix)]
413pub fn graceful_kill_process_group_default(pid: u32) -> GracefulTerminationResult {
414    graceful_kill_process_group(
415        pid,
416        KillSignal::Term,
417        std::time::Duration::from_millis(DEFAULT_GRACEFUL_TIMEOUT_MS),
418    )
419}
420
421/// Graceful termination with defaults on non-Unix.
422#[cfg(not(unix))]
423pub fn graceful_kill_process_group_default(pid: u32) -> GracefulTerminationResult {
424    graceful_kill_process_group(
425        pid,
426        KillSignal::Term,
427        std::time::Duration::from_millis(DEFAULT_GRACEFUL_TIMEOUT_MS),
428    )
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_set_parent_death_signal_no_panic() {
437        // Just verify it doesn't panic
438        #[cfg(target_os = "linux")]
439        {
440            let parent_pid = unistd::getpid().as_raw();
441            // Note: This will likely fail in tests since we're not in pre_exec
442            // but it should not panic
443            let _ = set_parent_death_signal(parent_pid);
444        }
445        #[cfg(not(target_os = "linux"))]
446        {
447            assert!(set_parent_death_signal(0).is_ok());
448        }
449    }
450
451    #[test]
452    fn test_kill_nonexistent_process_group() {
453        // Killing a non-existent process group should not error on non-Unix
454        // On Unix, ESRCH (no such process) is converted to Ok() in our implementation
455        #[cfg(unix)]
456        {
457            // Try to kill a very high PID that definitely doesn't exist
458            // Our implementation should return Ok for ESRCH
459            let result = kill_process_group(2_000_000_000);
460            // Just verify it doesn't panic - result depends on kernel
461            let _ = result;
462        }
463        #[cfg(not(unix))]
464        {
465            let result = kill_process_group(999_999);
466            assert!(result.is_ok());
467        }
468    }
469
470    #[test]
471    fn test_kill_signal_values() {
472        // Verify KillSignal enum values
473        assert_ne!(KillSignal::Int, KillSignal::Term);
474        assert_ne!(KillSignal::Term, KillSignal::Kill);
475        assert_ne!(KillSignal::Int, KillSignal::Kill);
476
477        // Test default
478        assert_eq!(KillSignal::default(), KillSignal::Kill);
479    }
480
481    #[test]
482    fn test_graceful_termination_result_debug() {
483        // Verify GracefulTerminationResult can be formatted
484        let results = [
485            GracefulTerminationResult::GracefulExit,
486            GracefulTerminationResult::ForcefulKill,
487            GracefulTerminationResult::AlreadyExited,
488            GracefulTerminationResult::Error,
489        ];
490        for result in &results {
491            let _ = format!("{result:?}");
492        }
493    }
494
495    #[test]
496    fn test_graceful_kill_nonexistent_process() {
497        // Gracefully killing a non-existent PID should return AlreadyExited
498        let result = graceful_kill_process_group_default(2_000_000_000);
499        #[cfg(unix)]
500        {
501            // On Unix, non-existent processes return AlreadyExited
502            assert_eq!(result, GracefulTerminationResult::AlreadyExited);
503        }
504        #[cfg(not(unix))]
505        {
506            // On non-Unix, behavior varies
507            let _ = result;
508        }
509    }
510
511    #[cfg(unix)]
512    #[test]
513    fn test_is_process_running_self() {
514        // Our own process should be running
515        let pid = std::process::id();
516        assert!(is_process_running(pid));
517    }
518
519    #[cfg(unix)]
520    #[test]
521    fn test_is_process_running_nonexistent() {
522        // A very high PID should not be running
523        assert!(!is_process_running(2_000_000_000));
524    }
525}