Skip to main content

vtcode_bash_runner/
process_group.rs

1//! Process-group helpers for reliable child process cleanup.
2//!
3//! This module centralizes OS-specific pieces that ensure a spawned
4//! command can be cleaned up reliably:
5//! - `set_process_group` is called in `pre_exec` so the child starts its own
6//!   process group.
7//! - `detach_from_tty` starts a new session so non-interactive children do not
8//!   inherit the controlling TTY.
9//! - `kill_process_group_by_pid` targets the whole group (children/grandchildren)
10//!   instead of a single PID.
11//! - `kill_process_group` targets a known process group ID directly.
12//! - `set_parent_death_signal` (Linux only) arranges for the child to receive a
13//!   `SIGTERM` when the parent exits, and re-checks the parent PID to avoid
14//!   races during fork/exec.
15//! - `graceful_kill_process_group` sends SIGTERM, waits for a grace period, then
16//!   SIGKILL if still running.
17//!
18//! On non-Unix platforms these helpers are no-ops or adapted equivalents.
19//!
20//! Inspired by codex-rs/utils/pty process group management patterns.
21
22use std::io;
23
24#[cfg(unix)]
25use nix::errno::Errno;
26#[cfg(target_os = "linux")]
27use nix::sys::prctl;
28#[cfg(unix)]
29use nix::sys::signal::{self, Signal};
30#[cfg(unix)]
31use nix::unistd::{self, Pid};
32#[cfg(unix)]
33use tokio::process::Child;
34
35/// Default grace period for graceful termination (milliseconds).
36pub const DEFAULT_GRACEFUL_TIMEOUT_MS: u64 = 500;
37
38/// Signal to send when killing process groups.
39#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
40pub enum KillSignal {
41    /// SIGINT - interrupt (Ctrl+C equivalent)
42    Int,
43    /// SIGTERM - allows graceful shutdown
44    Term,
45    /// SIGKILL - immediate termination
46    #[default]
47    Kill,
48}
49
50#[cfg(unix)]
51impl KillSignal {
52    fn as_nix_signal(self) -> Signal {
53        match self {
54            KillSignal::Int => Signal::SIGINT,
55            KillSignal::Term => Signal::SIGTERM,
56            KillSignal::Kill => Signal::SIGKILL,
57        }
58    }
59}
60
61#[cfg(unix)]
62#[cold]
63fn nix_err_to_io(err: Errno) -> io::Error {
64    io::Error::from_raw_os_error(err as i32)
65}
66
67/// Ensure the child receives SIGTERM when the original parent dies.
68///
69/// This should run in `pre_exec` and uses `parent_pid` captured before spawn to
70/// avoid a race where the parent exits between fork and exec.
71#[cfg(target_os = "linux")]
72pub fn set_parent_death_signal(parent_pid: libc::pid_t) -> io::Result<()> {
73    prctl::set_pdeathsig(Some(Signal::SIGTERM)).map_err(nix_err_to_io)?;
74
75    // Re-check parent PID to avoid race condition where parent exits between fork and exec.
76    if unistd::getppid() != Pid::from_raw(parent_pid) {
77        signal::kill(unistd::getpid(), Signal::SIGTERM).map_err(nix_err_to_io)?;
78    }
79
80    Ok(())
81}
82
83/// No-op on non-Linux platforms.
84#[cfg(not(target_os = "linux"))]
85pub fn set_parent_death_signal(_parent_pid: i32) -> io::Result<()> {
86    Ok(())
87}
88
89/// Detach from the controlling TTY by starting a new session.
90///
91/// This is useful for spawning background processes that should not receive
92/// signals from the controlling terminal.
93#[cfg(unix)]
94pub fn detach_from_tty() -> io::Result<()> {
95    match unistd::setsid() {
96        Ok(_) => Ok(()),
97        // EPERM means we're already a session leader, fall back to setpgid.
98        Err(Errno::EPERM) => set_process_group(),
99        Err(err) => Err(nix_err_to_io(err)),
100    }
101}
102
103/// No-op on non-Unix platforms.
104#[cfg(not(unix))]
105pub fn detach_from_tty() -> io::Result<()> {
106    Ok(())
107}
108
109/// Put the calling process into its own process group.
110///
111/// Intended for use in `pre_exec` so the child becomes the group leader.
112#[cfg(unix)]
113pub fn set_process_group() -> io::Result<()> {
114    unistd::setpgid(Pid::from_raw(0), Pid::from_raw(0)).map_err(nix_err_to_io)
115}
116
117/// No-op on non-Unix platforms.
118#[cfg(not(unix))]
119pub fn set_process_group() -> io::Result<()> {
120    Ok(())
121}
122
123/// Kill the process group for the given PID (best-effort).
124///
125/// This resolves the PGID for `pid` and sends SIGKILL to the whole group.
126#[cfg(unix)]
127pub fn kill_process_group_by_pid(pid: u32) -> io::Result<()> {
128    kill_process_group_by_pid_with_signal(pid, KillSignal::Kill)
129}
130
131/// Kill the process group for the given PID with a specific signal.
132#[cfg(unix)]
133pub fn kill_process_group_by_pid_with_signal(pid: u32, signal: KillSignal) -> io::Result<()> {
134    use std::io::ErrorKind;
135
136    let target_pid = Pid::from_raw(pid as libc::pid_t);
137    let pgid = unistd::getpgid(Some(target_pid));
138    let mut pgid_err = None;
139
140    match pgid {
141        Ok(group) => {
142            if let Err(err) = signal::killpg(group, signal.as_nix_signal()) {
143                let io_err = nix_err_to_io(err);
144                if io_err.kind() != ErrorKind::NotFound {
145                    pgid_err = Some(io_err);
146                }
147            }
148        }
149        Err(err) => pgid_err = Some(nix_err_to_io(err)),
150    }
151
152    // Always attempt to kill the direct child process handle as a fallback.
153    // This ensures termination even if the cached PGID was stale or
154    // the process group kill had issues.
155    if let Err(err) = signal::kill(target_pid, signal.as_nix_signal()) {
156        let io_err = nix_err_to_io(err);
157        if io_err.kind() == ErrorKind::NotFound {
158            // If direct kill says not found, we're done regardless of pgid result.
159            return Ok(());
160        }
161        // If we have a pgid error and a direct kill error, prefer the pgid one.
162        if let Some(pgid_error) = pgid_err {
163            return Err(pgid_error);
164        }
165        return Err(io_err);
166    }
167
168    Ok(())
169}
170
171/// No-op on non-Unix platforms.
172#[cfg(not(unix))]
173pub fn kill_process_group_by_pid(_pid: u32) -> io::Result<()> {
174    Ok(())
175}
176
177/// No-op on non-Unix platforms.
178#[cfg(not(unix))]
179pub fn kill_process_group_by_pid_with_signal(_pid: u32, _signal: KillSignal) -> io::Result<()> {
180    Ok(())
181}
182
183/// Kill a specific process group ID (best-effort).
184#[cfg(unix)]
185pub fn kill_process_group(process_group_id: u32) -> io::Result<()> {
186    kill_process_group_with_signal(process_group_id, KillSignal::Kill)
187}
188
189/// Kill a specific process group ID with a specific signal.
190#[cfg(unix)]
191pub fn kill_process_group_with_signal(process_group_id: u32, signal: KillSignal) -> io::Result<()> {
192    use std::io::ErrorKind;
193
194    let pgid = Pid::from_raw(process_group_id as libc::pid_t);
195    if let Err(err) = signal::killpg(pgid, signal.as_nix_signal()) {
196        let io_err = nix_err_to_io(err);
197        if io_err.kind() != ErrorKind::NotFound {
198            return Err(io_err);
199        }
200    }
201
202    Ok(())
203}
204
205/// No-op on non-Unix platforms.
206#[cfg(not(unix))]
207pub fn kill_process_group(_process_group_id: u32) -> io::Result<()> {
208    Ok(())
209}
210
211/// No-op on non-Unix platforms.
212#[cfg(not(unix))]
213pub fn kill_process_group_with_signal(
214    _process_group_id: u32,
215    _signal: KillSignal,
216) -> io::Result<()> {
217    Ok(())
218}
219
220/// Kill the process group for a tokio child (best-effort).
221#[cfg(unix)]
222pub fn kill_child_process_group(child: &mut Child) -> io::Result<()> {
223    kill_child_process_group_with_signal(child, KillSignal::Kill)
224}
225
226/// Kill the process group for a tokio child with a specific signal.
227#[cfg(unix)]
228pub fn kill_child_process_group_with_signal(
229    child: &mut Child,
230    signal: KillSignal,
231) -> io::Result<()> {
232    if let Some(pid) = child.id() {
233        return kill_process_group_by_pid_with_signal(pid, signal);
234    }
235
236    Ok(())
237}
238
239/// No-op on non-Unix platforms.
240#[cfg(not(unix))]
241pub fn kill_child_process_group(_child: &mut tokio::process::Child) -> io::Result<()> {
242    Ok(())
243}
244
245/// No-op on non-Unix platforms.
246#[cfg(not(unix))]
247pub fn kill_child_process_group_with_signal(
248    _child: &mut tokio::process::Child,
249    _signal: KillSignal,
250) -> io::Result<()> {
251    Ok(())
252}
253
254/// Kill a process by PID on Windows.
255#[cfg(windows)]
256pub fn kill_process(pid: u32) -> io::Result<()> {
257    let status = std::process::Command::new("taskkill")
258        .args(["/PID", &pid.to_string(), "/T", "/F"])
259        .status()?;
260    if status.success() {
261        Ok(())
262    } else {
263        Err(io::Error::other("taskkill failed"))
264    }
265}
266
267/// No-op on non-Windows platforms.
268#[cfg(not(windows))]
269pub fn kill_process(_pid: u32) -> io::Result<()> {
270    Ok(())
271}
272
273/// Result of a graceful termination attempt.
274#[derive(Debug, Clone, Copy, PartialEq, Eq)]
275pub enum GracefulTerminationResult {
276    /// Process exited gracefully after SIGTERM/SIGINT.
277    GracefulExit,
278    /// Process had to be forcefully killed with SIGKILL.
279    ForcefulKill,
280    /// Process was already not running.
281    AlreadyExited,
282    /// Failed to check or terminate the process.
283    Error,
284}
285
286/// Check if a process (by PID) is still running.
287#[cfg(unix)]
288fn is_process_running(pid: u32) -> bool {
289    let target_pid = Pid::from_raw(pid as libc::pid_t);
290    match signal::kill(target_pid, None::<Signal>) {
291        Ok(()) => true,
292        // EPERM = exists but no permission (still running)
293        Err(Errno::EPERM) => true,
294        Err(_) => false,
295    }
296}
297
298#[cfg(not(unix))]
299fn is_process_running(_pid: u32) -> bool {
300    // On non-Unix, assume running (will fail gracefully)
301    true
302}
303
304/// Gracefully terminate a process group by PID.
305///
306/// This function implements a staged termination strategy:
307/// 1. Send the initial signal (default: SIGTERM, or SIGINT for interactive processes)
308/// 2. Wait up to `grace_period` for the process to exit
309/// 3. If still running, send SIGKILL
310///
311/// Returns information about how the termination completed.
312///
313/// # Arguments
314/// * `pid` - Process ID (will be used to resolve the process group)
315/// * `initial_signal` - Signal to try first (SIGINT, SIGTERM)
316/// * `grace_period` - How long to wait before SIGKILL
317#[cfg(unix)]
318pub fn graceful_kill_process_group(
319    pid: u32,
320    initial_signal: KillSignal,
321    grace_period: std::time::Duration,
322) -> GracefulTerminationResult {
323    // Check if already exited
324    if !is_process_running(pid) {
325        return GracefulTerminationResult::AlreadyExited;
326    }
327
328    // Resolve PGID
329    let target_pid = Pid::from_raw(pid as libc::pid_t);
330    let Ok(pgid) = unistd::getpgid(Some(target_pid)) else {
331        // Can't get PGID - process may have already exited.
332        return GracefulTerminationResult::AlreadyExited;
333    };
334
335    // Send initial signal (SIGTERM or SIGINT)
336    let signal = match initial_signal {
337        KillSignal::Kill => Signal::SIGTERM, // Don't send SIGKILL as initial.
338        other => other.as_nix_signal(),
339    };
340
341    if let Err(err) = signal::killpg(pgid, signal) {
342        if err != Errno::ESRCH {
343            return GracefulTerminationResult::Error;
344        }
345        return GracefulTerminationResult::AlreadyExited;
346    }
347
348    // Wait for graceful exit
349    let deadline = std::time::Instant::now() + grace_period;
350    let poll_interval = std::time::Duration::from_millis(10);
351
352    while std::time::Instant::now() < deadline {
353        if !is_process_running(pid) {
354            return GracefulTerminationResult::GracefulExit;
355        }
356        std::thread::sleep(poll_interval);
357    }
358
359    // Still running - force kill.
360    // Use the robust termination behavior from codex-rs/utils/pty PR 12688
361    // by attempting both a pgid kill and a direct pid kill.
362    let _ = signal::killpg(pgid, Signal::SIGKILL);
363    if let Err(err) = signal::kill(target_pid, Signal::SIGKILL) {
364        if err == Errno::ESRCH {
365            // Exited between check and kill.
366            return GracefulTerminationResult::GracefulExit;
367        }
368        return GracefulTerminationResult::Error;
369    }
370
371    GracefulTerminationResult::ForcefulKill
372}
373
374/// Graceful termination on non-Unix (best effort).
375///
376/// On Windows, uses `taskkill` without `/F` first, then retries with `/F`
377/// after the grace period.
378#[cfg(not(unix))]
379pub fn graceful_kill_process_group(
380    pid: u32,
381    initial_signal: KillSignal,
382    grace_period: std::time::Duration,
383) -> GracefulTerminationResult {
384    #[cfg(windows)]
385    {
386        let _ = initial_signal;
387        let pid_arg = pid.to_string();
388        match std::process::Command::new("taskkill")
389            .args(["/PID", &pid_arg, "/T"])
390            .status()
391        {
392            Ok(status) if status.success() => {
393                std::thread::sleep(grace_period);
394                GracefulTerminationResult::GracefulExit
395            }
396            Ok(_) => match kill_process(pid) {
397                Ok(()) => GracefulTerminationResult::ForcefulKill,
398                Err(_) => GracefulTerminationResult::AlreadyExited,
399            },
400            Err(_) => GracefulTerminationResult::Error,
401        }
402    }
403    #[cfg(not(windows))]
404    {
405        let _ = (pid, initial_signal, grace_period);
406        GracefulTerminationResult::Error
407    }
408}
409
410/// Gracefully terminate a process group with default settings.
411///
412/// Uses SIGTERM and the default grace period (500ms).
413pub fn graceful_kill_process_group_default(pid: u32) -> GracefulTerminationResult {
414    graceful_kill_process_group(
415        pid,
416        KillSignal::Term,
417        std::time::Duration::from_millis(DEFAULT_GRACEFUL_TIMEOUT_MS),
418    )
419}
420
421/// Async-safe wrapper for graceful process-group termination.
422///
423/// This offloads the synchronous graceful-kill loop to Tokio's blocking pool so
424/// async runtime threads are not occupied by polling sleeps.
425pub async fn graceful_kill_process_group_default_async(pid: u32) -> GracefulTerminationResult {
426    tokio::task::spawn_blocking(move || graceful_kill_process_group_default(pid))
427        .await
428        .unwrap_or(GracefulTerminationResult::Error)
429}
430
431#[cfg(test)]
432mod tests {
433    use super::*;
434
435    #[test]
436    fn test_set_parent_death_signal_no_panic() {
437        // Just verify it doesn't panic
438        #[cfg(target_os = "linux")]
439        {
440            let parent_pid = unistd::getpid().as_raw();
441            // Note: This will likely fail in tests since we're not in pre_exec
442            // but it should not panic
443            let _ = set_parent_death_signal(parent_pid);
444        }
445        #[cfg(not(target_os = "linux"))]
446        {
447            assert!(set_parent_death_signal(0).is_ok());
448        }
449    }
450
451    #[test]
452    fn test_kill_nonexistent_process_group() {
453        // Killing a non-existent process group should not error on non-Unix
454        // On Unix, ESRCH (no such process) is converted to Ok() in our implementation
455        #[cfg(unix)]
456        {
457            // Try to kill a very high PID that definitely doesn't exist
458            // Our implementation should return Ok for ESRCH
459            let result = kill_process_group(2_000_000_000);
460            // Just verify it doesn't panic - result depends on kernel
461            let _ = result;
462        }
463        #[cfg(not(unix))]
464        {
465            let result = kill_process_group(999_999);
466            assert!(result.is_ok());
467        }
468    }
469
470    #[test]
471    fn test_kill_signal_values() {
472        // Verify KillSignal enum values
473        assert_ne!(KillSignal::Int, KillSignal::Term);
474        assert_ne!(KillSignal::Term, KillSignal::Kill);
475        assert_ne!(KillSignal::Int, KillSignal::Kill);
476
477        // Test default
478        assert_eq!(KillSignal::default(), KillSignal::Kill);
479    }
480
481    #[test]
482    fn test_graceful_termination_result_debug() {
483        // Verify GracefulTerminationResult can be formatted
484        let results = [
485            GracefulTerminationResult::GracefulExit,
486            GracefulTerminationResult::ForcefulKill,
487            GracefulTerminationResult::AlreadyExited,
488            GracefulTerminationResult::Error,
489        ];
490        for result in &results {
491            let _ = format!("{result:?}");
492        }
493    }
494
495    #[test]
496    fn test_graceful_kill_nonexistent_process() {
497        // Gracefully killing a non-existent PID should return AlreadyExited
498        let result = graceful_kill_process_group_default(2_000_000_000);
499        #[cfg(unix)]
500        {
501            // On Unix, non-existent processes return AlreadyExited
502            assert_eq!(result, GracefulTerminationResult::AlreadyExited);
503        }
504        #[cfg(not(unix))]
505        {
506            // On non-Unix, behavior varies
507            let _ = result;
508        }
509    }
510
511    #[tokio::test]
512    async fn test_graceful_kill_nonexistent_process_async() {
513        let result = graceful_kill_process_group_default_async(2_000_000_000).await;
514        #[cfg(unix)]
515        {
516            assert_eq!(result, GracefulTerminationResult::AlreadyExited);
517        }
518        #[cfg(not(unix))]
519        {
520            let _ = result;
521        }
522    }
523
524    #[cfg(unix)]
525    #[test]
526    fn test_is_process_running_self() {
527        // Our own process should be running
528        let pid = std::process::id();
529        assert!(is_process_running(pid));
530    }
531
532    #[cfg(unix)]
533    #[test]
534    fn test_is_process_running_nonexistent() {
535        // A very high PID should not be running
536        assert!(!is_process_running(2_000_000_000));
537    }
538}