Skip to main content

ralph_workflow/pipeline/idle_timeout/
kill.rs

1//! Subprocess termination helpers for idle-timeout enforcement.
2
3use crate::executor::{AgentChild, ProcessExecutor};
4use std::sync::{Arc, Mutex};
5use std::time::Duration;
6
7/// Result of attempting to kill a process.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum KillResult {
10    /// Process was successfully killed with SIGTERM.
11    TerminatedByTerm,
12    /// Process required SIGKILL/taskkill escalation.
13    TerminatedByKill,
14    /// Kill signals were sent successfully, but the process was not confirmed exited yet.
15    ///
16    /// The monitor should continue polling for exit. It may return `TimedOut`
17    /// after a bounded enforcement window so the pipeline can regain control,
18    /// but it must not silently stop enforcing termination; a background reaper
19    /// should continue best-effort SIGKILL/taskkill attempts until exit is observed.
20    SignalsSentAwaitingExit { escalated: bool },
21    /// Kill attempt failed (process may have already exited).
22    Failed,
23}
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub struct KillConfig {
27    sigterm_grace: Duration,
28    poll_interval: Duration,
29    sigkill_confirm_timeout: Duration,
30    post_sigkill_hard_cap: Duration,
31    sigkill_resend_interval: Duration,
32}
33
34impl KillConfig {
35    #[must_use]
36    pub const fn new(
37        sigterm_grace: Duration,
38        poll_interval: Duration,
39        sigkill_confirm_timeout: Duration,
40        post_sigkill_hard_cap: Duration,
41        sigkill_resend_interval: Duration,
42    ) -> Self {
43        Self {
44            sigterm_grace,
45            poll_interval,
46            sigkill_confirm_timeout,
47            post_sigkill_hard_cap,
48            sigkill_resend_interval,
49        }
50    }
51
52    #[must_use]
53    pub const fn sigterm_grace(&self) -> Duration {
54        self.sigterm_grace
55    }
56
57    #[must_use]
58    pub const fn poll_interval(&self) -> Duration {
59        self.poll_interval
60    }
61
62    #[must_use]
63    pub const fn sigkill_confirm_timeout(&self) -> Duration {
64        self.sigkill_confirm_timeout
65    }
66
67    #[must_use]
68    pub const fn post_sigkill_hard_cap(&self) -> Duration {
69        self.post_sigkill_hard_cap
70    }
71
72    #[must_use]
73    pub const fn sigkill_resend_interval(&self) -> Duration {
74        self.sigkill_resend_interval
75    }
76}
77
78/// Default kill configuration.
79///
80/// - SIGTERM grace: 5s
81/// - Poll interval: 100ms
82/// - SIGKILL confirm timeout: 500ms
83/// - Post-SIGKILL hard cap: 5s
84/// - SIGKILL resend interval: 1s
85pub const DEFAULT_KILL_CONFIG: KillConfig = KillConfig::new(
86    Duration::from_secs(5),
87    Duration::from_millis(100),
88    Duration::from_millis(500),
89    Duration::from_secs(5),
90    Duration::from_secs(1),
91);
92
93#[cfg(unix)]
94pub fn force_kill_best_effort(pid: u32, executor: &dyn ProcessExecutor) -> bool {
95    let pid_str = pid.to_string();
96    let process_group_id = format!("-{pid_str}");
97
98    // Prefer killing the whole process group so descendant processes that inherited
99    // stdout/stderr FDs don't keep pipes open after the parent is gone.
100    let group_ok = executor
101        .execute("kill", &["-KILL", "--", &process_group_id], &[], None)
102        .map(|o| o.status.success())
103        .unwrap_or(false);
104
105    if group_ok {
106        return true;
107    }
108
109    executor
110        .execute("kill", &["-KILL", &pid_str], &[], None)
111        .map(|o| o.status.success())
112        .unwrap_or(false)
113}
114
115#[cfg(windows)]
116pub(crate) fn force_kill_best_effort(pid: u32, executor: &dyn ProcessExecutor) -> bool {
117    executor
118        .execute(
119            "taskkill",
120            &["/F", "/T", "/PID", &pid.to_string()],
121            &[],
122            None,
123        )
124        .map(|o| o.status.success())
125        .unwrap_or(false)
126}
127
128/// Kill a process by PID using platform-specific commands via executor.
129///
130/// First attempts SIGTERM, waits for a grace period while verifying liveness,
131/// then escalates to SIGKILL if the process hasn't terminated.
132#[cfg(unix)]
133pub fn kill_process(
134    pid: u32,
135    executor: &dyn ProcessExecutor,
136    child: Option<&Arc<Mutex<Box<dyn AgentChild>>>>,
137    config: KillConfig,
138) -> KillResult {
139    let pid_str = pid.to_string();
140    let process_group_id = format!("-{pid_str}");
141
142    // Send SIGTERM to the process group first (see module docs).
143    let term_ok = executor
144        .execute("kill", &["-TERM", "--", &process_group_id], &[], None)
145        .map(|o| o.status.success())
146        .unwrap_or(false)
147        || executor
148            .execute("kill", &["-TERM", &pid_str], &[], None)
149            .map(|o| o.status.success())
150            .unwrap_or(false);
151
152    if !term_ok {
153        return KillResult::Failed;
154    }
155
156    if let Some(child_arc) = child {
157        let grace_deadline = std::time::Instant::now() + config.sigterm_grace;
158        while std::time::Instant::now() < grace_deadline {
159            let status = {
160                let mut locked_child = child_arc
161                    .lock()
162                    .expect("child process mutex poisoned - indicates panic in another thread");
163                locked_child.try_wait()
164            };
165
166            match status {
167                Ok(Some(_)) => return KillResult::TerminatedByTerm,
168                Ok(None) | Err(_) => std::thread::sleep(config.poll_interval),
169            }
170        }
171
172        let kill_ok = executor
173            .execute("kill", &["-KILL", "--", &process_group_id], &[], None)
174            .map(|o| o.status.success())
175            .unwrap_or(false)
176            || executor
177                .execute("kill", &["-KILL", &pid_str], &[], None)
178                .map(|o| o.status.success())
179                .unwrap_or(false);
180        if !kill_ok {
181            return KillResult::Failed;
182        }
183
184        let confirm_deadline = std::time::Instant::now() + config.sigkill_confirm_timeout;
185        while std::time::Instant::now() < confirm_deadline {
186            let status = {
187                let mut locked_child = child_arc
188                    .lock()
189                    .expect("child process mutex poisoned - indicates panic in another thread");
190                locked_child.try_wait()
191            };
192
193            match status {
194                Ok(Some(_)) => return KillResult::TerminatedByKill,
195                Ok(None) | Err(_) => std::thread::sleep(config.poll_interval),
196            }
197        }
198
199        return KillResult::SignalsSentAwaitingExit { escalated: true };
200    }
201
202    KillResult::TerminatedByTerm
203}
204
205/// Windows kill implementation.
206///
207/// `taskkill /F` is already forceful; treat this as an escalated kill.
208#[cfg(windows)]
209pub(crate) fn kill_process(
210    pid: u32,
211    executor: &dyn ProcessExecutor,
212    child: Option<&Arc<Mutex<Box<dyn AgentChild>>>>,
213    config: KillConfig,
214) -> KillResult {
215    let result = executor.execute(
216        "taskkill",
217        &["/F", "/T", "/PID", &pid.to_string()],
218        &[],
219        None,
220    );
221    let kill_ok = result.map(|o| o.status.success()).unwrap_or(false);
222    if !kill_ok {
223        return KillResult::Failed;
224    }
225
226    if let Some(child_arc) = child {
227        let confirm_deadline = std::time::Instant::now() + config.sigkill_confirm_timeout;
228        while std::time::Instant::now() < confirm_deadline {
229            let status = {
230                let mut locked_child = child_arc
231                    .lock()
232                    .expect("child process mutex poisoned - indicates panic in another thread");
233                locked_child.try_wait()
234            };
235
236            match status {
237                Ok(Some(_)) => return KillResult::TerminatedByKill,
238                Ok(None) | Err(_) => std::thread::sleep(config.poll_interval),
239            }
240        }
241
242        return KillResult::SignalsSentAwaitingExit { escalated: true };
243    }
244
245    KillResult::TerminatedByKill
246}