Skip to main content

ralph_workflow/pipeline/idle_timeout/
kill.rs

1//! Subprocess termination helpers for idle-timeout enforcement.
2
3use crate::executor::{AgentChild, ProcessExecutor};
4use std::sync::{Arc, Mutex};
5use std::time::Duration;
6
7/// Result of attempting to kill a process.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub(crate) enum KillResult {
10    /// Process was successfully killed with SIGTERM.
11    TerminatedByTerm,
12    /// Process required SIGKILL/taskkill escalation.
13    TerminatedByKill,
14    /// Kill signals were sent successfully, but the process was not confirmed exited yet.
15    ///
16    /// The monitor should continue polling for exit. It may return `TimedOut`
17    /// after a bounded enforcement window so the pipeline can regain control,
18    /// but it must not silently stop enforcing termination; a background reaper
19    /// should continue best-effort SIGKILL/taskkill attempts until exit is observed.
20    SignalsSentAwaitingExit { escalated: bool },
21    /// Kill attempt failed (process may have already exited).
22    Failed,
23}
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub struct KillConfig {
27    sigterm_grace: Duration,
28    poll_interval: Duration,
29    sigkill_confirm_timeout: Duration,
30    post_sigkill_hard_cap: Duration,
31    sigkill_resend_interval: Duration,
32}
33
34impl KillConfig {
35    pub const fn new(
36        sigterm_grace: Duration,
37        poll_interval: Duration,
38        sigkill_confirm_timeout: Duration,
39        post_sigkill_hard_cap: Duration,
40        sigkill_resend_interval: Duration,
41    ) -> Self {
42        Self {
43            sigterm_grace,
44            poll_interval,
45            sigkill_confirm_timeout,
46            post_sigkill_hard_cap,
47            sigkill_resend_interval,
48        }
49    }
50
51    pub fn sigterm_grace(&self) -> Duration {
52        self.sigterm_grace
53    }
54
55    pub fn poll_interval(&self) -> Duration {
56        self.poll_interval
57    }
58
59    pub fn sigkill_confirm_timeout(&self) -> Duration {
60        self.sigkill_confirm_timeout
61    }
62
63    pub fn post_sigkill_hard_cap(&self) -> Duration {
64        self.post_sigkill_hard_cap
65    }
66
67    pub fn sigkill_resend_interval(&self) -> Duration {
68        self.sigkill_resend_interval
69    }
70}
71
72/// Default kill configuration.
73///
74/// - SIGTERM grace: 5s
75/// - Poll interval: 100ms
76/// - SIGKILL confirm timeout: 500ms
77/// - Post-SIGKILL hard cap: 5s
78/// - SIGKILL resend interval: 1s
79pub const DEFAULT_KILL_CONFIG: KillConfig = KillConfig::new(
80    Duration::from_secs(5),
81    Duration::from_millis(100),
82    Duration::from_millis(500),
83    Duration::from_secs(5),
84    Duration::from_secs(1),
85);
86
87#[cfg(unix)]
88pub(crate) fn force_kill_best_effort(pid: u32, executor: &dyn ProcessExecutor) -> bool {
89    let pid_str = pid.to_string();
90    let pgid_str = format!("-{pid_str}");
91
92    // Prefer killing the whole process group so descendant processes that inherited
93    // stdout/stderr FDs don't keep pipes open after the parent is gone.
94    let group_ok = executor
95        .execute("kill", &["-KILL", "--", &pgid_str], &[], None)
96        .map(|o| o.status.success())
97        .unwrap_or(false);
98
99    if group_ok {
100        return true;
101    }
102
103    executor
104        .execute("kill", &["-KILL", &pid_str], &[], None)
105        .map(|o| o.status.success())
106        .unwrap_or(false)
107}
108
109#[cfg(windows)]
110pub(crate) fn force_kill_best_effort(pid: u32, executor: &dyn ProcessExecutor) -> bool {
111    executor
112        .execute(
113            "taskkill",
114            &["/F", "/T", "/PID", &pid.to_string()],
115            &[],
116            None,
117        )
118        .map(|o| o.status.success())
119        .unwrap_or(false)
120}
121
122/// Kill a process by PID using platform-specific commands via executor.
123///
124/// First attempts SIGTERM, waits for a grace period while verifying liveness,
125/// then escalates to SIGKILL if the process hasn't terminated.
126#[cfg(unix)]
127pub(crate) fn kill_process(
128    pid: u32,
129    executor: &dyn ProcessExecutor,
130    child: Option<&Arc<Mutex<Box<dyn AgentChild>>>>,
131    config: KillConfig,
132) -> KillResult {
133    let pid_str = pid.to_string();
134    let pgid_str = format!("-{pid_str}");
135
136    // Send SIGTERM to the process group first (see module docs).
137    let term_ok = executor
138        .execute("kill", &["-TERM", "--", &pgid_str], &[], None)
139        .map(|o| o.status.success())
140        .unwrap_or(false)
141        || executor
142            .execute("kill", &["-TERM", &pid_str], &[], None)
143            .map(|o| o.status.success())
144            .unwrap_or(false);
145
146    if !term_ok {
147        return KillResult::Failed;
148    }
149
150    if let Some(child_arc) = child {
151        let grace_deadline = std::time::Instant::now() + config.sigterm_grace;
152        while std::time::Instant::now() < grace_deadline {
153            let status = {
154                let mut locked_child = child_arc.lock().unwrap();
155                locked_child.try_wait()
156            };
157
158            match status {
159                Ok(Some(_)) => return KillResult::TerminatedByTerm,
160                Ok(None) => std::thread::sleep(config.poll_interval),
161                Err(_) => std::thread::sleep(config.poll_interval),
162            }
163        }
164
165        let kill_ok = executor
166            .execute("kill", &["-KILL", "--", &pgid_str], &[], None)
167            .map(|o| o.status.success())
168            .unwrap_or(false)
169            || executor
170                .execute("kill", &["-KILL", &pid_str], &[], None)
171                .map(|o| o.status.success())
172                .unwrap_or(false);
173        if !kill_ok {
174            return KillResult::Failed;
175        }
176
177        let confirm_deadline = std::time::Instant::now() + config.sigkill_confirm_timeout;
178        while std::time::Instant::now() < confirm_deadline {
179            let status = {
180                let mut locked_child = child_arc.lock().unwrap();
181                locked_child.try_wait()
182            };
183
184            match status {
185                Ok(Some(_)) => return KillResult::TerminatedByKill,
186                Ok(None) => std::thread::sleep(config.poll_interval),
187                Err(_) => std::thread::sleep(config.poll_interval),
188            }
189        }
190
191        return KillResult::SignalsSentAwaitingExit { escalated: true };
192    }
193
194    KillResult::TerminatedByTerm
195}
196
197/// Windows kill implementation.
198///
199/// `taskkill /F` is already forceful; treat this as an escalated kill.
200#[cfg(windows)]
201pub(crate) fn kill_process(
202    pid: u32,
203    executor: &dyn ProcessExecutor,
204    child: Option<&Arc<Mutex<Box<dyn AgentChild>>>>,
205    config: KillConfig,
206) -> KillResult {
207    let result = executor.execute(
208        "taskkill",
209        &["/F", "/T", "/PID", &pid.to_string()],
210        &[],
211        None,
212    );
213    let kill_ok = result.map(|o| o.status.success()).unwrap_or(false);
214    if !kill_ok {
215        return KillResult::Failed;
216    }
217
218    if let Some(child_arc) = child {
219        let confirm_deadline = std::time::Instant::now() + config.sigkill_confirm_timeout;
220        while std::time::Instant::now() < confirm_deadline {
221            let status = {
222                let mut locked_child = child_arc.lock().unwrap();
223                locked_child.try_wait()
224            };
225
226            match status {
227                Ok(Some(_)) => return KillResult::TerminatedByKill,
228                Ok(None) => std::thread::sleep(config.poll_interval),
229                Err(_) => std::thread::sleep(config.poll_interval),
230            }
231        }
232
233        return KillResult::SignalsSentAwaitingExit { escalated: true };
234    }
235
236    KillResult::TerminatedByKill
237}