Skip to main content

agent_exec/
kill.rs

1//! Implementation of the `kill` sub-command.
2//!
3//! Signals supported: TERM, INT, KILL (case-insensitive).
4//!
5//! Signal mapping on Windows:
6//!   TERM → TerminateJobObject (graceful intent; Windows has no SIGTERM, so
7//!           tree termination is the closest equivalent)
8//!   INT  → TerminateJobObject (same; Windows has no SIGINT for arbitrary PIDs)
9//!   KILL → TerminateJobObject (forced; semantically the same on Windows)
10//!   *    → TerminateJobObject (unknown signals treated as KILL per design.md)
11//!
12//! On Windows the supervisor records a `windows_job_name` in `state.json`.
13//! When present, `kill` opens that named Job Object directly and terminates
14//! it, which stops the entire process tree.  If absent (e.g. the supervisor
15//! could not assign the process to a job), a snapshot-based tree enumeration
16//! fallback is used instead.
17
18use anyhow::Result;
19use tracing::info;
20
21use crate::jobstore::{InvalidJobState, JobDir, resolve_root};
22use crate::schema::{JobState, JobStateJob, JobStateResult, JobStatus, KillData, Response};
23
24/// Options for the `kill` sub-command.
25#[derive(Debug)]
26pub struct KillOpts<'a> {
27    pub job_id: &'a str,
28    pub root: Option<&'a str>,
29    /// Signal name: TERM | INT | KILL (default: TERM).
30    pub signal: &'a str,
31}
32
33impl<'a> Default for KillOpts<'a> {
34    fn default() -> Self {
35        KillOpts {
36            job_id: "",
37            root: None,
38            signal: "TERM",
39        }
40    }
41}
42
43/// Execute `kill`: send signal and emit JSON.
44pub fn execute(opts: KillOpts) -> Result<()> {
45    let root = resolve_root(opts.root);
46    let job_dir = JobDir::open(&root, opts.job_id)?;
47
48    let state = job_dir.read_state()?;
49    let signal_upper = opts.signal.to_uppercase();
50
51    // Reject kill on created jobs: there is no process to signal.
52    if *state.status() == JobStatus::Created {
53        return Err(anyhow::Error::new(InvalidJobState(format!(
54            "job {} is in 'created' state and has not been started; cannot send signal",
55            opts.job_id
56        ))));
57    }
58
59    if *state.status() != JobStatus::Running {
60        // Already stopped — no-op but still emit JSON.
61        let response = Response::new(
62            "kill",
63            KillData {
64                job_id: job_dir.job_id.clone(),
65                signal: signal_upper,
66            },
67        );
68        response.print();
69        return Ok(());
70    }
71
72    if let Some(pid) = state.pid {
73        // On Windows, pass the job name from state.json so kill can use the
74        // named Job Object created by the supervisor for reliable tree termination.
75        #[cfg(windows)]
76        send_signal(pid, &signal_upper, state.windows_job_name.as_deref())?;
77        #[cfg(not(windows))]
78        send_signal(pid, &signal_upper)?;
79
80        info!(job_id = %job_dir.job_id, pid, signal = %signal_upper, "signal sent");
81
82        // Mark state as killed.
83        let now = crate::run::now_rfc3339_pub();
84        let new_state = JobState {
85            job: JobStateJob {
86                id: job_dir.job_id.clone(),
87                status: JobStatus::Killed,
88                started_at: state.started_at().map(|s| s.to_string()),
89            },
90            result: JobStateResult {
91                exit_code: None,
92                signal: Some(signal_upper.clone()),
93                duration_ms: None,
94            },
95            pid: Some(pid),
96            finished_at: Some(now.clone()),
97            updated_at: now,
98            windows_job_name: None,
99        };
100        job_dir.write_state(&new_state)?;
101    }
102
103    let response = Response::new(
104        "kill",
105        KillData {
106            job_id: job_dir.job_id.clone(),
107            signal: signal_upper,
108        },
109    );
110    response.print();
111    Ok(())
112}
113
114#[cfg(unix)]
115fn send_signal(pid: u32, signal: &str) -> Result<()> {
116    let signum: libc::c_int = match signal {
117        "TERM" => libc::SIGTERM,
118        "INT" => libc::SIGINT,
119        "KILL" => libc::SIGKILL,
120        _ => libc::SIGKILL, // Unknown → KILL (per design.md)
121    };
122    // Send signal to the process group (negative PID) so the shell wrapper
123    // and all its descendants receive it.  Fall back to single-process kill
124    // if the process-group kill fails (e.g. process is not a group leader).
125    // SAFETY: kill(2) is safe to call with any pid and valid signal number.
126    let pgid = -(pid as libc::pid_t);
127    let ret = unsafe { libc::kill(pgid, signum) };
128    if ret != 0 {
129        let err = std::io::Error::last_os_error();
130        if err.raw_os_error() == Some(libc::ESRCH) {
131            // No such process group — try single-process kill as fallback.
132            let ret2 = unsafe { libc::kill(pid as libc::pid_t, signum) };
133            if ret2 != 0 {
134                let err2 = std::io::Error::last_os_error();
135                if err2.raw_os_error() != Some(libc::ESRCH) {
136                    return Err(err2.into());
137                }
138            }
139        } else {
140            return Err(err.into());
141        }
142    }
143    Ok(())
144}
145
146/// Windows signal dispatch.
147///
148/// Signal mapping (per design.md):
149/// - TERM/INT/KILL all map to Job Object termination (process tree termination).
150/// - Unknown signals are treated as KILL (same as design.md specifies).
151///
152/// Strategy:
153/// 1. If `job_name` is Some, open the named Job Object and call TerminateJobObject.
154/// 2. Otherwise fall back to snapshot-based tree enumeration starting at `pid`.
155#[cfg(windows)]
156fn send_signal(pid: u32, signal: &str, job_name: Option<&str>) -> Result<()> {
157    use tracing::debug;
158    use windows::Win32::Foundation::CloseHandle;
159
160    // Log the signal mapping for observability.
161    let _mapped = match signal {
162        "TERM" => "TerminateJobObject (TERM→process-tree kill)",
163        "INT" => "TerminateJobObject (INT→process-tree kill)",
164        "KILL" => "TerminateJobObject (KILL→process-tree kill)",
165        other => {
166            debug!(
167                signal = other,
168                "unknown signal mapped to KILL (process-tree kill)"
169            );
170            "TerminateJobObject (unknown→process-tree kill)"
171        }
172    };
173
174    // Path 1: named Job Object created by the supervisor is available.
175    if let Some(name) = job_name {
176        use windows::Win32::System::JobObjects::{
177            JOB_OBJECT_ALL_ACCESS, OpenJobObjectW, TerminateJobObject,
178        };
179        use windows::core::HSTRING;
180
181        let hname = HSTRING::from(name);
182        unsafe {
183            let job = OpenJobObjectW(JOB_OBJECT_ALL_ACCESS, false, &hname)
184                .map_err(|e| anyhow::anyhow!("OpenJobObjectW({name}) failed: {e}"))?;
185            let result = TerminateJobObject(job, 1)
186                .map_err(|e| anyhow::anyhow!("TerminateJobObject({name}) failed: {e}"));
187            let _ = CloseHandle(job);
188            return result;
189        }
190    }
191
192    // Path 2: no named Job Object — try ad-hoc assignment then terminate.
193    send_signal_no_job(pid)
194}
195
196/// Fallback Windows kill path when no named Job Object is available.
197/// Attempts to create a temporary Job Object, assign the process, and terminate.
198/// If assignment fails (process already in another job), falls back to
199/// snapshot-based recursive tree termination.
200#[cfg(windows)]
201fn send_signal_no_job(pid: u32) -> Result<()> {
202    use windows::Win32::Foundation::{CloseHandle, HANDLE};
203    use windows::Win32::System::JobObjects::{
204        AssignProcessToJobObject, CreateJobObjectW, TerminateJobObject,
205    };
206    use windows::Win32::System::Threading::{OpenProcess, PROCESS_SET_QUOTA, PROCESS_TERMINATE};
207
208    unsafe {
209        // Open the target process.
210        let proc_handle: HANDLE = OpenProcess(PROCESS_TERMINATE | PROCESS_SET_QUOTA, false, pid)?;
211
212        // Create an anonymous Job Object and assign the process to it, then
213        // terminate all processes in the job (the target process and any
214        // children it has already spawned).
215        let job: HANDLE = CreateJobObjectW(None, None)?;
216
217        // Assign process to the job (if it is already in a job this may fail,
218        // e.g. when the process is already a member of another job object).
219        // In either case, we must guarantee process-tree termination per spec.
220        if AssignProcessToJobObject(job, proc_handle).is_err() {
221            // The process belongs to an existing job object (common when the
222            // supervisor itself runs inside a job, e.g. CI environments).
223            // Fall back to recursive tree termination via snapshot enumeration
224            // so that child processes are also killed, fulfilling the MUST
225            // requirement from spec.md:55-63.
226            let _ = CloseHandle(job);
227            let _ = CloseHandle(proc_handle);
228            // Propagate error if tree termination fails — success must not be
229            // returned unless the entire process tree is actually terminated.
230            return terminate_process_tree(pid);
231        }
232
233        // Terminate all processes in the job (process tree).
234        // Per spec.md:55-63, failure here must be surfaced as an error because
235        // the caller cannot verify tree termination otherwise.
236        TerminateJobObject(job, 1).map_err(|e| {
237            let _ = CloseHandle(proc_handle);
238            let _ = CloseHandle(job);
239            anyhow::anyhow!("TerminateJobObject failed: {}", e)
240        })?;
241
242        let _ = CloseHandle(proc_handle);
243        let _ = CloseHandle(job);
244    }
245    Ok(())
246}
247
248/// Recursively terminate a process and all its descendants using
249/// CreateToolhelp32Snapshot. This is the fallback path when Job Object
250/// assignment fails (e.g., nested job objects on older Windows or CI).
251///
252/// Returns `Ok(())` only when the entire process tree (root + all descendants)
253/// has been terminated. Returns an error if snapshot enumeration fails or if
254/// the root process itself cannot be opened for termination, because in those
255/// cases tree-wide termination cannot be guaranteed (spec.md:55-63 MUST).
256#[cfg(windows)]
257fn terminate_process_tree(root_pid: u32) -> Result<()> {
258    use windows::Win32::Foundation::CloseHandle;
259    use windows::Win32::System::Diagnostics::ToolHelp::{
260        CreateToolhelp32Snapshot, PROCESSENTRY32, Process32First, Process32Next, TH32CS_SNAPPROCESS,
261    };
262    use windows::Win32::System::Threading::{OpenProcess, PROCESS_TERMINATE, TerminateProcess};
263
264    unsafe {
265        // Build a list of (pid, parent_pid) for all running processes.
266        // If we cannot take a snapshot we cannot enumerate child processes, so
267        // we must return an error rather than silently skip them.
268        let snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0)
269            .map_err(|e| anyhow::anyhow!("CreateToolhelp32Snapshot failed: {}", e))?;
270
271        let mut entries: Vec<(u32, u32)> = Vec::new();
272        let mut entry = PROCESSENTRY32 {
273            dwSize: std::mem::size_of::<PROCESSENTRY32>() as u32,
274            ..Default::default()
275        };
276
277        if Process32First(snapshot, &mut entry).is_ok() {
278            loop {
279                entries.push((entry.th32ProcessID, entry.th32ParentProcessID));
280                entry = PROCESSENTRY32 {
281                    dwSize: std::mem::size_of::<PROCESSENTRY32>() as u32,
282                    ..Default::default()
283                };
284                if Process32Next(snapshot, &mut entry).is_err() {
285                    break;
286                }
287            }
288        }
289        let _ = CloseHandle(snapshot);
290
291        // Collect all pids in the subtree rooted at root_pid (BFS).
292        let mut to_kill: Vec<u32> = vec![root_pid];
293        let mut i = 0;
294        while i < to_kill.len() {
295            let parent = to_kill[i];
296            for &(child_pid, parent_pid) in &entries {
297                if parent_pid == parent && !to_kill.contains(&child_pid) {
298                    to_kill.push(child_pid);
299                }
300            }
301            i += 1;
302        }
303
304        // Terminate all collected processes (children first, then root).
305        // Per spec.md:55-63, tree-wide termination is a MUST.  Every process
306        // in the subtree must be confirmed terminated; failure to terminate
307        // any process (root or child) returns an error unless the process no
308        // longer exists (already terminated, which is a success condition).
309        use windows::Win32::Foundation::ERROR_INVALID_PARAMETER;
310
311        for &target_pid in to_kill.iter().rev() {
312            match OpenProcess(PROCESS_TERMINATE, false, target_pid) {
313                Ok(h) => {
314                    let result = TerminateProcess(h, 1);
315                    let _ = CloseHandle(h);
316                    result.map_err(|e| {
317                        anyhow::anyhow!("TerminateProcess for pid {} failed: {}", target_pid, e)
318                    })?;
319                }
320                Err(e) => {
321                    // ERROR_INVALID_PARAMETER (87) means the process no longer
322                    // exists — it has already exited, which is a success
323                    // condition (the process is gone).  Any other error means
324                    // we could not open the process handle and therefore cannot
325                    // confirm or perform termination, which violates the MUST.
326                    if e.code() != ERROR_INVALID_PARAMETER.to_hresult() {
327                        return Err(anyhow::anyhow!(
328                            "OpenProcess for pid {} failed (process may still be running): {}",
329                            target_pid,
330                            e
331                        ));
332                    }
333                    // Process already gone — treat as success.
334                }
335            }
336        }
337    }
338    Ok(())
339}
340
341#[cfg(not(any(unix, windows)))]
342fn send_signal(_pid: u32, _signal: &str) -> Result<()> {
343    anyhow::bail!("kill not supported on this platform");
344}