agent_exec/kill.rs
1//! Implementation of the `kill` sub-command.
2//!
3//! Signals supported: TERM, INT, KILL (case-insensitive).
4//!
5//! Signal mapping on Windows:
6//! TERM → TerminateJobObject (graceful intent; Windows has no SIGTERM, so
7//! tree termination is the closest equivalent)
8//! INT → TerminateJobObject (same; Windows has no SIGINT for arbitrary PIDs)
9//! KILL → TerminateJobObject (forced; semantically the same on Windows)
10//! * → TerminateJobObject (unknown signals treated as KILL per design.md)
11//!
12//! On Windows the supervisor records a `windows_job_name` in `state.json`.
13//! When present, `kill` opens that named Job Object directly and terminates
14//! it, which stops the entire process tree. If absent (e.g. the supervisor
15//! could not assign the process to a job), a snapshot-based tree enumeration
16//! fallback is used instead.
17
18use anyhow::Result;
19use tracing::info;
20
21use crate::jobstore::{InvalidJobState, JobDir, resolve_root};
22use crate::schema::{JobState, JobStateJob, JobStateResult, JobStatus, KillData, Response};
23
24/// Options for the `kill` sub-command.
25#[derive(Debug)]
26pub struct KillOpts<'a> {
27 pub job_id: &'a str,
28 pub root: Option<&'a str>,
29 /// Signal name: TERM | INT | KILL (default: TERM).
30 pub signal: &'a str,
31}
32
33impl<'a> Default for KillOpts<'a> {
34 fn default() -> Self {
35 KillOpts {
36 job_id: "",
37 root: None,
38 signal: "TERM",
39 }
40 }
41}
42
43/// Execute `kill`: send signal and emit JSON.
44pub fn execute(opts: KillOpts) -> Result<()> {
45 let root = resolve_root(opts.root);
46 let job_dir = JobDir::open(&root, opts.job_id)?;
47
48 let state = job_dir.read_state()?;
49 let signal_upper = opts.signal.to_uppercase();
50
51 // Reject kill on created jobs: there is no process to signal.
52 if *state.status() == JobStatus::Created {
53 return Err(anyhow::Error::new(InvalidJobState(format!(
54 "job {} is in 'created' state and has not been started; cannot send signal",
55 opts.job_id
56 ))));
57 }
58
59 if *state.status() != JobStatus::Running {
60 // Already stopped — no-op but still emit JSON.
61 let response = Response::new(
62 "kill",
63 KillData {
64 job_id: job_dir.job_id.clone(),
65 signal: signal_upper,
66 },
67 );
68 response.print();
69 return Ok(());
70 }
71
72 if let Some(pid) = state.pid {
73 // On Windows, pass the job name from state.json so kill can use the
74 // named Job Object created by the supervisor for reliable tree termination.
75 #[cfg(windows)]
76 send_signal(pid, &signal_upper, state.windows_job_name.as_deref())?;
77 #[cfg(not(windows))]
78 send_signal(pid, &signal_upper)?;
79
80 info!(job_id = %job_dir.job_id, pid, signal = %signal_upper, "signal sent");
81
82 // Mark state as killed.
83 let now = crate::run::now_rfc3339_pub();
84 let new_state = JobState {
85 job: JobStateJob {
86 id: job_dir.job_id.clone(),
87 status: JobStatus::Killed,
88 started_at: state.started_at().map(|s| s.to_string()),
89 },
90 result: JobStateResult {
91 exit_code: None,
92 signal: Some(signal_upper.clone()),
93 duration_ms: None,
94 },
95 pid: Some(pid),
96 finished_at: Some(now.clone()),
97 updated_at: now,
98 windows_job_name: None,
99 };
100 job_dir.write_state(&new_state)?;
101 }
102
103 let response = Response::new(
104 "kill",
105 KillData {
106 job_id: job_dir.job_id.clone(),
107 signal: signal_upper,
108 },
109 );
110 response.print();
111 Ok(())
112}
113
114#[cfg(unix)]
115fn send_signal(pid: u32, signal: &str) -> Result<()> {
116 let signum: libc::c_int = match signal {
117 "TERM" => libc::SIGTERM,
118 "INT" => libc::SIGINT,
119 "KILL" => libc::SIGKILL,
120 _ => libc::SIGKILL, // Unknown → KILL (per design.md)
121 };
122 // Send signal to the process group (negative PID) so the shell wrapper
123 // and all its descendants receive it. Fall back to single-process kill
124 // if the process-group kill fails (e.g. process is not a group leader).
125 // SAFETY: kill(2) is safe to call with any pid and valid signal number.
126 let pgid = -(pid as libc::pid_t);
127 let ret = unsafe { libc::kill(pgid, signum) };
128 if ret != 0 {
129 let err = std::io::Error::last_os_error();
130 if err.raw_os_error() == Some(libc::ESRCH) {
131 // No such process group — try single-process kill as fallback.
132 let ret2 = unsafe { libc::kill(pid as libc::pid_t, signum) };
133 if ret2 != 0 {
134 let err2 = std::io::Error::last_os_error();
135 if err2.raw_os_error() != Some(libc::ESRCH) {
136 return Err(err2.into());
137 }
138 }
139 } else {
140 return Err(err.into());
141 }
142 }
143 Ok(())
144}
145
146/// Windows signal dispatch.
147///
148/// Signal mapping (per design.md):
149/// - TERM/INT/KILL all map to Job Object termination (process tree termination).
150/// - Unknown signals are treated as KILL (same as design.md specifies).
151///
152/// Strategy:
153/// 1. If `job_name` is Some, open the named Job Object and call TerminateJobObject.
154/// 2. Otherwise fall back to snapshot-based tree enumeration starting at `pid`.
155#[cfg(windows)]
156fn send_signal(pid: u32, signal: &str, job_name: Option<&str>) -> Result<()> {
157 use tracing::debug;
158 use windows::Win32::Foundation::CloseHandle;
159
160 // Log the signal mapping for observability.
161 let _mapped = match signal {
162 "TERM" => "TerminateJobObject (TERM→process-tree kill)",
163 "INT" => "TerminateJobObject (INT→process-tree kill)",
164 "KILL" => "TerminateJobObject (KILL→process-tree kill)",
165 other => {
166 debug!(
167 signal = other,
168 "unknown signal mapped to KILL (process-tree kill)"
169 );
170 "TerminateJobObject (unknown→process-tree kill)"
171 }
172 };
173
174 // Path 1: named Job Object created by the supervisor is available.
175 if let Some(name) = job_name {
176 use windows::Win32::System::JobObjects::{
177 JOB_OBJECT_ALL_ACCESS, OpenJobObjectW, TerminateJobObject,
178 };
179 use windows::core::HSTRING;
180
181 let hname = HSTRING::from(name);
182 unsafe {
183 let job = OpenJobObjectW(JOB_OBJECT_ALL_ACCESS, false, &hname)
184 .map_err(|e| anyhow::anyhow!("OpenJobObjectW({name}) failed: {e}"))?;
185 let result = TerminateJobObject(job, 1)
186 .map_err(|e| anyhow::anyhow!("TerminateJobObject({name}) failed: {e}"));
187 let _ = CloseHandle(job);
188 return result;
189 }
190 }
191
192 // Path 2: no named Job Object — try ad-hoc assignment then terminate.
193 send_signal_no_job(pid)
194}
195
196/// Fallback Windows kill path when no named Job Object is available.
197/// Attempts to create a temporary Job Object, assign the process, and terminate.
198/// If assignment fails (process already in another job), falls back to
199/// snapshot-based recursive tree termination.
200#[cfg(windows)]
201fn send_signal_no_job(pid: u32) -> Result<()> {
202 use windows::Win32::Foundation::{CloseHandle, HANDLE};
203 use windows::Win32::System::JobObjects::{
204 AssignProcessToJobObject, CreateJobObjectW, TerminateJobObject,
205 };
206 use windows::Win32::System::Threading::{OpenProcess, PROCESS_SET_QUOTA, PROCESS_TERMINATE};
207
208 unsafe {
209 // Open the target process.
210 let proc_handle: HANDLE = OpenProcess(PROCESS_TERMINATE | PROCESS_SET_QUOTA, false, pid)?;
211
212 // Create an anonymous Job Object and assign the process to it, then
213 // terminate all processes in the job (the target process and any
214 // children it has already spawned).
215 let job: HANDLE = CreateJobObjectW(None, None)?;
216
217 // Assign process to the job (if it is already in a job this may fail,
218 // e.g. when the process is already a member of another job object).
219 // In either case, we must guarantee process-tree termination per spec.
220 if AssignProcessToJobObject(job, proc_handle).is_err() {
221 // The process belongs to an existing job object (common when the
222 // supervisor itself runs inside a job, e.g. CI environments).
223 // Fall back to recursive tree termination via snapshot enumeration
224 // so that child processes are also killed, fulfilling the MUST
225 // requirement from spec.md:55-63.
226 let _ = CloseHandle(job);
227 let _ = CloseHandle(proc_handle);
228 // Propagate error if tree termination fails — success must not be
229 // returned unless the entire process tree is actually terminated.
230 return terminate_process_tree(pid);
231 }
232
233 // Terminate all processes in the job (process tree).
234 // Per spec.md:55-63, failure here must be surfaced as an error because
235 // the caller cannot verify tree termination otherwise.
236 TerminateJobObject(job, 1).map_err(|e| {
237 let _ = CloseHandle(proc_handle);
238 let _ = CloseHandle(job);
239 anyhow::anyhow!("TerminateJobObject failed: {}", e)
240 })?;
241
242 let _ = CloseHandle(proc_handle);
243 let _ = CloseHandle(job);
244 }
245 Ok(())
246}
247
248/// Recursively terminate a process and all its descendants using
249/// CreateToolhelp32Snapshot. This is the fallback path when Job Object
250/// assignment fails (e.g., nested job objects on older Windows or CI).
251///
252/// Returns `Ok(())` only when the entire process tree (root + all descendants)
253/// has been terminated. Returns an error if snapshot enumeration fails or if
254/// the root process itself cannot be opened for termination, because in those
255/// cases tree-wide termination cannot be guaranteed (spec.md:55-63 MUST).
256#[cfg(windows)]
257fn terminate_process_tree(root_pid: u32) -> Result<()> {
258 use windows::Win32::Foundation::CloseHandle;
259 use windows::Win32::System::Diagnostics::ToolHelp::{
260 CreateToolhelp32Snapshot, PROCESSENTRY32, Process32First, Process32Next, TH32CS_SNAPPROCESS,
261 };
262 use windows::Win32::System::Threading::{OpenProcess, PROCESS_TERMINATE, TerminateProcess};
263
264 unsafe {
265 // Build a list of (pid, parent_pid) for all running processes.
266 // If we cannot take a snapshot we cannot enumerate child processes, so
267 // we must return an error rather than silently skip them.
268 let snapshot = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0)
269 .map_err(|e| anyhow::anyhow!("CreateToolhelp32Snapshot failed: {}", e))?;
270
271 let mut entries: Vec<(u32, u32)> = Vec::new();
272 let mut entry = PROCESSENTRY32 {
273 dwSize: std::mem::size_of::<PROCESSENTRY32>() as u32,
274 ..Default::default()
275 };
276
277 if Process32First(snapshot, &mut entry).is_ok() {
278 loop {
279 entries.push((entry.th32ProcessID, entry.th32ParentProcessID));
280 entry = PROCESSENTRY32 {
281 dwSize: std::mem::size_of::<PROCESSENTRY32>() as u32,
282 ..Default::default()
283 };
284 if Process32Next(snapshot, &mut entry).is_err() {
285 break;
286 }
287 }
288 }
289 let _ = CloseHandle(snapshot);
290
291 // Collect all pids in the subtree rooted at root_pid (BFS).
292 let mut to_kill: Vec<u32> = vec![root_pid];
293 let mut i = 0;
294 while i < to_kill.len() {
295 let parent = to_kill[i];
296 for &(child_pid, parent_pid) in &entries {
297 if parent_pid == parent && !to_kill.contains(&child_pid) {
298 to_kill.push(child_pid);
299 }
300 }
301 i += 1;
302 }
303
304 // Terminate all collected processes (children first, then root).
305 // Per spec.md:55-63, tree-wide termination is a MUST. Every process
306 // in the subtree must be confirmed terminated; failure to terminate
307 // any process (root or child) returns an error unless the process no
308 // longer exists (already terminated, which is a success condition).
309 use windows::Win32::Foundation::ERROR_INVALID_PARAMETER;
310
311 for &target_pid in to_kill.iter().rev() {
312 match OpenProcess(PROCESS_TERMINATE, false, target_pid) {
313 Ok(h) => {
314 let result = TerminateProcess(h, 1);
315 let _ = CloseHandle(h);
316 result.map_err(|e| {
317 anyhow::anyhow!("TerminateProcess for pid {} failed: {}", target_pid, e)
318 })?;
319 }
320 Err(e) => {
321 // ERROR_INVALID_PARAMETER (87) means the process no longer
322 // exists — it has already exited, which is a success
323 // condition (the process is gone). Any other error means
324 // we could not open the process handle and therefore cannot
325 // confirm or perform termination, which violates the MUST.
326 if e.code() != ERROR_INVALID_PARAMETER.to_hresult() {
327 return Err(anyhow::anyhow!(
328 "OpenProcess for pid {} failed (process may still be running): {}",
329 target_pid,
330 e
331 ));
332 }
333 // Process already gone — treat as success.
334 }
335 }
336 }
337 }
338 Ok(())
339}
340
341#[cfg(not(any(unix, windows)))]
342fn send_signal(_pid: u32, _signal: &str) -> Result<()> {
343 anyhow::bail!("kill not supported on this platform");
344}