Skip to main content

runtimo_core/capabilities/
kill.rs

1//! Kill capability — terminate runaway processes by PID with full audit trail.
2//!
3//! Kills a process by PID with full telemetry capture and WAL logging.
4//! Includes safety checks to prevent killing critical system processes.
5//!
6//! # PID Reuse Protection (FINDING #1)
7//!
8//! After sending a signal, the capability verifies the killed process is the
9//! same one by comparing start times from `/proc/{pid}/stat` field 22. This
10//! prevents PID reuse races where a new process inherits the killed PID.
11//!
12//! # Protected Processes
13//!
14//! The following PIDs are protected and cannot be killed:
15//! - `1` (init), `2` (kthreadd)
16//! - Current process, parent process, session leader, process group leader
17//! - All systemd-managed services (detected via cgroup)
18//!
19//! # Example
20//!
21//! ```rust,ignore
22//! use runtimo_core::capabilities::Kill;
23//! use runtimo_core::capability::{Capability, Context};
24//! use serde_json::json;
25//!
26//! let cap = Kill;
27//! let result = cap.execute(
28//!     &json!({"pid": 12345}),
29//!     &Context { dry_run: false, job_id: "test".into(), ..Default::default() }
30//! ).unwrap();
31//!
32//! assert_eq!(result.status, "ok");
33//! ```
34
35use crate::capability::{CapabilityError, Context, Output, TypedCapability};
36use crate::processes::ProcessSnapshot;
37use serde::{Deserialize, Serialize};
38use serde_json::Value;
39use std::time::Duration;
40
41#[cfg(test)]
42use std::process::Command;
43
44/// Reads the process start time (field 22) from `/proc/{pid}/stat`.
45///
46/// Returns start time in clock ticks since boot. Used to detect PID reuse:
47/// if a process is killed and a new process reuses the PID, the start time
48/// will differ (FINDING #1).
49#[allow(clippy::arithmetic_side_effects)]
50fn get_process_start_time(pid: u32) -> Option<u64> {
51    let stat_path = format!("/proc/{}/stat", pid);
52    let content = std::fs::read_to_string(&stat_path).ok()?;
53    let last_paren = content.rfind(')')?;
54    let fields: Vec<&str> = content[last_paren + 2..].split_whitespace().collect();
55    fields.get(19)?.parse::<u64>().ok()
56}
57fn get_process_start_time_retry(pid: u32) -> Option<u64> {
58    #[allow(clippy::arithmetic_side_effects)] // bit shift in retry backoff: 1 << attempt
59    for attempt in 0..3 {
60        if attempt > 0 {
61            std::thread::sleep(std::time::Duration::from_millis(10 * (1 << attempt)));
62        }
63        if let Some(start_time) = get_process_start_time(pid) {
64            return Some(start_time);
65        }
66    }
67    None
68}
69
70/// Reads the cgroup of a process from `/proc/{pid}/cgroup`.
71///
72/// Returns the cgroup path string, used to detect systemd-managed services.
73fn get_process_cgroup(pid: u32) -> Option<String> {
74    std::fs::read_to_string(format!("/proc/{}/cgroup", pid)).ok()
75}
76
77/// Checks if a cgroup path indicates a systemd-managed service.
78fn is_systemd_service(cgroup: &str) -> bool {
79    cgroup.contains("/system.slice/")
80        || cgroup.contains("/init.scope")
81        || cgroup.contains("systemd")
82}
83
84/// Protected PIDs that cannot be killed (safety guard).
85/// Includes init, kthreadd, current process, parent, session leader,
86/// process group leader, and systemd critical services (FINDING #2).
87fn protected_pids() -> Vec<u32> {
88    let mut pids = vec![1, 2];
89    let self_pid = std::process::id();
90    pids.push(self_pid);
91
92    // Add parent process
93    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
94        if let Some(ppid_str) = status
95            .lines()
96            .find(|l| l.starts_with("PPid:"))
97            .and_then(|l| l.split_whitespace().nth(1))
98        {
99            if let Ok(ppid) = ppid_str.parse::<u32>() {
100                pids.push(ppid);
101            }
102        }
103    }
104
105    // Add session leader (FINDING #2)
106    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
107        if let Some(sid_str) = status
108            .lines()
109            .find(|l| l.starts_with("Sid:"))
110            .and_then(|l| l.split_whitespace().nth(1))
111        {
112            if let Ok(sid) = sid_str.parse::<u32>() {
113                if sid != 0 {
114                    pids.push(sid);
115                }
116            }
117        }
118    }
119
120    // Add process group leader (FINDING #2)
121    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
122        if let Some(pgid_str) = status
123            .lines()
124            .find(|l| l.starts_with("NSpgid:"))
125            .and_then(|l| l.split_whitespace().nth(1))
126        {
127            if let Ok(pgid) = pgid_str.parse::<u32>() {
128                if pgid != 0 {
129                    pids.push(pgid);
130                }
131            }
132        }
133    }
134
135    // Scan all running processes for systemd-critical services (FINDING #2)
136    if let Ok(entries) = std::fs::read_dir("/proc") {
137        for entry in entries.flatten() {
138            if let Ok(name) = entry.file_name().into_string() {
139                if let Ok(pid) = name.parse::<u32>() {
140                    if let Some(cgroup) = get_process_cgroup(pid) {
141                        if is_systemd_service(&cgroup) {
142                            pids.push(pid);
143                        }
144                    }
145                }
146            }
147        }
148    }
149
150    pids.sort_unstable();
151    pids.dedup();
152    pids
153}
154
155/// Input parameters for [`Kill::execute`].
156#[derive(Debug, Clone, Serialize, Deserialize)]
157#[allow(clippy::exhaustive_structs)] // args struct — fields are the contract
158pub struct KillArgs {
159    /// Process ID to kill.
160    pub pid: u32,
161    /// Signal to send (default: 15 = SIGTERM). Must be valid POSIX: 1-31 or 64.
162    pub signal: Option<i32>,
163}
164
165/// Capability that terminates a process by PID with full audit logging.
166///
167/// Protected PIDs (init, kthreadd) are refused before the syscall.
168/// All kill operations are logged to the WAL for forensic review.
169#[allow(clippy::exhaustive_structs)]
170pub struct Kill;
171
172impl TypedCapability for Kill {
173    type Args = KillArgs;
174
175    fn name(&self) -> &'static str {
176        "Kill"
177    }
178
179    fn description(&self) -> &'static str {
180        "terminate process by PID with PID reuse protection. protected: init (1), kthreadd (2), self, parent, session/group leaders, systemd services. signals: 1-31, 64 (SIGRTMIN)."
181    }
182
183    /// Returns the JSON Schema for Kill arguments.
184    ///
185    /// Schema requires `"pid"` integer; `"signal"` is optional and restricted
186    /// to valid POSIX signal values (1-31, 64) — FINDING #3.
187    fn schema(&self) -> Value {
188        serde_json::json!({
189            "type": "object",
190            "properties": {
191                "pid": { "type": "integer", "minimum": 1 },
192                "signal": {
193                    "type": "integer",
194                    "anyOf": [
195                        { "minimum": 1, "maximum": 31 },
196                        { "enum": [64] }
197                    ]
198                }
199            },
200            "required": ["pid"]
201        })
202    }
203
204    fn execute(
205        &self,
206        args: KillArgs,
207        ctx: &Context,
208    ) -> std::result::Result<Output, CapabilityError> {
209        // FINDING #3: Restrict signal to valid POSIX values (1-31, 64)
210        if let Some(signal) = args.signal {
211            if !(1..=31).contains(&signal) && signal != 64 {
212                return Err(CapabilityError::InvalidArgs(format!(
213                    "Invalid signal {}: must be 1-31 or 64 (POSIX signals)",
214                    signal
215                )));
216            }
217        }
218
219        // Safety check: protected PIDs (init, kthreadd, self, parent)
220        let protected = protected_pids();
221        if protected.contains(&args.pid) {
222            return Err(CapabilityError::PermissionDenied(format!(
223                "PID {} is a protected system process (protected: {:?})",
224                args.pid, protected
225            )));
226        }
227
228        // Respect dry_run — skip kill entirely
229        if ctx.dry_run {
230            // FINDING #20: Limit dry-run output to "would kill PID X", hide command/user info
231            let mut out = Output::ok(format!("DRY RUN: would kill PID {}", args.pid));
232            out.data = Some(serde_json::json!({
233                "pid": args.pid,
234                "killed": false,
235                "dry_run": true,
236                "signal": args.signal.unwrap_or(15),
237            }));
238            return Ok(out);
239        }
240
241        // Capture process snapshot before kill
242        let process_before = ProcessSnapshot::capture();
243        let process_exists = process_before.processes.iter().any(|p| p.pid == args.pid);
244
245        if !process_exists {
246            let mut out = Output::error(
247                format!("Process {} not found", args.pid),
248                "Process not found".into(),
249            );
250            out.data = Some(serde_json::json!({
251                "pid": args.pid,
252                "killed": false,
253                "reason": "Process not found"
254            }));
255            return Ok(out);
256        }
257
258        // Get process info before killing
259        let process_info: Option<(String, String)> = process_before
260            .processes
261            .iter()
262            .find(|p| p.pid == args.pid)
263            .map(|p| (p.command.clone(), p.user.clone()));
264
265        // Record start time to detect PID reuse (FINDING #1)
266        let start_time_before = get_process_start_time_retry(args.pid);
267
268        // Double-check: re-read start time to narrow TOCTOU window.
269        // If the PID was recycled between these reads, abort the kill.
270        let start_time_before_confirm = get_process_start_time_retry(args.pid);
271        if start_time_before != start_time_before_confirm {
272            let mut out = Output::error(
273                format!(
274                    "PID {} was reused by a different process (start time changed before kill)",
275                    args.pid
276                ),
277                "PID reused between safety checks".into(),
278            );
279            out.data = Some(serde_json::json!({
280                "pid": args.pid,
281                "killed": false,
282                "reason": "PID reused between safety checks",
283                "pid_reused": true,
284            }));
285            return Ok(out);
286        }
287
288        // Determine signal — default to SIGTERM (15) for graceful shutdown
289        let signal = args.signal.unwrap_or(15);
290
291        // Execute kill via libc for reliability (avoids shell/PATH issues)
292        // SAFETY: pid is validated as a valid target; signal is validated to 1-64 range;
293        // pid_t is i32 — pid is u32, cast is safe for all valid PIDs
294        #[allow(clippy::cast_possible_wrap)]
295        let kill_result = unsafe { libc::kill(args.pid as libc::pid_t, signal) };
296        let success = kill_result == 0;
297        let stderr_str = if success {
298            String::new()
299        } else {
300            std::io::Error::last_os_error().to_string()
301        };
302
303        // Delay to let process terminate and be removed from process table
304        std::thread::sleep(Duration::from_millis(500));
305
306        // Clear cache to ensure fresh snapshot (cached data would show pre-kill state)
307        ProcessSnapshot::clear_cache();
308
309        // Capture process snapshot after kill
310        let process_after = ProcessSnapshot::capture();
311
312        // Check if process still exists (zombies count as dead — they've been terminated)
313        let process_still_exists = process_after
314            .processes
315            .iter()
316            .any(|p| p.pid == args.pid && !p.stat.starts_with('Z'));
317        // Verify PID was not reused — check start time matches (FINDING #1)
318        let pid_reused = match (start_time_before, get_process_start_time_retry(args.pid)) {
319            (Some(before_time), Some(after_time)) => before_time != after_time,
320            (None, _) => false,
321            (Some(_), None) => true,
322        };
323
324        let killed_success = success && !process_still_exists && !pid_reused;
325
326        let message = if killed_success {
327            format!("Killed process {} (signal {})", args.pid, signal)
328        } else if pid_reused {
329            format!(
330                "PID {} was reused by a different process (start time changed)",
331                args.pid
332            )
333        } else if !success {
334            format!("Failed to kill process {}: {}", args.pid, stderr_str)
335        } else {
336            format!("Process {} still exists after signal {}", args.pid, signal)
337        };
338
339        let mut out = if killed_success {
340            Output::ok(message)
341        } else {
342            Output::error(
343                message,
344                if success {
345                    String::new()
346                } else {
347                    stderr_str.clone()
348                },
349            )
350        };
351        out.data = Some(serde_json::json!({
352            "pid": args.pid,
353            "killed": killed_success,
354            "signal": signal,
355            "command": process_info.as_ref().map(|(cmd, _)| cmd),
356            "user": process_info.as_ref().map(|(_, user)| user),
357            "stderr": if success { String::new() } else { stderr_str },
358            "pid_reused": pid_reused,
359            "process_before": {
360                "count": process_before.summary.total_processes,
361                "zombies": process_before.summary.zombie_count
362            },
363            "process_after": {
364                "count": process_after.summary.total_processes,
365                "zombies": process_after.summary.zombie_count
366            }
367        }));
368        Ok(out)
369    }
370}
371
372#[cfg(test)]
373#[allow(clippy::unnecessary_map_or)]
374mod tests {
375    use super::*;
376    use crate::capability::Capability;
377    use std::thread;
378    use std::time::Duration;
379
380    #[test]
381    fn test_kill_schema() {
382        let cap = Kill;
383        let _schema = Capability::schema(&cap);
384        // Retry function test
385        // Test retry logic with existing process
386        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
387        let pid = child.id();
388
389        let result = get_process_start_time_retry(pid);
390        assert!(
391            result.is_some(),
392            "Should read start time for running process"
393        );
394
395        child.kill().ok();
396        let _ = child.wait();
397
398        // Non-existent PID should return None after retries
399        let result = get_process_start_time_retry(999999);
400        assert!(result.is_none(), "Non-existent PID should return None");
401    }
402
403    #[test]
404    fn test_kill_protected_pid() {
405        let cap = Kill;
406        // PID 1 is protected
407        let result = Capability::execute(
408            &cap,
409            &serde_json::json!({ "pid": 1 }),
410            &Context {
411                dry_run: false,
412                job_id: "test".into(),
413                working_dir: std::env::current_dir().unwrap(),
414            },
415        );
416
417        // Should fail because PID 1 is protected
418        assert!(result.is_err());
419        assert!(result
420            .unwrap_err()
421            .to_string()
422            .contains("protected system process"));
423    }
424
425    #[test]
426    fn test_kill_self_protected() {
427        let cap = Kill;
428        let self_pid = std::process::id();
429        let result = Capability::execute(
430            &cap,
431            &serde_json::json!({ "pid": self_pid }),
432            &Context {
433                dry_run: false,
434                job_id: "test".into(),
435                working_dir: std::env::current_dir().unwrap(),
436            },
437        );
438
439        assert!(result.is_err());
440        assert!(result.unwrap_err().to_string().contains("protected"));
441    }
442
443    #[test]
444    fn test_kill_nonexistent() {
445        let cap = Kill;
446        // Use a PID that's very unlikely to exist
447        let result = Capability::execute(
448            &cap,
449            &serde_json::json!({ "pid": 999999 }),
450            &Context {
451                dry_run: false,
452                job_id: "test".into(),
453                working_dir: std::env::current_dir().unwrap(),
454            },
455        )
456        .unwrap();
457
458        assert_eq!(result.status, "error");
459        assert!(result.data.as_ref().unwrap()["killed"].as_bool() == Some(false));
460    }
461
462    #[test]
463    fn test_kill_dry_run() {
464        let cap = Kill;
465        // Use a real PID (self) but in dry_run mode — should NOT error as protected
466        // because dry_run skips the actual kill but still checks protection
467        // Actually, protection check runs before dry_run, so use a non-protected PID
468        let result = Capability::execute(
469            &cap,
470            &serde_json::json!({ "pid": 999998 }),
471            &Context {
472                dry_run: true,
473                job_id: "test".into(),
474                working_dir: std::env::current_dir().unwrap(),
475            },
476        )
477        .unwrap();
478
479        assert_eq!(result.status, "ok");
480        assert!(result.data.as_ref().unwrap()["dry_run"].as_bool() == Some(true));
481        assert!(result.data.as_ref().unwrap()["killed"].as_bool() == Some(false));
482    }
483
484    #[test]
485    fn test_kill_actual_process() {
486        // Start a long-running process (sleep)
487        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
488        let pid = child.id();
489
490        // Give it time to start
491        thread::sleep(Duration::from_millis(100));
492
493        // Verify process exists before kill
494        let pre_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
495        assert!(
496            pre_check.unwrap().status.success(),
497            "Process should exist before kill"
498        );
499
500        // Check if the spawned child is in the protected PID list.
501        // In CI containers (systemd cgroups), nearly all PIDs can be
502        // considered protected — if so, skip the kill assertion
503        // gracefully instead of panicking.
504        let protected = protected_pids();
505        if protected.contains(&pid) {
506            let _ = child.kill();
507            let _ = child.wait();
508            eprintln!(
509                "SKIP: spawned child PID {pid} is in protected_pids set \
510                 ({protected:?}); kill blocked by safety guard. \
511                 This is expected in CI containers."
512            );
513            return;
514        }
515
516        // Clear cache so kill sees fresh process list
517        ProcessSnapshot::clear_cache();
518
519        // Kill it via the capability using SIGKILL for reliability
520        let cap = Kill;
521        let result = Capability::execute(
522            &cap,
523            &serde_json::json!({ "pid": pid, "signal": 9 }),
524            &Context {
525                dry_run: false,
526                job_id: "test".into(),
527                working_dir: std::env::current_dir().unwrap(),
528            },
529        )
530        .unwrap();
531
532        // Kill should succeed — process becomes zombie until reaped
533        assert!(
534            result.data.as_ref().unwrap()["killed"].as_bool() == Some(true),
535            "Kill failed: {:?}",
536            result.data
537        );
538        assert!(
539            result.data.as_ref().unwrap()["signal"].as_i64() == Some(9),
540            "Should use SIGKILL"
541        );
542
543        // Reap the zombie so it disappears from process table
544        let _ = child.wait();
545
546        // Verify process is fully gone after reaping
547        let post_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
548        let still_alive = post_check.map_or(false, |o| o.status.success());
549        assert!(
550            !still_alive,
551            "Process {} should be dead after kill and reap",
552            pid
553        );
554    }
555
556    #[test]
557    fn test_get_process_start_time() {
558        // Start a process and verify we can read its start time
559        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
560        let pid = child.id();
561
562        let start_time = get_process_start_time(pid);
563        assert!(
564            start_time.is_some(),
565            "Should be able to read start time for running process"
566        );
567
568        // Verify start time is consistent (no PID reuse)
569        let start_time2 = get_process_start_time(pid);
570        assert_eq!(start_time, start_time2, "Start time should be stable");
571
572        child.kill().ok();
573        let _ = child.wait();
574    }
575
576    #[test]
577    fn test_get_process_start_time_nonexistent() {
578        let result = get_process_start_time(999999);
579        assert!(result.is_none(), "Non-existent PID should return None");
580    }
581
582    #[test]
583    fn test_signal_validation_rejects_negative() {
584        // FINDING #3: negative signals should be rejected
585        let cap = Kill;
586        let result = Capability::execute(
587            &cap,
588            &serde_json::json!({ "pid": 999998, "signal": -1 }),
589            &Context {
590                dry_run: false,
591                job_id: "test".into(),
592                working_dir: std::env::current_dir().unwrap(),
593            },
594        );
595        assert!(result.is_err());
596        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
597    }
598
599    #[test]
600    fn test_signal_validation_rejects_zero() {
601        // FINDING #3: signal 0 should be rejected
602        let cap = Kill;
603        let result = Capability::execute(
604            &cap,
605            &serde_json::json!({ "pid": 999998, "signal": 0 }),
606            &Context {
607                dry_run: false,
608                job_id: "test".into(),
609                working_dir: std::env::current_dir().unwrap(),
610            },
611        );
612        assert!(result.is_err());
613        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
614    }
615
616    #[test]
617    fn test_signal_validation_rejects_out_of_range() {
618        // FINDING #3: signal > 31 (except 64) should be rejected
619        let cap = Kill;
620        let result = Capability::execute(
621            &cap,
622            &serde_json::json!({ "pid": 999998, "signal": 32 }),
623            &Context {
624                dry_run: false,
625                job_id: "test".into(),
626                working_dir: std::env::current_dir().unwrap(),
627            },
628        );
629        assert!(result.is_err());
630    }
631
632    #[test]
633    fn test_signal_validation_accepts_valid_signals() {
634        let cap = Kill;
635        for sig in [1, 9, 15, 31, 64] {
636            let result = Capability::execute(
637                &cap,
638                &serde_json::json!({ "pid": 999998, "signal": sig }),
639                &Context {
640                    dry_run: false,
641                    job_id: "test".into(),
642                    working_dir: std::env::current_dir().unwrap(),
643                },
644            );
645            // Should not fail with InvalidArgs for valid signals
646            // May fail with other errors (nonexistent PID) — that's OK
647            if let Err(e) = &result {
648                assert!(
649                    !e.to_string().contains("Invalid signal"),
650                    "Signal {} should be valid, got: {}",
651                    sig,
652                    e
653                );
654            }
655        }
656    }
657
658    #[test]
659    fn test_dry_run_hides_process_info() {
660        // FINDING #20: dry-run should NOT expose command or user info
661        let cap = Kill;
662        let result = Capability::execute(
663            &cap,
664            &serde_json::json!({ "pid": 999998 }),
665            &Context {
666                dry_run: true,
667                job_id: "test".into(),
668                working_dir: std::env::current_dir().unwrap(),
669            },
670        )
671        .unwrap();
672
673        assert_eq!(result.status, "ok");
674        assert!(result.data.as_ref().unwrap()["dry_run"].as_bool() == Some(true));
675        assert!(
676            result.data.as_ref().unwrap().get("command").is_none(),
677            "dry-run must not expose command"
678        );
679        assert!(
680            result.data.as_ref().unwrap().get("user").is_none(),
681            "dry-run must not expose user"
682        );
683        assert!(
684            result
685                .data
686                .as_ref()
687                .unwrap()
688                .get("process_exists")
689                .is_none(),
690            "dry-run must not expose process_exists"
691        );
692    }
693
694    #[test]
695    fn test_protected_pids_includes_self_and_parent() {
696        let protected = protected_pids();
697        let self_pid = std::process::id();
698        assert!(protected.contains(&1), "PID 1 should be protected");
699        assert!(protected.contains(&2), "PID 2 should be protected");
700        assert!(
701            protected.contains(&self_pid),
702            "self PID should be protected"
703        );
704    }
705
706    #[test]
707    fn test_get_process_start_time_retry() {
708        // Test retry logic with existing process
709        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
710        let pid = child.id();
711
712        let result = get_process_start_time_retry(pid);
713        assert!(
714            result.is_some(),
715            "Should read start time for running process"
716        );
717
718        child.kill().ok();
719        let _ = child.wait();
720
721        // Non-existent PID should return None after retries
722        let result = get_process_start_time_retry(999999);
723        assert!(result.is_none(), "Non-existent PID should return None");
724    }
725}