Skip to main content

runtimo_core/capabilities/
kill.rs

1//! Kill capability — terminate runaway processes by PID with full audit trail.
2//!
3//! Kills a process by PID with full telemetry capture and WAL logging.
4//! Includes safety checks to prevent killing critical system processes.
5//!
6//! # PID Reuse Protection (FINDING #1)
7//!
8//! After sending a signal, the capability verifies the killed process is the
9//! same one by comparing start times from `/proc/{pid}/stat` field 22. This
10//! prevents PID reuse races where a new process inherits the killed PID.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use runtimo_core::capabilities::Kill;
16//! use runtimo_core::capability::{Capability, Context};
17//! use serde_json::json;
18//!
19//! let cap = Kill;
20//! let result = cap.execute(
21//!     &json!({"pid": 12345}),
22//!     &Context { dry_run: false, job_id: "test".into(), ..Default::default() }
23//! ).unwrap();
24//!
25//! assert!(result.success);
26//! ```
27
28use crate::capability::{Capability, Context, Output};
29use crate::processes::ProcessSnapshot;
30use crate::{Error, Result};
31use serde::{Deserialize, Serialize};
32use serde_json::Value;
33use std::time::Duration;
34
35#[cfg(test)]
36use std::process::Command;
37
38/// Reads the process start time (field 22) from `/proc/{pid}/stat`.
39///
40/// Returns start time in clock ticks since boot. Used to detect PID reuse:
41/// if a process is killed and a new process reuses the PID, the start time
42/// will differ (FINDING #1).
43#[allow(clippy::arithmetic_side_effects)]
44fn get_process_start_time(pid: u32) -> Option<u64> {
45    let stat_path = format!("/proc/{}/stat", pid);
46    let content = std::fs::read_to_string(&stat_path).ok()?;
47    let last_paren = content.rfind(')')?;
48    let fields: Vec<&str> = content[last_paren + 2..].split_whitespace().collect();
49    fields.get(19)?.parse::<u64>().ok()
50}
51fn get_process_start_time_retry(pid: u32) -> Option<u64> {
52    #[allow(clippy::arithmetic_side_effects)] // bit shift in retry backoff: 1 << attempt
53    for attempt in 0..3 {
54        if attempt > 0 {
55            std::thread::sleep(std::time::Duration::from_millis(10 * (1 << attempt)));
56        }
57        if let Some(start_time) = get_process_start_time(pid) {
58            return Some(start_time);
59        }
60    }
61    None
62}
63
64/// Reads the cgroup of a process from `/proc/{pid}/cgroup`.
65///
66/// Returns the cgroup path string, used to detect systemd-managed services.
67fn get_process_cgroup(pid: u32) -> Option<String> {
68    std::fs::read_to_string(format!("/proc/{}/cgroup", pid)).ok()
69}
70
71/// Checks if a cgroup path indicates a systemd-managed service.
72fn is_systemd_service(cgroup: &str) -> bool {
73    cgroup.contains("/system.slice/")
74        || cgroup.contains("/init.scope")
75        || cgroup.contains("systemd")
76}
77
78/// Protected PIDs that cannot be killed (safety guard).
79/// Includes init, kthreadd, current process, parent, session leader,
80/// process group leader, and systemd critical services (FINDING #2).
81fn protected_pids() -> Vec<u32> {
82    let mut pids = vec![1, 2];
83    let self_pid = std::process::id();
84    pids.push(self_pid);
85
86    // Add parent process
87    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
88        if let Some(ppid_str) = status
89            .lines()
90            .find(|l| l.starts_with("PPid:"))
91            .and_then(|l| l.split_whitespace().nth(1))
92        {
93            if let Ok(ppid) = ppid_str.parse::<u32>() {
94                pids.push(ppid);
95            }
96        }
97    }
98
99    // Add session leader (FINDING #2)
100    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
101        if let Some(sid_str) = status
102            .lines()
103            .find(|l| l.starts_with("Sid:"))
104            .and_then(|l| l.split_whitespace().nth(1))
105        {
106            if let Ok(sid) = sid_str.parse::<u32>() {
107                if sid != 0 {
108                    pids.push(sid);
109                }
110            }
111        }
112    }
113
114    // Add process group leader (FINDING #2)
115    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
116        if let Some(pgid_str) = status
117            .lines()
118            .find(|l| l.starts_with("NSpgid:"))
119            .and_then(|l| l.split_whitespace().nth(1))
120        {
121            if let Ok(pgid) = pgid_str.parse::<u32>() {
122                if pgid != 0 {
123                    pids.push(pgid);
124                }
125            }
126        }
127    }
128
129    // Scan all running processes for systemd-critical services (FINDING #2)
130    if let Ok(entries) = std::fs::read_dir("/proc") {
131        for entry in entries.flatten() {
132            if let Ok(name) = entry.file_name().into_string() {
133                if let Ok(pid) = name.parse::<u32>() {
134                    if let Some(cgroup) = get_process_cgroup(pid) {
135                        if is_systemd_service(&cgroup) {
136                            pids.push(pid);
137                        }
138                    }
139                }
140            }
141        }
142    }
143
144    pids.sort_unstable();
145    pids.dedup();
146    pids
147}
148
149/// Input parameters for [`Kill::execute`].
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct KillArgs {
152    /// Process ID to kill.
153    pub pid: u32,
154    /// Signal to send (default: 15 = SIGTERM). Must be valid POSIX: 1-31 or 64.
155    pub signal: Option<i32>,
156}
157
158/// Capability that terminates a process by PID with full audit logging.
159///
160/// Protected PIDs (init, kthreadd) are refused before the syscall.
161/// All kill operations are logged to the WAL for forensic review.
162#[allow(clippy::exhaustive_structs)]
163pub struct Kill;
164
165impl Capability for Kill {
166    fn name(&self) -> &'static str {
167        "Kill"
168    }
169
170    fn description(&self) -> &'static str {
171        "kill PID. Protected: init,kthreadd,self. Custom sig ok."
172    }
173
174    /// Returns the JSON Schema for Kill arguments.
175    ///
176    /// Schema requires `"pid"` integer; `"signal"` is optional and restricted
177    /// to valid POSIX signal values (1-31, 64) — FINDING #3.
178    fn schema(&self) -> Value {
179        serde_json::json!({
180            "type": "object",
181            "properties": {
182                "pid": { "type": "integer", "minimum": 1 },
183                "signal": {
184                    "type": "integer",
185                    "anyOf": [
186                        { "minimum": 1, "maximum": 31 },
187                        { "enum": [64] }
188                    ]
189                }
190            },
191            "required": ["pid"]
192        })
193    }
194
195    fn validate(&self, args: &Value) -> Result<()> {
196        let args: KillArgs = serde_json::from_value(args.clone())
197            .map_err(|e| Error::SchemaValidationFailed(e.to_string()))?;
198
199        // FINDING #3: Restrict signal to valid POSIX values (1-31, 64)
200        if let Some(signal) = args.signal {
201            if !(1..=31).contains(&signal) && signal != 64 {
202                return Err(Error::SchemaValidationFailed(format!(
203                    "Invalid signal {}: must be 1-31 or 64 (POSIX signals)",
204                    signal
205                )));
206            }
207        }
208
209        Ok(())
210    }
211
212    fn execute(&self, args: &Value, ctx: &Context) -> Result<Output> {
213        let args: KillArgs = serde_json::from_value(args.clone())
214            .map_err(|e| Error::ExecutionFailed(e.to_string()))?;
215
216        // Safety check: protected PIDs (init, kthreadd, self, parent)
217        let protected = protected_pids();
218        if protected.contains(&args.pid) {
219            return Err(Error::ExecutionFailed(format!(
220                "PID {} is a protected system process (protected: {:?})",
221                args.pid, protected
222            )));
223        }
224
225        // Respect dry_run — skip kill entirely
226        if ctx.dry_run {
227            // FINDING #20: Limit dry-run output to "would kill PID X", hide command/user info
228            return Ok(Output {
229                success: true,
230                data: serde_json::json!({
231                    "pid": args.pid,
232                    "killed": false,
233                    "dry_run": true,
234                    "signal": args.signal.unwrap_or(15),
235                }),
236                message: Some(format!("DRY RUN: would kill PID {}", args.pid)),
237            });
238        }
239
240        // Capture process snapshot before kill
241        let process_before = ProcessSnapshot::capture();
242        let process_exists = process_before.processes.iter().any(|p| p.pid == args.pid);
243
244        if !process_exists {
245            return Ok(Output {
246                success: false,
247                data: serde_json::json!({
248                    "pid": args.pid,
249                    "killed": false,
250                    "reason": "Process not found"
251                }),
252                message: Some(format!("Process {} not found", args.pid)),
253            });
254        }
255
256        // Get process info before killing
257        let process_info: Option<(String, String)> = process_before
258            .processes
259            .iter()
260            .find(|p| p.pid == args.pid)
261            .map(|p| (p.command.clone(), p.user.clone()));
262
263        // Record start time to detect PID reuse (FINDING #1)
264        let start_time_before = get_process_start_time_retry(args.pid);
265
266        // Double-check: re-read start time to narrow TOCTOU window.
267        // If the PID was recycled between these reads, abort the kill.
268        let start_time_before_confirm = get_process_start_time_retry(args.pid);
269        if start_time_before != start_time_before_confirm {
270            return Ok(Output {
271                success: false,
272                data: serde_json::json!({
273                    "pid": args.pid,
274                    "killed": false,
275                    "reason": "PID reused between safety checks",
276                    "pid_reused": true,
277                }),
278                message: Some(format!(
279                    "PID {} was reused by a different process (start time changed before kill)",
280                    args.pid
281                )),
282            });
283        }
284
285        // Determine signal — default to SIGTERM (15) for graceful shutdown
286        let signal = args.signal.unwrap_or(15);
287
288        // Execute kill via libc for reliability (avoids shell/PATH issues)
289        // SAFETY: pid is validated as a valid target; signal is validated to 1-64 range;
290        // pid_t is i32 — pid is u32, cast is safe for all valid PIDs
291        #[allow(clippy::cast_possible_wrap)]
292        let kill_result = unsafe { libc::kill(args.pid as libc::pid_t, signal) };
293        let success = kill_result == 0;
294        let stderr_str = if success {
295            String::new()
296        } else {
297            std::io::Error::last_os_error().to_string()
298        };
299
300        // Delay to let process terminate and be removed from process table
301        std::thread::sleep(Duration::from_millis(500));
302
303        // Clear cache to ensure fresh snapshot (cached data would show pre-kill state)
304        ProcessSnapshot::clear_cache();
305
306        // Capture process snapshot after kill
307        let process_after = ProcessSnapshot::capture();
308
309        // Check if process still exists (zombies count as dead — they've been terminated)
310        let process_still_exists = process_after
311            .processes
312            .iter()
313            .any(|p| p.pid == args.pid && !p.stat.starts_with('Z'));
314        // Verify PID was not reused — check start time matches (FINDING #1)
315        let pid_reused = match (start_time_before, get_process_start_time_retry(args.pid)) {
316            (Some(before_time), Some(after_time)) => before_time != after_time,
317            (None, _) => false,
318            (Some(_), None) => true,
319        };
320
321        let killed_success = success && !process_still_exists && !pid_reused;
322
323        let message = if killed_success {
324            format!("Killed process {} (signal {})", args.pid, signal)
325        } else if pid_reused {
326            format!(
327                "PID {} was reused by a different process (start time changed)",
328                args.pid
329            )
330        } else if !success {
331            format!("Failed to kill process {}: {}", args.pid, stderr_str)
332        } else {
333            format!("Process {} still exists after signal {}", args.pid, signal)
334        };
335
336        Ok(Output {
337            success: killed_success,
338            data: serde_json::json!({
339                "pid": args.pid,
340                "killed": killed_success,
341                "signal": signal,
342                "command": process_info.as_ref().map(|(cmd, _)| cmd),
343                "user": process_info.as_ref().map(|(_, user)| user),
344                "stderr": if success { String::new() } else { stderr_str },
345                "pid_reused": pid_reused,
346                "process_before": {
347                    "count": process_before.summary.total_processes,
348                    "zombies": process_before.summary.zombie_count
349                },
350                "process_after": {
351                    "count": process_after.summary.total_processes,
352                    "zombies": process_after.summary.zombie_count
353                }
354            }),
355            message: Some(message),
356        })
357    }
358}
359
360#[cfg(test)]
361#[allow(clippy::unnecessary_map_or)]
362mod tests {
363    use super::*;
364    use crate::capability::Capability;
365    use std::thread;
366    use std::time::Duration;
367
368    #[test]
369    fn test_kill_schema() {
370        let cap = Kill;
371        let _schema = cap.schema();
372        // Retry function test
373        // Test retry logic with existing process
374        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
375        let pid = child.id();
376
377        let result = get_process_start_time_retry(pid);
378        assert!(
379            result.is_some(),
380            "Should read start time for running process"
381        );
382
383        child.kill().ok();
384        let _ = child.wait();
385
386        // Non-existent PID should return None after retries
387        let result = get_process_start_time_retry(999999);
388        assert!(result.is_none(), "Non-existent PID should return None");
389    }
390
391    #[test]
392    fn test_kill_protected_pid() {
393        let cap = Kill;
394        // PID 1 is protected
395        let result = cap.execute(
396            &serde_json::json!({ "pid": 1 }),
397            &Context {
398                dry_run: false,
399                job_id: "test".into(),
400                working_dir: std::env::current_dir().unwrap(),
401            },
402        );
403
404        // Should fail because PID 1 is protected
405        assert!(result.is_err());
406        assert!(result
407            .unwrap_err()
408            .to_string()
409            .contains("protected system process"));
410    }
411
412    #[test]
413    fn test_kill_self_protected() {
414        let cap = Kill;
415        let self_pid = std::process::id();
416        let result = cap.execute(
417            &serde_json::json!({ "pid": self_pid }),
418            &Context {
419                dry_run: false,
420                job_id: "test".into(),
421                working_dir: std::env::current_dir().unwrap(),
422            },
423        );
424
425        assert!(result.is_err());
426        assert!(result.unwrap_err().to_string().contains("protected"));
427    }
428
429    #[test]
430    fn test_kill_nonexistent() {
431        let cap = Kill;
432        // Use a PID that's very unlikely to exist
433        let result = cap
434            .execute(
435                &serde_json::json!({ "pid": 999999 }),
436                &Context {
437                    dry_run: false,
438                    job_id: "test".into(),
439                    working_dir: std::env::current_dir().unwrap(),
440                },
441            )
442            .unwrap();
443
444        assert!(!result.success);
445        assert!(result.data["killed"].as_bool() == Some(false));
446    }
447
448    #[test]
449    fn test_kill_dry_run() {
450        let cap = Kill;
451        // Use a real PID (self) but in dry_run mode — should NOT error as protected
452        // because dry_run skips the actual kill but still checks protection
453        // Actually, protection check runs before dry_run, so use a non-protected PID
454        let result = cap
455            .execute(
456                &serde_json::json!({ "pid": 999998 }),
457                &Context {
458                    dry_run: true,
459                    job_id: "test".into(),
460                    working_dir: std::env::current_dir().unwrap(),
461                },
462            )
463            .unwrap();
464
465        assert!(result.success);
466        assert!(result.data["dry_run"].as_bool() == Some(true));
467        assert!(result.data["killed"].as_bool() == Some(false));
468    }
469
470    #[test]
471    fn test_kill_actual_process() {
472        // Start a long-running process (sleep)
473        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
474        let pid = child.id();
475
476        // Give it time to start
477        thread::sleep(Duration::from_millis(100));
478
479        // Verify process exists before kill
480        let pre_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
481        assert!(
482            pre_check.unwrap().status.success(),
483            "Process should exist before kill"
484        );
485
486        // Check if the spawned child is in the protected PID list.
487        // In CI containers (systemd cgroups), nearly all PIDs can be
488        // considered protected — if so, skip the kill assertion
489        // gracefully instead of panicking.
490        let protected = protected_pids();
491        if protected.contains(&pid) {
492            let _ = child.kill();
493            let _ = child.wait();
494            eprintln!(
495                "SKIP: spawned child PID {pid} is in protected_pids set \
496                 ({protected:?}); kill blocked by safety guard. \
497                 This is expected in CI containers."
498            );
499            return;
500        }
501
502        // Clear cache so kill sees fresh process list
503        ProcessSnapshot::clear_cache();
504
505        // Kill it via the capability using SIGKILL for reliability
506        let cap = Kill;
507        let result = cap
508            .execute(
509                &serde_json::json!({ "pid": pid, "signal": 9 }),
510                &Context {
511                    dry_run: false,
512                    job_id: "test".into(),
513                    working_dir: std::env::current_dir().unwrap(),
514                },
515            )
516            .unwrap();
517
518        // Kill should succeed — process becomes zombie until reaped
519        assert!(
520            result.data["killed"].as_bool() == Some(true),
521            "Kill failed: {:?}",
522            result.data
523        );
524        assert!(
525            result.data["signal"].as_i64() == Some(9),
526            "Should use SIGKILL"
527        );
528
529        // Reap the zombie so it disappears from process table
530        let _ = child.wait();
531
532        // Verify process is fully gone after reaping
533        let post_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
534        let still_alive = post_check.map_or(false, |o| o.status.success());
535        assert!(
536            !still_alive,
537            "Process {} should be dead after kill and reap",
538            pid
539        );
540    }
541
542    #[test]
543    fn test_get_process_start_time() {
544        // Start a process and verify we can read its start time
545        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
546        let pid = child.id();
547
548        let start_time = get_process_start_time(pid);
549        assert!(
550            start_time.is_some(),
551            "Should be able to read start time for running process"
552        );
553
554        // Verify start time is consistent (no PID reuse)
555        let start_time2 = get_process_start_time(pid);
556        assert_eq!(start_time, start_time2, "Start time should be stable");
557
558        child.kill().ok();
559        let _ = child.wait();
560    }
561
562    #[test]
563    fn test_get_process_start_time_nonexistent() {
564        let result = get_process_start_time(999999);
565        assert!(result.is_none(), "Non-existent PID should return None");
566    }
567
568    #[test]
569    fn test_signal_validation_rejects_negative() {
570        // FINDING #3: negative signals should be rejected
571        let cap = Kill;
572        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": -1 }));
573        assert!(result.is_err());
574        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
575    }
576
577    #[test]
578    fn test_signal_validation_rejects_zero() {
579        // FINDING #3: signal 0 should be rejected
580        let cap = Kill;
581        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 0 }));
582        assert!(result.is_err());
583        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
584    }
585
586    #[test]
587    fn test_signal_validation_rejects_out_of_range() {
588        // FINDING #3: signal > 31 (except 64) should be rejected
589        let cap = Kill;
590        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 32 }));
591        assert!(result.is_err());
592    }
593
594    #[test]
595    fn test_signal_validation_accepts_valid_signals() {
596        let cap = Kill;
597        for sig in [1, 9, 15, 31, 64] {
598            let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": sig }));
599            assert!(result.is_ok(), "Signal {} should be valid", sig);
600        }
601    }
602
603    #[test]
604    fn test_dry_run_hides_process_info() {
605        // FINDING #20: dry-run should NOT expose command or user info
606        let cap = Kill;
607        let result = cap
608            .execute(
609                &serde_json::json!({ "pid": 999998 }),
610                &Context {
611                    dry_run: true,
612                    job_id: "test".into(),
613                    working_dir: std::env::current_dir().unwrap(),
614                },
615            )
616            .unwrap();
617
618        assert!(result.success);
619        assert!(result.data["dry_run"].as_bool() == Some(true));
620        assert!(
621            result.data.get("command").is_none(),
622            "dry-run must not expose command"
623        );
624        assert!(
625            result.data.get("user").is_none(),
626            "dry-run must not expose user"
627        );
628        assert!(
629            result.data.get("process_exists").is_none(),
630            "dry-run must not expose process_exists"
631        );
632    }
633
634    #[test]
635    fn test_protected_pids_includes_self_and_parent() {
636        let protected = protected_pids();
637        let self_pid = std::process::id();
638        assert!(protected.contains(&1), "PID 1 should be protected");
639        assert!(protected.contains(&2), "PID 2 should be protected");
640        assert!(
641            protected.contains(&self_pid),
642            "self PID should be protected"
643        );
644    }
645
646    #[test]
647    fn test_get_process_start_time_retry() {
648        // Test retry logic with existing process
649        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
650        let pid = child.id();
651
652        let result = get_process_start_time_retry(pid);
653        assert!(
654            result.is_some(),
655            "Should read start time for running process"
656        );
657
658        child.kill().ok();
659        let _ = child.wait();
660
661        // Non-existent PID should return None after retries
662        let result = get_process_start_time_retry(999999);
663        assert!(result.is_none(), "Non-existent PID should return None");
664    }
665}