Skip to main content

runtimo_core/capabilities/
kill.rs

1//! Kill capability — terminate runaway processes by PID with full audit trail.
2//!
3//! Kills a process by PID with full telemetry capture and WAL logging.
4//! Includes safety checks to prevent killing critical system processes.
5//!
6//! # PID Reuse Protection (FINDING #1)
7//!
8//! After sending a signal, the capability verifies the killed process is the
9//! same one by comparing start times from `/proc/{pid}/stat` field 22. This
10//! prevents PID reuse races where a new process inherits the killed PID.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use runtimo_core::capabilities::Kill;
16//! use runtimo_core::capability::{Capability, Context};
17//! use serde_json::json;
18//!
19//! let cap = Kill;
20//! let result = cap.execute(
21//!     &json!({"pid": 12345}),
22//!     &Context { dry_run: false, job_id: "test".into(), ..Default::default() }
23//! ).unwrap();
24//!
25//! assert!(result.success);
26//! ```
27
28use crate::capability::{Capability, Context, Output};
29use crate::processes::ProcessSnapshot;
30use crate::{Error, Result};
31use serde::{Deserialize, Serialize};
32use serde_json::Value;
33use std::time::Duration;
34
35#[cfg(test)]
36use std::process::Command;
37
38/// Reads the process start time (field 22) from `/proc/{pid}/stat`.
39///
40/// Returns start time in clock ticks since boot. Used to detect PID reuse:
41/// if a process is killed and a new process reuses the PID, the start time
42/// will differ (FINDING #1).
43#[allow(clippy::arithmetic_side_effects)]
44fn get_process_start_time(pid: u32) -> Option<u64> {
45    let stat_path = format!("/proc/{}/stat", pid);
46    let content = std::fs::read_to_string(&stat_path).ok()?;
47    let last_paren = content.rfind(')')?;
48    let fields: Vec<&str> = content[last_paren + 2..].split_whitespace().collect();
49    fields.get(19)?.parse::<u64>().ok()
50}
51fn get_process_start_time_retry(pid: u32) -> Option<u64> {
52    #[allow(clippy::arithmetic_side_effects)] // bit shift in retry backoff: 1 << attempt
53    for attempt in 0..3 {
54        if attempt > 0 {
55            std::thread::sleep(std::time::Duration::from_millis(10 * (1 << attempt)));
56        }
57        if let Some(start_time) = get_process_start_time(pid) {
58            return Some(start_time);
59        }
60    }
61    None
62}
63
64/// Reads the cgroup of a process from `/proc/{pid}/cgroup`.
65///
66/// Returns the cgroup path string, used to detect systemd-managed services.
67fn get_process_cgroup(pid: u32) -> Option<String> {
68    std::fs::read_to_string(format!("/proc/{}/cgroup", pid)).ok()
69}
70
71/// Checks if a cgroup path indicates a systemd-managed service.
72fn is_systemd_service(cgroup: &str) -> bool {
73    cgroup.contains("/system.slice/")
74        || cgroup.contains("/init.scope")
75        || cgroup.contains("systemd")
76}
77
78/// Protected PIDs that cannot be killed (safety guard).
79/// Includes init, kthreadd, current process, parent, session leader,
80/// process group leader, and systemd critical services (FINDING #2).
81fn protected_pids() -> Vec<u32> {
82    let mut pids = vec![1, 2];
83    let self_pid = std::process::id();
84    pids.push(self_pid);
85
86    // Add parent process
87    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
88        if let Some(ppid_str) = status
89            .lines()
90            .find(|l| l.starts_with("PPid:"))
91            .and_then(|l| l.split_whitespace().nth(1))
92        {
93            if let Ok(ppid) = ppid_str.parse::<u32>() {
94                pids.push(ppid);
95            }
96        }
97    }
98
99    // Add session leader (FINDING #2)
100    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
101        if let Some(sid_str) = status
102            .lines()
103            .find(|l| l.starts_with("Sid:"))
104            .and_then(|l| l.split_whitespace().nth(1))
105        {
106            if let Ok(sid) = sid_str.parse::<u32>() {
107                if sid != 0 {
108                    pids.push(sid);
109                }
110            }
111        }
112    }
113
114    // Add process group leader (FINDING #2)
115    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
116        if let Some(pgid_str) = status
117            .lines()
118            .find(|l| l.starts_with("NSpgid:"))
119            .and_then(|l| l.split_whitespace().nth(1))
120        {
121            if let Ok(pgid) = pgid_str.parse::<u32>() {
122                if pgid != 0 {
123                    pids.push(pgid);
124                }
125            }
126        }
127    }
128
129    // Scan all running processes for systemd-critical services (FINDING #2)
130    if let Ok(entries) = std::fs::read_dir("/proc") {
131        for entry in entries.flatten() {
132            if let Ok(name) = entry.file_name().into_string() {
133                if let Ok(pid) = name.parse::<u32>() {
134                    if let Some(cgroup) = get_process_cgroup(pid) {
135                        if is_systemd_service(&cgroup) {
136                            pids.push(pid);
137                        }
138                    }
139                }
140            }
141        }
142    }
143
144    pids.sort_unstable();
145    pids.dedup();
146    pids
147}
148
149/// Arguments for the [`Kill`] capability.
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct KillArgs {
152    /// Process ID to kill.
153    pub pid: u32,
154    /// Signal to send (default: 15 = SIGTERM). Must be valid POSIX: 1-31 or 64.
155    pub signal: Option<i32>,
156}
157
158/// Capability that terminates a process by PID with full audit logging.
159///
160/// # Safety
161///
162/// This capability includes guards to prevent killing critical system processes.
163/// Protected PIDs include: 1 (init), 2 (kthreadd).
164///
165/// # Security
166///
167/// All kill operations are logged to WAL for audit purposes.
168// This is a capability marker struct with no fields;
169// additional fields may be added later as needed.
170#[allow(clippy::exhaustive_structs)]
171pub struct Kill;
172
173impl Capability for Kill {
174    fn name(&self) -> &'static str {
175        "Kill"
176    }
177
178    fn description(&self) -> &'static str {
179        "kill PID. Protected: init,kthreadd,self. Custom sig ok."
180    }
181
182    /// Returns the JSON Schema for Kill arguments.
183    ///
184    /// Schema requires `"pid"` integer; `"signal"` is optional and restricted
185    /// to valid POSIX signal values (1-31, 64) — FINDING #3.
186    fn schema(&self) -> Value {
187        serde_json::json!({
188            "type": "object",
189            "properties": {
190                "pid": { "type": "integer", "minimum": 1 },
191                "signal": {
192                    "type": "integer",
193                    "anyOf": [
194                        { "minimum": 1, "maximum": 31 },
195                        { "enum": [64] }
196                    ]
197                }
198            },
199            "required": ["pid"]
200        })
201    }
202
203    fn validate(&self, args: &Value) -> Result<()> {
204        let args: KillArgs = serde_json::from_value(args.clone())
205            .map_err(|e| Error::SchemaValidationFailed(e.to_string()))?;
206
207        // FINDING #3: Restrict signal to valid POSIX values (1-31, 64)
208        if let Some(signal) = args.signal {
209            if !(1..=31).contains(&signal) && signal != 64 {
210                return Err(Error::SchemaValidationFailed(format!(
211                    "Invalid signal {}: must be 1-31 or 64 (POSIX signals)",
212                    signal
213                )));
214            }
215        }
216
217        Ok(())
218    }
219
220    fn execute(&self, args: &Value, ctx: &Context) -> Result<Output> {
221        let args: KillArgs = serde_json::from_value(args.clone())
222            .map_err(|e| Error::ExecutionFailed(e.to_string()))?;
223
224        // Safety check: protected PIDs (init, kthreadd, self, parent)
225        let protected = protected_pids();
226        if protected.contains(&args.pid) {
227            return Err(Error::ExecutionFailed(format!(
228                "PID {} is a protected system process (protected: {:?})",
229                args.pid, protected
230            )));
231        }
232
233        // Respect dry_run — skip kill entirely
234        if ctx.dry_run {
235            // FINDING #20: Limit dry-run output to "would kill PID X", hide command/user info
236            return Ok(Output {
237                success: true,
238                data: serde_json::json!({
239                    "pid": args.pid,
240                    "killed": false,
241                    "dry_run": true,
242                    "signal": args.signal.unwrap_or(15),
243                }),
244                message: Some(format!("DRY RUN: would kill PID {}", args.pid)),
245            });
246        }
247
248        // Capture process snapshot before kill
249        let process_before = ProcessSnapshot::capture();
250        let process_exists = process_before.processes.iter().any(|p| p.pid == args.pid);
251
252        if !process_exists {
253            return Ok(Output {
254                success: false,
255                data: serde_json::json!({
256                    "pid": args.pid,
257                    "killed": false,
258                    "reason": "Process not found"
259                }),
260                message: Some(format!("Process {} not found", args.pid)),
261            });
262        }
263
264        // Get process info before killing
265        let process_info: Option<(String, String)> = process_before
266            .processes
267            .iter()
268            .find(|p| p.pid == args.pid)
269            .map(|p| (p.command.clone(), p.user.clone()));
270
271        // Record start time to detect PID reuse (FINDING #1)
272        let start_time_before = get_process_start_time_retry(args.pid);
273
274        // Determine signal — default to SIGTERM (15) for graceful shutdown
275        let signal = args.signal.unwrap_or(15);
276
277        // Execute kill via libc for reliability (avoids shell/PATH issues)
278        // SAFETY: pid is validated as a valid target; signal is validated to 1-64 range;
279        // pid_t is i32 — pid is u32, cast is safe for all valid PIDs
280        #[allow(clippy::cast_possible_wrap)]
281        let kill_result = unsafe { libc::kill(args.pid as libc::pid_t, signal) };
282        let success = kill_result == 0;
283        let stderr_str = if success {
284            String::new()
285        } else {
286            std::io::Error::last_os_error().to_string()
287        };
288
289        // Delay to let process terminate and be removed from process table
290        std::thread::sleep(Duration::from_millis(500));
291
292        // Clear cache to ensure fresh snapshot (cached data would show pre-kill state)
293        ProcessSnapshot::clear_cache();
294
295        // Capture process snapshot after kill
296        let process_after = ProcessSnapshot::capture();
297
298        // Check if process still exists (zombies count as dead — they've been terminated)
299        let process_still_exists = process_after
300            .processes
301            .iter()
302            .any(|p| p.pid == args.pid && !p.stat.starts_with('Z'));
303        // Verify PID was not reused — check start time matches (FINDING #1)
304        let pid_reused = match (start_time_before, get_process_start_time_retry(args.pid)) {
305            (Some(before_time), Some(after_time)) => before_time != after_time,
306            (None, _) => false,
307            (Some(_), None) => true,
308        };
309
310        let killed_success = success && !process_still_exists && !pid_reused;
311
312        let message = if killed_success {
313            format!("Killed process {} (signal {})", args.pid, signal)
314        } else if pid_reused {
315            format!(
316                "PID {} was reused by a different process (start time changed)",
317                args.pid
318            )
319        } else if !success {
320            format!("Failed to kill process {}: {}", args.pid, stderr_str)
321        } else {
322            format!("Process {} still exists after signal {}", args.pid, signal)
323        };
324
325        Ok(Output {
326            success: killed_success,
327            data: serde_json::json!({
328                "pid": args.pid,
329                "killed": killed_success,
330                "signal": signal,
331                "command": process_info.as_ref().map(|(cmd, _)| cmd),
332                "user": process_info.as_ref().map(|(_, user)| user),
333                "stderr": if success { String::new() } else { stderr_str },
334                "pid_reused": pid_reused,
335                "process_before": {
336                    "count": process_before.summary.total_processes,
337                    "zombies": process_before.summary.zombie_count
338                },
339                "process_after": {
340                    "count": process_after.summary.total_processes,
341                    "zombies": process_after.summary.zombie_count
342                }
343            }),
344            message: Some(message),
345        })
346    }
347}
348
349#[cfg(test)]
350#[allow(clippy::unnecessary_map_or)]
351mod tests {
352    use super::*;
353    use crate::capability::Capability;
354    use std::thread;
355    use std::time::Duration;
356
357    #[test]
358    fn test_kill_schema() {
359        let cap = Kill;
360        let _schema = cap.schema();
361        // Retry function test
362        // Test retry logic with existing process
363        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
364        let pid = child.id();
365
366        let result = get_process_start_time_retry(pid);
367        assert!(
368            result.is_some(),
369            "Should read start time for running process"
370        );
371
372        child.kill().ok();
373        let _ = child.wait();
374
375        // Non-existent PID should return None after retries
376        let result = get_process_start_time_retry(999999);
377        assert!(result.is_none(), "Non-existent PID should return None");
378    }
379
380    #[test]
381    fn test_kill_protected_pid() {
382        let cap = Kill;
383        // PID 1 is protected
384        let result = cap.execute(
385            &serde_json::json!({ "pid": 1 }),
386            &Context {
387                dry_run: false,
388                job_id: "test".into(),
389                working_dir: std::env::current_dir().unwrap(),
390            },
391        );
392
393        // Should fail because PID 1 is protected
394        assert!(result.is_err());
395        assert!(result
396            .unwrap_err()
397            .to_string()
398            .contains("protected system process"));
399    }
400
401    #[test]
402    fn test_kill_self_protected() {
403        let cap = Kill;
404        let self_pid = std::process::id();
405        let result = cap.execute(
406            &serde_json::json!({ "pid": self_pid }),
407            &Context {
408                dry_run: false,
409                job_id: "test".into(),
410                working_dir: std::env::current_dir().unwrap(),
411            },
412        );
413
414        assert!(result.is_err());
415        assert!(result.unwrap_err().to_string().contains("protected"));
416    }
417
418    #[test]
419    fn test_kill_nonexistent() {
420        let cap = Kill;
421        // Use a PID that's very unlikely to exist
422        let result = cap
423            .execute(
424                &serde_json::json!({ "pid": 999999 }),
425                &Context {
426                    dry_run: false,
427                    job_id: "test".into(),
428                    working_dir: std::env::current_dir().unwrap(),
429                },
430            )
431            .unwrap();
432
433        assert!(!result.success);
434        assert!(result.data["killed"].as_bool() == Some(false));
435    }
436
437    #[test]
438    fn test_kill_dry_run() {
439        let cap = Kill;
440        // Use a real PID (self) but in dry_run mode — should NOT error as protected
441        // because dry_run skips the actual kill but still checks protection
442        // Actually, protection check runs before dry_run, so use a non-protected PID
443        let result = cap
444            .execute(
445                &serde_json::json!({ "pid": 999998 }),
446                &Context {
447                    dry_run: true,
448                    job_id: "test".into(),
449                    working_dir: std::env::current_dir().unwrap(),
450                },
451            )
452            .unwrap();
453
454        assert!(result.success);
455        assert!(result.data["dry_run"].as_bool() == Some(true));
456        assert!(result.data["killed"].as_bool() == Some(false));
457    }
458
459    #[test]
460    fn test_kill_actual_process() {
461        // Start a long-running process (sleep)
462        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
463        let pid = child.id();
464
465        // Give it time to start
466        thread::sleep(Duration::from_millis(100));
467
468        // Verify process exists before kill
469        let pre_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
470        assert!(
471            pre_check.unwrap().status.success(),
472            "Process should exist before kill"
473        );
474
475        // Clear cache so kill sees fresh process list
476        ProcessSnapshot::clear_cache();
477
478        // Kill it via the capability using SIGKILL for reliability
479        let cap = Kill;
480        let result = cap
481            .execute(
482                &serde_json::json!({ "pid": pid, "signal": 9 }),
483                &Context {
484                    dry_run: false,
485                    job_id: "test".into(),
486                    working_dir: std::env::current_dir().unwrap(),
487                },
488            )
489            .unwrap();
490
491        // Kill should succeed — process becomes zombie until reaped
492        assert!(
493            result.data["killed"].as_bool() == Some(true),
494            "Kill failed: {:?}",
495            result.data
496        );
497        assert!(
498            result.data["signal"].as_i64() == Some(9),
499            "Should use SIGKILL"
500        );
501
502        // Reap the zombie so it disappears from process table
503        let _ = child.wait();
504
505        // Verify process is fully gone after reaping
506        let post_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
507        let still_alive = post_check.map_or(false, |o| o.status.success());
508        assert!(
509            !still_alive,
510            "Process {} should be dead after kill and reap",
511            pid
512        );
513    }
514
515    #[test]
516    fn test_get_process_start_time() {
517        // Start a process and verify we can read its start time
518        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
519        let pid = child.id();
520
521        let start_time = get_process_start_time(pid);
522        assert!(
523            start_time.is_some(),
524            "Should be able to read start time for running process"
525        );
526
527        // Verify start time is consistent (no PID reuse)
528        let start_time2 = get_process_start_time(pid);
529        assert_eq!(start_time, start_time2, "Start time should be stable");
530
531        child.kill().ok();
532        let _ = child.wait();
533    }
534
535    #[test]
536    fn test_get_process_start_time_nonexistent() {
537        let result = get_process_start_time(999999);
538        assert!(result.is_none(), "Non-existent PID should return None");
539    }
540
541    #[test]
542    fn test_signal_validation_rejects_negative() {
543        // FINDING #3: negative signals should be rejected
544        let cap = Kill;
545        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": -1 }));
546        assert!(result.is_err());
547        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
548    }
549
550    #[test]
551    fn test_signal_validation_rejects_zero() {
552        // FINDING #3: signal 0 should be rejected
553        let cap = Kill;
554        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 0 }));
555        assert!(result.is_err());
556        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
557    }
558
559    #[test]
560    fn test_signal_validation_rejects_out_of_range() {
561        // FINDING #3: signal > 31 (except 64) should be rejected
562        let cap = Kill;
563        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 32 }));
564        assert!(result.is_err());
565    }
566
567    #[test]
568    fn test_signal_validation_accepts_valid_signals() {
569        let cap = Kill;
570        for sig in [1, 9, 15, 31, 64] {
571            let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": sig }));
572            assert!(result.is_ok(), "Signal {} should be valid", sig);
573        }
574    }
575
576    #[test]
577    fn test_dry_run_hides_process_info() {
578        // FINDING #20: dry-run should NOT expose command or user info
579        let cap = Kill;
580        let result = cap
581            .execute(
582                &serde_json::json!({ "pid": 999998 }),
583                &Context {
584                    dry_run: true,
585                    job_id: "test".into(),
586                    working_dir: std::env::current_dir().unwrap(),
587                },
588            )
589            .unwrap();
590
591        assert!(result.success);
592        assert!(result.data["dry_run"].as_bool() == Some(true));
593        assert!(
594            result.data.get("command").is_none(),
595            "dry-run must not expose command"
596        );
597        assert!(
598            result.data.get("user").is_none(),
599            "dry-run must not expose user"
600        );
601        assert!(
602            result.data.get("process_exists").is_none(),
603            "dry-run must not expose process_exists"
604        );
605    }
606
607    #[test]
608    fn test_protected_pids_includes_self_and_parent() {
609        let protected = protected_pids();
610        let self_pid = std::process::id();
611        assert!(protected.contains(&1), "PID 1 should be protected");
612        assert!(protected.contains(&2), "PID 2 should be protected");
613        assert!(
614            protected.contains(&self_pid),
615            "self PID should be protected"
616        );
617    }
618
619    #[test]
620    fn test_get_process_start_time_retry() {
621        // Test retry logic with existing process
622        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
623        let pid = child.id();
624
625        let result = get_process_start_time_retry(pid);
626        assert!(
627            result.is_some(),
628            "Should read start time for running process"
629        );
630
631        child.kill().ok();
632        let _ = child.wait();
633
634        // Non-existent PID should return None after retries
635        let result = get_process_start_time_retry(999999);
636        assert!(result.is_none(), "Non-existent PID should return None");
637    }
638}