Skip to main content

runtimo_core/capabilities/
kill.rs

1//! Kill capability — terminate runaway processes by PID with full audit trail.
2//!
3//! Kills a process by PID with full telemetry capture and WAL logging.
4//! Includes safety checks to prevent killing critical system processes.
5//!
6//! # PID Reuse Protection (FINDING #1)
7//!
8//! After sending a signal, the capability verifies the killed process is the
9//! same one by comparing start times from `/proc/{pid}/stat` field 22. This
10//! prevents PID reuse races where a new process inherits the killed PID.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use runtimo_core::capabilities::Kill;
16//! use runtimo_core::capability::{Capability, Context};
17//! use serde_json::json;
18//!
19//! let cap = Kill;
20//! let result = cap.execute(
21//!     &json!({"pid": 12345}),
22//!     &Context { dry_run: false, job_id: "test".into(), ..Default::default() }
23//! ).unwrap();
24//!
25//! assert!(result.success);
26//! ```
27
28use crate::capability::{Capability, Context, Output};
29use crate::processes::ProcessSnapshot;
30use crate::{Error, Result};
31use serde::{Deserialize, Serialize};
32use serde_json::Value;
33use std::time::Duration;
34
35#[cfg(test)]
36use std::process::Command;
37
38/// Reads the process start time (field 22) from `/proc/{pid}/stat`.
39///
40/// Returns start time in clock ticks since boot. Used to detect PID reuse:
41/// if a process is killed and a new process reuses the PID, the start time
42/// will differ (FINDING #1).
43#[allow(clippy::arithmetic_side_effects)]
44fn get_process_start_time(pid: u32) -> Option<u64> {
45    let stat_path = format!("/proc/{}/stat", pid);
46    let content = std::fs::read_to_string(&stat_path).ok()?;
47    let last_paren = content.rfind(')')?;
48    let fields: Vec<&str> = content[last_paren + 2..].split_whitespace().collect();
49    fields.get(19)?.parse::<u64>().ok()
50}
51fn get_process_start_time_retry(pid: u32) -> Option<u64> {
52    #[allow(clippy::arithmetic_side_effects)] // bit shift in retry backoff: 1 << attempt
53    for attempt in 0..3 {
54        if attempt > 0 {
55            std::thread::sleep(std::time::Duration::from_millis(10 * (1 << attempt)));
56        }
57        if let Some(start_time) = get_process_start_time(pid) {
58            return Some(start_time);
59        }
60    }
61    None
62}
63
64/// Reads the cgroup of a process from `/proc/{pid}/cgroup`.
65///
66/// Returns the cgroup path string, used to detect systemd-managed services.
67fn get_process_cgroup(pid: u32) -> Option<String> {
68    std::fs::read_to_string(format!("/proc/{}/cgroup", pid)).ok()
69}
70
71/// Checks if a cgroup path indicates a systemd-managed service.
72fn is_systemd_service(cgroup: &str) -> bool {
73    cgroup.contains("/system.slice/")
74        || cgroup.contains("/init.scope")
75        || cgroup.contains("systemd")
76}
77
78/// Protected PIDs that cannot be killed (safety guard).
79/// Includes init, kthreadd, current process, parent, session leader,
80/// process group leader, and systemd critical services (FINDING #2).
81fn protected_pids() -> Vec<u32> {
82    let mut pids = vec![1, 2];
83    let self_pid = std::process::id();
84    pids.push(self_pid);
85
86    // Add parent process
87    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
88        if let Some(ppid_str) = status
89            .lines()
90            .find(|l| l.starts_with("PPid:"))
91            .and_then(|l| l.split_whitespace().nth(1))
92        {
93            if let Ok(ppid) = ppid_str.parse::<u32>() {
94                pids.push(ppid);
95            }
96        }
97    }
98
99    // Add session leader (FINDING #2)
100    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
101        if let Some(sid_str) = status
102            .lines()
103            .find(|l| l.starts_with("Sid:"))
104            .and_then(|l| l.split_whitespace().nth(1))
105        {
106            if let Ok(sid) = sid_str.parse::<u32>() {
107                if sid != 0 {
108                    pids.push(sid);
109                }
110            }
111        }
112    }
113
114    // Add process group leader (FINDING #2)
115    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
116        if let Some(pgid_str) = status
117            .lines()
118            .find(|l| l.starts_with("NSpgid:"))
119            .and_then(|l| l.split_whitespace().nth(1))
120        {
121            if let Ok(pgid) = pgid_str.parse::<u32>() {
122                if pgid != 0 {
123                    pids.push(pgid);
124                }
125            }
126        }
127    }
128
129    // Scan all running processes for systemd-critical services (FINDING #2)
130    if let Ok(entries) = std::fs::read_dir("/proc") {
131        for entry in entries.flatten() {
132            if let Ok(name) = entry.file_name().into_string() {
133                if let Ok(pid) = name.parse::<u32>() {
134                    if let Some(cgroup) = get_process_cgroup(pid) {
135                        if is_systemd_service(&cgroup) {
136                            pids.push(pid);
137                        }
138                    }
139                }
140            }
141        }
142    }
143
144    pids.sort_unstable();
145    pids.dedup();
146    pids
147}
148
149/// Arguments for the [`Kill`] capability.
150#[derive(Debug, Clone, Serialize, Deserialize)]
151pub struct KillArgs {
152    /// Process ID to kill.
153    pub pid: u32,
154    /// Signal to send (default: 15 = SIGTERM). Must be valid POSIX: 1-31 or 64.
155    pub signal: Option<i32>,
156}
157
158/// Capability that terminates a process by PID with full audit logging.
159///
160/// # Safety
161///
162/// This capability includes guards to prevent killing critical system processes.
163/// Protected PIDs include: 1 (init), 2 (kthreadd).
164///
165/// # Security
166///
167/// All kill operations are logged to WAL for audit purposes.
168// This is a capability marker struct with no fields;
169// additional fields may be added later as needed.
170#[allow(clippy::exhaustive_structs)]
171pub struct Kill;
172
173impl Capability for Kill {
174    fn name(&self) -> &'static str {
175        "Kill"
176    }
177
178    fn description(&self) -> &'static str {
179        "kill PID. Protected: init,kthreadd,self. Custom sig ok."
180    }
181
182    /// Returns the JSON Schema for Kill arguments.
183    ///
184    /// Schema requires `"pid"` integer; `"signal"` is optional and restricted
185    /// to valid POSIX signal values (1-31, 64) — FINDING #3.
186    fn schema(&self) -> Value {
187        serde_json::json!({
188            "type": "object",
189            "properties": {
190                "pid": { "type": "integer", "minimum": 1 },
191                "signal": {
192                    "type": "integer",
193                    "anyOf": [
194                        { "minimum": 1, "maximum": 31 },
195                        { "enum": [64] }
196                    ]
197                }
198            },
199            "required": ["pid"]
200        })
201    }
202
203    fn validate(&self, args: &Value) -> Result<()> {
204        let args: KillArgs = serde_json::from_value(args.clone())
205            .map_err(|e| Error::SchemaValidationFailed(e.to_string()))?;
206
207        // FINDING #3: Restrict signal to valid POSIX values (1-31, 64)
208        if let Some(signal) = args.signal {
209            if !(1..=31).contains(&signal) && signal != 64 {
210                return Err(Error::SchemaValidationFailed(format!(
211                    "Invalid signal {}: must be 1-31 or 64 (POSIX signals)",
212                    signal
213                )));
214            }
215        }
216
217        Ok(())
218    }
219
220    fn execute(&self, args: &Value, ctx: &Context) -> Result<Output> {
221        let args: KillArgs = serde_json::from_value(args.clone())
222            .map_err(|e| Error::ExecutionFailed(e.to_string()))?;
223
224        // Safety check: protected PIDs (init, kthreadd, self, parent)
225        let protected = protected_pids();
226        if protected.contains(&args.pid) {
227            return Err(Error::ExecutionFailed(format!(
228                "PID {} is a protected system process (protected: {:?})",
229                args.pid, protected
230            )));
231        }
232
233        // Respect dry_run — skip kill entirely
234        if ctx.dry_run {
235            // FINDING #20: Limit dry-run output to "would kill PID X", hide command/user info
236            return Ok(Output {
237                success: true,
238                data: serde_json::json!({
239                    "pid": args.pid,
240                    "killed": false,
241                    "dry_run": true,
242                    "signal": args.signal.unwrap_or(15),
243                }),
244                message: Some(format!("DRY RUN: would kill PID {}", args.pid)),
245            });
246        }
247
248        // Capture process snapshot before kill
249        let process_before = ProcessSnapshot::capture();
250        let process_exists = process_before.processes.iter().any(|p| p.pid == args.pid);
251
252        if !process_exists {
253            return Ok(Output {
254                success: false,
255                data: serde_json::json!({
256                    "pid": args.pid,
257                    "killed": false,
258                    "reason": "Process not found"
259                }),
260                message: Some(format!("Process {} not found", args.pid)),
261            });
262        }
263
264        // Get process info before killing
265        let process_info: Option<(String, String)> = process_before
266            .processes
267            .iter()
268            .find(|p| p.pid == args.pid)
269            .map(|p| (p.command.clone(), p.user.clone()));
270
271        // Record start time to detect PID reuse (FINDING #1)
272        let start_time_before = get_process_start_time_retry(args.pid);
273
274        // Double-check: re-read start time to narrow TOCTOU window.
275        // If the PID was recycled between these reads, abort the kill.
276        let start_time_before_confirm = get_process_start_time_retry(args.pid);
277        if start_time_before != start_time_before_confirm {
278            return Ok(Output {
279                success: false,
280                data: serde_json::json!({
281                    "pid": args.pid,
282                    "killed": false,
283                    "reason": "PID reused between safety checks",
284                    "pid_reused": true,
285                }),
286                message: Some(format!(
287                    "PID {} was reused by a different process (start time changed before kill)",
288                    args.pid
289                )),
290            });
291        }
292
293        // Determine signal — default to SIGTERM (15) for graceful shutdown
294        let signal = args.signal.unwrap_or(15);
295
296        // Execute kill via libc for reliability (avoids shell/PATH issues)
297        // SAFETY: pid is validated as a valid target; signal is validated to 1-64 range;
298        // pid_t is i32 — pid is u32, cast is safe for all valid PIDs
299        #[allow(clippy::cast_possible_wrap)]
300        let kill_result = unsafe { libc::kill(args.pid as libc::pid_t, signal) };
301        let success = kill_result == 0;
302        let stderr_str = if success {
303            String::new()
304        } else {
305            std::io::Error::last_os_error().to_string()
306        };
307
308        // Delay to let process terminate and be removed from process table
309        std::thread::sleep(Duration::from_millis(500));
310
311        // Clear cache to ensure fresh snapshot (cached data would show pre-kill state)
312        ProcessSnapshot::clear_cache();
313
314        // Capture process snapshot after kill
315        let process_after = ProcessSnapshot::capture();
316
317        // Check if process still exists (zombies count as dead — they've been terminated)
318        let process_still_exists = process_after
319            .processes
320            .iter()
321            .any(|p| p.pid == args.pid && !p.stat.starts_with('Z'));
322        // Verify PID was not reused — check start time matches (FINDING #1)
323        let pid_reused = match (start_time_before, get_process_start_time_retry(args.pid)) {
324            (Some(before_time), Some(after_time)) => before_time != after_time,
325            (None, _) => false,
326            (Some(_), None) => true,
327        };
328
329        let killed_success = success && !process_still_exists && !pid_reused;
330
331        let message = if killed_success {
332            format!("Killed process {} (signal {})", args.pid, signal)
333        } else if pid_reused {
334            format!(
335                "PID {} was reused by a different process (start time changed)",
336                args.pid
337            )
338        } else if !success {
339            format!("Failed to kill process {}: {}", args.pid, stderr_str)
340        } else {
341            format!("Process {} still exists after signal {}", args.pid, signal)
342        };
343
344        Ok(Output {
345            success: killed_success,
346            data: serde_json::json!({
347                "pid": args.pid,
348                "killed": killed_success,
349                "signal": signal,
350                "command": process_info.as_ref().map(|(cmd, _)| cmd),
351                "user": process_info.as_ref().map(|(_, user)| user),
352                "stderr": if success { String::new() } else { stderr_str },
353                "pid_reused": pid_reused,
354                "process_before": {
355                    "count": process_before.summary.total_processes,
356                    "zombies": process_before.summary.zombie_count
357                },
358                "process_after": {
359                    "count": process_after.summary.total_processes,
360                    "zombies": process_after.summary.zombie_count
361                }
362            }),
363            message: Some(message),
364        })
365    }
366}
367
368#[cfg(test)]
369#[allow(clippy::unnecessary_map_or)]
370mod tests {
371    use super::*;
372    use crate::capability::Capability;
373    use std::thread;
374    use std::time::Duration;
375
376    #[test]
377    fn test_kill_schema() {
378        let cap = Kill;
379        let _schema = cap.schema();
380        // Retry function test
381        // Test retry logic with existing process
382        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
383        let pid = child.id();
384
385        let result = get_process_start_time_retry(pid);
386        assert!(
387            result.is_some(),
388            "Should read start time for running process"
389        );
390
391        child.kill().ok();
392        let _ = child.wait();
393
394        // Non-existent PID should return None after retries
395        let result = get_process_start_time_retry(999999);
396        assert!(result.is_none(), "Non-existent PID should return None");
397    }
398
399    #[test]
400    fn test_kill_protected_pid() {
401        let cap = Kill;
402        // PID 1 is protected
403        let result = cap.execute(
404            &serde_json::json!({ "pid": 1 }),
405            &Context {
406                dry_run: false,
407                job_id: "test".into(),
408                working_dir: std::env::current_dir().unwrap(),
409            },
410        );
411
412        // Should fail because PID 1 is protected
413        assert!(result.is_err());
414        assert!(result
415            .unwrap_err()
416            .to_string()
417            .contains("protected system process"));
418    }
419
420    #[test]
421    fn test_kill_self_protected() {
422        let cap = Kill;
423        let self_pid = std::process::id();
424        let result = cap.execute(
425            &serde_json::json!({ "pid": self_pid }),
426            &Context {
427                dry_run: false,
428                job_id: "test".into(),
429                working_dir: std::env::current_dir().unwrap(),
430            },
431        );
432
433        assert!(result.is_err());
434        assert!(result.unwrap_err().to_string().contains("protected"));
435    }
436
437    #[test]
438    fn test_kill_nonexistent() {
439        let cap = Kill;
440        // Use a PID that's very unlikely to exist
441        let result = cap
442            .execute(
443                &serde_json::json!({ "pid": 999999 }),
444                &Context {
445                    dry_run: false,
446                    job_id: "test".into(),
447                    working_dir: std::env::current_dir().unwrap(),
448                },
449            )
450            .unwrap();
451
452        assert!(!result.success);
453        assert!(result.data["killed"].as_bool() == Some(false));
454    }
455
456    #[test]
457    fn test_kill_dry_run() {
458        let cap = Kill;
459        // Use a real PID (self) but in dry_run mode — should NOT error as protected
460        // because dry_run skips the actual kill but still checks protection
461        // Actually, protection check runs before dry_run, so use a non-protected PID
462        let result = cap
463            .execute(
464                &serde_json::json!({ "pid": 999998 }),
465                &Context {
466                    dry_run: true,
467                    job_id: "test".into(),
468                    working_dir: std::env::current_dir().unwrap(),
469                },
470            )
471            .unwrap();
472
473        assert!(result.success);
474        assert!(result.data["dry_run"].as_bool() == Some(true));
475        assert!(result.data["killed"].as_bool() == Some(false));
476    }
477
478    #[test]
479    fn test_kill_actual_process() {
480        // Start a long-running process (sleep)
481        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
482        let pid = child.id();
483
484        // Give it time to start
485        thread::sleep(Duration::from_millis(100));
486
487        // Verify process exists before kill
488        let pre_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
489        assert!(
490            pre_check.unwrap().status.success(),
491            "Process should exist before kill"
492        );
493
494        // Clear cache so kill sees fresh process list
495        ProcessSnapshot::clear_cache();
496
497        // Kill it via the capability using SIGKILL for reliability
498        let cap = Kill;
499        let result = cap
500            .execute(
501                &serde_json::json!({ "pid": pid, "signal": 9 }),
502                &Context {
503                    dry_run: false,
504                    job_id: "test".into(),
505                    working_dir: std::env::current_dir().unwrap(),
506                },
507            )
508            .unwrap();
509
510        // Kill should succeed — process becomes zombie until reaped
511        assert!(
512            result.data["killed"].as_bool() == Some(true),
513            "Kill failed: {:?}",
514            result.data
515        );
516        assert!(
517            result.data["signal"].as_i64() == Some(9),
518            "Should use SIGKILL"
519        );
520
521        // Reap the zombie so it disappears from process table
522        let _ = child.wait();
523
524        // Verify process is fully gone after reaping
525        let post_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
526        let still_alive = post_check.map_or(false, |o| o.status.success());
527        assert!(
528            !still_alive,
529            "Process {} should be dead after kill and reap",
530            pid
531        );
532    }
533
534    #[test]
535    fn test_get_process_start_time() {
536        // Start a process and verify we can read its start time
537        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
538        let pid = child.id();
539
540        let start_time = get_process_start_time(pid);
541        assert!(
542            start_time.is_some(),
543            "Should be able to read start time for running process"
544        );
545
546        // Verify start time is consistent (no PID reuse)
547        let start_time2 = get_process_start_time(pid);
548        assert_eq!(start_time, start_time2, "Start time should be stable");
549
550        child.kill().ok();
551        let _ = child.wait();
552    }
553
554    #[test]
555    fn test_get_process_start_time_nonexistent() {
556        let result = get_process_start_time(999999);
557        assert!(result.is_none(), "Non-existent PID should return None");
558    }
559
560    #[test]
561    fn test_signal_validation_rejects_negative() {
562        // FINDING #3: negative signals should be rejected
563        let cap = Kill;
564        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": -1 }));
565        assert!(result.is_err());
566        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
567    }
568
569    #[test]
570    fn test_signal_validation_rejects_zero() {
571        // FINDING #3: signal 0 should be rejected
572        let cap = Kill;
573        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 0 }));
574        assert!(result.is_err());
575        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
576    }
577
578    #[test]
579    fn test_signal_validation_rejects_out_of_range() {
580        // FINDING #3: signal > 31 (except 64) should be rejected
581        let cap = Kill;
582        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 32 }));
583        assert!(result.is_err());
584    }
585
586    #[test]
587    fn test_signal_validation_accepts_valid_signals() {
588        let cap = Kill;
589        for sig in [1, 9, 15, 31, 64] {
590            let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": sig }));
591            assert!(result.is_ok(), "Signal {} should be valid", sig);
592        }
593    }
594
595    #[test]
596    fn test_dry_run_hides_process_info() {
597        // FINDING #20: dry-run should NOT expose command or user info
598        let cap = Kill;
599        let result = cap
600            .execute(
601                &serde_json::json!({ "pid": 999998 }),
602                &Context {
603                    dry_run: true,
604                    job_id: "test".into(),
605                    working_dir: std::env::current_dir().unwrap(),
606                },
607            )
608            .unwrap();
609
610        assert!(result.success);
611        assert!(result.data["dry_run"].as_bool() == Some(true));
612        assert!(
613            result.data.get("command").is_none(),
614            "dry-run must not expose command"
615        );
616        assert!(
617            result.data.get("user").is_none(),
618            "dry-run must not expose user"
619        );
620        assert!(
621            result.data.get("process_exists").is_none(),
622            "dry-run must not expose process_exists"
623        );
624    }
625
626    #[test]
627    fn test_protected_pids_includes_self_and_parent() {
628        let protected = protected_pids();
629        let self_pid = std::process::id();
630        assert!(protected.contains(&1), "PID 1 should be protected");
631        assert!(protected.contains(&2), "PID 2 should be protected");
632        assert!(
633            protected.contains(&self_pid),
634            "self PID should be protected"
635        );
636    }
637
638    #[test]
639    fn test_get_process_start_time_retry() {
640        // Test retry logic with existing process
641        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
642        let pid = child.id();
643
644        let result = get_process_start_time_retry(pid);
645        assert!(
646            result.is_some(),
647            "Should read start time for running process"
648        );
649
650        child.kill().ok();
651        let _ = child.wait();
652
653        // Non-existent PID should return None after retries
654        let result = get_process_start_time_retry(999999);
655        assert!(result.is_none(), "Non-existent PID should return None");
656    }
657}