Skip to main content

runtimo_core/capabilities/
kill.rs

1//! Kill capability — terminate runaway processes by PID with full audit trail.
2//!
3//! Kills a process by PID with full telemetry capture and WAL logging.
4//! Includes safety checks to prevent killing critical system processes.
5//!
6//! # PID Reuse Protection (FINDING #1)
7//!
8//! After sending a signal, the capability verifies the killed process is the
9//! same one by comparing start times from `/proc/{pid}/stat` field 22. This
10//! prevents PID reuse races where a new process inherits the killed PID.
11//!
12//! # Example
13//!
14//! ```rust,ignore
15//! use runtimo_core::capabilities::Kill;
16//! use runtimo_core::capability::{Capability, Context};
17//! use serde_json::json;
18//!
19//! let cap = Kill;
20//! let result = cap.execute(
21//!     &json!({"pid": 12345}),
22//!     &Context { dry_run: false, job_id: "test".into(), ..Default::default() }
23//! ).unwrap();
24//!
25//! assert!(result.success);
26//! ```
27
28use crate::capability::{Capability, Context, Output};
29use crate::processes::ProcessSnapshot;
30use crate::{Error, Result};
31use serde::{Deserialize, Serialize};
32use serde_json::Value;
33use std::time::Duration;
34
35#[cfg(test)]
36use std::process::Command;
37
38/// Reads the process start time (field 22) from `/proc/{pid}/stat`.
39///
40/// Returns start time in clock ticks since boot. Used to detect PID reuse:
41/// if a process is killed and a new process reuses the PID, the start time
42/// will differ (FINDING #1).
43fn get_process_start_time(pid: u32) -> Option<u64> {
44    let stat_path = format!("/proc/{}/stat", pid);
45    let content = std::fs::read_to_string(&stat_path).ok()?;
46    let last_paren = content.rfind(')')?;
47    let fields: Vec<&str> = content[last_paren + 2..].split_whitespace().collect();
48    fields.get(19)?.parse::<u64>().ok()
49}
50fn get_process_start_time_retry(pid: u32) -> Option<u64> {
51    for attempt in 0..3 {
52        if attempt > 0 {
53            std::thread::sleep(std::time::Duration::from_millis(10 * (1 << attempt)));
54        }
55        if let Some(start_time) = get_process_start_time(pid) {
56            return Some(start_time);
57        }
58    }
59    None
60}
61
62/// Reads the cgroup of a process from `/proc/{pid}/cgroup`.
63///
64/// Returns the cgroup path string, used to detect systemd-managed services.
65fn get_process_cgroup(pid: u32) -> Option<String> {
66    std::fs::read_to_string(format!("/proc/{}/cgroup", pid)).ok()
67}
68
69/// Checks if a cgroup path indicates a systemd-managed service.
70fn is_systemd_service(cgroup: &str) -> bool {
71    cgroup.contains("/system.slice/")
72        || cgroup.contains("/init.scope")
73        || cgroup.contains("systemd")
74}
75
76/// Protected PIDs that cannot be killed (safety guard).
77/// Includes init, kthreadd, current process, parent, session leader,
78/// process group leader, and systemd critical services (FINDING #2).
79fn protected_pids() -> Vec<u32> {
80    let mut pids = vec![1, 2];
81    let self_pid = std::process::id();
82    pids.push(self_pid);
83
84    // Add parent process
85    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
86        if let Some(ppid_str) = status
87            .lines()
88            .find(|l| l.starts_with("PPid:"))
89            .and_then(|l| l.split_whitespace().nth(1))
90        {
91            if let Ok(ppid) = ppid_str.parse::<u32>() {
92                pids.push(ppid);
93            }
94        }
95    }
96
97    // Add session leader (FINDING #2)
98    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
99        if let Some(sid_str) = status
100            .lines()
101            .find(|l| l.starts_with("Sid:"))
102            .and_then(|l| l.split_whitespace().nth(1))
103        {
104            if let Ok(sid) = sid_str.parse::<u32>() {
105                if sid != 0 {
106                    pids.push(sid);
107                }
108            }
109        }
110    }
111
112    // Add process group leader (FINDING #2)
113    if let Ok(status) = std::fs::read_to_string(format!("/proc/{}/status", self_pid)) {
114        if let Some(pgid_str) = status
115            .lines()
116            .find(|l| l.starts_with("NSpgid:"))
117            .and_then(|l| l.split_whitespace().nth(1))
118        {
119            if let Ok(pgid) = pgid_str.parse::<u32>() {
120                if pgid != 0 {
121                    pids.push(pgid);
122                }
123            }
124        }
125    }
126
127    // Scan all running processes for systemd-critical services (FINDING #2)
128    if let Ok(entries) = std::fs::read_dir("/proc") {
129        for entry in entries.flatten() {
130            if let Ok(name) = entry.file_name().into_string() {
131                if let Ok(pid) = name.parse::<u32>() {
132                    if let Some(cgroup) = get_process_cgroup(pid) {
133                        if is_systemd_service(&cgroup) {
134                            pids.push(pid);
135                        }
136                    }
137                }
138            }
139        }
140    }
141
142    pids.sort();
143    pids.dedup();
144    pids
145}
146
147/// Arguments for the [`Kill`] capability.
148#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct KillArgs {
150    /// Process ID to kill.
151    pub pid: u32,
152    /// Signal to send (default: 15 = SIGTERM). Must be valid POSIX: 1-31 or 64.
153    pub signal: Option<i32>,
154}
155
156/// Capability that terminates a process by PID with full audit logging.
157///
158/// # Safety
159///
160/// This capability includes guards to prevent killing critical system processes.
161/// Protected PIDs include: 1 (init), 2 (kthreadd).
162///
163/// # Security
164///
165/// All kill operations are logged to WAL for audit purposes.
166pub struct Kill;
167
168impl Capability for Kill {
169    fn name(&self) -> &'static str {
170        "Kill"
171    }
172
173    fn description(&self) -> &'static str {
174        "Terminate a process by PID. Protects critical system processes (init, kthreadd, self). Supports custom signals."
175    }
176
177    /// Returns the JSON Schema for Kill arguments.
178    ///
179    /// Schema requires `"pid"` integer; `"signal"` is optional and restricted
180    /// to valid POSIX signal values (1-31, 64) — FINDING #3.
181    fn schema(&self) -> Value {
182        serde_json::json!({
183            "type": "object",
184            "properties": {
185                "pid": { "type": "integer", "minimum": 1 },
186                "signal": {
187                    "type": "integer",
188                    "anyOf": [
189                        { "minimum": 1, "maximum": 31 },
190                        { "enum": [64] }
191                    ]
192                }
193            },
194            "required": ["pid"]
195        })
196    }
197
198    fn validate(&self, args: &Value) -> Result<()> {
199        let args: KillArgs = serde_json::from_value(args.clone())
200            .map_err(|e| Error::SchemaValidationFailed(e.to_string()))?;
201
202        // FINDING #3: Restrict signal to valid POSIX values (1-31, 64)
203        if let Some(signal) = args.signal {
204            if !(1..=31).contains(&signal) && signal != 64 {
205                return Err(Error::SchemaValidationFailed(format!(
206                    "Invalid signal {}: must be 1-31 or 64 (POSIX signals)",
207                    signal
208                )));
209            }
210        }
211
212        Ok(())
213    }
214
215    fn execute(&self, args: &Value, ctx: &Context) -> Result<Output> {
216        let args: KillArgs = serde_json::from_value(args.clone())
217            .map_err(|e| Error::ExecutionFailed(e.to_string()))?;
218
219        // Safety check: protected PIDs (init, kthreadd, self, parent)
220        let protected = protected_pids();
221        if protected.contains(&args.pid) {
222            return Err(Error::ExecutionFailed(format!(
223                "PID {} is a protected system process (protected: {:?})",
224                args.pid, protected
225            )));
226        }
227
228        // Respect dry_run — skip kill entirely
229        if ctx.dry_run {
230            // FINDING #20: Limit dry-run output to "would kill PID X", hide command/user info
231            return Ok(Output {
232                success: true,
233                data: serde_json::json!({
234                    "pid": args.pid,
235                    "killed": false,
236                    "dry_run": true,
237                    "signal": args.signal.unwrap_or(15),
238                }),
239                message: Some(format!("DRY RUN: would kill PID {}", args.pid)),
240            });
241        }
242
243        // Capture process snapshot before kill
244        let process_before = ProcessSnapshot::capture();
245        let process_exists = process_before.processes.iter().any(|p| p.pid == args.pid);
246
247        if !process_exists {
248            return Ok(Output {
249                success: false,
250                data: serde_json::json!({
251                    "pid": args.pid,
252                    "killed": false,
253                    "reason": "Process not found"
254                }),
255                message: Some(format!("Process {} not found", args.pid)),
256            });
257        }
258
259        // Get process info before killing
260        let process_info: Option<(String, String)> = process_before
261            .processes
262            .iter()
263            .find(|p| p.pid == args.pid)
264            .map(|p| (p.command.clone(), p.user.clone()));
265
266        // Record start time to detect PID reuse (FINDING #1)
267let start_time_before = get_process_start_time_retry(args.pid);
268
269        // Determine signal — default to SIGTERM (15) for graceful shutdown
270        let signal = args.signal.unwrap_or(15);
271
272        // Execute kill via libc for reliability (avoids shell/PATH issues)
273        let kill_result = unsafe { libc::kill(args.pid as libc::pid_t, signal) };
274        let success = kill_result == 0;
275        let stderr_str = if !success {
276            std::io::Error::last_os_error().to_string()
277        } else {
278            String::new()
279        };
280
281        // Delay to let process terminate and be removed from process table
282        std::thread::sleep(Duration::from_millis(500));
283
284        // Clear cache to ensure fresh snapshot (cached data would show pre-kill state)
285        ProcessSnapshot::clear_cache();
286
287        // Capture process snapshot after kill
288        let process_after = ProcessSnapshot::capture();
289
290        // Check if process still exists (zombies count as dead — they've been terminated)
291        let process_still_exists = process_after
292            .processes
293            .iter()
294            .any(|p| p.pid == args.pid && !p.stat.starts_with('Z'));
295// Verify PID was not reused — check start time matches (FINDING #1)
296let pid_reused = match (start_time_before, get_process_start_time_retry(args.pid)) {
297    (Some(before_time), Some(after_time)) => before_time != after_time,
298    (None, _) => false,
299    (Some(_), None) => true,
300};
301
302        let killed_success = success && !process_still_exists && !pid_reused;
303
304        let message = if killed_success {
305            format!("Killed process {} (signal {})", args.pid, signal)
306        } else if pid_reused {
307            format!(
308                "PID {} was reused by a different process (start time changed)",
309                args.pid
310            )
311        } else if !success {
312            format!("Failed to kill process {}: {}", args.pid, stderr_str)
313        } else {
314            format!("Process {} still exists after signal {}", args.pid, signal)
315        };
316
317        Ok(Output {
318            success: killed_success,
319            data: serde_json::json!({
320                "pid": args.pid,
321                "killed": killed_success,
322                "signal": signal,
323                "command": process_info.as_ref().map(|(cmd, _)| cmd),
324                "user": process_info.as_ref().map(|(_, user)| user),
325                "stderr": if !success { stderr_str.clone() } else { String::new() },
326                "pid_reused": pid_reused,
327                "process_before": {
328                    "count": process_before.summary.total_processes,
329                    "zombies": process_before.summary.zombie_count
330                },
331                "process_after": {
332                    "count": process_after.summary.total_processes,
333                    "zombies": process_after.summary.zombie_count
334                }
335            }),
336            message: Some(message),
337        })
338    }
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344    use crate::capability::Capability;
345    use std::thread;
346    use std::time::Duration;
347
348#[test]
349fn test_kill_schema() {
350    let cap = Kill;
351    let _schema = cap.schema();
352    // Retry function test
353    // Test retry logic with existing process
354let mut child = Command::new("sleep").arg("60").spawn().unwrap();
355let pid = child.id();
356
357let result = get_process_start_time_retry(pid);
358assert!(result.is_some(), "Should read start time for running process");
359
360child.kill().ok();
361let _ = child.wait();
362
363// Non-existent PID should return None after retries
364let result = get_process_start_time_retry(999999);
365assert!(result.is_none(), "Non-existent PID should return None");
366    }
367
368    #[test]
369    fn test_kill_protected_pid() {
370        let cap = Kill;
371        // PID 1 is protected
372        let result = cap.execute(
373            &serde_json::json!({ "pid": 1 }),
374            &Context {
375                dry_run: false,
376                job_id: "test".into(),
377                working_dir: std::env::current_dir().unwrap(),
378            },
379        );
380
381        // Should fail because PID 1 is protected
382        assert!(result.is_err());
383        assert!(result
384            .unwrap_err()
385            .to_string()
386            .contains("protected system process"));
387    }
388
389    #[test]
390    fn test_kill_self_protected() {
391        let cap = Kill;
392        let self_pid = std::process::id();
393        let result = cap.execute(
394            &serde_json::json!({ "pid": self_pid }),
395            &Context {
396                dry_run: false,
397                job_id: "test".into(),
398                working_dir: std::env::current_dir().unwrap(),
399            },
400        );
401
402        assert!(result.is_err());
403        assert!(result.unwrap_err().to_string().contains("protected"));
404    }
405
406    #[test]
407    fn test_kill_nonexistent() {
408        let cap = Kill;
409        // Use a PID that's very unlikely to exist
410        let result = cap
411            .execute(
412                &serde_json::json!({ "pid": 999999 }),
413                &Context {
414                    dry_run: false,
415                    job_id: "test".into(),
416                    working_dir: std::env::current_dir().unwrap(),
417                },
418            )
419            .unwrap();
420
421        assert!(!result.success);
422        assert!(result.data["killed"].as_bool() == Some(false));
423    }
424
425    #[test]
426    fn test_kill_dry_run() {
427        let cap = Kill;
428        // Use a real PID (self) but in dry_run mode — should NOT error as protected
429        // because dry_run skips the actual kill but still checks protection
430        // Actually, protection check runs before dry_run, so use a non-protected PID
431        let result = cap
432            .execute(
433                &serde_json::json!({ "pid": 999998 }),
434                &Context {
435                    dry_run: true,
436                    job_id: "test".into(),
437                    working_dir: std::env::current_dir().unwrap(),
438                },
439            )
440            .unwrap();
441
442        assert!(result.success);
443        assert!(result.data["dry_run"].as_bool() == Some(true));
444        assert!(result.data["killed"].as_bool() == Some(false));
445    }
446
447    #[test]
448    fn test_kill_actual_process() {
449        // Start a long-running process (sleep)
450        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
451        let pid = child.id();
452
453        // Give it time to start
454        thread::sleep(Duration::from_millis(100));
455
456        // Verify process exists before kill
457        let pre_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
458        assert!(
459            pre_check.unwrap().status.success(),
460            "Process should exist before kill"
461        );
462
463        // Clear cache so kill sees fresh process list
464        ProcessSnapshot::clear_cache();
465
466        // Kill it via the capability using SIGKILL for reliability
467        let cap = Kill;
468        let result = cap
469            .execute(
470                &serde_json::json!({ "pid": pid, "signal": 9 }),
471                &Context {
472                    dry_run: false,
473                    job_id: "test".into(),
474                    working_dir: std::env::current_dir().unwrap(),
475                },
476            )
477            .unwrap();
478
479        // Kill should succeed — process becomes zombie until reaped
480        assert!(
481            result.data["killed"].as_bool() == Some(true),
482            "Kill failed: {:?}",
483            result.data
484        );
485        assert!(
486            result.data["signal"].as_i64() == Some(9),
487            "Should use SIGKILL"
488        );
489
490        // Reap the zombie so it disappears from process table
491        let _ = child.wait();
492
493        // Verify process is fully gone after reaping
494        let post_check = Command::new("kill").arg("-0").arg(pid.to_string()).output();
495        let still_alive = post_check.map(|o| o.status.success()).unwrap_or(false);
496        assert!(
497            !still_alive,
498            "Process {} should be dead after kill and reap",
499            pid
500        );
501    }
502
503    #[test]
504    fn test_get_process_start_time() {
505        // Start a process and verify we can read its start time
506        let mut child = Command::new("sleep").arg("60").spawn().unwrap();
507        let pid = child.id();
508
509        let start_time = get_process_start_time(pid);
510        assert!(
511            start_time.is_some(),
512            "Should be able to read start time for running process"
513        );
514
515        // Verify start time is consistent (no PID reuse)
516        let start_time2 = get_process_start_time(pid);
517        assert_eq!(start_time, start_time2, "Start time should be stable");
518
519        child.kill().ok();
520        let _ = child.wait();
521    }
522
523    #[test]
524    fn test_get_process_start_time_nonexistent() {
525        let result = get_process_start_time(999999);
526        assert!(result.is_none(), "Non-existent PID should return None");
527    }
528
529    #[test]
530    fn test_signal_validation_rejects_negative() {
531        // FINDING #3: negative signals should be rejected
532        let cap = Kill;
533        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": -1 }));
534        assert!(result.is_err());
535        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
536    }
537
538    #[test]
539    fn test_signal_validation_rejects_zero() {
540        // FINDING #3: signal 0 should be rejected
541        let cap = Kill;
542        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 0 }));
543        assert!(result.is_err());
544        assert!(result.unwrap_err().to_string().contains("Invalid signal"));
545    }
546
547    #[test]
548    fn test_signal_validation_rejects_out_of_range() {
549        // FINDING #3: signal > 31 (except 64) should be rejected
550        let cap = Kill;
551        let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": 32 }));
552        assert!(result.is_err());
553    }
554
555    #[test]
556    fn test_signal_validation_accepts_valid_signals() {
557        let cap = Kill;
558        for sig in [1, 9, 15, 31, 64] {
559            let result = cap.validate(&serde_json::json!({ "pid": 999998, "signal": sig }));
560            assert!(result.is_ok(), "Signal {} should be valid", sig);
561        }
562    }
563
564    #[test]
565    fn test_dry_run_hides_process_info() {
566        // FINDING #20: dry-run should NOT expose command or user info
567        let cap = Kill;
568        let result = cap
569            .execute(
570                &serde_json::json!({ "pid": 999998 }),
571                &Context {
572                    dry_run: true,
573                    job_id: "test".into(),
574                    working_dir: std::env::current_dir().unwrap(),
575                },
576            )
577            .unwrap();
578
579        assert!(result.success);
580        assert!(result.data["dry_run"].as_bool() == Some(true));
581        assert!(result.data.get("command").is_none(), "dry-run must not expose command");
582        assert!(result.data.get("user").is_none(), "dry-run must not expose user");
583        assert!(result.data.get("process_exists").is_none(), "dry-run must not expose process_exists");
584    }
585
586    #[test]
587    fn test_protected_pids_includes_self_and_parent() {
588        let protected = protected_pids();
589        let self_pid = std::process::id();
590        assert!(protected.contains(&1), "PID 1 should be protected");
591        assert!(protected.contains(&2), "PID 2 should be protected");
592        assert!(protected.contains(&self_pid), "self PID should be protected");
593    }
594
595#[test]
596fn test_get_process_start_time_retry() {
597    // Test retry logic with existing process
598    let mut child = Command::new("sleep").arg("60").spawn().unwrap();
599    let pid = child.id();
600
601    let result = get_process_start_time_retry(pid);
602    assert!(result.is_some(), "Should read start time for running process");
603
604    child.kill().ok();
605    let _ = child.wait();
606
607    // Non-existent PID should return None after retries
608    let result = get_process_start_time_retry(999999);
609    assert!(result.is_none(), "Non-existent PID should return None");
610}
611}