Skip to main content

sqlite_graphrag/
reaper.rs

1//! G28: Reaper for orphan external processes.
2//!
3//! When the CLI crashes or is killed (SIGKILL, OOM, machine reset), child
4//! processes spawned by `claude -p` or `codex exec` may be left running.
5//! Without cleanup they accumulate as zombies that consume CPU, RAM, and
6//! MCP-spawned subprocess trees (the 2026-06-03 incident: 1.877 processes
7//! total, load average 276 on a 10-CPU host).
8//!
9//! [`scan_and_kill_orphans`] walks the process table at startup and
10//! terminates any `claude` or `codex` invocation whose `PPID` is `1`
11//! (reparented to `init`/`launchd` after the parent died) and that is
12//! older than the `ORPHAN_MIN_AGE_SECS` constant. The scan is conservative: it only
13//! kills processes that (a) match a known LLM CLI name, AND (b) are
14//! orphaned, AND (c) are older than the threshold. A short-lived CLI
15//! that is just starting up is left alone.
16
17use std::time::Duration;
18
19const ORPHAN_MIN_AGE_SECS: u64 = 60;
20const ORPHAN_SCAN_TARGETS: &[&str] = &["claude", "codex"];
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq)]
23pub struct ReaperReport {
24    /// Number of orphan processes detected.
25    pub found: usize,
26    /// Number of orphan processes successfully terminated.
27    pub killed: usize,
28    /// Number that we could not terminate (permission, ESRCH, etc).
29    pub failed: usize,
30    /// Elapsed wall time of the scan.
31    pub elapsed_ms: u64,
32}
33
34/// Walks the process table and kills orphan LLM invocations.
35///
36/// The scan is best-effort and never panics: on any unexpected error it
37/// logs the failure and returns a report with `killed = 0`.
38pub fn scan_and_kill_orphans() -> ReaperReport {
39    let start = std::time::Instant::now();
40    let mut report = ReaperReport {
41        found: 0,
42        killed: 0,
43        failed: 0,
44        elapsed_ms: 0,
45    };
46
47    #[cfg(unix)]
48    {
49        if let Err(e) = scan_unix(&mut report) {
50            tracing::warn!(target: "reaper", error = %e, "orphan scan failed");
51        }
52    }
53
54    #[cfg(not(unix))]
55    {
56        tracing::debug!(target: "reaper", "orphan scan is a no-op on non-Unix platforms");
57    }
58
59    report.elapsed_ms = start.elapsed().as_millis() as u64;
60    if report.killed > 0 {
61        tracing::warn!(
62            target: "reaper",
63            found = report.found,
64            killed = report.killed,
65            failed = report.failed,
66            "reaped orphan LLM subprocesses"
67        );
68    } else {
69        tracing::info!(target: "reaper", found = report.found, "no orphan LLM subprocesses detected");
70    }
71    report
72}
73
74#[cfg(unix)]
75fn scan_unix(report: &mut ReaperReport) -> std::io::Result<()> {
76    use std::fs;
77    use std::path::Path;
78
79    let proc = Path::new("/proc");
80    let entries = fs::read_dir(proc)?;
81    for entry in entries.flatten() {
82        let name = entry.file_name();
83        let Some(name_str) = name.to_str() else {
84            continue;
85        };
86        if !name_str.chars().all(|c| c.is_ascii_digit()) {
87            continue;
88        }
89        let pid: i32 = match name_str.parse() {
90            Ok(p) => p,
91            Err(_) => continue,
92        };
93        if pid == std::process::id() as i32 {
94            continue;
95        }
96
97        let stat_path = entry.path().join("stat");
98        let stat = match fs::read_to_string(&stat_path) {
99            Ok(s) => s,
100            Err(_) => continue,
101        };
102
103        // /proc/[pid]/stat has the form: `pid (comm) state ppid ...`
104        // The comm field can contain spaces and parens; the last `)`
105        // separates the comm from the rest.
106        let Some(close_paren) = stat.rfind(')') else {
107            continue;
108        };
109        let after = &stat[close_paren + 1..];
110        let mut parts = after.split_whitespace();
111        // parts[0] = state (e.g. "R"), parts[1] = ppid, parts[2] = pgrp, ...
112        let state = parts.next().unwrap_or("");
113        let ppid: i32 = parts.next().and_then(|p| p.parse().ok()).unwrap_or(-1);
114
115        // Only target processes orphaned to init (PPID 1 on Linux/Unix
116        // when the parent is gone) or whose parent is also dead.
117        if ppid != 1 {
118            continue;
119        }
120
121        // Skip zombies (state Z) — they need no kill.
122        if state.starts_with('Z') {
123            continue;
124        }
125
126        // Resolve the comm field. proc/[pid]/comm is the short program
127        // name (no path); we use it instead of parsing the bracketed
128        // comm from stat to avoid encoding edge cases.
129        let comm_path = entry.path().join("comm");
130        let comm = match fs::read_to_string(&comm_path) {
131            Ok(s) => s.trim().to_string(),
132            Err(_) => continue,
133        };
134
135        if !ORPHAN_SCAN_TARGETS.iter().any(|t| comm == *t) {
136            continue;
137        }
138
139        // Age check: skip processes that just spawned (under 60s old) so
140        // we never race with a concurrent CLI invocation.
141        let age_ok = check_process_age(pid, ORPHAN_MIN_AGE_SECS);
142        if !age_ok {
143            continue;
144        }
145
146        report.found += 1;
147        match terminate_pid(pid) {
148            Ok(()) => {
149                report.killed += 1;
150                tracing::info!(target: "reaper", pid, comm = %comm, "killed orphan LLM subprocess");
151            }
152            Err(e) => {
153                report.failed += 1;
154                tracing::warn!(target: "reaper", pid, comm = %comm, error = %e, "failed to kill orphan");
155            }
156        }
157    }
158    Ok(())
159}
160
161#[cfg(unix)]
162fn check_process_age(pid: i32, min_age_secs: u64) -> bool {
163    use std::fs;
164    // /proc/[pid]/stat field 22 is start_time in clock ticks since boot.
165    // We instead use the simpler heuristic: stat file mtime.
166    let stat_path = std::path::Path::new("/proc")
167        .join(pid.to_string())
168        .join("stat");
169    let Ok(meta) = fs::metadata(&stat_path) else {
170        return false;
171    };
172    let Ok(modified) = meta.modified() else {
173        return false;
174    };
175    let Ok(elapsed) = std::time::SystemTime::now().duration_since(modified) else {
176        return false;
177    };
178    elapsed >= Duration::from_secs(min_age_secs)
179}
180
181#[cfg(unix)]
182fn terminate_pid(pid: i32) -> std::io::Result<()> {
183    // SIGTERM first; if the process ignores it for >2s, the caller can
184    // escalate to SIGKILL. For the reaper we send TERM and return; a
185    // follow-up sweep can send KILL if needed.
186    let rc = unsafe { libc::kill(pid, libc::SIGTERM) };
187    if rc == 0 {
188        Ok(())
189    } else {
190        Err(std::io::Error::last_os_error())
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    #[test]
199    fn reaper_report_starts_zeroed() {
200        let r = ReaperReport {
201            found: 0,
202            killed: 0,
203            failed: 0,
204            elapsed_ms: 0,
205        };
206        assert_eq!(r.found, 0);
207        assert_eq!(r.killed, 0);
208        assert_eq!(r.failed, 0);
209    }
210
211    #[test]
212    fn orphan_min_age_is_one_minute() {
213        // G28: the threshold of 60s is the safety margin that prevents
214        // a CLI invocation from killing a concurrent peer that just
215        // started 5s ago.
216        assert_eq!(ORPHAN_MIN_AGE_SECS, 60);
217    }
218
219    #[test]
220    fn orphan_targets_include_claude_and_codex() {
221        assert!(ORPHAN_SCAN_TARGETS.contains(&"claude"));
222        assert!(ORPHAN_SCAN_TARGETS.contains(&"codex"));
223    }
224
225    #[test]
226    fn scan_completes_without_panic_on_linux() {
227        // Just ensure the function returns a ReaperReport on the test
228        // host. On Linux CI we may be PID 1 in containers; the report
229        // will simply have found=0.
230        let r = scan_and_kill_orphans();
231        assert!(r.elapsed_ms < 30_000, "scan must finish in <30s");
232    }
233}