wire/daemon_supervisor.rs
1//! `wire daemon --all-sessions` — multi-session supervisor.
2//!
3//! ## Why
4//!
5//! honey-pine's 2026-06-01 dogfood (#162) surfaced a launchd-vs-session
6//! isolation gap: the `sh.slancha.wire.daemon` launchd unit invokes
7//! `wire daemon --interval 5` with **no cwd context**. With WIRE_HOME
8//! unset, the daemon resolves to the *default* session WIRE_HOME and
9//! silently skips every other initialized session. Operators with
10//! multiple per-project sessions (slancha-mesh, wire, etc.) saw their
11//! shell `wire status` report `running:false` even with the launchd
12//! daemon perfectly alive — same daemon, different state tree.
13//!
14//! Her working remedy was `launchctl bootout` + `nohup wire daemon`
15//! from the project cwd. That works for one session but doesn't scale
16//! to N. The architectural fix is a supervisor that owns the
17//! multi-session orchestration: one supervisor process per launchd
18//! unit, N child `wire daemon --session <name>` processes — each with
19//! its own pinned `WIRE_HOME` and its own pidfile under that session's
20//! state dir. `wire status` from any cwd then sees its session's child
21//! pid and reports truthfully.
22//!
23//! ## Model
24//!
25//! - **Fork-exec, not threads.** Each session's daemon needs its own
26//! `WIRE_HOME`. We set it via the child process env so the daemon
27//! code path stays unchanged. Threads would mean global mutable
28//! `WIRE_HOME` and cross-session races.
29//! - **Idempotent spawn.** Before spawning a child for session S,
30//! check `daemon_singleton_holder()` on that session's home. If a
31//! live daemon already exists (operator ran `wire daemon` directly
32//! in S's cwd, or supervisor restarted and the old child is still
33//! alive), leave it alone.
34//! - **Reap via polling, not SIGCHLD.** macOS launchd-supervised
35//! processes already get SIGCHLD overhead; `try_wait` polling on a
36//! short interval is simpler and bug-free across platforms.
37//! - **Backoff on rapid failure.** A child that exits within 10s of
38//! spawn doubles its respawn delay (1s → 60s cap). Prevents a broken
39//! session (corrupt key, missing relay) from fork-bombing.
40//! - **Don't exit on zero sessions.** Sleep and re-poll the registry —
41//! new sessions get picked up without supervisor restart.
42//! - **Adopt orphaned children on supervisor restart.** When launchd
43//! relaunches the supervisor, the previous supervisor's children
44//! keep running (correct: they're still syncing). New supervisor
45//! sees their pidfiles, skips re-spawning, and lets them keep going
46//! until their next natural exit (then it spawns a fresh child).
47//!
48//! ## Invariants
49//!
50//! - One supervisor per launchd unit per machine. Singleton guard on
51//! `sessions_root()/supervisor.pid` (separate from per-session
52//! daemon pidfiles).
53//! - Child env contains exactly one wire-relevant variable:
54//! `WIRE_HOME=<session-home>`. Any other inherited WIRE_* vars are
55//! stripped so the operator's shell config doesn't leak in.
56//! - Per-session daemon code is *unchanged* — supervisor is a pure
57//! orchestrator.
58
59use std::collections::HashMap;
60use std::path::{Path, PathBuf};
61use std::process::{Child, Command};
62use std::time::{Duration, Instant, SystemTime};
63
64use anyhow::{Context, Result};
65use serde_json::json;
66
67/// How often the supervisor re-reads the session registry. Tradeoff: a
68/// new session created at `wire session new` waits up to this many
69/// seconds before its daemon comes up. 10s strikes a balance — fast
70/// enough that operators don't notice, slow enough that registry
71/// fork-execs don't dominate.
72const REGISTRY_POLL_SECS: u64 = 10;
73
74/// Initial respawn delay after a child exits unexpectedly. Doubles on
75/// each rapid failure (exit within `RAPID_FAIL_WINDOW`) up to
76/// `MAX_BACKOFF`.
77const INITIAL_BACKOFF: Duration = Duration::from_secs(1);
78const MAX_BACKOFF: Duration = Duration::from_secs(60);
79const RAPID_FAIL_WINDOW: Duration = Duration::from_secs(10);
80
81/// Default idle cutoff for registry-unbound sessions. `list_sessions()`
82/// enumerates *every* session home ever minted on the machine — and
83/// because each Claude tab / `wire session new` mints a fresh persona
84/// home, a long-lived box accumulates hundreds (honey-pine's had 147).
85/// Spawning one daemon per home turns `--all-sessions` into a fork
86/// storm. A session is kept regardless of age if it has a registry cwd
87/// binding (operator deliberately bound it); an *unbound* session is
88/// only kept if it has been active within this window. Override via
89/// `WIRE_ALL_SESSIONS_MAX_IDLE_DAYS` (0 disables the filter → legacy
90/// spawn-for-all behavior).
91const DEFAULT_MAX_IDLE_DAYS: u64 = 7;
92
93/// Parse the idle cutoff. `None` raw → default; a `0` value → `None`
94/// (no filter, spawn for every session); any other integer → that many
95/// days; unparseable → default. Pure, so it's unit-testable without
96/// mutating process env.
97fn parse_max_idle(raw: Option<&str>) -> Option<Duration> {
98 match raw {
99 Some(v) => {
100 let days: u64 = v.trim().parse().unwrap_or(DEFAULT_MAX_IDLE_DAYS);
101 (days != 0).then(|| Duration::from_secs(days * 86_400))
102 }
103 None => Some(Duration::from_secs(DEFAULT_MAX_IDLE_DAYS * 86_400)),
104 }
105}
106
107/// Read the idle cutoff from the environment. `None` means "no idle
108/// filter" (spawn a daemon for every session — pre-fix behavior),
109/// selected by setting `WIRE_ALL_SESSIONS_MAX_IDLE_DAYS=0`.
110fn max_idle_from_env() -> Option<Duration> {
111 parse_max_idle(
112 std::env::var("WIRE_ALL_SESSIONS_MAX_IDLE_DAYS")
113 .ok()
114 .as_deref(),
115 )
116}
117
118/// Newest mtime among a session home's activity files — the
119/// supervisor's "last actually *synced*" signal. These live under the
120/// session's `state/wire/` subtree (same root the per-session daemon
121/// and `existing_daemon_for_session` use), NOT the home root.
122/// `last_sync.json` is rewritten on every successful daemon relay
123/// cycle; the cursors move on inbox/reactor activity. Returns `None`
124/// for a home that has never synced (a husk).
125///
126/// Deliberately excludes `daemon.pid`: it's written on *spawn*, so
127/// counting it would make eligibility self-perpetuating — the
128/// supervisor spawns a daemon, the pidfile refreshes, and the session
129/// would never age out even if it never actually syncs anything.
130fn fs_last_active(home: &Path) -> Option<SystemTime> {
131 let state = home.join("state").join("wire");
132 ["last_sync.json", "notify.cursor", "reactor.cursor"]
133 .iter()
134 .filter_map(|f| std::fs::metadata(state.join(f)).ok())
135 .filter_map(|m| m.modified().ok())
136 .max()
137}
138
139/// Filter `list_sessions()` down to the sessions the supervisor should
140/// own a daemon for. A session is eligible iff it has a registry cwd
141/// binding OR it was active within `max_idle`. `max_idle == None`
142/// disables the filter (every session eligible). Pure: the activity
143/// probe is injected so this is unit-testable without touching disk.
144fn supervisor_eligible<F>(
145 sessions: Vec<crate::session::SessionInfo>,
146 max_idle: Option<Duration>,
147 now: SystemTime,
148 last_active: F,
149) -> Vec<crate::session::SessionInfo>
150where
151 F: Fn(&Path) -> Option<SystemTime>,
152{
153 let Some(max_idle) = max_idle else {
154 return sessions;
155 };
156 sessions
157 .into_iter()
158 .filter(|s| {
159 if s.cwd.is_some() {
160 return true;
161 }
162 match last_active(&s.home_dir) {
163 // `duration_since` errors when the file mtime is in the
164 // future (clock skew) — treat that as "active now".
165 Some(t) => now.duration_since(t).map(|d| d <= max_idle).unwrap_or(true),
166 None => false,
167 }
168 })
169 .collect()
170}
171
172/// State the supervisor tracks per session it has spawned a child for.
173struct ChildState {
174 child: Child,
175 spawned_at: Instant,
176}
177
178/// Entrypoint for `wire daemon --all-sessions`. Loops forever; only
179/// returns Err on a setup error (e.g. cannot resolve sessions_root).
180pub fn run_supervisor(interval_secs: u64, as_json: bool) -> Result<()> {
181 // Supervisor singleton — one per machine. Separate pidfile from the
182 // per-session daemon pidfile so the two layers can't collide.
183 let pid_path = supervisor_pid_path()?;
184 if let Some(existing) = read_alive_supervisor_pid(&pid_path)? {
185 let msg = json!({
186 "status": "skipped",
187 "reason": "supervisor already running",
188 "holder_pid": existing,
189 });
190 if as_json {
191 println!("{msg}");
192 } else {
193 eprintln!(
194 "wire daemon --all-sessions: another supervisor is already running (pid {existing}); not starting a second one."
195 );
196 }
197 return Ok(());
198 }
199 write_supervisor_pid(&pid_path)?;
200 let _cleanup = SupervisorPidGuard {
201 path: pid_path.clone(),
202 };
203
204 if !as_json {
205 eprintln!(
206 "wire daemon --all-sessions: supervisor up. interval={interval_secs}s, registry-poll={REGISTRY_POLL_SECS}s. SIGINT to stop."
207 );
208 } else {
209 println!(
210 "{}",
211 json!({
212 "status": "supervisor_started",
213 "interval_secs": interval_secs,
214 "registry_poll_secs": REGISTRY_POLL_SECS,
215 })
216 );
217 }
218
219 // Idle cutoff for registry-unbound sessions — read once at startup
220 // (env doesn't change under a running supervisor).
221 let max_idle = max_idle_from_env();
222 eprintln!(
223 "supervisor: idle cutoff for unbound sessions = {}",
224 match max_idle {
225 Some(d) => format!("{} days", d.as_secs() / 86_400),
226 None => "disabled (spawn-for-all)".to_string(),
227 }
228 );
229
230 let mut children: HashMap<String, ChildState> = HashMap::new();
231 // Per-session backoff that survives a child's reap → respawn → reap
232 // cycle. Distinguishes "session crashes hard repeatedly" from
233 // "child exited cleanly and we're spawning a fresh one".
234 let mut session_last_exit: HashMap<String, Instant> = HashMap::new();
235 let mut session_backoff: HashMap<String, Duration> = HashMap::new();
236
237 loop {
238 // 1. Reap any exited children. Detect rapid failures + update
239 // per-session backoff so the next spawn waits.
240 let mut exited: Vec<String> = Vec::new();
241 for (name, state) in children.iter_mut() {
242 if let Ok(Some(status)) = state.child.try_wait() {
243 let lived = state.spawned_at.elapsed();
244 let rapid = lived < RAPID_FAIL_WINDOW;
245 eprintln!(
246 "supervisor: child '{name}' exited (status={status:?}, lived={}s, rapid={rapid})",
247 lived.as_secs()
248 );
249 let next_backoff = if rapid {
250 let prev = session_backoff
251 .get(name)
252 .copied()
253 .unwrap_or(INITIAL_BACKOFF);
254 (prev * 2).min(MAX_BACKOFF)
255 } else {
256 INITIAL_BACKOFF
257 };
258 session_backoff.insert(name.clone(), next_backoff);
259 session_last_exit.insert(name.clone(), Instant::now());
260 exited.push(name.clone());
261 }
262 }
263 for n in exited {
264 children.remove(&n);
265 }
266
267 // 2. Read registry, identify wanted sessions. Filter out
268 // registry-unbound sessions that have been idle past the
269 // cutoff so the supervisor doesn't fan out a daemon per
270 // every ephemeral persona home (the 147-home fork storm).
271 let all_sessions = crate::session::list_sessions().unwrap_or_default();
272 let total_sessions = all_sessions.len();
273 let wanted: Vec<crate::session::SessionInfo> =
274 supervisor_eligible(all_sessions, max_idle, SystemTime::now(), fs_last_active);
275 if wanted.len() != total_sessions {
276 eprintln!(
277 "supervisor: {} of {} sessions eligible (skipped {} registry-unbound + idle > cutoff)",
278 wanted.len(),
279 total_sessions,
280 total_sessions - wanted.len()
281 );
282 }
283
284 // 3. Kill children whose session has been removed from the
285 // registry since last poll. (Operator ran `wire session
286 // forget` or similar.)
287 let wanted_names: std::collections::HashSet<String> =
288 wanted.iter().map(|s| s.name.clone()).collect();
289 let to_kill: Vec<String> = children
290 .keys()
291 .filter(|n| !wanted_names.contains(n.as_str()))
292 .cloned()
293 .collect();
294 for name in to_kill {
295 if let Some(mut state) = children.remove(&name) {
296 eprintln!("supervisor: session '{name}' gone from registry; terminating its child");
297 let _ = state.child.kill();
298 let _ = state.child.wait();
299 }
300 }
301
302 // 4. Spawn missing children, respecting backoff + existing
303 // pidfiles (operator-spawned daemons coexist).
304 for info in wanted {
305 if info.did.is_none() {
306 continue;
307 }
308 if children.contains_key(&info.name) {
309 continue;
310 }
311 // Backoff gate: if this session is in a rapid-fail loop,
312 // wait the remaining backoff before respawning.
313 if let Some(last_exit) = session_last_exit.get(&info.name) {
314 let wait = session_backoff
315 .get(&info.name)
316 .copied()
317 .unwrap_or(INITIAL_BACKOFF);
318 if last_exit.elapsed() < wait {
319 continue;
320 }
321 }
322 // Singleton check: an operator-spawned `wire daemon` may
323 // already own this session. Leave it alone — re-checking
324 // next poll is cheap.
325 if existing_daemon_for_session(&info.home_dir)? {
326 continue;
327 }
328 match spawn_child_for_session(&info.name, &info.home_dir, interval_secs) {
329 Ok(child) => {
330 eprintln!(
331 "supervisor: spawned child for session '{}' (pid {})",
332 info.name,
333 child.id()
334 );
335 children.insert(
336 info.name.clone(),
337 ChildState {
338 child,
339 spawned_at: Instant::now(),
340 },
341 );
342 }
343 Err(e) => {
344 eprintln!(
345 "supervisor: spawn failed for session '{}': {e:#}",
346 info.name
347 );
348 // Treat spawn failure as a rapid failure so the
349 // backoff curve kicks in.
350 let prev = session_backoff
351 .get(&info.name)
352 .copied()
353 .unwrap_or(INITIAL_BACKOFF);
354 session_backoff.insert(info.name.clone(), (prev * 2).min(MAX_BACKOFF));
355 session_last_exit.insert(info.name.clone(), Instant::now());
356 }
357 }
358 }
359
360 std::thread::sleep(Duration::from_secs(REGISTRY_POLL_SECS));
361 }
362}
363
364/// Spawn `wire daemon --interval <i>` as a child with `WIRE_HOME`
365/// pinned via env. Strips inherited WIRE_* env so the operator's
366/// shell config (test overrides like `WIRE_DAEMON_NO_SINGLETON=1`)
367/// can't leak in.
368///
369/// v0.14.2 #170 hotfix: the original implementation also passed
370/// `--session <character-name>` as a belt-and-suspenders check.
371/// That broke 127 of 133 sessions on a real multi-session box —
372/// `cmd_daemon`'s `--session` handler calls
373/// `session::session_dir(name)` which resolves
374/// `sessions_root/<name>`, correct for v0.6 top-level layout but
375/// WRONG for v0.13's `by-key/<hash>` layout where the character
376/// name is *derived* from the card DID, not the directory name.
377/// Children bailed → supervisor fork-bombed (10s poll × 60s
378/// backoff × 127 failing sessions). WIRE_HOME env alone is the
379/// correct contract: every daemon code path flows through
380/// `state_dir()` / `config_dir()` which honor it. No second
381/// source of truth.
382fn spawn_child_for_session(
383 name: &str,
384 home_dir: &std::path::Path,
385 interval_secs: u64,
386) -> Result<Child> {
387 let exe = std::env::current_exe().context("resolving current exe for child fork")?;
388 let mut cmd = Command::new(&exe);
389 cmd.args(["daemon", "--interval", &interval_secs.to_string()]);
390 // Strip WIRE_* env so operator shell-vars don't leak into the
391 // child. Then pin WIRE_HOME exactly.
392 let leaks: Vec<String> = std::env::vars()
393 .filter(|(k, _)| k.starts_with("WIRE_"))
394 .map(|(k, _)| k)
395 .collect();
396 for k in leaks {
397 cmd.env_remove(&k);
398 }
399 cmd.env("WIRE_HOME", home_dir);
400 // Children inherit stdout/stderr → land in the launchd log file
401 // (StandardOutPath in the plist). Operators see "supervisor:
402 // spawned ..." lines interleaved with each session's daemon log.
403 cmd.spawn().with_context(|| {
404 format!(
405 "fork-exec `wire daemon` for session '{name}' (binary {} WIRE_HOME={})",
406 exe.display(),
407 home_dir.display()
408 )
409 })
410}
411
412/// True iff this session's `daemon.pid` names a live process. Used by
413/// the supervisor to coexist with operator-spawned `wire daemon`
414/// invocations: if the operator already started one in a tmux pane,
415/// we skip the spawn and let theirs own the cursor.
416fn existing_daemon_for_session(home_dir: &std::path::Path) -> Result<bool> {
417 let pid_path = home_dir.join("state").join("wire").join("daemon.pid");
418 if !pid_path.exists() {
419 return Ok(false);
420 }
421 let body = match std::fs::read_to_string(&pid_path) {
422 Ok(b) => b,
423 Err(_) => return Ok(false),
424 };
425 // Pidfile is either JSON `{"pid": <n>, ...}` (v0.5.11+) or a bare
426 // integer (legacy). Try JSON+pid-field first; if that yields
427 // None (parse failed OR JSON had no pid field, e.g. a bare
428 // integer body parses as JSON number with no `.pid`), fall
429 // through to the bare-int path.
430 let pid = serde_json::from_str::<serde_json::Value>(&body)
431 .ok()
432 .and_then(|v| v.get("pid").and_then(serde_json::Value::as_u64))
433 .or_else(|| body.trim().parse::<u64>().ok());
434 Ok(pid
435 .map(|p| crate::ensure_up::pid_is_alive(p as u32))
436 .unwrap_or(false))
437}
438
439/// Read-only snapshot of the supervisor's current topology — supervisor
440/// liveness + per-session daemon liveness + orphan pids the supervisor
441/// is not currently managing. Used by `wire supervisor` (the CLI
442/// counterpart to single-session `wire status`) so operators can ask
443/// "what is the multi-session supervisor doing?" in one command
444/// instead of cross-referencing `pgrep` against per-session pidfiles
445/// by hand.
446#[derive(Debug, Clone, serde::Serialize)]
447pub struct SupervisorState {
448 /// Pid the `supervisor.pid` file names; None if file missing.
449 pub supervisor_pid: Option<u32>,
450 /// True iff that pid is currently a live process.
451 pub supervisor_alive: bool,
452 /// Per-session liveness across every initialized session, in
453 /// `list_sessions()` order.
454 pub sessions: Vec<SupervisedSession>,
455 /// `wire daemon` pids found via cmdline-scan that are NOT mapped
456 /// to any session's pidfile AND are not the supervisor itself.
457 /// Could be legacy operator-spawned daemons, leftover children
458 /// from a crashed prior supervisor, or daemons serving the
459 /// default WIRE_HOME (no `--all-sessions`). Operators see them
460 /// here so they can decide whether to kill.
461 pub unmanaged_pids: Vec<u32>,
462 /// v0.14.2: session names whose live daemon's recorded
463 /// `pidfile.version` is older than this CLI's own
464 /// `CARGO_PKG_VERSION`. The supervisor's existing-pidfile check
465 /// skips alive daemons regardless of their binary version, so
466 /// stale-binary daemons persist until they exit. Surfaced for
467 /// operator visibility — they can `pkill -TERM <pid>` or use a
468 /// future `wire upgrade --refresh-stale-children` to force the
469 /// supervisor to respawn them on the current binary.
470 pub stale_binary_sessions: Vec<String>,
471}
472
473/// One session as seen by the supervisor.
474#[derive(Debug, Clone, serde::Serialize)]
475pub struct SupervisedSession {
476 /// Session name (`info.name` from `session::list_sessions`).
477 pub name: String,
478 /// `home_dir` filesystem path.
479 pub home_dir: String,
480 /// Pid the session's `daemon.pid` records; None if file missing.
481 pub daemon_pid: Option<u32>,
482 /// True iff that pid is currently a live process.
483 pub daemon_alive: bool,
484 /// Seconds since the session's daemon last completed a sync
485 /// cycle (read from `last_sync.json`); None if never recorded.
486 pub last_sync_age_seconds: Option<u64>,
487 /// Version string the running daemon recorded when it wrote its
488 /// pidfile (`PidRecord::Json.version`). None when the pidfile is
489 /// missing or corrupt. Surfaced so operators can spot version drift across
490 /// the supervisor fleet — the supervisor's pre-spawn
491 /// existing-pidfile check skips alive daemons regardless of
492 /// their binary version, so a daemon spawned on v0.13.x and
493 /// still running after the supervisor was bounced to v0.14.x
494 /// keeps the old binary in memory until it exits.
495 #[serde(skip_serializing_if = "Option::is_none")]
496 pub daemon_version: Option<String>,
497}
498
499/// Build a `SupervisorState` snapshot. Pure read; no fork / no
500/// pidfile mutation. Best-effort on every component (filesystem
501/// errors yield None / empty rather than failing the whole call).
502pub fn read_supervisor_state() -> Result<SupervisorState> {
503 let pid_path = supervisor_pid_path()?;
504 let supervisor_pid = read_supervisor_pid(&pid_path);
505 let supervisor_alive = supervisor_pid
506 .map(crate::ensure_up::pid_is_alive)
507 .unwrap_or(false);
508
509 // Per-session liveness — walk list_sessions, read each home's
510 // pidfile + last_sync.
511 let sessions: Vec<SupervisedSession> = crate::session::list_sessions()
512 .unwrap_or_default()
513 .into_iter()
514 .map(|info| {
515 let daemon_pid = crate::session::session_daemon_pid(&info.home_dir);
516 let daemon_alive = daemon_pid
517 .map(crate::ensure_up::pid_is_alive)
518 .unwrap_or(false);
519 // last_sync.json lives under <home>/state/wire/last_sync.json.
520 let last_sync_age_seconds = read_session_last_sync_age(&info.home_dir);
521 // v0.14.2: read the daemon-recorded version from the JSON
522 // pidfile. Legacy bare-integer pidfiles return None
523 // (can't surface a version we don't have).
524 let daemon_version = read_session_pidfile_version(&info.home_dir);
525 SupervisedSession {
526 name: info.name,
527 home_dir: info.home_dir.to_string_lossy().into_owned(),
528 daemon_pid,
529 daemon_alive,
530 last_sync_age_seconds,
531 daemon_version,
532 }
533 })
534 .collect();
535
536 // Unmanaged pids: every `wire daemon` cmdline scan hit that isn't
537 // (a) the supervisor itself, (b) any session's pidfile pid.
538 let all_daemon_pids: std::collections::HashSet<u32> =
539 crate::platform::find_processes_by_cmdline("wire daemon")
540 .into_iter()
541 .collect();
542 let known_session_pids: std::collections::HashSet<u32> = sessions
543 .iter()
544 .filter_map(|s| if s.daemon_alive { s.daemon_pid } else { None })
545 .collect();
546 let mut unmanaged_pids: Vec<u32> = all_daemon_pids
547 .into_iter()
548 .filter(|p| Some(*p) != supervisor_pid && !known_session_pids.contains(p))
549 .collect();
550 unmanaged_pids.sort_unstable();
551
552 // v0.14.2: derive the stale-binary set. Compare each live
553 // daemon's recorded version against the running CLI's version.
554 // "Stale" iff alive + has a recorded version + that version is
555 // strictly less than ours by dotted-integer compare (so 0.10.0 >
556 // 0.9.0). Unparseable strings are conservatively "not stale" — a
557 // pre-release suffix like 0.14.2-rc.1 stays unflagged rather than
558 // false-positive against 0.14.2.
559 let our_version = env!("CARGO_PKG_VERSION");
560 let stale_binary_sessions: Vec<String> = sessions
561 .iter()
562 .filter(|s| {
563 s.daemon_alive
564 && s.daemon_version
565 .as_deref()
566 .map(|v| version_lt(v, our_version))
567 .unwrap_or(false)
568 })
569 .map(|s| s.name.clone())
570 .collect();
571
572 Ok(SupervisorState {
573 supervisor_pid,
574 supervisor_alive,
575 sessions,
576 unmanaged_pids,
577 stale_binary_sessions,
578 })
579}
580
581/// Compare two dotted-integer version strings: `a < b`?
582///
583/// Splits on `.`, parses each segment as `u32`, compares
584/// element-wise (left-pad shorter with 0 so `0.14` < `0.14.1` is
585/// `true`). Anything that fails to parse as `u32` makes the whole
586/// compare return `false` — we'd rather under-flag a pre-release
587/// suffix like `0.14.2-rc.1` than false-positive against a stable
588/// peer of the same major.minor.patch.
589fn version_lt(a: &str, b: &str) -> bool {
590 let parse = |s: &str| -> Option<Vec<u32>> { s.split('.').map(|p| p.parse().ok()).collect() };
591 let (Some(av), Some(bv)) = (parse(a), parse(b)) else {
592 return false;
593 };
594 let n = av.len().max(bv.len());
595 for i in 0..n {
596 let ai = av.get(i).copied().unwrap_or(0);
597 let bi = bv.get(i).copied().unwrap_or(0);
598 if ai != bi {
599 return ai < bi;
600 }
601 }
602 false
603}
604
605/// Read the daemon-recorded version string from a session's
606/// `<home>/state/wire/daemon.pid` JSON pidfile. Returns None for
607/// legacy bare-integer pidfiles (no version field) and for absent /
608/// unreadable files.
609fn read_session_pidfile_version(home_dir: &std::path::Path) -> Option<String> {
610 let pidfile = home_dir.join("state").join("wire").join("daemon.pid");
611 let body = std::fs::read_to_string(&pidfile).ok()?;
612 let v: serde_json::Value = serde_json::from_str(&body).ok()?;
613 v.get("version")
614 .and_then(serde_json::Value::as_str)
615 .map(str::to_string)
616}
617
618/// Read `supervisor.pid` without the liveness check (the snapshot
619/// builder runs the check itself, separated so an absent file is
620/// just `None` rather than an Err).
621fn read_supervisor_pid(path: &std::path::Path) -> Option<u32> {
622 if !path.exists() {
623 return None;
624 }
625 let body = std::fs::read_to_string(path).ok()?;
626 body.trim().parse::<u32>().ok()
627}
628
629/// Read `<home>/state/wire/last_sync.json`'s timestamp and return
630/// "seconds since now". None on absent / unreadable / unparseable.
631fn read_session_last_sync_age(home_dir: &std::path::Path) -> Option<u64> {
632 let path = home_dir.join("state").join("wire").join("last_sync.json");
633 let body = std::fs::read_to_string(&path).ok()?;
634 let v: serde_json::Value = serde_json::from_str(&body).ok()?;
635 let ts = v.get("ts").and_then(serde_json::Value::as_str)?;
636 let parsed =
637 time::OffsetDateTime::parse(ts, &time::format_description::well_known::Rfc3339).ok()?;
638 let age = (time::OffsetDateTime::now_utc() - parsed).whole_seconds();
639 if age < 0 {
640 // Clock skew: timestamp is in the future. Treat as fresh.
641 Some(0)
642 } else {
643 Some(age as u64)
644 }
645}
646
647fn supervisor_pid_path() -> Result<PathBuf> {
648 let root = crate::session::sessions_root()
649 .context("resolving sessions_root for supervisor pidfile")?;
650 std::fs::create_dir_all(&root).with_context(|| format!("creating {root:?}"))?;
651 Ok(root.join("supervisor.pid"))
652}
653
654fn read_alive_supervisor_pid(path: &std::path::Path) -> Result<Option<u32>> {
655 if !path.exists() {
656 return Ok(None);
657 }
658 let body = std::fs::read_to_string(path).ok();
659 let pid = body.as_deref().and_then(|s| s.trim().parse::<u32>().ok());
660 match pid {
661 Some(p) if crate::ensure_up::pid_is_alive(p) => Ok(Some(p)),
662 _ => Ok(None),
663 }
664}
665
666fn write_supervisor_pid(path: &std::path::Path) -> Result<()> {
667 let pid = std::process::id();
668 std::fs::write(path, pid.to_string())
669 .with_context(|| format!("writing supervisor pidfile {path:?}"))?;
670 Ok(())
671}
672
673struct SupervisorPidGuard {
674 path: PathBuf,
675}
676
677impl Drop for SupervisorPidGuard {
678 fn drop(&mut self) {
679 // Only remove if it still names us — same pattern as
680 // DaemonPidGuard in ensure_up.rs.
681 if let Ok(body) = std::fs::read_to_string(&self.path)
682 && let Ok(pid) = body.trim().parse::<u32>()
683 && pid == std::process::id()
684 {
685 let _ = std::fs::remove_file(&self.path);
686 }
687 }
688}
689
690#[cfg(test)]
691mod tests {
692 use super::*;
693 use tempfile::tempdir;
694
695 #[test]
696 fn version_lt_dotted_integer_compare() {
697 // Lexical string-compare footgun cases — these must come out right.
698 assert!(version_lt("0.9.0", "0.10.0"));
699 assert!(version_lt("0.13.5", "0.14.1"));
700 assert!(version_lt("0.14.0", "0.14.1"));
701 // Equal / greater → not stale.
702 assert!(!version_lt("0.14.1", "0.14.1"));
703 assert!(!version_lt("0.14.2", "0.14.1"));
704 // Shorter version pads with zero.
705 assert!(version_lt("0.14", "0.14.1"));
706 assert!(!version_lt("0.14.1", "0.14"));
707 // Unparseable (pre-release suffix, garbage) is conservatively NOT-stale
708 // — under-flagging beats false-positive on `0.14.2-rc.1` vs `0.14.2`.
709 assert!(!version_lt("0.14.2-rc.1", "0.14.2"));
710 assert!(!version_lt("garbage", "0.14.1"));
711 assert!(!version_lt("0.14.1", "garbage"));
712 }
713
714 #[test]
715 fn read_alive_supervisor_pid_returns_none_when_missing() {
716 let tmp = tempdir().unwrap();
717 let p = tmp.path().join("supervisor.pid");
718 assert_eq!(read_alive_supervisor_pid(&p).unwrap(), None);
719 }
720
721 #[test]
722 fn read_alive_supervisor_pid_returns_none_for_dead_pid() {
723 let tmp = tempdir().unwrap();
724 let p = tmp.path().join("supervisor.pid");
725 // pid 999999 is almost certainly not running.
726 std::fs::write(&p, "999999").unwrap();
727 assert_eq!(read_alive_supervisor_pid(&p).unwrap(), None);
728 }
729
730 #[test]
731 fn read_alive_supervisor_pid_returns_pid_for_self() {
732 let tmp = tempdir().unwrap();
733 let p = tmp.path().join("supervisor.pid");
734 let our_pid = std::process::id();
735 std::fs::write(&p, our_pid.to_string()).unwrap();
736 assert_eq!(read_alive_supervisor_pid(&p).unwrap(), Some(our_pid));
737 }
738
739 #[test]
740 fn pid_guard_only_removes_when_pid_still_matches() {
741 let tmp = tempdir().unwrap();
742 let p = tmp.path().join("supervisor.pid");
743 // Write a foreign pid into the file, then drop a guard for
744 // our pid. The guard should leave the foreign pidfile alone.
745 std::fs::write(&p, "12345").unwrap();
746 {
747 let _g = SupervisorPidGuard { path: p.clone() };
748 }
749 assert!(p.exists(), "guard removed a pidfile that didn't name us");
750 }
751
752 #[test]
753 fn pid_guard_removes_when_pid_matches() {
754 let tmp = tempdir().unwrap();
755 let p = tmp.path().join("supervisor.pid");
756 let our_pid = std::process::id();
757 std::fs::write(&p, our_pid.to_string()).unwrap();
758 {
759 let _g = SupervisorPidGuard { path: p.clone() };
760 }
761 assert!(!p.exists(), "guard left our own pidfile behind");
762 }
763
764 #[test]
765 fn existing_daemon_for_session_returns_false_when_pidfile_missing() {
766 let tmp = tempdir().unwrap();
767 // home_dir has no state/wire/daemon.pid
768 assert!(!existing_daemon_for_session(tmp.path()).unwrap());
769 }
770
771 #[test]
772 fn existing_daemon_for_session_returns_false_for_dead_pid() {
773 let tmp = tempdir().unwrap();
774 let state = tmp.path().join("state").join("wire");
775 std::fs::create_dir_all(&state).unwrap();
776 std::fs::write(state.join("daemon.pid"), "999999").unwrap();
777 assert!(!existing_daemon_for_session(tmp.path()).unwrap());
778 }
779
780 #[test]
781 fn existing_daemon_for_session_returns_true_for_self_pid() {
782 let tmp = tempdir().unwrap();
783 let state = tmp.path().join("state").join("wire");
784 std::fs::create_dir_all(&state).unwrap();
785 std::fs::write(state.join("daemon.pid"), std::process::id().to_string()).unwrap();
786 assert!(existing_daemon_for_session(tmp.path()).unwrap());
787 }
788
789 // ---- supervisor eligibility filter (the 147-home fork-storm fix) ----
790
791 fn mk_session(name: &str, cwd: Option<&str>) -> crate::session::SessionInfo {
792 crate::session::SessionInfo {
793 name: name.to_string(),
794 cwd: cwd.map(String::from),
795 home_dir: PathBuf::from(format!("/sessions/{name}")),
796 did: None,
797 handle: None,
798 daemon_running: false,
799 character: None,
800 }
801 }
802
803 #[test]
804 fn parse_max_idle_default_when_unset() {
805 assert_eq!(
806 parse_max_idle(None),
807 Some(Duration::from_secs(DEFAULT_MAX_IDLE_DAYS * 86_400))
808 );
809 }
810
811 #[test]
812 fn parse_max_idle_zero_disables_filter() {
813 assert_eq!(parse_max_idle(Some("0")), None);
814 }
815
816 #[test]
817 fn parse_max_idle_explicit_days() {
818 assert_eq!(
819 parse_max_idle(Some("3")),
820 Some(Duration::from_secs(3 * 86_400))
821 );
822 assert_eq!(
823 parse_max_idle(Some(" 14 ")),
824 Some(Duration::from_secs(14 * 86_400))
825 );
826 }
827
828 #[test]
829 fn parse_max_idle_garbage_falls_back_to_default() {
830 assert_eq!(
831 parse_max_idle(Some("not-a-number")),
832 Some(Duration::from_secs(DEFAULT_MAX_IDLE_DAYS * 86_400))
833 );
834 }
835
836 #[test]
837 fn eligible_keeps_cwd_bound_even_when_ancient() {
838 // A registry-bound session is kept no matter how idle — the
839 // operator deliberately attached it to a project dir. (This is
840 // the real-world case: the cwd-bound `wire`/`slancha-*` sessions
841 // were the *oldest* on the box, yet must survive.)
842 let now = SystemTime::now();
843 let ancient = now - Duration::from_secs(365 * 86_400);
844 let sessions = vec![mk_session("wire", Some("/Users/p/Source/wire"))];
845 let out = supervisor_eligible(sessions, Some(Duration::from_secs(7 * 86_400)), now, |_| {
846 Some(ancient)
847 });
848 assert_eq!(out.len(), 1);
849 assert_eq!(out[0].name, "wire");
850 }
851
852 #[test]
853 fn eligible_keeps_unbound_recent_drops_unbound_idle() {
854 // The live-but-unbound persona sessions (each Claude tab) are
855 // recent → kept. The abandoned ones are idle → dropped.
856 let now = SystemTime::now();
857 let recent = now - Duration::from_secs(2 * 86_400);
858 let stale = now - Duration::from_secs(30 * 86_400);
859 let sessions = vec![
860 mk_session("rosy-rook", None), // live tab
861 mk_session("agate-nimbus", None), // abandoned
862 ];
863 let out = supervisor_eligible(
864 sessions,
865 Some(Duration::from_secs(7 * 86_400)),
866 now,
867 |home| {
868 if home.ends_with("rosy-rook") {
869 Some(recent)
870 } else {
871 Some(stale)
872 }
873 },
874 );
875 let names: Vec<_> = out.iter().map(|s| s.name.as_str()).collect();
876 assert_eq!(names, vec!["rosy-rook"]);
877 }
878
879 #[test]
880 fn eligible_drops_unbound_with_no_activity_signal() {
881 // A never-synced husk (no activity files at all) and no cwd →
882 // dropped: nothing says it's a session anyone is using.
883 let now = SystemTime::now();
884 let sessions = vec![mk_session("husk", None)];
885 let out = supervisor_eligible(sessions, Some(Duration::from_secs(7 * 86_400)), now, |_| {
886 None
887 });
888 assert!(out.is_empty());
889 }
890
891 #[test]
892 fn eligible_none_cutoff_keeps_everything() {
893 // Override = 0 (max_idle None) restores legacy spawn-for-all.
894 let now = SystemTime::now();
895 let ancient = now - Duration::from_secs(999 * 86_400);
896 let sessions = vec![mk_session("husk", None), mk_session("agate-nimbus", None)];
897 let out = supervisor_eligible(sessions, None, now, |_| Some(ancient));
898 assert_eq!(out.len(), 2);
899 }
900}