Skip to main content

wire/
daemon_supervisor.rs

1//! `wire daemon --all-sessions` — multi-session supervisor.
2//!
3//! ## Why
4//!
5//! honey-pine's 2026-06-01 dogfood (#162) surfaced a launchd-vs-session
6//! isolation gap: the `sh.slancha.wire.daemon` launchd unit invokes
7//! `wire daemon --interval 5` with **no cwd context**. With WIRE_HOME
8//! unset, the daemon resolves to the *default* session WIRE_HOME and
9//! silently skips every other initialized session. Operators with
10//! multiple per-project sessions (slancha-mesh, wire, etc.) saw their
11//! shell `wire status` report `running:false` even with the launchd
12//! daemon perfectly alive — same daemon, different state tree.
13//!
14//! Her working remedy was `launchctl bootout` + `nohup wire daemon`
15//! from the project cwd. That works for one session but doesn't scale
16//! to N. The architectural fix is a supervisor that owns the
17//! multi-session orchestration: one supervisor process per launchd
18//! unit, N child `wire daemon --session <name>` processes — each with
19//! its own pinned `WIRE_HOME` and its own pidfile under that session's
20//! state dir. `wire status` from any cwd then sees its session's child
21//! pid and reports truthfully.
22//!
23//! ## Model
24//!
25//! - **Fork-exec, not threads.** Each session's daemon needs its own
26//!   `WIRE_HOME`. We set it via the child process env so the daemon
27//!   code path stays unchanged. Threads would mean global mutable
28//!   `WIRE_HOME` and cross-session races.
29//! - **Idempotent spawn.** Before spawning a child for session S,
30//!   check `daemon_singleton_holder()` on that session's home. If a
31//!   live daemon already exists (operator ran `wire daemon` directly
32//!   in S's cwd, or supervisor restarted and the old child is still
33//!   alive), leave it alone.
34//! - **Reap via polling, not SIGCHLD.** macOS launchd-supervised
35//!   processes already get SIGCHLD overhead; `try_wait` polling on a
36//!   short interval is simpler and bug-free across platforms.
37//! - **Backoff on rapid failure.** A child that exits within 10s of
38//!   spawn doubles its respawn delay (1s → 60s cap). Prevents a broken
39//!   session (corrupt key, missing relay) from fork-bombing.
40//! - **Don't exit on zero sessions.** Sleep and re-poll the registry —
41//!   new sessions get picked up without supervisor restart.
42//! - **Adopt orphaned children on supervisor restart.** When launchd
43//!   relaunches the supervisor, the previous supervisor's children
44//!   keep running (correct: they're still syncing). New supervisor
45//!   sees their pidfiles, skips re-spawning, and lets them keep going
46//!   until their next natural exit (then it spawns a fresh child).
47//!
48//! ## Invariants
49//!
50//! - One supervisor per launchd unit per machine. Singleton guard on
51//!   `sessions_root()/supervisor.pid` (separate from per-session
52//!   daemon pidfiles).
53//! - Child env contains exactly one wire-relevant variable:
54//!   `WIRE_HOME=<session-home>`. Any other inherited WIRE_* vars are
55//!   stripped so the operator's shell config doesn't leak in.
56//! - Per-session daemon code is *unchanged* — supervisor is a pure
57//!   orchestrator.
58
59use std::collections::HashMap;
60use std::path::{Path, PathBuf};
61use std::process::{Child, Command};
62use std::time::{Duration, Instant, SystemTime};
63
64use anyhow::{Context, Result};
65use serde_json::json;
66
67/// How often the supervisor re-reads the session registry. Tradeoff: a
68/// new session created at `wire session new` waits up to this many
69/// seconds before its daemon comes up. 10s strikes a balance — fast
70/// enough that operators don't notice, slow enough that registry
71/// fork-execs don't dominate.
72const REGISTRY_POLL_SECS: u64 = 10;
73
74/// Initial respawn delay after a child exits unexpectedly. Doubles on
75/// each rapid failure (exit within `RAPID_FAIL_WINDOW`) up to
76/// `MAX_BACKOFF`.
77const INITIAL_BACKOFF: Duration = Duration::from_secs(1);
78const MAX_BACKOFF: Duration = Duration::from_secs(60);
79const RAPID_FAIL_WINDOW: Duration = Duration::from_secs(10);
80
81/// Default idle cutoff for registry-unbound sessions. `list_sessions()`
82/// enumerates *every* session home ever minted on the machine — and
83/// because each Claude tab / `wire session new` mints a fresh persona
84/// home, a long-lived box accumulates hundreds (honey-pine's had 147).
85/// Spawning one daemon per home turns `--all-sessions` into a fork
86/// storm. A session is kept regardless of age if it has a registry cwd
87/// binding (operator deliberately bound it); an *unbound* session is
88/// only kept if it has been active within this window. Override via
89/// `WIRE_ALL_SESSIONS_MAX_IDLE_DAYS` (0 disables the filter → legacy
90/// spawn-for-all behavior).
91const DEFAULT_MAX_IDLE_DAYS: u64 = 7;
92
93/// Parse the idle cutoff. `None` raw → default; a `0` value → `None`
94/// (no filter, spawn for every session); any other integer → that many
95/// days; unparseable → default. Pure, so it's unit-testable without
96/// mutating process env.
97fn parse_max_idle(raw: Option<&str>) -> Option<Duration> {
98    match raw {
99        Some(v) => {
100            let days: u64 = v.trim().parse().unwrap_or(DEFAULT_MAX_IDLE_DAYS);
101            (days != 0).then(|| Duration::from_secs(days * 86_400))
102        }
103        None => Some(Duration::from_secs(DEFAULT_MAX_IDLE_DAYS * 86_400)),
104    }
105}
106
107/// Read the idle cutoff from the environment. `None` means "no idle
108/// filter" (spawn a daemon for every session — pre-fix behavior),
109/// selected by setting `WIRE_ALL_SESSIONS_MAX_IDLE_DAYS=0`.
110fn max_idle_from_env() -> Option<Duration> {
111    parse_max_idle(
112        std::env::var("WIRE_ALL_SESSIONS_MAX_IDLE_DAYS")
113            .ok()
114            .as_deref(),
115    )
116}
117
118/// Newest mtime among a session home's activity files — the
119/// supervisor's "last actually *synced*" signal. These live under the
120/// session's `state/wire/` subtree (same root the per-session daemon
121/// and `existing_daemon_for_session` use), NOT the home root.
122/// `last_sync.json` is rewritten on every successful daemon relay
123/// cycle; the cursors move on inbox/reactor activity. Returns `None`
124/// for a home that has never synced (a husk).
125///
126/// Deliberately excludes `daemon.pid`: it's written on *spawn*, so
127/// counting it would make eligibility self-perpetuating — the
128/// supervisor spawns a daemon, the pidfile refreshes, and the session
129/// would never age out even if it never actually syncs anything.
130fn fs_last_active(home: &Path) -> Option<SystemTime> {
131    let state = home.join("state").join("wire");
132    ["last_sync.json", "notify.cursor", "reactor.cursor"]
133        .iter()
134        .filter_map(|f| std::fs::metadata(state.join(f)).ok())
135        .filter_map(|m| m.modified().ok())
136        .max()
137}
138
139/// Filter `list_sessions()` down to the sessions the supervisor should
140/// own a daemon for. A session is eligible iff it has a registry cwd
141/// binding OR it was active within `max_idle`. `max_idle == None`
142/// disables the filter (every session eligible). Pure: the activity
143/// probe is injected so this is unit-testable without touching disk.
144fn supervisor_eligible<F>(
145    sessions: Vec<crate::session::SessionInfo>,
146    max_idle: Option<Duration>,
147    now: SystemTime,
148    last_active: F,
149) -> Vec<crate::session::SessionInfo>
150where
151    F: Fn(&Path) -> Option<SystemTime>,
152{
153    let Some(max_idle) = max_idle else {
154        return sessions;
155    };
156    sessions
157        .into_iter()
158        .filter(|s| {
159            if s.cwd.is_some() {
160                return true;
161            }
162            match last_active(&s.home_dir) {
163                // `duration_since` errors when the file mtime is in the
164                // future (clock skew) — treat that as "active now".
165                Some(t) => now.duration_since(t).map(|d| d <= max_idle).unwrap_or(true),
166                None => false,
167            }
168        })
169        .collect()
170}
171
172/// State the supervisor tracks per session it has spawned a child for.
173struct ChildState {
174    child: Child,
175    spawned_at: Instant,
176}
177
178/// Entrypoint for `wire daemon --all-sessions`. Loops forever; only
179/// returns Err on a setup error (e.g. cannot resolve sessions_root).
180pub fn run_supervisor(interval_secs: u64, as_json: bool) -> Result<()> {
181    // Supervisor singleton — one per machine. Separate pidfile from the
182    // per-session daemon pidfile so the two layers can't collide.
183    let pid_path = supervisor_pid_path()?;
184    if let Some(existing) = read_alive_supervisor_pid(&pid_path)? {
185        let msg = json!({
186            "status": "skipped",
187            "reason": "supervisor already running",
188            "holder_pid": existing,
189        });
190        if as_json {
191            println!("{msg}");
192        } else {
193            eprintln!(
194                "wire daemon --all-sessions: another supervisor is already running (pid {existing}); not starting a second one."
195            );
196        }
197        return Ok(());
198    }
199    write_supervisor_pid(&pid_path)?;
200    let _cleanup = SupervisorPidGuard {
201        path: pid_path.clone(),
202    };
203
204    if !as_json {
205        eprintln!(
206            "wire daemon --all-sessions: supervisor up. interval={interval_secs}s, registry-poll={REGISTRY_POLL_SECS}s. SIGINT to stop."
207        );
208    } else {
209        println!(
210            "{}",
211            json!({
212                "status": "supervisor_started",
213                "interval_secs": interval_secs,
214                "registry_poll_secs": REGISTRY_POLL_SECS,
215            })
216        );
217    }
218
219    // Idle cutoff for registry-unbound sessions — read once at startup
220    // (env doesn't change under a running supervisor).
221    let max_idle = max_idle_from_env();
222    eprintln!(
223        "supervisor: idle cutoff for unbound sessions = {}",
224        match max_idle {
225            Some(d) => format!("{} days", d.as_secs() / 86_400),
226            None => "disabled (spawn-for-all)".to_string(),
227        }
228    );
229
230    let mut children: HashMap<String, ChildState> = HashMap::new();
231    // Per-session backoff that survives a child's reap → respawn → reap
232    // cycle. Distinguishes "session crashes hard repeatedly" from
233    // "child exited cleanly and we're spawning a fresh one".
234    let mut session_last_exit: HashMap<String, Instant> = HashMap::new();
235    let mut session_backoff: HashMap<String, Duration> = HashMap::new();
236
237    loop {
238        // 1. Reap any exited children. Detect rapid failures + update
239        //    per-session backoff so the next spawn waits.
240        let mut exited: Vec<String> = Vec::new();
241        for (name, state) in children.iter_mut() {
242            if let Ok(Some(status)) = state.child.try_wait() {
243                let lived = state.spawned_at.elapsed();
244                let rapid = lived < RAPID_FAIL_WINDOW;
245                eprintln!(
246                    "supervisor: child '{name}' exited (status={status:?}, lived={}s, rapid={rapid})",
247                    lived.as_secs()
248                );
249                let next_backoff = if rapid {
250                    let prev = session_backoff
251                        .get(name)
252                        .copied()
253                        .unwrap_or(INITIAL_BACKOFF);
254                    (prev * 2).min(MAX_BACKOFF)
255                } else {
256                    INITIAL_BACKOFF
257                };
258                session_backoff.insert(name.clone(), next_backoff);
259                session_last_exit.insert(name.clone(), Instant::now());
260                exited.push(name.clone());
261            }
262        }
263        for n in exited {
264            children.remove(&n);
265        }
266
267        // 2. Read registry, identify wanted sessions. Filter out
268        //    registry-unbound sessions that have been idle past the
269        //    cutoff so the supervisor doesn't fan out a daemon per
270        //    every ephemeral persona home (the 147-home fork storm).
271        let all_sessions = crate::session::list_sessions().unwrap_or_default();
272        let total_sessions = all_sessions.len();
273        let wanted: Vec<crate::session::SessionInfo> =
274            supervisor_eligible(all_sessions, max_idle, SystemTime::now(), fs_last_active);
275        if wanted.len() != total_sessions {
276            eprintln!(
277                "supervisor: {} of {} sessions eligible (skipped {} registry-unbound + idle > cutoff)",
278                wanted.len(),
279                total_sessions,
280                total_sessions - wanted.len()
281            );
282        }
283
284        // 3. Kill children whose session has been removed from the
285        //    registry since last poll. (Operator ran `wire session
286        //    forget` or similar.)
287        let wanted_names: std::collections::HashSet<String> =
288            wanted.iter().map(|s| s.name.clone()).collect();
289        let to_kill: Vec<String> = children
290            .keys()
291            .filter(|n| !wanted_names.contains(n.as_str()))
292            .cloned()
293            .collect();
294        for name in to_kill {
295            if let Some(mut state) = children.remove(&name) {
296                eprintln!("supervisor: session '{name}' gone from registry; terminating its child");
297                let _ = state.child.kill();
298                let _ = state.child.wait();
299            }
300        }
301
302        // 4. Spawn missing children, respecting backoff + existing
303        //    pidfiles (operator-spawned daemons coexist).
304        for info in wanted {
305            if info.did.is_none() {
306                continue;
307            }
308            if children.contains_key(&info.name) {
309                continue;
310            }
311            // Backoff gate: if this session is in a rapid-fail loop,
312            // wait the remaining backoff before respawning.
313            if let Some(last_exit) = session_last_exit.get(&info.name) {
314                let wait = session_backoff
315                    .get(&info.name)
316                    .copied()
317                    .unwrap_or(INITIAL_BACKOFF);
318                if last_exit.elapsed() < wait {
319                    continue;
320                }
321            }
322            // Singleton check: an operator-spawned `wire daemon` may
323            // already own this session. Leave it alone — re-checking
324            // next poll is cheap.
325            if existing_daemon_for_session(&info.home_dir)? {
326                continue;
327            }
328            match spawn_child_for_session(&info.name, &info.home_dir, interval_secs) {
329                Ok(child) => {
330                    eprintln!(
331                        "supervisor: spawned child for session '{}' (pid {})",
332                        info.name,
333                        child.id()
334                    );
335                    children.insert(
336                        info.name.clone(),
337                        ChildState {
338                            child,
339                            spawned_at: Instant::now(),
340                        },
341                    );
342                }
343                Err(e) => {
344                    eprintln!(
345                        "supervisor: spawn failed for session '{}': {e:#}",
346                        info.name
347                    );
348                    // Treat spawn failure as a rapid failure so the
349                    // backoff curve kicks in.
350                    let prev = session_backoff
351                        .get(&info.name)
352                        .copied()
353                        .unwrap_or(INITIAL_BACKOFF);
354                    session_backoff.insert(info.name.clone(), (prev * 2).min(MAX_BACKOFF));
355                    session_last_exit.insert(info.name.clone(), Instant::now());
356                }
357            }
358        }
359
360        std::thread::sleep(Duration::from_secs(REGISTRY_POLL_SECS));
361    }
362}
363
364/// Spawn `wire daemon --interval <i>` as a child with `WIRE_HOME`
365/// pinned via env. Strips inherited WIRE_* env so the operator's
366/// shell config (test overrides like `WIRE_DAEMON_NO_SINGLETON=1`)
367/// can't leak in.
368///
369/// v0.14.2 #170 hotfix: the original implementation also passed
370/// `--session <character-name>` as a belt-and-suspenders check.
371/// That broke 127 of 133 sessions on a real multi-session box —
372/// `cmd_daemon`'s `--session` handler calls
373/// `session::session_dir(name)` which resolves
374/// `sessions_root/<name>`, correct for v0.6 top-level layout but
375/// WRONG for v0.13's `by-key/<hash>` layout where the character
376/// name is *derived* from the card DID, not the directory name.
377/// Children bailed → supervisor fork-bombed (10s poll × 60s
378/// backoff × 127 failing sessions). WIRE_HOME env alone is the
379/// correct contract: every daemon code path flows through
380/// `state_dir()` / `config_dir()` which honor it. No second
381/// source of truth.
382fn spawn_child_for_session(
383    name: &str,
384    home_dir: &std::path::Path,
385    interval_secs: u64,
386) -> Result<Child> {
387    let exe = std::env::current_exe().context("resolving current exe for child fork")?;
388    let mut cmd = Command::new(&exe);
389    cmd.args(["daemon", "--interval", &interval_secs.to_string()]);
390    // Strip WIRE_* env so operator shell-vars don't leak into the
391    // child. Then pin WIRE_HOME exactly.
392    let leaks: Vec<String> = std::env::vars()
393        .filter(|(k, _)| k.starts_with("WIRE_"))
394        .map(|(k, _)| k)
395        .collect();
396    for k in leaks {
397        cmd.env_remove(&k);
398    }
399    cmd.env("WIRE_HOME", home_dir);
400    // Children inherit stdout/stderr → land in the launchd log file
401    // (StandardOutPath in the plist). Operators see "supervisor:
402    // spawned ..." lines interleaved with each session's daemon log.
403    cmd.spawn().with_context(|| {
404        format!(
405            "fork-exec `wire daemon` for session '{name}' (binary {} WIRE_HOME={})",
406            exe.display(),
407            home_dir.display()
408        )
409    })
410}
411
412/// True iff this session's `daemon.pid` names a live process. Used by
413/// the supervisor to coexist with operator-spawned `wire daemon`
414/// invocations: if the operator already started one in a tmux pane,
415/// we skip the spawn and let theirs own the cursor.
416fn existing_daemon_for_session(home_dir: &std::path::Path) -> Result<bool> {
417    let pid_path = home_dir.join("state").join("wire").join("daemon.pid");
418    if !pid_path.exists() {
419        return Ok(false);
420    }
421    let body = match std::fs::read_to_string(&pid_path) {
422        Ok(b) => b,
423        Err(_) => return Ok(false),
424    };
425    // Pidfile is either JSON `{"pid": <n>, ...}` (v0.5.11+) or a bare
426    // integer (legacy). Try JSON+pid-field first; if that yields
427    // None (parse failed OR JSON had no pid field, e.g. a bare
428    // integer body parses as JSON number with no `.pid`), fall
429    // through to the bare-int path.
430    let pid = serde_json::from_str::<serde_json::Value>(&body)
431        .ok()
432        .and_then(|v| v.get("pid").and_then(serde_json::Value::as_u64))
433        .or_else(|| body.trim().parse::<u64>().ok());
434    Ok(pid
435        .map(|p| crate::ensure_up::pid_is_alive(p as u32))
436        .unwrap_or(false))
437}
438
439/// Read-only snapshot of the supervisor's current topology — supervisor
440/// liveness + per-session daemon liveness + orphan pids the supervisor
441/// is not currently managing. Used by `wire supervisor` (the CLI
442/// counterpart to single-session `wire status`) so operators can ask
443/// "what is the multi-session supervisor doing?" in one command
444/// instead of cross-referencing `pgrep` against per-session pidfiles
445/// by hand.
446#[derive(Debug, Clone, serde::Serialize)]
447pub struct SupervisorState {
448    /// Pid the `supervisor.pid` file names; None if file missing.
449    pub supervisor_pid: Option<u32>,
450    /// True iff that pid is currently a live process.
451    pub supervisor_alive: bool,
452    /// Per-session liveness across every initialized session, in
453    /// `list_sessions()` order.
454    pub sessions: Vec<SupervisedSession>,
455    /// `wire daemon` pids found via cmdline-scan that are NOT mapped
456    /// to any session's pidfile AND are not the supervisor itself.
457    /// Could be legacy operator-spawned daemons, leftover children
458    /// from a crashed prior supervisor, or daemons serving the
459    /// default WIRE_HOME (no `--all-sessions`). Operators see them
460    /// here so they can decide whether to kill.
461    pub unmanaged_pids: Vec<u32>,
462    /// v0.14.2: session names whose live daemon's recorded
463    /// `pidfile.version` is older than this CLI's own
464    /// `CARGO_PKG_VERSION`. The supervisor's existing-pidfile check
465    /// skips alive daemons regardless of their binary version, so
466    /// stale-binary daemons persist until they exit. Surfaced for
467    /// operator visibility — they can `pkill -TERM <pid>` or use a
468    /// future `wire upgrade --refresh-stale-children` to force the
469    /// supervisor to respawn them on the current binary.
470    pub stale_binary_sessions: Vec<String>,
471}
472
473/// One session as seen by the supervisor.
474#[derive(Debug, Clone, serde::Serialize)]
475pub struct SupervisedSession {
476    /// Session name (`info.name` from `session::list_sessions`).
477    pub name: String,
478    /// `home_dir` filesystem path.
479    pub home_dir: String,
480    /// Pid the session's `daemon.pid` records; None if file missing.
481    pub daemon_pid: Option<u32>,
482    /// True iff that pid is currently a live process.
483    pub daemon_alive: bool,
484    /// Seconds since the session's daemon last completed a sync
485    /// cycle (read from `last_sync.json`); None if never recorded.
486    pub last_sync_age_seconds: Option<u64>,
487    /// Version string the running daemon recorded when it wrote its
488    /// pidfile (`PidRecord::Json.version`). None for legacy-int
489    /// pidfiles. Surfaced so operators can spot version drift across
490    /// the supervisor fleet — the supervisor's pre-spawn
491    /// existing-pidfile check skips alive daemons regardless of
492    /// their binary version, so a daemon spawned on v0.13.x and
493    /// still running after the supervisor was bounced to v0.14.x
494    /// keeps the old binary in memory until it exits.
495    #[serde(skip_serializing_if = "Option::is_none")]
496    pub daemon_version: Option<String>,
497}
498
499/// Build a `SupervisorState` snapshot. Pure read; no fork / no
500/// pidfile mutation. Best-effort on every component (filesystem
501/// errors yield None / empty rather than failing the whole call).
502pub fn read_supervisor_state() -> Result<SupervisorState> {
503    let pid_path = supervisor_pid_path()?;
504    let supervisor_pid = read_supervisor_pid(&pid_path);
505    let supervisor_alive = supervisor_pid
506        .map(crate::ensure_up::pid_is_alive)
507        .unwrap_or(false);
508
509    // Per-session liveness — walk list_sessions, read each home's
510    // pidfile + last_sync.
511    let sessions: Vec<SupervisedSession> = crate::session::list_sessions()
512        .unwrap_or_default()
513        .into_iter()
514        .map(|info| {
515            let daemon_pid = crate::session::session_daemon_pid(&info.home_dir);
516            let daemon_alive = daemon_pid
517                .map(crate::ensure_up::pid_is_alive)
518                .unwrap_or(false);
519            // last_sync.json lives under <home>/state/wire/last_sync.json.
520            let last_sync_age_seconds = read_session_last_sync_age(&info.home_dir);
521            // v0.14.2: read the daemon-recorded version from the JSON
522            // pidfile. Legacy bare-integer pidfiles return None
523            // (can't surface a version we don't have).
524            let daemon_version = read_session_pidfile_version(&info.home_dir);
525            SupervisedSession {
526                name: info.name,
527                home_dir: info.home_dir.to_string_lossy().into_owned(),
528                daemon_pid,
529                daemon_alive,
530                last_sync_age_seconds,
531                daemon_version,
532            }
533        })
534        .collect();
535
536    // Unmanaged pids: every `wire daemon` cmdline scan hit that isn't
537    // (a) the supervisor itself, (b) any session's pidfile pid.
538    let all_daemon_pids: std::collections::HashSet<u32> =
539        crate::platform::find_processes_by_cmdline("wire daemon")
540            .into_iter()
541            .collect();
542    let known_session_pids: std::collections::HashSet<u32> = sessions
543        .iter()
544        .filter_map(|s| if s.daemon_alive { s.daemon_pid } else { None })
545        .collect();
546    let mut unmanaged_pids: Vec<u32> = all_daemon_pids
547        .into_iter()
548        .filter(|p| Some(*p) != supervisor_pid && !known_session_pids.contains(p))
549        .collect();
550    unmanaged_pids.sort_unstable();
551
552    // v0.14.2: derive the stale-binary set. Compare each live
553    // daemon's recorded version against the running CLI's version.
554    // "Stale" iff alive + has a recorded version + that version is
555    // strictly less than ours by dotted-integer compare (so 0.10.0 >
556    // 0.9.0). Unparseable strings are conservatively "not stale" — a
557    // pre-release suffix like 0.14.2-rc.1 stays unflagged rather than
558    // false-positive against 0.14.2.
559    let our_version = env!("CARGO_PKG_VERSION");
560    let stale_binary_sessions: Vec<String> = sessions
561        .iter()
562        .filter(|s| {
563            s.daemon_alive
564                && s.daemon_version
565                    .as_deref()
566                    .map(|v| version_lt(v, our_version))
567                    .unwrap_or(false)
568        })
569        .map(|s| s.name.clone())
570        .collect();
571
572    Ok(SupervisorState {
573        supervisor_pid,
574        supervisor_alive,
575        sessions,
576        unmanaged_pids,
577        stale_binary_sessions,
578    })
579}
580
581/// Compare two dotted-integer version strings: `a < b`?
582///
583/// Splits on `.`, parses each segment as `u32`, compares
584/// element-wise (left-pad shorter with 0 so `0.14` < `0.14.1` is
585/// `true`). Anything that fails to parse as `u32` makes the whole
586/// compare return `false` — we'd rather under-flag a pre-release
587/// suffix like `0.14.2-rc.1` than false-positive against a stable
588/// peer of the same major.minor.patch.
589fn version_lt(a: &str, b: &str) -> bool {
590    let parse = |s: &str| -> Option<Vec<u32>> { s.split('.').map(|p| p.parse().ok()).collect() };
591    let (Some(av), Some(bv)) = (parse(a), parse(b)) else {
592        return false;
593    };
594    let n = av.len().max(bv.len());
595    for i in 0..n {
596        let ai = av.get(i).copied().unwrap_or(0);
597        let bi = bv.get(i).copied().unwrap_or(0);
598        if ai != bi {
599            return ai < bi;
600        }
601    }
602    false
603}
604
605/// Read the daemon-recorded version string from a session's
606/// `<home>/state/wire/daemon.pid` JSON pidfile. Returns None for
607/// legacy bare-integer pidfiles (no version field) and for absent /
608/// unreadable files.
609fn read_session_pidfile_version(home_dir: &std::path::Path) -> Option<String> {
610    let pidfile = home_dir.join("state").join("wire").join("daemon.pid");
611    let body = std::fs::read_to_string(&pidfile).ok()?;
612    let v: serde_json::Value = serde_json::from_str(&body).ok()?;
613    v.get("version")
614        .and_then(serde_json::Value::as_str)
615        .map(str::to_string)
616}
617
618/// Read `supervisor.pid` without the liveness check (the snapshot
619/// builder runs the check itself, separated so an absent file is
620/// just `None` rather than an Err).
621fn read_supervisor_pid(path: &std::path::Path) -> Option<u32> {
622    if !path.exists() {
623        return None;
624    }
625    let body = std::fs::read_to_string(path).ok()?;
626    body.trim().parse::<u32>().ok()
627}
628
629/// Read `<home>/state/wire/last_sync.json`'s timestamp and return
630/// "seconds since now". None on absent / unreadable / unparseable.
631fn read_session_last_sync_age(home_dir: &std::path::Path) -> Option<u64> {
632    let path = home_dir.join("state").join("wire").join("last_sync.json");
633    let body = std::fs::read_to_string(&path).ok()?;
634    let v: serde_json::Value = serde_json::from_str(&body).ok()?;
635    let ts = v.get("ts").and_then(serde_json::Value::as_str)?;
636    let parsed =
637        time::OffsetDateTime::parse(ts, &time::format_description::well_known::Rfc3339).ok()?;
638    let age = (time::OffsetDateTime::now_utc() - parsed).whole_seconds();
639    if age < 0 {
640        // Clock skew: timestamp is in the future. Treat as fresh.
641        Some(0)
642    } else {
643        Some(age as u64)
644    }
645}
646
647fn supervisor_pid_path() -> Result<PathBuf> {
648    let root = crate::session::sessions_root()
649        .context("resolving sessions_root for supervisor pidfile")?;
650    std::fs::create_dir_all(&root).with_context(|| format!("creating {root:?}"))?;
651    Ok(root.join("supervisor.pid"))
652}
653
654fn read_alive_supervisor_pid(path: &std::path::Path) -> Result<Option<u32>> {
655    if !path.exists() {
656        return Ok(None);
657    }
658    let body = std::fs::read_to_string(path).ok();
659    let pid = body.as_deref().and_then(|s| s.trim().parse::<u32>().ok());
660    match pid {
661        Some(p) if crate::ensure_up::pid_is_alive(p) => Ok(Some(p)),
662        _ => Ok(None),
663    }
664}
665
666fn write_supervisor_pid(path: &std::path::Path) -> Result<()> {
667    let pid = std::process::id();
668    std::fs::write(path, pid.to_string())
669        .with_context(|| format!("writing supervisor pidfile {path:?}"))?;
670    Ok(())
671}
672
673struct SupervisorPidGuard {
674    path: PathBuf,
675}
676
677impl Drop for SupervisorPidGuard {
678    fn drop(&mut self) {
679        // Only remove if it still names us — same pattern as
680        // DaemonPidGuard in ensure_up.rs.
681        if let Ok(body) = std::fs::read_to_string(&self.path)
682            && let Ok(pid) = body.trim().parse::<u32>()
683            && pid == std::process::id()
684        {
685            let _ = std::fs::remove_file(&self.path);
686        }
687    }
688}
689
690#[cfg(test)]
691mod tests {
692    use super::*;
693    use tempfile::tempdir;
694
695    #[test]
696    fn version_lt_dotted_integer_compare() {
697        // Lexical string-compare footgun cases — these must come out right.
698        assert!(version_lt("0.9.0", "0.10.0"));
699        assert!(version_lt("0.13.5", "0.14.1"));
700        assert!(version_lt("0.14.0", "0.14.1"));
701        // Equal / greater → not stale.
702        assert!(!version_lt("0.14.1", "0.14.1"));
703        assert!(!version_lt("0.14.2", "0.14.1"));
704        // Shorter version pads with zero.
705        assert!(version_lt("0.14", "0.14.1"));
706        assert!(!version_lt("0.14.1", "0.14"));
707        // Unparseable (pre-release suffix, garbage) is conservatively NOT-stale
708        // — under-flagging beats false-positive on `0.14.2-rc.1` vs `0.14.2`.
709        assert!(!version_lt("0.14.2-rc.1", "0.14.2"));
710        assert!(!version_lt("garbage", "0.14.1"));
711        assert!(!version_lt("0.14.1", "garbage"));
712    }
713
714    #[test]
715    fn read_alive_supervisor_pid_returns_none_when_missing() {
716        let tmp = tempdir().unwrap();
717        let p = tmp.path().join("supervisor.pid");
718        assert_eq!(read_alive_supervisor_pid(&p).unwrap(), None);
719    }
720
721    #[test]
722    fn read_alive_supervisor_pid_returns_none_for_dead_pid() {
723        let tmp = tempdir().unwrap();
724        let p = tmp.path().join("supervisor.pid");
725        // pid 999999 is almost certainly not running.
726        std::fs::write(&p, "999999").unwrap();
727        assert_eq!(read_alive_supervisor_pid(&p).unwrap(), None);
728    }
729
730    #[test]
731    fn read_alive_supervisor_pid_returns_pid_for_self() {
732        let tmp = tempdir().unwrap();
733        let p = tmp.path().join("supervisor.pid");
734        let our_pid = std::process::id();
735        std::fs::write(&p, our_pid.to_string()).unwrap();
736        assert_eq!(read_alive_supervisor_pid(&p).unwrap(), Some(our_pid));
737    }
738
739    #[test]
740    fn pid_guard_only_removes_when_pid_still_matches() {
741        let tmp = tempdir().unwrap();
742        let p = tmp.path().join("supervisor.pid");
743        // Write a foreign pid into the file, then drop a guard for
744        // our pid. The guard should leave the foreign pidfile alone.
745        std::fs::write(&p, "12345").unwrap();
746        {
747            let _g = SupervisorPidGuard { path: p.clone() };
748        }
749        assert!(p.exists(), "guard removed a pidfile that didn't name us");
750    }
751
752    #[test]
753    fn pid_guard_removes_when_pid_matches() {
754        let tmp = tempdir().unwrap();
755        let p = tmp.path().join("supervisor.pid");
756        let our_pid = std::process::id();
757        std::fs::write(&p, our_pid.to_string()).unwrap();
758        {
759            let _g = SupervisorPidGuard { path: p.clone() };
760        }
761        assert!(!p.exists(), "guard left our own pidfile behind");
762    }
763
764    #[test]
765    fn existing_daemon_for_session_returns_false_when_pidfile_missing() {
766        let tmp = tempdir().unwrap();
767        // home_dir has no state/wire/daemon.pid
768        assert!(!existing_daemon_for_session(tmp.path()).unwrap());
769    }
770
771    #[test]
772    fn existing_daemon_for_session_returns_false_for_dead_pid() {
773        let tmp = tempdir().unwrap();
774        let state = tmp.path().join("state").join("wire");
775        std::fs::create_dir_all(&state).unwrap();
776        std::fs::write(state.join("daemon.pid"), "999999").unwrap();
777        assert!(!existing_daemon_for_session(tmp.path()).unwrap());
778    }
779
780    #[test]
781    fn existing_daemon_for_session_returns_true_for_self_pid() {
782        let tmp = tempdir().unwrap();
783        let state = tmp.path().join("state").join("wire");
784        std::fs::create_dir_all(&state).unwrap();
785        std::fs::write(state.join("daemon.pid"), std::process::id().to_string()).unwrap();
786        assert!(existing_daemon_for_session(tmp.path()).unwrap());
787    }
788
789    // ---- supervisor eligibility filter (the 147-home fork-storm fix) ----
790
791    fn mk_session(name: &str, cwd: Option<&str>) -> crate::session::SessionInfo {
792        crate::session::SessionInfo {
793            name: name.to_string(),
794            cwd: cwd.map(String::from),
795            home_dir: PathBuf::from(format!("/sessions/{name}")),
796            did: None,
797            handle: None,
798            daemon_running: false,
799            character: None,
800        }
801    }
802
803    #[test]
804    fn parse_max_idle_default_when_unset() {
805        assert_eq!(
806            parse_max_idle(None),
807            Some(Duration::from_secs(DEFAULT_MAX_IDLE_DAYS * 86_400))
808        );
809    }
810
811    #[test]
812    fn parse_max_idle_zero_disables_filter() {
813        assert_eq!(parse_max_idle(Some("0")), None);
814    }
815
816    #[test]
817    fn parse_max_idle_explicit_days() {
818        assert_eq!(
819            parse_max_idle(Some("3")),
820            Some(Duration::from_secs(3 * 86_400))
821        );
822        assert_eq!(
823            parse_max_idle(Some("  14 ")),
824            Some(Duration::from_secs(14 * 86_400))
825        );
826    }
827
828    #[test]
829    fn parse_max_idle_garbage_falls_back_to_default() {
830        assert_eq!(
831            parse_max_idle(Some("not-a-number")),
832            Some(Duration::from_secs(DEFAULT_MAX_IDLE_DAYS * 86_400))
833        );
834    }
835
836    #[test]
837    fn eligible_keeps_cwd_bound_even_when_ancient() {
838        // A registry-bound session is kept no matter how idle — the
839        // operator deliberately attached it to a project dir. (This is
840        // the real-world case: the cwd-bound `wire`/`slancha-*` sessions
841        // were the *oldest* on the box, yet must survive.)
842        let now = SystemTime::now();
843        let ancient = now - Duration::from_secs(365 * 86_400);
844        let sessions = vec![mk_session("wire", Some("/Users/p/Source/wire"))];
845        let out = supervisor_eligible(sessions, Some(Duration::from_secs(7 * 86_400)), now, |_| {
846            Some(ancient)
847        });
848        assert_eq!(out.len(), 1);
849        assert_eq!(out[0].name, "wire");
850    }
851
852    #[test]
853    fn eligible_keeps_unbound_recent_drops_unbound_idle() {
854        // The live-but-unbound persona sessions (each Claude tab) are
855        // recent → kept. The abandoned ones are idle → dropped.
856        let now = SystemTime::now();
857        let recent = now - Duration::from_secs(2 * 86_400);
858        let stale = now - Duration::from_secs(30 * 86_400);
859        let sessions = vec![
860            mk_session("rosy-rook", None),    // live tab
861            mk_session("agate-nimbus", None), // abandoned
862        ];
863        let out = supervisor_eligible(
864            sessions,
865            Some(Duration::from_secs(7 * 86_400)),
866            now,
867            |home| {
868                if home.ends_with("rosy-rook") {
869                    Some(recent)
870                } else {
871                    Some(stale)
872                }
873            },
874        );
875        let names: Vec<_> = out.iter().map(|s| s.name.as_str()).collect();
876        assert_eq!(names, vec!["rosy-rook"]);
877    }
878
879    #[test]
880    fn eligible_drops_unbound_with_no_activity_signal() {
881        // A never-synced husk (no activity files at all) and no cwd →
882        // dropped: nothing says it's a session anyone is using.
883        let now = SystemTime::now();
884        let sessions = vec![mk_session("husk", None)];
885        let out = supervisor_eligible(sessions, Some(Duration::from_secs(7 * 86_400)), now, |_| {
886            None
887        });
888        assert!(out.is_empty());
889    }
890
891    #[test]
892    fn eligible_none_cutoff_keeps_everything() {
893        // Override = 0 (max_idle None) restores legacy spawn-for-all.
894        let now = SystemTime::now();
895        let ancient = now - Duration::from_secs(999 * 86_400);
896        let sessions = vec![mk_session("husk", None), mk_session("agate-nimbus", None)];
897        let out = supervisor_eligible(sessions, None, now, |_| Some(ancient));
898        assert_eq!(out.len(), 2);
899    }
900}