Skip to main content

trusty_memory/commands/
single_instance.rs

1//! Single-instance guard for the trusty-memory daemon.
2//!
3//! Why: macOS launchd `KeepAlive { SuccessfulExit: false }` (i.e. `OnSuccess`)
4//! respawns the daemon whenever it exits with a non-zero code. When a second
5//! daemon instance fails to bind (EADDRINUSE — the first instance already owns
6//! port 7070 and/or the UDS socket), it exits non-zero, which launchd interprets
7//! as a crash and spawns yet another copy. The resulting zombie herd (69 observed
8//! in the wild) exhausts file descriptors on top of the existing fd-limit bug.
9//!
10//! The fix: before attempting to bind, probe the discovery files. If a healthy
11//! daemon is already responding to `/health`, exit **0** (success). Launchd
12//! treats exit-0 as "clean shutdown" and does NOT respawn (SuccessfulExit:false
13//! = restart only on non-zero). This collapses the zombie herd immediately on
14//! the next invocation without touching launchd config.
15//!
16//! What: exposes [`single_instance_check`] (async, for real daemon startups)
17//! and [`StartupAction`] (pure enum, for unit testing the decision logic).
18//!
19//! Test: `startup_action_*` unit tests cover every branch including the
20//! stale-socket-vs-live-socket distinction.
21
22use std::path::Path;
23
24/// What the daemon startup should do after the single-instance check.
25///
26/// Why: separating the decision from the I/O lets us unit-test the logic
27/// with injected probe results rather than spinning up real TCP listeners.
28/// What: three variants covering the full decision tree.
29/// Test: `startup_action_from_probe_result_*` tests in this module.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum StartupAction {
32    /// Proceed to bind the TCP port and start serving.
33    Proceed,
34    /// Another healthy instance is already running — exit 0 cleanly so
35    /// launchd does not respawn.
36    ExitAlreadyRunning,
37    /// A probe attempt failed with an unexpected error (not ECONNREFUSED /
38    /// "no such file") — propagate as a startup failure so the operator sees
39    /// a real error in the launchd log. Launchd will respawn (correctly, because
40    /// this is a genuine failure).
41    Fail(String),
42}
43
44/// Decide what to do based on the result of an HTTP health probe.
45///
46/// Why: the single-instance check reduces to "did the health probe succeed?".
47/// Encoding the decision as a pure function (rather than embedding it in the
48/// async probe body) makes the logic unit-testable without actual network I/O.
49/// What: `probe_ok = true` → [`StartupAction::ExitAlreadyRunning`];
50/// `probe_ok = false` → [`StartupAction::Proceed`].
51/// Test: `startup_action_from_probe_result_when_alive`,
52///       `startup_action_from_probe_result_when_dead`.
53pub fn startup_action_from_probe_result(probe_ok: bool) -> StartupAction {
54    if probe_ok {
55        StartupAction::ExitAlreadyRunning
56    } else {
57        StartupAction::Proceed
58    }
59}
60
61/// Perform the single-instance check at daemon startup.
62///
63/// Why: launchd's `KeepAlive { SuccessfulExit: false }` respawns any non-zero
64/// exit, so a second daemon instance that fails to bind causes an endless
65/// respawn storm. Exiting 0 (when another healthy instance is detected) short-
66/// circuits this because `SuccessfulExit: false` means "restart only on
67/// non-zero exits" — exit 0 is treated as a voluntary clean shutdown.
68/// What: reads the `http_addr` discovery file; if it contains a reachable
69/// address whose `/health` responds with HTTP 200, returns
70/// [`StartupAction::ExitAlreadyRunning`]. Otherwise returns
71/// [`StartupAction::Proceed`] so the caller continues with normal bind.
72/// Errors reading the addr file or the network call are silently treated as
73/// "not running" (returns `Proceed`) so a missing or stale file never blocks
74/// a cold start.
75/// Test: integration — run `trusty-memory serve --foreground` twice in the
76/// same session and observe the second exits 0 without trying to bind; the
77/// unit tests in this module cover the decision logic.
78pub async fn single_instance_check(addr_file: Option<&Path>) -> StartupAction {
79    let Some(path) = addr_file else {
80        // No addr file path available (no $HOME) — proceed with bind.
81        return StartupAction::Proceed;
82    };
83    let probe_ok = trusty_common::check_already_running(path, "/health")
84        .await
85        .is_some();
86    startup_action_from_probe_result(probe_ok)
87}
88
89#[cfg(test)]
90mod tests {
91    use super::*;
92
93    /// Why: when the health probe returns `Some(url)` (daemon is alive),
94    /// the startup action must be `ExitAlreadyRunning` so the caller can
95    /// exit 0 and stop the launchd respawn storm.
96    /// What: asserts the mapping for `probe_ok = true`.
97    /// Test: itself (pure function, no I/O).
98    #[test]
99    fn startup_action_from_probe_result_when_alive() {
100        assert_eq!(
101            startup_action_from_probe_result(true),
102            StartupAction::ExitAlreadyRunning,
103            "alive probe → ExitAlreadyRunning"
104        );
105    }
106
107    /// Why: when the health probe returns `None` (addr file missing, stale,
108    /// or daemon not responding), the startup action must be `Proceed` so the
109    /// daemon continues with its normal bind sequence.
110    /// What: asserts the mapping for `probe_ok = false`.
111    /// Test: itself (pure function, no I/O).
112    #[test]
113    fn startup_action_from_probe_result_when_dead() {
114        assert_eq!(
115            startup_action_from_probe_result(false),
116            StartupAction::Proceed,
117            "dead/absent probe → Proceed"
118        );
119    }
120
121    /// Why: when there is no addr file path (no $HOME / TRUSTY_DATA_DIR_OVERRIDE),
122    /// the guard must not block a cold start — it must proceed.
123    /// What: calls `single_instance_check(None)` in a tokio context and asserts
124    /// the result is `Proceed`.
125    /// Test: itself (no real I/O — None short-circuits immediately).
126    #[tokio::test]
127    async fn single_instance_check_proceeds_when_no_path() {
128        let action = single_instance_check(None).await;
129        assert_eq!(
130            action,
131            StartupAction::Proceed,
132            "no addr path → Proceed (cold start must not be blocked)"
133        );
134    }
135
136    /// Why: a missing addr file means no daemon is running — the guard
137    /// must allow the cold start to proceed.
138    /// What: passes a path to a nonexistent file and asserts `Proceed`.
139    /// Test: itself (real fs stat, no network).
140    #[tokio::test]
141    async fn single_instance_check_proceeds_when_addr_file_missing() {
142        let tmp = tempfile::tempdir().expect("tempdir");
143        let missing = tmp.path().join("http_addr");
144        let action = single_instance_check(Some(&missing)).await;
145        assert_eq!(
146            action,
147            StartupAction::Proceed,
148            "missing addr file → Proceed"
149        );
150    }
151
152    /// Why: a stale addr file (address written but no daemon listening) must
153    /// be treated as "not running" — the guard must allow the cold start.
154    /// What: writes a dead address to a tempfile and asserts `Proceed`
155    /// (the `check_already_running` helper cleans the stale file and returns
156    /// `None`, so `startup_action_from_probe_result(false)` = Proceed).
157    /// Test: itself (real fs + loopback TCP attempt, no daemon spawned).
158    #[tokio::test]
159    async fn single_instance_check_proceeds_when_addr_file_stale() {
160        let tmp = tempfile::tempdir().expect("tempdir");
161        let addr_file = tmp.path().join("http_addr");
162        // Write a port that nothing is listening on.
163        std::fs::write(&addr_file, "127.0.0.1:19999\n").expect("write");
164        let action = single_instance_check(Some(&addr_file)).await;
165        assert_eq!(
166            action,
167            StartupAction::Proceed,
168            "stale addr file (no listener) → Proceed"
169        );
170    }
171}