Skip to main content

trusty_memory/commands/
single_instance.rs

1//! Single-instance guard for the trusty-memory daemon.
2//!
3//! Why: macOS launchd `KeepAlive { SuccessfulExit: false }` (i.e. `OnSuccess`)
4//! respawns the daemon whenever it exits with a non-zero code. When a second
5//! daemon instance fails to bind (EADDRINUSE — the first instance already owns
6//! port 7070 and/or the UDS socket), it exits non-zero, which launchd interprets
7//! as a crash and spawns yet another copy. The resulting zombie herd (69 observed
8//! in the wild) exhausts file descriptors on top of the existing fd-limit bug.
9//!
10//! The fix: before attempting to bind, probe the discovery files. If a healthy
11//! daemon is already responding to `/health`, exit **0** (success). Launchd
12//! treats exit-0 as "clean shutdown" and does NOT respawn (SuccessfulExit:false
13//! = restart only on non-zero). This collapses the zombie herd immediately on
14//! the next invocation without touching launchd config.
15//!
16//! What: exposes [`single_instance_check`] (async, for real daemon startups)
17//! and [`StartupAction`] (pure enum, for unit testing the decision logic).
18//!
19//! Test: `startup_action_*` unit tests cover every branch including the
20//! stale-socket-vs-live-socket distinction.
21
22use std::path::Path;
23
24/// What the daemon startup should do after the single-instance check.
25///
26/// Why: separating the decision from the I/O lets us unit-test the logic
27/// with injected probe results rather than spinning up real TCP listeners.
28/// What: three variants covering the full decision tree.
29/// Test: `startup_action_from_probe_result_*` tests in this module.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum StartupAction {
32    /// Proceed to bind the TCP port and start serving.
33    Proceed,
34    /// Another healthy instance is already running — exit 0 cleanly so
35    /// launchd does not respawn.
36    ExitAlreadyRunning,
37    /// A probe attempt failed with an unexpected error (not ECONNREFUSED /
38    /// "no such file") — propagate as a startup failure so the operator sees
39    /// a real error in the launchd log. Launchd will respawn (correctly, because
40    /// this is a genuine failure).
41    Fail(String),
42}
43
44/// Decide what to do based on the result of an HTTP health probe.
45///
46/// Why: the single-instance check reduces to "did the health probe succeed?".
47/// Encoding the decision as a pure function (rather than embedding it in the
48/// async probe body) makes the logic unit-testable without actual network I/O.
49/// What: `probe_ok = true` → [`StartupAction::ExitAlreadyRunning`];
50/// `probe_ok = false` → [`StartupAction::Proceed`].
51/// Test: `startup_action_from_probe_result_when_alive`,
52///       `startup_action_from_probe_result_when_dead`.
53pub fn startup_action_from_probe_result(probe_ok: bool) -> StartupAction {
54    if probe_ok {
55        StartupAction::ExitAlreadyRunning
56    } else {
57        StartupAction::Proceed
58    }
59}
60
61/// Perform the single-instance check at daemon startup.
62///
63/// Why: launchd's `KeepAlive { SuccessfulExit: false }` respawns any non-zero
64/// exit, so a second daemon instance that fails to bind causes an endless
65/// respawn storm. Exiting 0 (when another healthy instance is detected) short-
66/// circuits this because `SuccessfulExit: false` means "restart only on
67/// non-zero exits" — exit 0 is treated as a voluntary clean shutdown.
68/// What: reads the `http_addr` discovery file; if it contains a reachable
69/// address whose `/health` responds with HTTP 200, returns
70/// [`StartupAction::ExitAlreadyRunning`]. Otherwise returns
71/// [`StartupAction::Proceed`] so the caller continues with normal bind.
72/// Errors reading the addr file or the network call are silently treated as
73/// "not running" (returns `Proceed`) so a missing or stale file never blocks
74/// a cold start.
75/// Test: integration — run `trusty-memory serve --foreground` twice in the
76/// same session and observe the second exits 0 without trying to bind; the
77/// unit tests in this module cover the decision logic.
78pub async fn single_instance_check(addr_file: Option<&Path>) -> StartupAction {
79    let Some(path) = addr_file else {
80        // No addr file path available (no $HOME) — proceed with bind.
81        return StartupAction::Proceed;
82    };
83    let probe_ok = trusty_common::check_already_running(path, "/health")
84        .await
85        .is_some();
86    startup_action_from_probe_result(probe_ok)
87}
88
89/// Single-instance check with up to `max_retries` additional probes.
90///
91/// Why (issue #1152, Tier 3): a single probe can miss a daemon that is
92/// mid-boot — it wrote the addr file but hasn't yet answered `/health`.
93/// Retrying with a short sleep lets a slow-boot daemon be detected and
94/// this caller exit 0 (stopping the launchd respawn storm) rather than
95/// proceeding to open redb, which would trigger `DatabaseAlreadyOpen`.
96/// What: calls `single_instance_check` repeatedly up to `1 + max_retries`
97/// times, sleeping `delay_ms` between each call, stopping on the first
98/// non-`Proceed` result. Returns the final `StartupAction`.
99/// Test: covered by the unit tests for `startup_action_from_probe_result`;
100/// the retry path is exercised by the integration guard in `main.rs`.
101pub async fn single_instance_check_retried(
102    addr_file: Option<&std::path::Path>,
103    max_retries: u8,
104    delay_ms: u64,
105) -> StartupAction {
106    let mut action = single_instance_check(addr_file).await;
107    let mut retries = max_retries;
108    while action == StartupAction::Proceed && retries > 0 {
109        retries -= 1;
110        tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
111        action = single_instance_check(addr_file).await;
112    }
113    action
114}
115
116#[cfg(test)]
117mod tests {
118    use super::*;
119
120    /// Why: when the health probe returns `Some(url)` (daemon is alive),
121    /// the startup action must be `ExitAlreadyRunning` so the caller can
122    /// exit 0 and stop the launchd respawn storm.
123    /// What: asserts the mapping for `probe_ok = true`.
124    /// Test: itself (pure function, no I/O).
125    #[test]
126    fn startup_action_from_probe_result_when_alive() {
127        assert_eq!(
128            startup_action_from_probe_result(true),
129            StartupAction::ExitAlreadyRunning,
130            "alive probe → ExitAlreadyRunning"
131        );
132    }
133
134    /// Why: when the health probe returns `None` (addr file missing, stale,
135    /// or daemon not responding), the startup action must be `Proceed` so the
136    /// daemon continues with its normal bind sequence.
137    /// What: asserts the mapping for `probe_ok = false`.
138    /// Test: itself (pure function, no I/O).
139    #[test]
140    fn startup_action_from_probe_result_when_dead() {
141        assert_eq!(
142            startup_action_from_probe_result(false),
143            StartupAction::Proceed,
144            "dead/absent probe → Proceed"
145        );
146    }
147
148    /// Why: when there is no addr file path (no $HOME / TRUSTY_DATA_DIR_OVERRIDE),
149    /// the guard must not block a cold start — it must proceed.
150    /// What: calls `single_instance_check(None)` in a tokio context and asserts
151    /// the result is `Proceed`.
152    /// Test: itself (no real I/O — None short-circuits immediately).
153    #[tokio::test]
154    async fn single_instance_check_proceeds_when_no_path() {
155        let action = single_instance_check(None).await;
156        assert_eq!(
157            action,
158            StartupAction::Proceed,
159            "no addr path → Proceed (cold start must not be blocked)"
160        );
161    }
162
163    /// Why: a missing addr file means no daemon is running — the guard
164    /// must allow the cold start to proceed.
165    /// What: passes a path to a nonexistent file and asserts `Proceed`.
166    /// Test: itself (real fs stat, no network).
167    #[tokio::test]
168    async fn single_instance_check_proceeds_when_addr_file_missing() {
169        let tmp = tempfile::tempdir().expect("tempdir");
170        let missing = tmp.path().join("http_addr");
171        let action = single_instance_check(Some(&missing)).await;
172        assert_eq!(
173            action,
174            StartupAction::Proceed,
175            "missing addr file → Proceed"
176        );
177    }
178
179    /// Why: a stale addr file (address written but no daemon listening) must
180    /// be treated as "not running" — the guard must allow the cold start.
181    /// What: writes a dead address to a tempfile and asserts `Proceed`
182    /// (the `check_already_running` helper cleans the stale file and returns
183    /// `None`, so `startup_action_from_probe_result(false)` = Proceed).
184    /// Test: itself (real fs + loopback TCP attempt, no daemon spawned).
185    #[tokio::test]
186    async fn single_instance_check_proceeds_when_addr_file_stale() {
187        let tmp = tempfile::tempdir().expect("tempdir");
188        let addr_file = tmp.path().join("http_addr");
189        // Write a port that nothing is listening on.
190        std::fs::write(&addr_file, "127.0.0.1:19999\n").expect("write");
191        let action = single_instance_check(Some(&addr_file)).await;
192        assert_eq!(
193            action,
194            StartupAction::Proceed,
195            "stale addr file (no listener) → Proceed"
196        );
197    }
198}