trusty_memory/commands/single_instance.rs
1//! Single-instance guard for the trusty-memory daemon.
2//!
3//! Why: macOS launchd `KeepAlive { SuccessfulExit: false }` (i.e. `OnSuccess`)
4//! respawns the daemon whenever it exits with a non-zero code. When a second
5//! daemon instance fails to bind (EADDRINUSE — the first instance already owns
6//! port 7070 and/or the UDS socket), it exits non-zero, which launchd interprets
7//! as a crash and spawns yet another copy. The resulting zombie herd (69 observed
8//! in the wild) exhausts file descriptors on top of the existing fd-limit bug.
9//!
10//! The fix: before attempting to bind, probe the discovery files. If a healthy
11//! daemon is already responding to `/health`, exit **0** (success). Launchd
12//! treats exit-0 as "clean shutdown" and does NOT respawn (SuccessfulExit:false
13//! = restart only on non-zero). This collapses the zombie herd immediately on
14//! the next invocation without touching launchd config.
15//!
16//! What: exposes [`single_instance_check`] (async, for real daemon startups)
17//! and [`StartupAction`] (pure enum, for unit testing the decision logic).
18//!
19//! Test: `startup_action_*` unit tests cover every branch including the
20//! stale-socket-vs-live-socket distinction.
21
22use std::path::Path;
23
24/// What the daemon startup should do after the single-instance check.
25///
26/// Why: separating the decision from the I/O lets us unit-test the logic
27/// with injected probe results rather than spinning up real TCP listeners.
28/// What: three variants covering the full decision tree.
29/// Test: `startup_action_from_probe_result_*` tests in this module.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum StartupAction {
32 /// Proceed to bind the TCP port and start serving.
33 Proceed,
34 /// Another healthy instance is already running — exit 0 cleanly so
35 /// launchd does not respawn.
36 ExitAlreadyRunning,
37 /// A probe attempt failed with an unexpected error (not ECONNREFUSED /
38 /// "no such file") — propagate as a startup failure so the operator sees
39 /// a real error in the launchd log. Launchd will respawn (correctly, because
40 /// this is a genuine failure).
41 Fail(String),
42}
43
44/// Decide what to do based on the result of an HTTP health probe.
45///
46/// Why: the single-instance check reduces to "did the health probe succeed?".
47/// Encoding the decision as a pure function (rather than embedding it in the
48/// async probe body) makes the logic unit-testable without actual network I/O.
49/// What: `probe_ok = true` → [`StartupAction::ExitAlreadyRunning`];
50/// `probe_ok = false` → [`StartupAction::Proceed`].
51/// Test: `startup_action_from_probe_result_when_alive`,
52/// `startup_action_from_probe_result_when_dead`.
53pub fn startup_action_from_probe_result(probe_ok: bool) -> StartupAction {
54 if probe_ok {
55 StartupAction::ExitAlreadyRunning
56 } else {
57 StartupAction::Proceed
58 }
59}
60
61/// Perform the single-instance check at daemon startup.
62///
63/// Why: launchd's `KeepAlive { SuccessfulExit: false }` respawns any non-zero
64/// exit, so a second daemon instance that fails to bind causes an endless
65/// respawn storm. Exiting 0 (when another healthy instance is detected) short-
66/// circuits this because `SuccessfulExit: false` means "restart only on
67/// non-zero exits" — exit 0 is treated as a voluntary clean shutdown.
68/// What: reads the `http_addr` discovery file; if it contains a reachable
69/// address whose `/health` responds with HTTP 200, returns
70/// [`StartupAction::ExitAlreadyRunning`]. Otherwise returns
71/// [`StartupAction::Proceed`] so the caller continues with normal bind.
72/// Errors reading the addr file or the network call are silently treated as
73/// "not running" (returns `Proceed`) so a missing or stale file never blocks
74/// a cold start.
75/// Test: integration — run `trusty-memory serve --foreground` twice in the
76/// same session and observe the second exits 0 without trying to bind; the
77/// unit tests in this module cover the decision logic.
78pub async fn single_instance_check(addr_file: Option<&Path>) -> StartupAction {
79 let Some(path) = addr_file else {
80 // No addr file path available (no $HOME) — proceed with bind.
81 return StartupAction::Proceed;
82 };
83 let probe_ok = trusty_common::check_already_running(path, "/health")
84 .await
85 .is_some();
86 startup_action_from_probe_result(probe_ok)
87}
88
89/// Single-instance check with up to `max_retries` additional probes.
90///
91/// Why (issue #1152, Tier 3): a single probe can miss a daemon that is
92/// mid-boot — it wrote the addr file but hasn't yet answered `/health`.
93/// Retrying with a short sleep lets a slow-boot daemon be detected and
94/// this caller exit 0 (stopping the launchd respawn storm) rather than
95/// proceeding to open redb, which would trigger `DatabaseAlreadyOpen`.
96/// What: calls `single_instance_check` repeatedly up to `1 + max_retries`
97/// times, sleeping `delay_ms` between each call, stopping on the first
98/// non-`Proceed` result. Returns the final `StartupAction`.
99/// Test: covered by the unit tests for `startup_action_from_probe_result`;
100/// the retry path is exercised by the integration guard in `main.rs`.
101pub async fn single_instance_check_retried(
102 addr_file: Option<&std::path::Path>,
103 max_retries: u8,
104 delay_ms: u64,
105) -> StartupAction {
106 let mut action = single_instance_check(addr_file).await;
107 let mut retries = max_retries;
108 while action == StartupAction::Proceed && retries > 0 {
109 retries -= 1;
110 tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
111 action = single_instance_check(addr_file).await;
112 }
113 action
114}
115
116#[cfg(test)]
117mod tests {
118 use super::*;
119
120 /// Why: when the health probe returns `Some(url)` (daemon is alive),
121 /// the startup action must be `ExitAlreadyRunning` so the caller can
122 /// exit 0 and stop the launchd respawn storm.
123 /// What: asserts the mapping for `probe_ok = true`.
124 /// Test: itself (pure function, no I/O).
125 #[test]
126 fn startup_action_from_probe_result_when_alive() {
127 assert_eq!(
128 startup_action_from_probe_result(true),
129 StartupAction::ExitAlreadyRunning,
130 "alive probe → ExitAlreadyRunning"
131 );
132 }
133
134 /// Why: when the health probe returns `None` (addr file missing, stale,
135 /// or daemon not responding), the startup action must be `Proceed` so the
136 /// daemon continues with its normal bind sequence.
137 /// What: asserts the mapping for `probe_ok = false`.
138 /// Test: itself (pure function, no I/O).
139 #[test]
140 fn startup_action_from_probe_result_when_dead() {
141 assert_eq!(
142 startup_action_from_probe_result(false),
143 StartupAction::Proceed,
144 "dead/absent probe → Proceed"
145 );
146 }
147
148 /// Why: when there is no addr file path (no $HOME / TRUSTY_DATA_DIR_OVERRIDE),
149 /// the guard must not block a cold start — it must proceed.
150 /// What: calls `single_instance_check(None)` in a tokio context and asserts
151 /// the result is `Proceed`.
152 /// Test: itself (no real I/O — None short-circuits immediately).
153 #[tokio::test]
154 async fn single_instance_check_proceeds_when_no_path() {
155 let action = single_instance_check(None).await;
156 assert_eq!(
157 action,
158 StartupAction::Proceed,
159 "no addr path → Proceed (cold start must not be blocked)"
160 );
161 }
162
163 /// Why: a missing addr file means no daemon is running — the guard
164 /// must allow the cold start to proceed.
165 /// What: passes a path to a nonexistent file and asserts `Proceed`.
166 /// Test: itself (real fs stat, no network).
167 #[tokio::test]
168 async fn single_instance_check_proceeds_when_addr_file_missing() {
169 let tmp = tempfile::tempdir().expect("tempdir");
170 let missing = tmp.path().join("http_addr");
171 let action = single_instance_check(Some(&missing)).await;
172 assert_eq!(
173 action,
174 StartupAction::Proceed,
175 "missing addr file → Proceed"
176 );
177 }
178
179 /// Why: a stale addr file (address written but no daemon listening) must
180 /// be treated as "not running" — the guard must allow the cold start.
181 /// What: writes a dead address to a tempfile and asserts `Proceed`
182 /// (the `check_already_running` helper cleans the stale file and returns
183 /// `None`, so `startup_action_from_probe_result(false)` = Proceed).
184 /// Test: itself (real fs + loopback TCP attempt, no daemon spawned).
185 #[tokio::test]
186 async fn single_instance_check_proceeds_when_addr_file_stale() {
187 let tmp = tempfile::tempdir().expect("tempdir");
188 let addr_file = tmp.path().join("http_addr");
189 // Write a port that nothing is listening on.
190 std::fs::write(&addr_file, "127.0.0.1:19999\n").expect("write");
191 let action = single_instance_check(Some(&addr_file)).await;
192 assert_eq!(
193 action,
194 StartupAction::Proceed,
195 "stale addr file (no listener) → Proceed"
196 );
197 }
198}