trusty_memory/commands/single_instance.rs
1//! Single-instance guard for the trusty-memory daemon.
2//!
3//! Why: macOS launchd `KeepAlive { SuccessfulExit: false }` (i.e. `OnSuccess`)
4//! respawns the daemon whenever it exits with a non-zero code. When a second
5//! daemon instance fails to bind (EADDRINUSE — the first instance already owns
6//! port 7070 and/or the UDS socket), it exits non-zero, which launchd interprets
7//! as a crash and spawns yet another copy. The resulting zombie herd (69 observed
8//! in the wild) exhausts file descriptors on top of the existing fd-limit bug.
9//!
10//! The fix: before attempting to bind, probe the discovery files. If a healthy
11//! daemon is already responding to `/health`, exit **0** (success). Launchd
12//! treats exit-0 as "clean shutdown" and does NOT respawn (SuccessfulExit:false
13//! = restart only on non-zero). This collapses the zombie herd immediately on
14//! the next invocation without touching launchd config.
15//!
16//! What: exposes [`single_instance_check`] (async, for real daemon startups)
17//! and [`StartupAction`] (pure enum, for unit testing the decision logic).
18//!
19//! Test: `startup_action_*` unit tests cover every branch including the
20//! stale-socket-vs-live-socket distinction.
21
22use std::path::Path;
23
24/// What the daemon startup should do after the single-instance check.
25///
26/// Why: separating the decision from the I/O lets us unit-test the logic
27/// with injected probe results rather than spinning up real TCP listeners.
28/// What: three variants covering the full decision tree.
29/// Test: `startup_action_from_probe_result_*` tests in this module.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum StartupAction {
32 /// Proceed to bind the TCP port and start serving.
33 Proceed,
34 /// Another healthy instance is already running — exit 0 cleanly so
35 /// launchd does not respawn.
36 ExitAlreadyRunning,
37 /// A probe attempt failed with an unexpected error (not ECONNREFUSED /
38 /// "no such file") — propagate as a startup failure so the operator sees
39 /// a real error in the launchd log. Launchd will respawn (correctly, because
40 /// this is a genuine failure).
41 Fail(String),
42}
43
44/// Decide what to do based on the result of an HTTP health probe.
45///
46/// Why: the single-instance check reduces to "did the health probe succeed?".
47/// Encoding the decision as a pure function (rather than embedding it in the
48/// async probe body) makes the logic unit-testable without actual network I/O.
49/// What: `probe_ok = true` → [`StartupAction::ExitAlreadyRunning`];
50/// `probe_ok = false` → [`StartupAction::Proceed`].
51/// Test: `startup_action_from_probe_result_when_alive`,
52/// `startup_action_from_probe_result_when_dead`.
53pub fn startup_action_from_probe_result(probe_ok: bool) -> StartupAction {
54 if probe_ok {
55 StartupAction::ExitAlreadyRunning
56 } else {
57 StartupAction::Proceed
58 }
59}
60
61/// Perform the single-instance check at daemon startup.
62///
63/// Why: launchd's `KeepAlive { SuccessfulExit: false }` respawns any non-zero
64/// exit, so a second daemon instance that fails to bind causes an endless
65/// respawn storm. Exiting 0 (when another healthy instance is detected) short-
66/// circuits this because `SuccessfulExit: false` means "restart only on
67/// non-zero exits" — exit 0 is treated as a voluntary clean shutdown.
68/// What: reads the `http_addr` discovery file; if it contains a reachable
69/// address whose `/health` responds with HTTP 200, returns
70/// [`StartupAction::ExitAlreadyRunning`]. Otherwise returns
71/// [`StartupAction::Proceed`] so the caller continues with normal bind.
72/// Errors reading the addr file or the network call are silently treated as
73/// "not running" (returns `Proceed`) so a missing or stale file never blocks
74/// a cold start.
75/// Test: integration — run `trusty-memory serve --foreground` twice in the
76/// same session and observe the second exits 0 without trying to bind; the
77/// unit tests in this module cover the decision logic.
78pub async fn single_instance_check(addr_file: Option<&Path>) -> StartupAction {
79 let Some(path) = addr_file else {
80 // No addr file path available (no $HOME) — proceed with bind.
81 return StartupAction::Proceed;
82 };
83 let probe_ok = trusty_common::check_already_running(path, "/health")
84 .await
85 .is_some();
86 startup_action_from_probe_result(probe_ok)
87}
88
89#[cfg(test)]
90mod tests {
91 use super::*;
92
93 /// Why: when the health probe returns `Some(url)` (daemon is alive),
94 /// the startup action must be `ExitAlreadyRunning` so the caller can
95 /// exit 0 and stop the launchd respawn storm.
96 /// What: asserts the mapping for `probe_ok = true`.
97 /// Test: itself (pure function, no I/O).
98 #[test]
99 fn startup_action_from_probe_result_when_alive() {
100 assert_eq!(
101 startup_action_from_probe_result(true),
102 StartupAction::ExitAlreadyRunning,
103 "alive probe → ExitAlreadyRunning"
104 );
105 }
106
107 /// Why: when the health probe returns `None` (addr file missing, stale,
108 /// or daemon not responding), the startup action must be `Proceed` so the
109 /// daemon continues with its normal bind sequence.
110 /// What: asserts the mapping for `probe_ok = false`.
111 /// Test: itself (pure function, no I/O).
112 #[test]
113 fn startup_action_from_probe_result_when_dead() {
114 assert_eq!(
115 startup_action_from_probe_result(false),
116 StartupAction::Proceed,
117 "dead/absent probe → Proceed"
118 );
119 }
120
121 /// Why: when there is no addr file path (no $HOME / TRUSTY_DATA_DIR_OVERRIDE),
122 /// the guard must not block a cold start — it must proceed.
123 /// What: calls `single_instance_check(None)` in a tokio context and asserts
124 /// the result is `Proceed`.
125 /// Test: itself (no real I/O — None short-circuits immediately).
126 #[tokio::test]
127 async fn single_instance_check_proceeds_when_no_path() {
128 let action = single_instance_check(None).await;
129 assert_eq!(
130 action,
131 StartupAction::Proceed,
132 "no addr path → Proceed (cold start must not be blocked)"
133 );
134 }
135
136 /// Why: a missing addr file means no daemon is running — the guard
137 /// must allow the cold start to proceed.
138 /// What: passes a path to a nonexistent file and asserts `Proceed`.
139 /// Test: itself (real fs stat, no network).
140 #[tokio::test]
141 async fn single_instance_check_proceeds_when_addr_file_missing() {
142 let tmp = tempfile::tempdir().expect("tempdir");
143 let missing = tmp.path().join("http_addr");
144 let action = single_instance_check(Some(&missing)).await;
145 assert_eq!(
146 action,
147 StartupAction::Proceed,
148 "missing addr file → Proceed"
149 );
150 }
151
152 /// Why: a stale addr file (address written but no daemon listening) must
153 /// be treated as "not running" — the guard must allow the cold start.
154 /// What: writes a dead address to a tempfile and asserts `Proceed`
155 /// (the `check_already_running` helper cleans the stale file and returns
156 /// `None`, so `startup_action_from_probe_result(false)` = Proceed).
157 /// Test: itself (real fs + loopback TCP attempt, no daemon spawned).
158 #[tokio::test]
159 async fn single_instance_check_proceeds_when_addr_file_stale() {
160 let tmp = tempfile::tempdir().expect("tempdir");
161 let addr_file = tmp.path().join("http_addr");
162 // Write a port that nothing is listening on.
163 std::fs::write(&addr_file, "127.0.0.1:19999\n").expect("write");
164 let action = single_instance_check(Some(&addr_file)).await;
165 assert_eq!(
166 action,
167 StartupAction::Proceed,
168 "stale addr file (no listener) → Proceed"
169 );
170 }
171}