Skip to main content

trusty_memory/
foreground.rs

1//! Supervised `serve --foreground` entry point (issue #787).
2//!
3//! Why: extracted from `lib.rs` to keep that file under the 500-line-cap
4//! ratchet budget. All three issue-#787 fixes (lock file ownership, http_addr
5//! guarantee, bind-abort-on-collision) live here so they are easy to find and
6//! test together.
7//! What: exports `bind_foreground_port` (Fix C) and `run_http_foreground`
8//! (Fix A + Fix B + Fix C combined entry point) for use by `main.rs`.
9//! Test: `bind_foreground_port_refuses_collision` (unit, real TCP bind);
10//! `daemon_lock` module tests cover the lock-file logic.
11
12use anyhow::Result;
13use std::net::SocketAddr;
14
15use crate::DEFAULT_HTTP_PORT;
16#[cfg(feature = "axum-server")]
17use crate::{commands::daemon_lock, run_http_on, AppState};
18
19/// Bind a `TcpListener` to `127.0.0.1:DEFAULT_HTTP_PORT` for the
20/// launchd/supervisor `serve --foreground` path — abort on collision.
21///
22/// Why (issue #787, Fix C): the generic `bind_dynamic_port` silently walks
23/// 7070→7071→…→7079 when port 7070 is taken, then falls back to an
24/// OS-assigned port. Under `serve --foreground` (the launchd path) this
25/// produces a hidden second daemon on a different port rather than failing
26/// loudly. The correct behavior for a supervised `serve --foreground` is to
27/// abort with a clear error when port 7070 is already bound — the existing
28/// daemon is already serving and the new instance should not start at all.
29/// The `single_instance_check` in `main.rs` catches the live-daemon case
30/// before reaching this function; the only cases that slip through are
31/// non-trusty-memory processes bound to 7070 (a genuine conflict) and race
32/// windows (two simultaneous launchd respawns) — both are better served by
33/// a loud error than a silent port-walk.
34/// What: attempts to bind exactly `127.0.0.1:DEFAULT_HTTP_PORT`. Returns
35/// `Err` with a clear human-readable message when the port is already in
36/// use (EADDRINUSE), so callers can surface it to launchd's log and the
37/// user's terminal instead of silently spawning on a different port.
38/// Test: `bind_foreground_port_refuses_collision` occupies port 7070 then
39/// asserts `bind_foreground_port` returns `Err` containing "already in use".
40pub async fn bind_foreground_port() -> Result<tokio::net::TcpListener> {
41    let addr: SocketAddr = SocketAddr::from(([127, 0, 0, 1], DEFAULT_HTTP_PORT));
42    tokio::net::TcpListener::bind(addr).await.map_err(|e| {
43        if e.kind() == std::io::ErrorKind::AddrInUse {
44            anyhow::anyhow!(
45                "port {} is already in use — another trusty-memory daemon is \
46                     likely running. Check with `trusty-memory port` or \
47                     `lsof -nP -iTCP:{} -sTCP:LISTEN`. If the existing daemon is \
48                     healthy, this instance will exit 0 (expected under launchd \
49                     KeepAlive). If the existing daemon is stale, run \
50                     `trusty-memory stop` first.",
51                DEFAULT_HTTP_PORT,
52                DEFAULT_HTTP_PORT
53            )
54        } else {
55            anyhow::anyhow!("bind {}:{}: {e}", addr.ip(), DEFAULT_HTTP_PORT)
56        }
57    })
58}
59
60/// The canonical `serve --foreground` entry point used by launchd and systemd
61/// supervisors (issue #787).
62///
63/// Why (issue #787): previously `serve --foreground` shared the same
64/// `run_http_dynamic` path used by ad-hoc CLI invocations. That path
65/// silently port-walked (7070→7071→…→7079→OS-assigned) on bind collision,
66/// producing hidden second instances that never appeared in the `http_addr`
67/// discovery file at the expected port. This function replaces that path
68/// for the supervised case with three explicit guarantees:
69///
70/// 1. **Lock file ownership** (Fix A): writes `daemon.lock` containing the
71///    current PID before binding. The RAII guard removes the file on any
72///    exit (graceful shutdown, panic, launchd SIGTERM). `start` and the
73///    single-instance guard read this file as a second detection layer when
74///    `http_addr` is absent or stale.
75///
76/// 2. **http_addr written on bind** (Fix B): `run_http_on` writes both the
77///    OS-standard `http_addr` file and the legacy dotfile path
78///    (`~/.trusty-memory/http_addr`) immediately after binding, before
79///    accepting the first request. Both files are removed on clean shutdown.
80///    This ensures `trusty-memory port` and the MCP bridge always find the
81///    running daemon.
82///
83/// 3. **Abort on port collision** (Fix C): uses `bind_foreground_port`
84///    (binds exactly port 7070, returns `Err` on `EADDRINUSE`) instead of
85///    the port-walking `bind_dynamic_port`. If 7070 is already taken the
86///    function returns `Err` with a clear message; the caller (`main.rs`)
87///    exits non-zero, launchd logs the error, applies `ThrottleInterval`,
88///    and the single-instance guard prevents a respawn storm.
89///
90/// What: acquires the daemon lock, binds `127.0.0.1:7070` (aborts on
91/// collision), then runs `run_http_on` which writes the addr file and
92/// serves until graceful shutdown. The lock guard is dropped after
93/// `run_http_on` returns, removing `daemon.lock` best-effort.
94///
95/// Test: `bind_foreground_port_refuses_collision` (unit), plus the
96/// integration path `trusty-memory service start` followed by a second
97/// `trusty-memory serve --foreground` which should exit immediately with
98/// the "already in use" error.
99#[cfg(feature = "axum-server")]
100pub async fn run_http_foreground(state: AppState) -> Result<()> {
101    // Fix A (issue #787): acquire and own the daemon PID lock file for the
102    // duration of this daemon instance. A stale lock (dead PID) is reclaimed
103    // automatically; a live lock causes an `Err` that propagates to `main`,
104    // which exits non-zero so launchd logs the message and applies its
105    // ThrottleInterval instead of hot-looping. The RAII guard removes the
106    // file on any exit path (clean, panic, SIGTERM→graceful-shutdown).
107    let lock_path = daemon_lock::lock_file_path();
108    let _lock_guard: Option<daemon_lock::DaemonLock> = match lock_path {
109        Some(ref p) => match daemon_lock::acquire_lock(p) {
110            Ok(g) => {
111                tracing::info!(
112                    "daemon lock acquired at {} (pid {})",
113                    p.display(),
114                    std::process::id()
115                );
116                Some(g)
117            }
118            Err(e) => {
119                // Lock held by a live process: this is a real duplicate-start.
120                // Exit non-zero so launchd records the error in its log and
121                // respects ThrottleInterval. The single-instance guard in
122                // main.rs handles the healthy-daemon case (exit 0); this
123                // branch handles the edge case where the addr probe passed
124                // but the lock is still contested.
125                return Err(e.context(
126                    "serve --foreground refused to start: daemon lock held by live process",
127                ));
128            }
129        },
130        None => {
131            // No data dir resolvable (container, no $HOME). Log and continue
132            // without a lock — discovery files also won't work in this env.
133            tracing::warn!(
134                "could not resolve data dir for daemon lock — \
135                 starting without PID lock (containerised / no $HOME)"
136            );
137            None
138        }
139    };
140
141    // Fix C (issue #787): bind exactly port 7070, abort on collision.
142    // `bind_foreground_port` returns `Err(EADDRINUSE)` when 7070 is taken.
143    // Under launchd, the single-instance guard already exits 0 before we
144    // reach this point when a healthy daemon is on 7070; `Err` here means a
145    // non-trusty-memory process owns the port, which is a genuine conflict.
146    let listener = bind_foreground_port().await?;
147
148    // Fix B (issue #787): `run_http_on` writes `http_addr` (and the dotfile
149    // path) immediately after binding, before accepting the first request, so
150    // `trusty-memory port` and the MCP bridge can locate this daemon
151    // immediately. Both files are removed on graceful shutdown.
152    run_http_on(state, listener).await
153}
154
155#[cfg(test)]
156mod tests {
157    use super::*;
158
159    /// Why (issue #787, Fix C): `bind_foreground_port` must return `Err`
160    /// when port 7070 is already in use rather than silently walking to
161    /// the next free port. This is the key guard against a launchd
162    /// respawn colliding with the live daemon and producing an orphan on
163    /// port 7071+. The single-instance guard catches the healthy-daemon
164    /// case (exits 0) before this function is reached; this test covers
165    /// the "port taken by a non-trusty-memory process" scenario.
166    /// What: occupies `127.0.0.1:DEFAULT_HTTP_PORT`, then asserts that
167    /// `bind_foreground_port` returns `Err` containing "already in use".
168    /// Test: itself (real TCP bind, no daemon spawned).
169    #[tokio::test]
170    async fn bind_foreground_port_refuses_collision() {
171        use std::net::SocketAddr;
172        // Occupy the default port with a plain TcpListener (not trusty-memory).
173        let addr: SocketAddr = SocketAddr::from(([127, 0, 0, 1], DEFAULT_HTTP_PORT));
174        let Ok(_holder) = tokio::net::TcpListener::bind(addr).await else {
175            // Port already taken by something else on the CI host — skip the
176            // test rather than fail spuriously.
177            return;
178        };
179        // Now `bind_foreground_port` must refuse.
180        let result = bind_foreground_port().await;
181        assert!(
182            result.is_err(),
183            "bind_foreground_port must return Err when port {DEFAULT_HTTP_PORT} \
184             is already bound; got Ok"
185        );
186        let msg = format!("{}", result.unwrap_err());
187        assert!(
188            msg.contains("already in use"),
189            "error message must mention 'already in use'; got: {msg}"
190        );
191    }
192}