Skip to main content

trusty_memory/commands/
daemon_lock.rs

1//! PID lock file for the `trusty-memory serve --foreground` daemon (issue #787).
2//!
3//! Why: the `start` subcommand used to detect a running daemon ONLY by probing
4//! the `http_addr` discovery file. When a launchd-managed `serve --foreground`
5//! instance (a) crashed without cleaning up `http_addr`, or (b) was deployed
6//! from an older binary that did not write `http_addr`, the `start` command
7//! concluded "no daemon running" and forked a new one. The new fork walked to
8//! the next free port (7071, 7072, …) and became a silent orphan. This module
9//! provides a PID lock file — written by `serve --foreground` before binding,
10//! and cleared on graceful shutdown — that gives `start` a second, independent
11//! signal to detect a live daemon. A stale lock (PID not alive) is reclaimed
12//! transparently so a crash does not permanently block startup.
13//!
14//! What: exposes [`DaemonLock`] (RAII guard that removes the file on drop),
15//! [`acquire_lock`] (write + reclaim stale), [`read_lock_pid`] (inspect
16//! without acquiring), and the injectable [`lock_file_path`] helper.
17//! Only used by the `serve --foreground` path — the `start` fork, CLI
18//! subcommands, and the MCP bridge must never call [`acquire_lock`].
19//!
20//! Test: unit tests in this module cover stale-lock reclaim, live-lock
21//! refusal, and the write/remove cycle, all against temp directories so
22//! the real `~/.local/share/trusty-memory` is never touched.
23
24use anyhow::{bail, Result};
25use std::path::{Path, PathBuf};
26
27/// Filename of the daemon PID lock file, written under the trusty-memory
28/// data directory alongside `http_addr`.
29///
30/// Why: co-locating it with `http_addr` keeps the two discovery files in
31/// the same directory so the `doctor` and `start` commands resolve both
32/// with a single `resolve_data_dir` call.
33/// What: the literal filename; callers join it onto the data-dir path.
34/// Test: `lock_file_path_uses_data_dir` asserts the constructed path.
35pub const LOCK_FILENAME: &str = "daemon.lock";
36
37/// RAII guard that holds the daemon PID lock file.
38///
39/// Why: tie the lock file's lifetime to the daemon process lifetime so
40/// the file is removed on both clean shutdown and panic, without
41/// requiring every exit path to call an explicit cleanup function. The
42/// guard is not `Clone` or `Send` — it is constructed once in `main` and
43/// lives for the full daemon lifetime.
44/// What: wraps the path of the written lock file. `Drop` removes it
45/// best-effort (I/O errors are silently swallowed — the file will be
46/// reclaimed as stale by the next invocation anyway).
47/// Test: `daemon_lock_drops_removes_file`.
48#[derive(Debug)]
49pub struct DaemonLock {
50    path: PathBuf,
51}
52
53impl DaemonLock {
54    /// Construct directly from a path (test helper + internal use only).
55    ///
56    /// Why: tests need to build a `DaemonLock` pointing at a tempfile
57    /// they control without going through the full OS data-dir resolution.
58    /// What: wraps `path`; the file at `path` is assumed to already exist.
59    /// Test: used in `daemon_lock_drops_removes_file`.
60    pub(crate) fn from_path(path: PathBuf) -> Self {
61        Self { path }
62    }
63}
64
65impl Drop for DaemonLock {
66    fn drop(&mut self) {
67        // Best-effort: if the remove fails (e.g. already deleted by a
68        // concurrent `trusty-memory stop`) we ignore the error. The next
69        // `serve --foreground` invocation will reclaim the stale file.
70        let _ = std::fs::remove_file(&self.path);
71    }
72}
73
74/// Resolve the canonical lock-file path for the trusty-memory daemon.
75///
76/// Why: centralising the path keeps `acquire_lock`, `read_lock_pid`, and
77/// any future diagnostic check in agreement. Returns `None` when the data
78/// directory cannot be resolved (no `$HOME`, no `TRUSTY_DATA_DIR_OVERRIDE`)
79/// so callers degrade gracefully rather than panicking.
80/// What: returns `{resolve_data_dir("trusty-memory")}/daemon.lock`, or
81/// `None` on resolution failure.
82/// Test: `lock_file_path_uses_data_dir` asserts the constructed path ends
83/// with `daemon.lock` and lives under a known data dir override.
84pub fn lock_file_path() -> Option<PathBuf> {
85    trusty_common::resolve_data_dir("trusty-memory")
86        .ok()
87        .map(|d| d.join(LOCK_FILENAME))
88}
89
90/// Check whether a PID is alive on this Unix host.
91///
92/// Why: `/bin/kill -0 <pid>` had two Linux bugs: `kill(0, 0)` signals the
93/// caller's process GROUP (false positive), and `u32::MAX as i32` = -1 gives
94/// broadcast semantics (also false positive). `libc::kill` with explicit pid
95/// guards fixes both. On non-Unix platforms always returns `false`.
96///
97/// What: returns `false` immediately for `pid == 0` or `pid > i32::MAX`
98/// (special semantics). For valid pids calls `libc::kill(pid, 0)`: 0 → alive,
99/// `ESRCH` → dead, `EPERM` → exists-but-no-permission → alive, other → alive.
100///
101/// Test: `pid_alive_returns_false_for_pid_zero`,
102/// `pid_alive_returns_false_for_overflow_pid`,
103/// `pid_alive_returns_true_for_current_pid`,
104/// `acquire_lock_reclaims_stale_pid`.
105pub fn pid_alive(pid: u32) -> bool {
106    // pid 0 → process-group semantics; pid > i32::MAX → negative pid_t
107    // (broadcast semantics).  Both are non-specific; guard before syscall.
108    if pid == 0 || pid > i32::MAX as u32 {
109        return false;
110    }
111
112    #[cfg(unix)]
113    {
114        // SAFETY: kill(2) is async-signal-safe; signal 0 is liveness-only.
115        let rc = unsafe { libc::kill(pid as libc::pid_t, 0) };
116        if rc == 0 {
117            return true; // process exists
118        }
119        let err = std::io::Error::last_os_error().raw_os_error().unwrap_or(0);
120        // ESRCH → no such process; EPERM → exists but no permission → alive.
121        err != libc::ESRCH
122    }
123    #[cfg(not(unix))]
124    {
125        false
126    }
127}
128
129/// Read the PID stored in the lock file at `path`.
130///
131/// Why: `acquire_lock` and diagnostic commands need to read the lock file
132/// without acquiring it. Separating the read from the acquire lets callers
133/// inspect the file without side-effecting it.
134/// What: reads the file, trims whitespace, and parses the first line as a
135/// `u32`. Returns `None` when the file does not exist, is empty, or does
136/// not contain a valid PID. Returns `Err` for I/O errors other than
137/// `NotFound`.
138/// Test: `read_lock_pid_returns_none_for_missing_file`,
139/// `read_lock_pid_returns_pid_for_valid_file`.
140pub fn read_lock_pid(path: &Path) -> Result<Option<u32>> {
141    let raw = match std::fs::read_to_string(path) {
142        Ok(s) => s,
143        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
144        Err(e) => {
145            return Err(anyhow::Error::new(e).context(format!("read lock file {}", path.display())));
146        }
147    };
148    let trimmed = raw.trim();
149    if trimmed.is_empty() {
150        return Ok(None);
151    }
152    match trimmed.parse::<u32>() {
153        Ok(pid) => Ok(Some(pid)),
154        Err(_) => Ok(None), // malformed → treat as absent
155    }
156}
157
158/// Write `{pid}\n` to `path` atomically (write to `.tmp` + rename).
159///
160/// Why: atomic write prevents a concurrent reader from observing a
161/// partial file (e.g. a truncated PID) during the write window.
162/// What: creates parent directories if missing; writes the PID and a
163/// trailing newline to `{path}.tmp`; renames to `path`. Returns `Err`
164/// on any I/O failure.
165/// Test: called by `acquire_lock` and covered by
166/// `acquire_lock_writes_own_pid`.
167fn write_lock_file(path: &Path, pid: u32) -> std::io::Result<()> {
168    use std::io::Write;
169    if let Some(parent) = path.parent() {
170        std::fs::create_dir_all(parent)?;
171    }
172    let tmp = path.with_extension("lock.tmp");
173    {
174        let mut f = std::fs::File::create(&tmp)?;
175        writeln!(f, "{pid}")?;
176        f.sync_all()?;
177    }
178    std::fs::rename(&tmp, path)?;
179    Ok(())
180}
181
182/// Attempt to acquire the daemon PID lock file at `path`.
183///
184/// Why: without a lock file, a stale `http_addr` (e.g. from a daemon that
185/// crashed before cleaning up its discovery files, or from an older binary
186/// that never wrote `http_addr`) causes `trusty-memory start` to conclude
187/// "no daemon running" and fork a new process. The new fork then collides
188/// with the live daemon on port 7070 and silently port-walks to 7071+.
189/// The lock file gives `start` — and the single-instance guard in `main.rs`
190/// — a second signal to detect a live daemon.
191///
192/// Stale-lock handling: if the file exists but the recorded PID is not
193/// alive (dead process, reboot, SIGKILL), we reclaim it by overwriting.
194/// If the file exists AND the PID is alive, we return `Err` with a clear
195/// message so the caller can abort rather than spawning a duplicate.
196///
197/// What: reads the existing lock file (if any); if the recorded PID is
198/// alive, returns `Err("daemon already running: PID {n}")`. Otherwise
199/// (no file, empty file, dead PID) writes the current process PID and
200/// returns a [`DaemonLock`] RAII guard that removes the file on drop.
201///
202/// Test: `acquire_lock_writes_own_pid`, `acquire_lock_reclaims_stale_pid`,
203/// `acquire_lock_refuses_live_pid`.
204pub fn acquire_lock(path: &Path) -> Result<DaemonLock> {
205    let me = std::process::id();
206
207    // Read any existing lock without panicking — a missing file is fine.
208    if let Some(existing_pid) = read_lock_pid(path)? {
209        if existing_pid != me && pid_alive(existing_pid) {
210            bail!(
211                "trusty-memory daemon is already running as PID {existing_pid} \
212                 (lock file: {}). \
213                 If you believe this is a stale lock, remove it manually: \
214                 rm {:?}",
215                path.display(),
216                path
217            );
218        }
219        // Stale lock (dead PID or same PID): fall through to reclaim.
220        tracing::info!(
221            stale_pid = existing_pid,
222            "reclaiming stale daemon lock file at {}",
223            path.display()
224        );
225    }
226
227    write_lock_file(path, me)
228        .map_err(|e| anyhow::anyhow!("write daemon lock {}: {e}", path.display()))?;
229
230    tracing::info!(pid = me, "wrote daemon lock at {}", path.display());
231    Ok(DaemonLock::from_path(path.to_path_buf()))
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237    use tempfile::tempdir;
238
239    // ── lock_file_path ─────────────────────────────────────────────────────
240
241    /// Why: the lock file must live alongside `http_addr` in the standard
242    /// data dir so the `doctor` and `start` commands resolve both with the
243    /// same `resolve_data_dir` call.
244    /// What: overrides the data dir via `TRUSTY_DATA_DIR_OVERRIDE`, calls
245    /// `lock_file_path()`, and asserts the path ends with `daemon.lock` and
246    /// lives under the override.
247    /// Test: itself (pure path construction, no I/O).
248    #[test]
249    fn lock_file_path_uses_data_dir() {
250        let tmp = tempdir().expect("tempdir");
251        // Safety: single-threaded test; guard scoped to this block.
252        unsafe {
253            std::env::set_var("TRUSTY_DATA_DIR_OVERRIDE", tmp.path());
254        }
255        let path = lock_file_path();
256        unsafe {
257            std::env::remove_var("TRUSTY_DATA_DIR_OVERRIDE");
258        }
259        let p = path.expect("lock_file_path must return Some under TRUSTY_DATA_DIR_OVERRIDE");
260        assert_eq!(p.file_name().and_then(|n| n.to_str()), Some(LOCK_FILENAME));
261        assert!(
262            p.starts_with(tmp.path()),
263            "lock file must live under the data dir override; got: {p:?}"
264        );
265    }
266
267    // ── read_lock_pid ──────────────────────────────────────────────────────
268
269    /// Why: a missing lock file means no daemon is registered; callers must
270    /// treat this as "no daemon" (not an error).
271    /// What: calls `read_lock_pid` on a nonexistent path; asserts `Ok(None)`.
272    /// Test: itself (real fs stat, no daemon).
273    #[test]
274    fn read_lock_pid_returns_none_for_missing_file() {
275        let tmp = tempdir().expect("tempdir");
276        let path = tmp.path().join("daemon.lock");
277        let result = read_lock_pid(&path).expect("must not error for missing file");
278        assert_eq!(result, None);
279    }
280
281    /// Why: a valid lock file must round-trip the PID so `acquire_lock` can
282    /// determine whether the recorded process is still alive.
283    /// What: writes a PID to a tempfile; asserts `read_lock_pid` returns it.
284    /// Test: itself.
285    #[test]
286    fn read_lock_pid_returns_pid_for_valid_file() {
287        let tmp = tempdir().expect("tempdir");
288        let path = tmp.path().join("daemon.lock");
289        std::fs::write(&path, "12345\n").expect("write");
290        let result = read_lock_pid(&path).expect("must not error for valid file");
291        assert_eq!(result, Some(12345));
292    }
293
294    /// Why: a corrupt or empty lock file (e.g. from a crashed partial write)
295    /// must be treated as absent rather than crashing the daemon.
296    /// What: writes an empty file; asserts `Ok(None)`.
297    /// Test: itself.
298    #[test]
299    fn read_lock_pid_returns_none_for_empty_file() {
300        let tmp = tempdir().expect("tempdir");
301        let path = tmp.path().join("daemon.lock");
302        std::fs::write(&path, "").expect("write");
303        let result = read_lock_pid(&path).expect("must not error for empty file");
304        assert_eq!(result, None);
305    }
306
307    // ── pid_alive ──────────────────────────────────────────────────────────
308
309    /// Why: using PID 1 (init/launchd) as a guaranteed-alive process is
310    /// platform-specific and fragile; instead we test the trivially-true
311    /// case: the current process must be alive.
312    /// What: asserts `pid_alive(std::process::id())` returns `true` on Unix.
313    /// Test: itself.
314    #[cfg(unix)]
315    #[test]
316    fn pid_alive_returns_true_for_current_pid() {
317        assert!(
318            pid_alive(std::process::id()),
319            "current process must be alive"
320        );
321    }
322
323    /// Why: `pid == 0` has process-group semantics on Linux; the guard in
324    /// `pid_alive` must short-circuit before any syscall.
325    /// What: asserts `pid_alive(0)` is `false`.
326    /// Test: itself.
327    #[cfg(unix)]
328    #[test]
329    fn pid_alive_returns_false_for_pid_zero() {
330        assert!(
331            !pid_alive(0),
332            "pid 0 has process-group semantics, not single-process"
333        );
334    }
335
336    /// Why: `pid > i32::MAX` overflows `pid_t` and becomes negative, giving
337    /// `kill(-1, 0)` broadcast semantics on Linux (false positive).
338    /// What: asserts both `u32::MAX` and `i32::MAX as u32 + 1` return `false`.
339    /// Test: itself.
340    #[cfg(unix)]
341    #[test]
342    fn pid_alive_returns_false_for_overflow_pid() {
343        assert!(
344            !pid_alive(u32::MAX),
345            "u32::MAX overflows i32 → broadcast semantics"
346        );
347        assert!(
348            !pid_alive(i32::MAX as u32 + 1),
349            "first value that overflows i32"
350        );
351    }
352
353    // ── acquire_lock ───────────────────────────────────────────────────────
354
355    /// Why: the primary use case of `acquire_lock` is writing the daemon's
356    /// own PID so future `start` / `serve` invocations detect the live
357    /// daemon.
358    /// What: calls `acquire_lock` against a temp path; reads the written
359    /// file; asserts it contains the current PID.
360    /// Test: itself (real fs, no daemon).
361    #[test]
362    fn acquire_lock_writes_own_pid() {
363        let tmp = tempdir().expect("tempdir");
364        let path = tmp.path().join("daemon.lock");
365        let _guard = acquire_lock(&path).expect("acquire_lock must succeed on empty path");
366        let written = read_lock_pid(&path)
367            .expect("read after write must not error")
368            .expect("lock file must contain a PID after acquire");
369        assert_eq!(
370            written,
371            std::process::id(),
372            "lock file must contain the current process PID"
373        );
374    }
375
376    /// Why: stale locks must be reclaimed after a crash so the daemon can
377    /// restart. The old test used `u32::MAX` which overflows `pid_t` on Linux
378    /// (broadcast semantics → false-positive "alive"). We now spawn a real
379    /// child, reap it, and use its guaranteed-dead PID as the stale value.
380    /// What: spawns+reaps `true`, writes its PID as a stale lock, calls
381    /// `acquire_lock`, asserts success and that the PID was overwritten.
382    /// Test: itself (real fs, spawns `true`).
383    #[cfg(unix)]
384    #[test]
385    fn acquire_lock_reclaims_stale_pid() {
386        let tmp = tempdir().expect("tempdir");
387        let path = tmp.path().join("daemon.lock");
388
389        // Spawn a real child that exits immediately, then reap it so its PID
390        // is guaranteed dead by the time we call pid_alive.
391        let mut child = std::process::Command::new("true")
392            .spawn()
393            .expect("spawn 'true' must succeed on any Unix CI machine");
394        let dead_pid = child.id();
395        child.wait().expect("wait must succeed");
396
397        // The PID must now be dead.
398        assert!(
399            !pid_alive(dead_pid),
400            "pid_alive({dead_pid}) must be false after the child was reaped"
401        );
402
403        std::fs::write(&path, format!("{dead_pid}\n")).expect("write stale pid");
404        let _guard = acquire_lock(&path).expect("acquire_lock must reclaim stale PID");
405        let written = read_lock_pid(&path)
406            .expect("read after reclaim must not error")
407            .expect("lock file must contain a PID after reclaim");
408        assert_eq!(
409            written,
410            std::process::id(),
411            "lock file must be overwritten with current PID after stale reclaim"
412        );
413    }
414
415    /// Why: non-Unix platforms always return false from `pid_alive` so any
416    /// lock is treated as stale. What: writes a PID, asserts reclaim succeeds.
417    /// Test: itself (non-Unix only).
418    #[cfg(not(unix))]
419    #[test]
420    fn acquire_lock_reclaims_stale_pid() {
421        let tmp = tempdir().expect("tempdir");
422        let path = tmp.path().join("daemon.lock");
423        std::fs::write(&path, "99999\n").expect("write stale pid");
424        let _guard = acquire_lock(&path).expect("acquire_lock must reclaim stale PID on non-Unix");
425        let written = read_lock_pid(&path)
426            .expect("read after reclaim must not error")
427            .expect("lock file must contain a PID after reclaim");
428        assert_eq!(written, std::process::id());
429    }
430
431    /// Why: if another live process holds the lock (e.g. the launchd-managed
432    /// daemon is already running), a new `serve --foreground` invocation must
433    /// fail loudly rather than starting a duplicate on a different port.
434    /// What: writes a lock file containing the current process's own PID
435    /// (which is alive by definition), then calls `acquire_lock` from a
436    /// simulated "other" PID by writing a lock with the CURRENT pid and
437    /// checking that `acquire_lock` would refuse it.
438    ///
439    /// Because we cannot spawn a second real live process in a unit test,
440    /// we test the logic indirectly: write our own PID as the "existing"
441    /// lock holder (since our process IS alive) and verify `acquire_lock`
442    /// returns `Err`. This is the exact path hit when launchd's
443    /// `KeepAlive` tries to restart a daemon that is already running.
444    /// Test: itself.
445    #[test]
446    fn acquire_lock_refuses_live_pid() {
447        let tmp = tempdir().expect("tempdir");
448        let path = tmp.path().join("daemon.lock");
449        // Write the current PID as the "held" lock (we are the live holder).
450        std::fs::write(&path, format!("{}\n", std::process::id())).expect("write live pid");
451        // A second call from the same process should also succeed (it sees its
452        // own PID, which is alive, but since `existing_pid == me` it reclaims).
453        // To truly test the "refuse" path, we need a different PID that is
454        // alive. Use PID 1 (init/launchd on Unix, always alive) as the fake
455        // held lock.
456        #[cfg(unix)]
457        {
458            if pid_alive(1) {
459                // PID 1 is alive → write it as the lock holder.
460                std::fs::write(&path, "1\n").expect("write pid 1");
461                let result = acquire_lock(&path);
462                assert!(
463                    result.is_err(),
464                    "acquire_lock must refuse when lock holder PID 1 is alive"
465                );
466                let msg = format!("{}", result.unwrap_err());
467                assert!(
468                    msg.contains("already running"),
469                    "error must mention 'already running'; got: {msg}"
470                );
471            }
472        }
473    }
474
475    // ── DaemonLock drop ────────────────────────────────────────────────────
476
477    /// Why: the RAII contract of `DaemonLock` is its primary safety
478    /// guarantee — if Drop does not remove the file, a crash leaves a stale
479    /// lock that the next startup must reclaim (which works, but wastes a
480    /// probe). We verify the happy path: file exists before drop, gone after.
481    /// What: acquires the lock, asserts the file exists, drops the guard,
482    /// asserts the file is gone.
483    /// Test: itself.
484    #[test]
485    fn daemon_lock_drops_removes_file() {
486        let tmp = tempdir().expect("tempdir");
487        let path = tmp.path().join("daemon.lock");
488        let guard = acquire_lock(&path).expect("acquire_lock must succeed on empty path");
489        assert!(path.exists(), "lock file must exist after acquire");
490        drop(guard);
491        assert!(
492            !path.exists(),
493            "lock file must be removed when DaemonLock is dropped"
494        );
495    }
496}