Skip to main content

trusty_memory/commands/
daemon_lock.rs

1//! PID lock file for the `trusty-memory serve --foreground` daemon (issue #787).
2//!
3//! Why: the `start` subcommand used to detect a running daemon ONLY by probing
4//! the `http_addr` discovery file. When a launchd-managed `serve --foreground`
5//! instance crashed without cleaning up `http_addr`, or was deployed from an
6//! older binary that did not write `http_addr`, `start` concluded "no daemon
7//! running" and forked a new one that silently port-walked to 7071+.
8//! This module provides a PID lock file — written by `serve --foreground`
9//! before binding, cleared on graceful shutdown — giving `start` a second,
10//! independent signal to detect a live daemon.  A stale lock (PID not alive)
11//! is reclaimed transparently so a crash does not permanently block startup.
12//!
13//! What: exposes [`DaemonLock`] (RAII guard), [`acquire_lock`] (O_EXCL-create
14//! then fallback to stale-reclaim), [`read_lock_pid`] (inspect without
15//! acquiring), and [`lock_file_path`] / [`lock_file_path_for_dir`] helpers.
16//! Only used by the `serve --foreground` path — `start`, CLI subcommands, and
17//! the MCP bridge must never call [`acquire_lock`].
18//!
19//! Test: unit tests below cover stale-lock reclaim, live-lock refusal, and
20//! the write/remove cycle, all against temp directories so the real
21//! `~/.local/share/trusty-memory` is never touched.
22
23use anyhow::{bail, Result};
24use std::path::{Path, PathBuf};
25
26/// Filename of the daemon PID lock file, written under the trusty-memory
27/// data directory alongside `http_addr`.
28///
29/// Why: co-locating it with `http_addr` keeps both discovery files in the
30/// same directory so `doctor` and `start` resolve both with one
31/// `resolve_data_dir` call.
32/// What: the literal filename; callers join it onto the data-dir path.
33/// Test: `lock_file_path_uses_data_dir` asserts the constructed path.
34pub const LOCK_FILENAME: &str = "daemon.lock";
35
36/// RAII guard that holds the daemon PID lock file.
37///
38/// Why: tie the lock file's lifetime to the daemon process so the file is
39/// removed on both clean shutdown and panic without requiring every exit
40/// path to call an explicit cleanup function.
41/// What: wraps the lock-file path; `Drop` removes it best-effort (I/O
42/// errors are swallowed — the file is reclaimed as stale on next startup).
43/// Test: `daemon_lock_drops_removes_file`.
44#[derive(Debug)]
45pub struct DaemonLock {
46    path: PathBuf,
47}
48
49impl DaemonLock {
50    /// Construct directly from a path (test helper + internal use only).
51    ///
52    /// Why: tests need a `DaemonLock` pointing at a tempfile without OS
53    /// data-dir resolution.
54    /// What: wraps `path`; the file at `path` is assumed to already exist.
55    /// Test: used in `daemon_lock_drops_removes_file`.
56    pub(crate) fn from_path(path: PathBuf) -> Self {
57        Self { path }
58    }
59}
60
61impl Drop for DaemonLock {
62    fn drop(&mut self) {
63        // Best-effort: if remove fails (e.g. concurrent `trusty-memory stop`)
64        // we ignore — the next invocation reclaims the stale file.
65        let _ = std::fs::remove_file(&self.path);
66    }
67}
68
69/// Resolve the canonical lock-file path for the trusty-memory daemon.
70///
71/// Why: centralising the path keeps `acquire_lock`, `read_lock_pid`, and
72/// diagnostic checks in agreement.  Returns `None` when the data directory
73/// cannot be resolved so callers degrade gracefully rather than panicking.
74/// What: returns `{resolve_data_dir("trusty-memory")}/daemon.lock`, or
75/// `None` on resolution failure.
76/// Test: `lock_file_path_uses_data_dir` asserts the constructed path.
77pub fn lock_file_path() -> Option<PathBuf> {
78    trusty_common::resolve_data_dir("trusty-memory")
79        .ok()
80        .map(|d| d.join(LOCK_FILENAME))
81}
82
83/// Build the lock-file path under an explicitly supplied directory.
84///
85/// Why: test code needs to point at a tempdir without mutating the process
86/// environment.  `std::env::set_var` inside a parallel test harness is UB
87/// (data race on the env block); this function bypasses the env lookup so
88/// tests never touch global process state.
89/// What: returns `dir.join(LOCK_FILENAME)`.
90/// Test: `lock_file_path_uses_data_dir` calls this instead of
91/// `lock_file_path()` so the test never mutates the environment.
92pub fn lock_file_path_for_dir(dir: &Path) -> PathBuf {
93    dir.join(LOCK_FILENAME)
94}
95
96/// Check whether a PID is alive on this Unix host.
97///
98/// Why: `libc::kill(pid, 0)` avoids forking `/bin/kill` and guards against
99/// two Linux edge cases: `pid == 0` has process-group semantics (false
100/// positive) and `pid > i32::MAX` wraps to negative `pid_t` giving broadcast
101/// semantics (also false positive).
102/// What: returns `false` for `pid == 0` or `pid > i32::MAX`.  For valid pids
103/// calls `libc::kill(pid, 0)`: 0 → alive, `ESRCH` → dead, `EPERM` → alive.
104/// On non-Unix platforms always returns `false`.
105/// Test: `pid_alive_returns_false_for_pid_zero`,
106/// `pid_alive_returns_false_for_overflow_pid`,
107/// `pid_alive_returns_true_for_current_pid`.
108pub fn pid_alive(pid: u32) -> bool {
109    if pid == 0 || pid > i32::MAX as u32 {
110        return false;
111    }
112    #[cfg(unix)]
113    {
114        // SAFETY: kill(2) is async-signal-safe; signal 0 is liveness-only.
115        let rc = unsafe { libc::kill(pid as libc::pid_t, 0) };
116        if rc == 0 {
117            return true;
118        }
119        let err = std::io::Error::last_os_error().raw_os_error().unwrap_or(0);
120        err != libc::ESRCH // EPERM → exists but no permission → alive
121    }
122    #[cfg(not(unix))]
123    {
124        false
125    }
126}
127
128/// Read the PID stored in the lock file at `path`.
129///
130/// Why: `acquire_lock` and diagnostic commands need to read the lock file
131/// without acquiring it.
132/// What: reads the file, trims whitespace, and parses as `u32`. Returns
133/// `None` when the file does not exist, is empty, or contains a non-PID.
134/// Returns `Err` for I/O errors other than `NotFound`.
135/// Test: `read_lock_pid_returns_none_for_missing_file`,
136/// `read_lock_pid_returns_pid_for_valid_file`.
137pub fn read_lock_pid(path: &Path) -> Result<Option<u32>> {
138    let raw = match std::fs::read_to_string(path) {
139        Ok(s) => s,
140        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
141        Err(e) => {
142            return Err(anyhow::Error::new(e).context(format!("read lock file {}", path.display())));
143        }
144    };
145    let trimmed = raw.trim();
146    if trimmed.is_empty() {
147        return Ok(None);
148    }
149    Ok(trimmed.parse::<u32>().ok()) // malformed → None
150}
151
152/// Write `{pid}\n` to `path` atomically (write to `.tmp` + rename).
153///
154/// Why: atomic write prevents a concurrent reader from observing a partial
155/// file (e.g. a truncated PID) during the write window.
156/// What: creates parent dirs; writes PID + newline to `{path}.tmp`;
157/// fsyncs; renames to `path`. Returns `Err` on any I/O failure.
158/// Test: called by `acquire_lock`; covered by `acquire_lock_writes_own_pid`.
159fn write_lock_file(path: &Path, pid: u32) -> std::io::Result<()> {
160    use std::io::Write;
161    if let Some(parent) = path.parent() {
162        std::fs::create_dir_all(parent)?;
163    }
164    let tmp = path.with_extension("lock.tmp");
165    {
166        let mut f = std::fs::File::create(&tmp)?;
167        writeln!(f, "{pid}")?;
168        f.sync_all()?;
169    }
170    std::fs::rename(&tmp, path)?;
171    Ok(())
172}
173
174/// Try to create the lock file exclusively (`O_CREAT | O_EXCL`) and write `pid`.
175///
176/// Why: `O_EXCL` makes the create-and-write atomic — only one concurrent
177/// caller can win, eliminating the TOCTOU window between "file absent" and
178/// "write PID" (first phase of [`acquire_lock`]'s two-phase strategy).
179/// What: creates parent dirs; opens with `create_new` (`O_CREAT | O_EXCL`);
180/// writes `{pid}\n` and fsyncs.  Returns `Ok(true)` on success, `Ok(false)`
181/// when the file already exists (fall through to stale-reclaim), or `Err`
182/// for other I/O failures.
183/// Test: covered by `acquire_lock_writes_own_pid` (empty-path happy path).
184fn try_create_lock_exclusive(path: &Path, pid: u32) -> std::io::Result<bool> {
185    use std::io::Write;
186    if let Some(parent) = path.parent() {
187        std::fs::create_dir_all(parent)?;
188    }
189    match std::fs::OpenOptions::new()
190        .write(true)
191        .create_new(true) // O_CREAT | O_EXCL
192        .open(path)
193    {
194        Ok(mut f) => {
195            writeln!(f, "{pid}")?;
196            f.sync_all()?;
197            Ok(true)
198        }
199        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => Ok(false),
200        Err(e) => Err(e),
201    }
202}
203
204/// Attempt to acquire the daemon PID lock file at `path`.
205///
206/// Why: without a lock, a stale `http_addr` lets `start` fork a new daemon
207/// that silently port-walks to 7071+.  The lock gives `start` and the
208/// single-instance guard in `main.rs` a second detection layer.
209///
210/// Two-phase acquisition (closes TOCTOU advisory in #797):
211/// 1. **O_EXCL create** — atomic; only one caller wins; no TOCTOU window.
212/// 2. **Stale-reclaim fallback** — if the file existed, check the recorded
213///    PID; if dead, overwrite; if alive, return `Err("already running")`.
214///    A narrow TOCTOU remains on the reclaim path but is bounded: at worst
215///    two concurrent starters both see a dead PID and one overwrites the
216///    other; defence-in-depth (port-abort) catches the loser.
217///
218/// What: returns a [`DaemonLock`] RAII guard on success; `Err` if a live
219/// daemon already holds the lock.
220/// Test: `acquire_lock_writes_own_pid`, `acquire_lock_reclaims_stale_pid`,
221/// `acquire_lock_refuses_live_pid`.
222pub fn acquire_lock(path: &Path) -> Result<DaemonLock> {
223    let me = std::process::id();
224
225    // Phase 1: O_EXCL — race-free when the file is absent.
226    match try_create_lock_exclusive(path, me) {
227        Ok(true) => {
228            tracing::info!(
229                pid = me,
230                "wrote daemon lock at {} (exclusive create)",
231                path.display()
232            );
233            return Ok(DaemonLock::from_path(path.to_path_buf()));
234        }
235        Ok(false) => {} // File existed; fall through to Phase 2.
236        Err(e) => {
237            return Err(anyhow::anyhow!(
238                "create daemon lock {}: {e}",
239                path.display()
240            ));
241        }
242    }
243
244    // Phase 2: file exists — read the recorded PID and decide.
245    if let Some(existing_pid) = read_lock_pid(path)? {
246        if existing_pid != me && pid_alive(existing_pid) {
247            bail!(
248                "trusty-memory daemon is already running as PID {existing_pid} \
249                 (lock file: {}). \
250                 If you believe this is a stale lock, remove it manually: \
251                 rm {:?}",
252                path.display(),
253                path
254            );
255        }
256        tracing::info!(
257            stale_pid = existing_pid,
258            "reclaiming stale daemon lock file at {}",
259            path.display()
260        );
261    }
262    write_lock_file(path, me)
263        .map_err(|e| anyhow::anyhow!("write daemon lock {}: {e}", path.display()))?;
264    tracing::info!(pid = me, "wrote daemon lock at {}", path.display());
265    Ok(DaemonLock::from_path(path.to_path_buf()))
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271    use tempfile::tempdir;
272
273    // ── lock_file_path ─────────────────────────────────────────────────────
274
275    /// Why: the lock file must live in the standard data dir so `doctor` and
276    /// `start` resolve it with one `resolve_data_dir` call.
277    /// What: constructs the path via `lock_file_path_for_dir` (no env
278    /// mutation — `set_var` is UB under the parallel test runner) and asserts
279    /// it ends with `daemon.lock` under the supplied tempdir.
280    /// Test: itself (pure path construction, no I/O).
281    #[test]
282    fn lock_file_path_uses_data_dir() {
283        let tmp = tempdir().expect("tempdir");
284        let path = lock_file_path_for_dir(tmp.path());
285        assert_eq!(
286            path.file_name().and_then(|n| n.to_str()),
287            Some(LOCK_FILENAME)
288        );
289        assert!(
290            path.starts_with(tmp.path()),
291            "lock file must live under the data dir; got: {path:?}"
292        );
293    }
294
295    // ── read_lock_pid ──────────────────────────────────────────────────────
296
297    /// Why: a missing lock file means no daemon registered; callers treat
298    /// this as "no daemon" not an error.
299    /// What: calls `read_lock_pid` on a nonexistent path; asserts `Ok(None)`.
300    /// Test: itself.
301    #[test]
302    fn read_lock_pid_returns_none_for_missing_file() {
303        let tmp = tempdir().expect("tempdir");
304        let result = read_lock_pid(&tmp.path().join("daemon.lock"))
305            .expect("must not error for missing file");
306        assert_eq!(result, None);
307    }
308
309    /// Why: a valid lock file must round-trip the PID so `acquire_lock` can
310    /// check liveness of the recorded process.
311    /// What: writes a PID; asserts `read_lock_pid` returns it.
312    /// Test: itself.
313    #[test]
314    fn read_lock_pid_returns_pid_for_valid_file() {
315        let tmp = tempdir().expect("tempdir");
316        let path = tmp.path().join("daemon.lock");
317        std::fs::write(&path, "12345\n").expect("write");
318        assert_eq!(
319            read_lock_pid(&path).expect("must not error for valid file"),
320            Some(12345)
321        );
322    }
323
324    /// Why: a corrupt or empty lock file (e.g. partial write) must be treated
325    /// as absent rather than crashing the daemon.
326    /// What: writes an empty file; asserts `Ok(None)`.
327    /// Test: itself.
328    #[test]
329    fn read_lock_pid_returns_none_for_empty_file() {
330        let tmp = tempdir().expect("tempdir");
331        let path = tmp.path().join("daemon.lock");
332        std::fs::write(&path, "").expect("write");
333        assert_eq!(
334            read_lock_pid(&path).expect("must not error for empty file"),
335            None
336        );
337    }
338
339    // ── pid_alive ──────────────────────────────────────────────────────────
340
341    /// Why: the current process must always be alive; this is the safe,
342    /// reliable alternative to hard-coding PID 1.
343    /// What: asserts `pid_alive(std::process::id())` is `true` on Unix.
344    /// Test: itself.
345    #[cfg(unix)]
346    #[test]
347    fn pid_alive_returns_true_for_current_pid() {
348        assert!(
349            pid_alive(std::process::id()),
350            "current process must be alive"
351        );
352    }
353
354    /// Why: `pid == 0` has process-group semantics; the guard must
355    /// short-circuit before any syscall.
356    /// What: asserts `pid_alive(0)` is `false`.
357    /// Test: itself.
358    #[cfg(unix)]
359    #[test]
360    fn pid_alive_returns_false_for_pid_zero() {
361        assert!(!pid_alive(0), "pid 0 has process-group semantics");
362    }
363
364    /// Why: `pid > i32::MAX` wraps to negative `pid_t` giving broadcast
365    /// semantics (`kill(-1, 0)`) — must guard before syscall.
366    /// What: asserts both `u32::MAX` and `i32::MAX as u32 + 1` return `false`.
367    /// Test: itself.
368    #[cfg(unix)]
369    #[test]
370    fn pid_alive_returns_false_for_overflow_pid() {
371        assert!(!pid_alive(u32::MAX), "u32::MAX overflows i32");
372        assert!(!pid_alive(i32::MAX as u32 + 1), "first i32-overflow value");
373    }
374
375    // ── acquire_lock ───────────────────────────────────────────────────────
376
377    /// Why: the primary use case of `acquire_lock` is writing the daemon's own
378    /// PID so future invocations detect the live daemon.  On an empty path the
379    /// O_EXCL exclusive-create branch (Phase 1) must succeed.
380    /// What: calls `acquire_lock` against a fresh temp path; reads the file;
381    /// asserts it contains the current PID.
382    /// Test: itself (real fs, no daemon).
383    #[test]
384    fn acquire_lock_writes_own_pid() {
385        let tmp = tempdir().expect("tempdir");
386        let path = tmp.path().join("daemon.lock");
387        assert!(!path.exists(), "pre-condition: lock file must not exist");
388        let _guard = acquire_lock(&path).expect("acquire must succeed on empty path");
389        let written = read_lock_pid(&path)
390            .expect("read after write must not error")
391            .expect("lock file must contain a PID after acquire");
392        assert_eq!(written, std::process::id());
393    }
394
395    /// Why: stale locks must be reclaimed after a crash so the daemon can
396    /// restart.  Uses a real spawned+reaped child (not `u32::MAX`) to avoid
397    /// the broadcast-semantics false-positive on Linux.
398    /// What: spawns+reaps `true`, writes its dead PID as the stale lock, calls
399    /// `acquire_lock`, asserts success and PID overwrite.
400    /// Test: itself (real fs, spawns `true`).
401    #[cfg(unix)]
402    #[test]
403    fn acquire_lock_reclaims_stale_pid() {
404        let tmp = tempdir().expect("tempdir");
405        let path = tmp.path().join("daemon.lock");
406        let mut child = std::process::Command::new("true")
407            .spawn()
408            .expect("spawn 'true' must succeed");
409        let dead_pid = child.id();
410        child.wait().expect("wait must succeed");
411        assert!(
412            !pid_alive(dead_pid),
413            "pid_alive({dead_pid}) must be false after child was reaped"
414        );
415        std::fs::write(&path, format!("{dead_pid}\n")).expect("write stale pid");
416        let _guard = acquire_lock(&path).expect("acquire must reclaim stale PID");
417        let written = read_lock_pid(&path)
418            .expect("read after reclaim must not error")
419            .expect("lock file must contain a PID after reclaim");
420        assert_eq!(written, std::process::id());
421    }
422
423    /// Why: non-Unix platforms always return false from `pid_alive` so any
424    /// lock is treated as stale.
425    /// What: writes a PID; asserts reclaim succeeds.
426    /// Test: itself (non-Unix only).
427    #[cfg(not(unix))]
428    #[test]
429    fn acquire_lock_reclaims_stale_pid() {
430        let tmp = tempdir().expect("tempdir");
431        let path = tmp.path().join("daemon.lock");
432        std::fs::write(&path, "99999\n").expect("write stale pid");
433        let _guard = acquire_lock(&path).expect("acquire must reclaim stale PID on non-Unix");
434        assert_eq!(
435            read_lock_pid(&path).expect("read").expect("pid"),
436            std::process::id()
437        );
438    }
439
440    /// Why: if a live process holds the lock a new `serve --foreground` must
441    /// fail loudly rather than starting a duplicate on a different port.
442    /// What: writes PID 1 (init/launchd — alive on any Unix) as the held lock;
443    /// asserts `acquire_lock` returns `Err` containing "already running".
444    /// Test: itself (unix only; skipped if PID 1 unreachable e.g. containers).
445    #[test]
446    fn acquire_lock_refuses_live_pid() {
447        let tmp = tempdir().expect("tempdir");
448        let path = tmp.path().join("daemon.lock");
449        #[cfg(unix)]
450        {
451            if pid_alive(1) {
452                std::fs::write(&path, "1\n").expect("write pid 1");
453                let result = acquire_lock(&path);
454                assert!(result.is_err(), "must refuse live lock holder PID 1");
455                assert!(
456                    format!("{}", result.unwrap_err()).contains("already running"),
457                    "error must mention 'already running'"
458                );
459            }
460        }
461    }
462
463    // ── DaemonLock drop ────────────────────────────────────────────────────
464
465    /// Why: `DaemonLock::drop` is the primary safety guarantee — it removes
466    /// the file so a clean shutdown leaves no stale lock.
467    /// What: acquires the lock; asserts file exists; drops guard; asserts gone.
468    /// Test: itself.
469    #[test]
470    fn daemon_lock_drops_removes_file() {
471        let tmp = tempdir().expect("tempdir");
472        let path = tmp.path().join("daemon.lock");
473        let guard = acquire_lock(&path).expect("acquire must succeed on empty path");
474        assert!(path.exists(), "lock file must exist after acquire");
475        drop(guard);
476        assert!(!path.exists(), "lock file must be removed on drop");
477    }
478}