Skip to main content

atomcode_core/setup/
lock.rs

1//! Project-level advisory file lock for setup. Dual-rail: `fs2::FileExt::try_lock_exclusive`
2//! + PID/start_time sentinel JSON. Sentinel handles sandbox/NFS/container edge cases
3//! where flock alone is unreliable.
4//!
5//! Acquire order:
6//! 1. Read `.atomcode/.setup.lock.sentinel` if present. If recorded PID is alive **and**
7//!    its start_time matches, return [`LockError::Held`] (unless `force = true`).
8//!    Stale sentinel is removed.
9//! 2. `try_lock_exclusive` on `.atomcode/.setup.lock` — second rail.
10//! 3. Write a fresh sentinel JSON with current PID, start_time, host, version.
11//!
12//! Drop releases both rails (unlock fs2, rm sentinel) but keeps the `.setup.lock`
13//! file so future opens reuse the inode.
14
15use fs2::FileExt;
16use serde::{Deserialize, Serialize};
17use std::fs::{File, OpenOptions};
18use std::io::Write;
19use std::path::{Path, PathBuf};
20use thiserror::Error;
21
22const LOCK_FILE: &str = ".setup.lock";
23const SENTINEL_FILE: &str = ".setup.lock.sentinel";
24
25#[derive(Debug, Error)]
26pub enum LockError {
27    #[error(
28        "Setup is already running (PID {pid} @ {host}, started {start_time}). Use --force to override."
29    )]
30    Held {
31        pid: u32,
32        start_time: String,
33        host: String,
34    },
35    #[error("Lock acquisition io error: {0}")]
36    Io(#[from] std::io::Error),
37}
38
39#[derive(Debug)]
40pub struct SetupLock {
41    fd: File,
42    /// Path to the project's `.atomcode/.setup.lock` file. Kept around for
43    /// diagnostics and future callers (e.g. error messages, force-cleanup CLI).
44    #[allow(dead_code)]
45    pub(super) lock_path: PathBuf,
46    pub(super) sentinel_path: PathBuf,
47}
48
49#[derive(Debug, Serialize, Deserialize)]
50struct Sentinel {
51    pid: u32,
52    start_time_nanos: u128,
53    host: String,
54    atomcode_version: String,
55}
56
57fn lock_dir(project_root: &Path) -> PathBuf {
58    project_root.join(".atomcode")
59}
60
61fn current_pid() -> u32 {
62    std::process::id()
63}
64
65fn hostname() -> String {
66    sysinfo::System::host_name().unwrap_or_else(|| "unknown".to_string())
67}
68
69/// Returns the current process's start_time as nanoseconds since UNIX epoch.
70/// sysinfo's `start_time()` returns seconds (u64); multiply to nanos for finer-grained
71/// future-proofing (Linux clocktick granularity is jiffy-ish, but the JSON field
72/// stays uniform regardless of OS).
73fn current_start_time_nanos() -> u128 {
74    use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
75    let pid = Pid::from_u32(current_pid());
76    let mut sys = System::new();
77    sys.refresh_processes_specifics(ProcessesToUpdate::Some(&[pid]), false, ProcessRefreshKind::new());
78    sys.process(pid)
79        .map(|p| (p.start_time() as u128) * 1_000_000_000)
80        .unwrap_or(0)
81}
82
83fn read_sentinel(path: &Path) -> Option<Sentinel> {
84    let raw = std::fs::read_to_string(path).ok()?;
85    serde_json::from_str(&raw).ok()
86}
87
88/// True iff a process with `pid` is currently running **and** its observed
89/// start_time (nanos) equals `start_time_nanos`. PID reuse after the previous
90/// setup crashed will not falsely report alive because start_time differs.
91fn process_alive_at(pid: u32, start_time_nanos: u128) -> bool {
92    use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
93    let target = Pid::from_u32(pid);
94    let mut sys = System::new();
95    sys.refresh_processes_specifics(ProcessesToUpdate::Some(&[target]), false, ProcessRefreshKind::new());
96    match sys.process(target) {
97        Some(p) => (p.start_time() as u128) * 1_000_000_000 == start_time_nanos,
98        None => false,
99    }
100}
101
102impl SetupLock {
103    pub fn acquire(project_root: &Path, force: bool) -> Result<Self, LockError> {
104        let dir = lock_dir(project_root);
105        std::fs::create_dir_all(&dir)?;
106        let lock_path = dir.join(LOCK_FILE);
107        let sentinel_path = dir.join(SENTINEL_FILE);
108
109        // Stage 1: inspect sentinel (primary rail).
110        //
111        // - If recorded owner is **alive** and !force: report Held with full identity.
112        // - If recorded owner is **alive** and force: do NOT delete sentinel yet; let
113        //   fs2 be the authority. If fs2 also held, force genuinely cannot take over
114        //   (peer still running). If fs2 is releasable, the alive-check raced and the
115        //   peer just exited — proceed with takeover (re-read in Stage 2 surfaces who).
116        // - If recorded owner is **stale** (dead PID or start_time mismatch): delete
117        //   sentinel so fs2 isn't confused by a leftover file.
118        let sentinel_owner: Option<Sentinel> = read_sentinel(&sentinel_path);
119        let owner_alive = sentinel_owner
120            .as_ref()
121            .is_some_and(|s| process_alive_at(s.pid, s.start_time_nanos));
122
123        if let Some(meta) = sentinel_owner.as_ref() {
124            if owner_alive && !force {
125                return Err(LockError::Held {
126                    pid: meta.pid,
127                    start_time: format!("{} ns", meta.start_time_nanos),
128                    host: meta.host.clone(),
129                });
130            }
131            if !owner_alive {
132                // Stale — clean up so fs2 won't see a leftover file from prior crash.
133                let _ = std::fs::remove_file(&sentinel_path);
134            }
135            // owner_alive && force: keep sentinel for now; fs2 is the authority.
136        }
137
138        // Stage 2: fs2 try_lock_exclusive (secondary rail).
139        let fd = OpenOptions::new()
140            .create(true)
141            .read(true)
142            .write(true)
143            .truncate(false)
144            .open(&lock_path)?;
145
146        if fd.try_lock_exclusive().is_err() {
147            // fs2 failed. Re-read sentinel to surface the *real* holder identity.
148            // Covers two race/edge cases:
149            //   (a) TOCTOU between our stale-sentinel removal and our fs2 attempt:
150            //       a sibling wrote a fresh sentinel + grabbed fs2 in the gap.
151            //   (b) force=true but the live sibling still holds fs2 — force cannot
152            //       take over a running peer; report the real PID so the user knows
153            //       whom to kill.
154            let live_owner = read_sentinel(&sentinel_path);
155            return Err(match live_owner {
156                Some(meta) => LockError::Held {
157                    pid: meta.pid,
158                    start_time: format!("{} ns", meta.start_time_nanos),
159                    host: meta.host,
160                },
161                None => LockError::Held {
162                    pid: 0,
163                    start_time: "concurrent (sentinel missing/corrupt)".to_string(),
164                    host: hostname(),
165                },
166            });
167        }
168
169        // Stage 3: we hold both rails. If force was used against a previously-alive
170        // owner, the peer must have released fs2 between Stage 1 and Stage 2 — warn
171        // so the operator knows takeover actually fired (the previous version warned
172        // unconditionally before fs2 succeeded, which was misleading on failure).
173        if force && owner_alive {
174            if let Some(meta) = sentinel_owner.as_ref() {
175                tracing::warn!(
176                    pid = meta.pid,
177                    host = %meta.host,
178                    "forced setup lock takeover after sibling released fs2 lock"
179                );
180            }
181            // Clean up the prior owner's sentinel before we write our own.
182            let _ = std::fs::remove_file(&sentinel_path);
183        }
184
185        // Stage 4: write our sentinel.
186        let sentinel = Sentinel {
187            pid: current_pid(),
188            start_time_nanos: current_start_time_nanos(),
189            host: hostname(),
190            atomcode_version: env!("CARGO_PKG_VERSION").to_string(),
191        };
192        let json = serde_json::to_string(&sentinel).expect("Sentinel serialize never fails");
193        let mut f = File::create(&sentinel_path)?;
194        f.write_all(json.as_bytes())?;
195        f.sync_all()?;
196
197        Ok(Self { fd, lock_path, sentinel_path })
198    }
199}
200
201impl Drop for SetupLock {
202    fn drop(&mut self) {
203        // Best-effort: errors during drop are intentionally swallowed. If unlock
204        // fails the OS will release on process exit; sentinel removal failure
205        // just leaves a stale file the next acquire will overwrite.
206        let _ = fs2::FileExt::unlock(&self.fd);
207        let _ = std::fs::remove_file(&self.sentinel_path);
208        // Keep `.setup.lock` file itself so future opens reuse the inode.
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn acquire_creates_lock_in_fresh_project() {
218        let dir = tempfile::tempdir().unwrap();
219        let lock = SetupLock::acquire(dir.path(), false).unwrap();
220        assert!(lock.lock_path.exists());
221        assert!(lock.sentinel_path.exists());
222    }
223
224    #[test]
225    fn second_acquire_fails_when_first_held() {
226        let dir = tempfile::tempdir().unwrap();
227        let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
228        let err = SetupLock::acquire(dir.path(), false).unwrap_err();
229        assert!(matches!(err, LockError::Held { .. }));
230    }
231
232    #[test]
233    fn drop_releases_lock_so_next_acquire_succeeds() {
234        let dir = tempfile::tempdir().unwrap();
235        {
236            let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
237        }
238        // drop happened; next acquire should succeed
239        let _lock2 = SetupLock::acquire(dir.path(), false).unwrap();
240    }
241
242    #[test]
243    fn force_with_alive_holder_still_fails_if_fs2_held() {
244        // First lock takes both sentinel + fs2.
245        let dir = tempfile::tempdir().unwrap();
246        let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
247
248        // Force=true cannot succeed when fs2 is genuinely held by the live sibling.
249        // Holder PID should be reported as our own pid (since we wrote the sentinel ourselves).
250        let err = SetupLock::acquire(dir.path(), true).unwrap_err();
251        match err {
252            LockError::Held { pid, .. } => {
253                assert_eq!(pid, std::process::id(), "Held should surface real holder pid, not 0");
254            }
255            other => panic!("expected Held, got {other:?}"),
256        }
257    }
258
259    #[test]
260    fn fs2_race_loses_holder_identity_gracefully() {
261        // This is hard to truly race in a unit test, but we can simulate the
262        // post-condition: a sentinel exists and we attempt to acquire without force.
263        // Verify the error carries the sentinel's identity, not pid=0.
264        let dir = tempfile::tempdir().unwrap();
265        let _lock1 = SetupLock::acquire(dir.path(), false).unwrap();
266        let err = SetupLock::acquire(dir.path(), false).unwrap_err();
267        match err {
268            LockError::Held { pid, .. } => {
269                assert_eq!(pid, std::process::id());
270            }
271            other => panic!("expected Held with real pid, got {other:?}"),
272        }
273    }
274}