Skip to main content

solo_storage/
lockfile.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! `solo.lock`: O_EXCL-style mutex that prevents two daemons (or two `solo
4//! init` invocations) from racing on the same data dir.
5//!
6//! Per ADR-0003 §P8-I: stale-lock recovery + PID-alive checks. Behaviour:
7//!
8//!   - `Lockfile::acquire(path)` creates the file with `O_EXCL` (cross-platform
9//!     via `OpenOptions::create_new`) and writes the current PID.
10//!   - If the file already exists, we read the PID it contains and ask the OS
11//!     whether that PID is currently alive (via `sysinfo`). Two outcomes:
12//!       - **Alive**: refuse with `Error::Conflict` — another Solo process
13//!         genuinely owns the data dir.
14//!       - **Dead** (or unparseable PID): the previous run crashed without
15//!         removing the file. Remove it and retry once. We never loop —
16//!         repeated failures fall through to the conflict error.
17//!   - On drop, the file is deleted. (Best-effort; if delete fails we log
18//!     and continue.)
19//!
20//! Why we want this even in commit 1.1: `solo init` creates a fresh DB with
21//! the user's chosen passphrase. If two `solo init` invocations race, they
22//! could each generate a different salt and overwrite each other's config —
23//! leaving the user with a passphrase that no longer matches the on-disk DB.
24
25use solo_core::{Error, Result};
26use std::fs::{File, OpenOptions};
27use std::io::Write;
28use std::path::{Path, PathBuf};
29use sysinfo::{Pid, ProcessRefreshKind, RefreshKind, System};
30
31/// Cross-platform PID-alive check. Internally, `sysinfo` queries `/proc` on
32/// Linux, `kqueue` on BSD/macOS, and `OpenProcess` on Windows. We refresh a
33/// fresh `System` each call — there's no global state to keep stale.
34fn is_pid_alive(pid: u32) -> bool {
35    // Minimal refresh — we only need the process list, not CPU/memory/etc.
36    let sys = System::new_with_specifics(
37        RefreshKind::new().with_processes(ProcessRefreshKind::new()),
38    );
39    sys.process(Pid::from_u32(pid)).is_some()
40}
41
42/// RAII handle to the data-dir lockfile.
43#[derive(Debug)]
44pub struct Lockfile {
45    path: PathBuf,
46    /// Held to keep the OS handle open for the lifetime of the guard. Dropping
47    /// closes the handle; we explicitly remove the file in our own Drop impl.
48    _handle: File,
49}
50
51impl Lockfile {
52    /// Acquire the lock by creating `path` with O_EXCL. Writes the current
53    /// PID to the file. If the file already exists, attempt stale-lock
54    /// recovery: read the persisted PID, ask the OS if it's alive, remove
55    /// and retry once if it isn't.
56    pub fn acquire(path: &Path) -> Result<Self> {
57        match Self::try_create(path) {
58            Ok(lf) => Ok(lf),
59            Err(Error::Conflict(_)) => {
60                // Existing lockfile — investigate.
61                Self::try_recover_stale(path)?;
62                // One retry. If this fails too, surface the conflict.
63                Self::try_create(path)
64            }
65            Err(e) => Err(e),
66        }
67    }
68
69    /// Best-effort: if the existing lockfile's PID is dead, remove it.
70    /// Returns Ok if recovered, Err(Conflict) if the lock is genuinely held.
71    fn try_recover_stale(path: &Path) -> Result<()> {
72        let body = match std::fs::read_to_string(path) {
73            Ok(s) => s,
74            Err(_) => {
75                // Can't read — stay conservative, treat as held.
76                return Err(Self::held_error(path, None));
77            }
78        };
79        let pid = body.trim().parse::<u32>().ok();
80        let alive = match pid {
81            Some(p) => is_pid_alive(p),
82            // Unparseable PID body (corruption, partial write); treat as
83            // stale and recover.
84            None => false,
85        };
86        if alive {
87            return Err(Self::held_error(path, pid));
88        }
89        // Stale: the previous run died without cleaning up.
90        tracing::warn!(
91            ?pid,
92            path = %path.display(),
93            "stale lockfile detected (pid not alive); removing"
94        );
95        std::fs::remove_file(path)
96            .map_err(|e| Error::storage(format!("remove stale lockfile {}: {e}", path.display())))?;
97        Ok(())
98    }
99
100    fn try_create(path: &Path) -> Result<Self> {
101        let mut handle = OpenOptions::new()
102            .write(true)
103            .create_new(true)
104            .open(path)
105            .map_err(|e| match e.kind() {
106                std::io::ErrorKind::AlreadyExists => Self::held_error(path, None),
107                _ => Error::storage(format!("open lockfile {}: {e}", path.display())),
108            })?;
109        let pid = std::process::id();
110        write!(handle, "{pid}")
111            .map_err(|e| Error::storage(format!("write pid to lockfile: {e}")))?;
112        handle
113            .sync_all()
114            .map_err(|e| Error::storage(format!("fsync lockfile: {e}")))?;
115        Ok(Self {
116            path: path.to_path_buf(),
117            _handle: handle,
118        })
119    }
120
121    fn held_error(path: &Path, pid: Option<u32>) -> Error {
122        let pid_msg = match pid {
123            Some(p) => format!(" (held by pid {p})"),
124            None => String::new(),
125        };
126        Error::conflict(format!(
127            "lockfile {} already exists{pid_msg} — another Solo process is \
128             running. If you're sure no other instance is alive, remove the \
129             file manually.",
130            path.display()
131        ))
132    }
133
134    /// Path to the lockfile (for diagnostics).
135    pub fn path(&self) -> &Path {
136        &self.path
137    }
138}
139
140impl Drop for Lockfile {
141    fn drop(&mut self) {
142        // Best-effort: if delete fails (e.g., the file was already removed),
143        // log and continue. We deliberately don't panic in Drop.
144        if let Err(e) = std::fs::remove_file(&self.path) {
145            tracing::warn!(
146                error = %e,
147                path = %self.path.display(),
148                "failed to remove lockfile on drop"
149            );
150        }
151    }
152}
153
154#[cfg(test)]
155mod tests {
156    use super::*;
157    use tempfile::TempDir;
158
159    #[test]
160    fn acquire_creates_file_with_pid() {
161        let tmp = TempDir::new().unwrap();
162        let path = tmp.path().join("solo.lock");
163        let _lock = Lockfile::acquire(&path).unwrap();
164        assert!(path.exists());
165        let body = std::fs::read_to_string(&path).unwrap();
166        let pid: u32 = body.parse().expect("pid should be a number");
167        assert_eq!(pid, std::process::id());
168    }
169
170    #[test]
171    fn second_acquire_fails_with_conflict() {
172        let tmp = TempDir::new().unwrap();
173        let path = tmp.path().join("solo.lock");
174        let _lock = Lockfile::acquire(&path).unwrap();
175        let err = Lockfile::acquire(&path).unwrap_err();
176        assert!(matches!(err, Error::Conflict(_)), "got: {err:?}");
177    }
178
179    #[test]
180    fn drop_removes_file() {
181        let tmp = TempDir::new().unwrap();
182        let path = tmp.path().join("solo.lock");
183        {
184            let _lock = Lockfile::acquire(&path).unwrap();
185            assert!(path.exists());
186        }
187        assert!(!path.exists(), "lockfile should be removed on drop");
188    }
189
190    #[test]
191    fn re_acquire_after_drop_succeeds() {
192        let tmp = TempDir::new().unwrap();
193        let path = tmp.path().join("solo.lock");
194        {
195            let _lock = Lockfile::acquire(&path).unwrap();
196        }
197        let _lock2 = Lockfile::acquire(&path).unwrap();
198    }
199
200    #[test]
201    fn stale_lockfile_with_dead_pid_is_recovered() {
202        let tmp = TempDir::new().unwrap();
203        let path = tmp.path().join("solo.lock");
204        // Plant a stale lockfile with a bogus, definitely-dead PID. PID 1
205        // is reserved on Unix (init); we want a number that's vanishingly
206        // unlikely to exist. u32::MAX is a safe choice — process IDs are
207        // bounded well below that on every supported OS.
208        std::fs::write(&path, format!("{}", u32::MAX)).unwrap();
209        // Acquire should remove the stale file and create a fresh one with
210        // the current PID.
211        let lock = Lockfile::acquire(&path).unwrap();
212        assert!(path.exists());
213        let body = std::fs::read_to_string(&path).unwrap();
214        let pid: u32 = body.trim().parse().unwrap();
215        assert_eq!(pid, std::process::id());
216        drop(lock);
217    }
218
219    #[test]
220    fn stale_lockfile_with_unparseable_body_is_recovered() {
221        let tmp = TempDir::new().unwrap();
222        let path = tmp.path().join("solo.lock");
223        std::fs::write(&path, b"<garbage from a partial write>").unwrap();
224        let _lock = Lockfile::acquire(&path).unwrap();
225        // No assertion needed beyond Ok — getting here means recovery worked.
226    }
227
228    #[test]
229    fn live_pid_is_not_recovered() {
230        let tmp = TempDir::new().unwrap();
231        let path = tmp.path().join("solo.lock");
232        // Use the current process's PID — definitely alive.
233        std::fs::write(&path, format!("{}", std::process::id())).unwrap();
234        let err = Lockfile::acquire(&path).unwrap_err();
235        assert!(matches!(err, Error::Conflict(_)), "got: {err:?}");
236        // The file must still exist (we didn't remove a live lock).
237        assert!(path.exists());
238    }
239}