socket_patch_core/patch/apply_lock.rs
1//! Advisory file lock used to serialize mutating operations against a
2//! single `.socket/` directory.
3//!
4//! Apply, rollback, repair, and remove can each rewrite manifest state
5//! and on-disk package files. Two of them running at once against the
6//! same project — common when a dev runs `socket-patch apply` while CI
7//! triggers a deploy hook, or when `apply` and a `repair` are stacked
8//! by a wrapper script — race on every file write. The lock turns
9//! that race into a clean refusal: the second invocation reports
10//! `lock_held` and exits non-zero, leaving the first to finish.
11//!
12//! The lock file lives at `<.socket>/apply.lock`. It is created on
13//! demand (the parent `.socket/` directory must exist first; callers
14//! get a clear error otherwise) and is **never deleted** — the file
15//! handle drop releases the OS-level advisory lock, but the inode
16//! sticks around for next time. That keeps the lock idempotent across
17//! restarts and avoids a race where two callers create the lock file
18//! at the same time.
19//!
20//! Locking is advisory (`flock(2)` on Unix, `LockFileEx` on Windows
21//! via the `fs2` crate). Non-cooperating writers (a user shelling
22//! `rm -rf .socket/`) are not stopped — but every socket-patch
23//! mutating command honors the lock, which is what matters in
24//! practice.
25
26use std::path::{Path, PathBuf};
27use std::time::{Duration, Instant};
28
29use fs2::FileExt;
30use thiserror::Error;
31
32/// Errors surfaced when acquiring the apply lock.
33#[derive(Debug, Error)]
34pub enum LockError {
35 /// Another `socket-patch` process holds the lock and `timeout`
36 /// (possibly zero) elapsed without the lock becoming available.
37 #[error("another socket-patch process is operating in this directory")]
38 Held,
39
40 /// We could not create or open the lock file (typically a missing
41 /// `.socket/` directory or a permissions problem).
42 #[error("failed to open lock file at {path:?}: {source}")]
43 Io {
44 path: PathBuf,
45 #[source]
46 source: std::io::Error,
47 },
48}
49
50/// RAII guard for the apply lock.
51///
52/// Drop releases the OS-level advisory lock. There is no explicit
53/// `unlock()` API on purpose — Rust's drop guarantees are simpler to
54/// reason about than a `?`-fallible unlock path.
55#[derive(Debug)]
56#[must_use = "the lock is released when this guard is dropped"]
57pub struct LockGuard {
58 // The std::fs::File holds the OS handle whose drop releases the
59 // lock; we keep it alive for the guard's lifetime. Field is unused
60 // by name but its Drop side effect is the entire point.
61 _file: std::fs::File,
62}
63
64/// Try to acquire the apply lock at `<socket_dir>/apply.lock`.
65///
66/// `timeout = Duration::ZERO` makes this a non-blocking try-once. Any
67/// positive `timeout` re-tries with a 100 ms backoff until the lock
68/// becomes available or the budget elapses.
69///
70/// The lock file is created on demand. Its parent (`socket_dir`) must
71/// already exist — apply and friends create `.socket/` separately
72/// during `setup`, and we don't want lock acquisition to silently
73/// create directories on a misconfigured path.
74pub fn acquire(socket_dir: &Path, timeout: Duration) -> Result<LockGuard, LockError> {
75 let path = socket_dir.join("apply.lock");
76
77 // Open (or create) the lock file. `create(true)` is idempotent if
78 // it already exists; we never write to the file, only flock it.
79 let file = std::fs::OpenOptions::new()
80 .read(true)
81 .write(true)
82 .create(true)
83 .truncate(false)
84 .open(&path)
85 .map_err(|source| LockError::Io {
86 path: path.clone(),
87 source,
88 })?;
89
90 let deadline = Instant::now() + timeout;
91 loop {
92 match file.try_lock_exclusive() {
93 Ok(()) => return Ok(LockGuard { _file: file }),
94 // Only a genuine "someone else holds it" signal counts as
95 // contention and feeds the retry/`Held` path. Any other
96 // failure (ENOLCK, EBADF, a filesystem that doesn't support
97 // advisory locks, EACCES on a pre-existing read-only lock
98 // file, …) is a real I/O fault: surface it immediately as
99 // `Io` rather than busy-sleeping for the whole budget and
100 // then mislabelling it as `Held`. See `is_lock_contended`.
101 Err(ref e) if is_lock_contended(e) => {
102 let now = Instant::now();
103 if now >= deadline {
104 return Err(LockError::Held);
105 }
106 // Never sleep past the deadline: a sub-100 ms budget
107 // must not be rounded up to a full 100 ms wait. The
108 // remaining slice is always > 0 here (now < deadline).
109 let remaining = deadline - now;
110 std::thread::sleep(remaining.min(Duration::from_millis(100)));
111 }
112 Err(source) => {
113 return Err(LockError::Io {
114 path: path.clone(),
115 source,
116 });
117 }
118 }
119 }
120}
121
122/// Distinguish "the lock is held by someone else" from a real I/O
123/// failure of `try_lock_exclusive`.
124///
125/// `fs2` reports contention via a fixed OS-error sentinel
126/// (`EWOULDBLOCK` on Unix, `ERROR_LOCK_VIOLATION` on Windows), exposed
127/// as [`fs2::lock_contended_error`]. We compare raw OS codes — an exact
128/// match, and portable, because both that sentinel and any genuine
129/// `flock(2)`/`LockFileEx` failure are constructed from an OS error
130/// code. A non-OS error (`raw_os_error() == None`) can never be
131/// contention, so it correctly falls through to `Io`.
132fn is_lock_contended(err: &std::io::Error) -> bool {
133 err.raw_os_error() == fs2::lock_contended_error().raw_os_error()
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139
140 /// Lock file is created on demand and the first acquisition succeeds.
141 #[test]
142 fn first_acquire_succeeds() {
143 let dir = tempfile::tempdir().unwrap();
144 let guard = acquire(dir.path(), Duration::ZERO).unwrap();
145 // Lock file must exist on disk.
146 assert!(dir.path().join("apply.lock").is_file());
147 drop(guard);
148 }
149
150 /// Second concurrent acquire returns `LockError::Held` when the
151 /// first guard is still alive.
152 #[test]
153 fn second_concurrent_acquire_is_held() {
154 let dir = tempfile::tempdir().unwrap();
155 let _first = acquire(dir.path(), Duration::ZERO).unwrap();
156 let err = acquire(dir.path(), Duration::ZERO).unwrap_err();
157 assert!(matches!(err, LockError::Held));
158 }
159
160 /// After the first guard drops, a fresh acquire succeeds.
161 #[test]
162 fn drop_releases_lock() {
163 let dir = tempfile::tempdir().unwrap();
164 {
165 let _g = acquire(dir.path(), Duration::ZERO).unwrap();
166 } // guard dropped here
167 let again = acquire(dir.path(), Duration::ZERO);
168 assert!(again.is_ok());
169 }
170
171 /// Missing socket directory surfaces as `LockError::Io` with the
172 /// original `NotFound` underneath.
173 #[test]
174 fn missing_socket_dir_surfaces_io() {
175 let dir = tempfile::tempdir().unwrap();
176 let missing = dir.path().join("does-not-exist");
177 let err = acquire(&missing, Duration::ZERO).unwrap_err();
178 match err {
179 LockError::Io { source, .. } => {
180 assert_eq!(source.kind(), std::io::ErrorKind::NotFound);
181 }
182 _ => panic!("expected Io error, got {:?}", err),
183 }
184 }
185
186 /// Non-zero timeout waits then errors `Held` when the lock never
187 /// frees up.
188 #[test]
189 fn timeout_held() {
190 let dir = tempfile::tempdir().unwrap();
191 let _first = acquire(dir.path(), Duration::ZERO).unwrap();
192 let start = Instant::now();
193 let err = acquire(dir.path(), Duration::from_millis(250)).unwrap_err();
194 let elapsed = start.elapsed();
195 assert!(matches!(err, LockError::Held));
196 // We waited at least the budget (with some slack for the
197 // sleep granularity). Bound the upper end loosely so a slow
198 // CI host doesn't make this flaky.
199 assert!(
200 elapsed >= Duration::from_millis(200),
201 "expected at least 200ms wait, got {:?}",
202 elapsed
203 );
204 }
205
206 /// Regression: `fs2`'s own contended-lock sentinel must be
207 /// classified as contention (the `Held` path). If `fs2` ever
208 /// changed the sentinel out from under us, this catches it before
209 /// the misclassification reaches users.
210 #[test]
211 fn contended_sentinel_is_classified_as_contention() {
212 assert!(is_lock_contended(&fs2::lock_contended_error()));
213 }
214
215 /// Regression: genuine I/O failures of `try_lock_exclusive` must
216 /// NOT masquerade as contention. Previously every error funnelled
217 /// into the retry/`Held` path, so a real fault (e.g. ENOLCK on a
218 /// full kernel lock table, or a filesystem without advisory locks)
219 /// was reported as "another process is operating here" — and, with
220 /// a positive timeout, only after busy-sleeping the entire budget.
221 #[test]
222 fn genuine_io_errors_are_not_contention() {
223 use std::io::{Error, ErrorKind};
224
225 // Kind-only errors carry no OS code, so they can never equal
226 // the contended sentinel.
227 assert!(!is_lock_contended(&Error::from(ErrorKind::NotFound)));
228 assert!(!is_lock_contended(&Error::from(
229 ErrorKind::PermissionDenied
230 )));
231
232 // A concrete-but-different OS error (EINTR == 4 on Unix) must
233 // not look like contention either. Skip the exact code match on
234 // the off chance a platform reuses 4 for the contended sentinel.
235 let eintr = Error::from_raw_os_error(4);
236 if eintr.raw_os_error() != fs2::lock_contended_error().raw_os_error() {
237 assert!(!is_lock_contended(&eintr));
238 }
239 }
240
241 /// A non-blocking (`ZERO`) acquire on a contended lock returns
242 /// `Held` essentially immediately — it must not pay the 100 ms
243 /// backoff sleep before giving up.
244 #[test]
245 fn zero_timeout_does_not_sleep_before_held() {
246 let dir = tempfile::tempdir().unwrap();
247 let _first = acquire(dir.path(), Duration::ZERO).unwrap();
248 let start = Instant::now();
249 let err = acquire(dir.path(), Duration::ZERO).unwrap_err();
250 let elapsed = start.elapsed();
251 assert!(matches!(err, LockError::Held));
252 assert!(
253 elapsed < Duration::from_millis(100),
254 "non-blocking acquire should not sleep, took {:?}",
255 elapsed
256 );
257 }
258
259 /// The retry loop must not overshoot the deadline by a full sleep
260 /// quantum. A 150 ms budget should resolve well under the old
261 /// fixed-100 ms-sleep worst case (~200 ms) — the final sleep is
262 /// clamped to the remaining slice.
263 #[test]
264 fn wait_respects_deadline_without_full_quantum_overshoot() {
265 let dir = tempfile::tempdir().unwrap();
266 let _first = acquire(dir.path(), Duration::ZERO).unwrap();
267 let start = Instant::now();
268 let err = acquire(dir.path(), Duration::from_millis(150)).unwrap_err();
269 let elapsed = start.elapsed();
270 assert!(matches!(err, LockError::Held));
271 assert!(
272 elapsed >= Duration::from_millis(150),
273 "should wait at least the budget, got {:?}",
274 elapsed
275 );
276 // Loose upper bound: clamped sleeps mean we don't blow well past
277 // the budget. Generous slack keeps slow CI hosts non-flaky while
278 // still failing the old uncapped behaviour's pathological cases.
279 assert!(
280 elapsed < Duration::from_millis(450),
281 "clamped sleep should keep us near the budget, got {:?}",
282 elapsed
283 );
284 }
285}