dbmd_core/fsx.rs
1//! `fsx` — the one atomic, durable file write for db.md's primary data.
2//!
3//! Every store-state file that holds **primary** data — content records
4//! ([`crate::parser::write_file`]), `log.md` and its archives ([`crate::log`]),
5//! and in-place link rewrites — is written through [`write_atomic`] or
6//! [`write_atomic_new`]:
7//!
8//! 1. write the bytes to a uniquely-named sibling temp file in the *same*
9//! directory (`create_new`, so a predictable temp name can never be
10//! clobbered — closing the temp-clobber race);
11//! 2. `fsync` the temp file;
12//! 3. either `rename` it over the destination ([`write_atomic`]) or hard-link it
13//! into place with create-new semantics ([`write_atomic_new`]);
14//! 4. `fsync` the parent directory so the committed directory entry survives a
15//! crash.
16//!
17//! These are the only primitives for durable writes — never `std::fs::write`,
18//! which is neither atomic nor crash-durable. Use [`write_atomic`] when replacing
19//! an existing file is intended; use [`write_atomic_new`] when the destination
20//! must not already exist.
21//!
22//! **Not for the index.** `index.md` / `index.jsonl` are *derived, rebuildable*
23//! artifacts on the O(changed) write-through path; they use their own
24//! atomic-but-not-`fsync`'d writer ([`crate::index`]'s `AtomicTemp`) on purpose
25//! — a crash-lost index write is recovered by `dbmd index rebuild`, so paying an
26//! `fsync` per catalog update on the hot loop would be cost without benefit.
27
28use std::fs::{self, File, OpenOptions};
29use std::io::Write;
30use std::path::{Path, PathBuf};
31use std::sync::atomic::{AtomicU64, Ordering};
32use std::time::{SystemTime, UNIX_EPOCH};
33
34/// Atomically and durably replace `path` with `bytes` (see the module docs for
35/// the write/fsync/rename/fsync sequence). The parent directory is created if
36/// missing. On *any* early return between temp-file creation and a successful
37/// rename — a `write_all`/`sync_all` failure (ENOSPC, EIO, quota) as well as a
38/// rename failure — the temp file is cleaned up rather than leaked, via the
39/// [`TempGuard`] `Drop` impl (mirroring `index.rs`'s `AtomicTemp`).
40pub fn write_atomic(path: &Path, bytes: &[u8]) -> std::io::Result<()> {
41 let dir = path.parent().unwrap_or_else(|| Path::new("."));
42 fs::create_dir_all(dir)?;
43
44 let file_name = path
45 .file_name()
46 .and_then(|s| s.to_str())
47 .unwrap_or("dbmd-tmp");
48 let (mut f, mut guard) = create_temp_file(dir, file_name)?;
49
50 // Scope the handle so it is flushed/closed before the rename. A failure here
51 // returns via `?`; `guard` then drops and removes the orphaned temp file.
52 {
53 f.write_all(bytes)?;
54 f.sync_all()?;
55 }
56
57 // Preserve the destination's existing permission bits. The temp file was
58 // created with the default mode (0666 & umask → 0644), and a bare
59 // `rename(temp, dest)` would install *that* mode as the destination's new
60 // mode — silently widening a deliberately-restricted file (e.g. `chmod 600`
61 // on a record holding private data) to world-readable 0644 on every rewrite.
62 // Copy the live destination mode onto the temp before the rename so an
63 // in-place update keeps the file's permissions. Best-effort: if the
64 // destination does not exist yet (a fresh create) or its metadata can't be
65 // read, the default mode stands. A `set_permissions` failure is non-fatal —
66 // the rewrite still commits with the default mode rather than aborting.
67 copy_existing_permissions(path, &guard.path);
68
69 // The rename either errors (guard drops, cleaning up the temp) or succeeds
70 // (we disarm the guard so it does not remove the now-renamed destination).
71 fs::rename(&guard.path, path)?;
72 guard.disarm();
73 sync_parent_dir(dir);
74 Ok(())
75}
76
77/// Copy `dest`'s existing permission bits onto `temp` when `dest` already exists,
78/// so a replace-by-rename preserves the original mode rather than resetting it to
79/// the temp file's default. Best-effort and non-fatal: a missing destination (a
80/// first create) or an unreadable mode simply leaves the temp's default in place.
81fn copy_existing_permissions(dest: &Path, temp: &Path) {
82 if let Ok(meta) = fs::metadata(dest) {
83 let _ = fs::set_permissions(temp, meta.permissions());
84 }
85}
86
87/// Atomically and durably create `path` with `bytes`, failing with
88/// [`std::io::ErrorKind::AlreadyExists`] if the destination already exists.
89///
90/// This follows the same temp-file + file-fsync + parent-fsync sequence as
91/// [`write_atomic`], but installs the temp file with `hard_link(temp, path)`
92/// instead of `rename(temp, path)`. Hard-link creation is resolved atomically by
93/// the OS and refuses an existing destination, so concurrent creators for the
94/// same path produce exactly one winner and `AlreadyExists` for the rest. The
95/// temporary link is removed after the destination link is established.
96pub fn write_atomic_new(path: &Path, bytes: &[u8]) -> std::io::Result<()> {
97 let dir = path.parent().unwrap_or_else(|| Path::new("."));
98 fs::create_dir_all(dir)?;
99
100 let file_name = path
101 .file_name()
102 .and_then(|s| s.to_str())
103 .unwrap_or("dbmd-tmp");
104 let (mut f, mut guard) = create_temp_file(dir, file_name)?;
105
106 {
107 f.write_all(bytes)?;
108 f.sync_all()?;
109 }
110 drop(f);
111
112 fs::hard_link(&guard.path, path)?;
113 if fs::remove_file(&guard.path).is_ok() {
114 guard.disarm();
115 }
116 sync_parent_dir(dir);
117 Ok(())
118}
119
120/// Drop-based cleanup for the hidden temp file `write_atomic` creates. While
121/// armed, dropping the guard removes `path`. [`TempGuard::disarm`] is called
122/// only after a successful rename, or after a successful temp-link cleanup in
123/// [`write_atomic_new`], so the final destination is never touched.
124struct TempGuard {
125 path: PathBuf,
126 armed: bool,
127}
128
129impl TempGuard {
130 /// Stop cleaning up `path` on drop — used once the temp has been renamed
131 /// into place and is no longer a stray temp file.
132 fn disarm(&mut self) {
133 self.armed = false;
134 }
135}
136
137impl Drop for TempGuard {
138 fn drop(&mut self) {
139 // Best-effort cleanup if an error path bailed out before the rename.
140 if self.armed {
141 let _ = fs::remove_file(&self.path);
142 }
143 }
144}
145
146/// Create a uniquely-named temp file in `dir` with `create_new` (never clobbers
147/// a predictable name), retrying on the vanishingly-rare collision. The name is
148/// hidden (`.`-prefixed) and tagged with pid + nanos + a process-wide counter so
149/// concurrent writers in the same directory never pick the same path. Returns the
150/// open handle plus an armed [`TempGuard`] so any early return cleans up the temp.
151fn create_temp_file(dir: &Path, file_name: &str) -> std::io::Result<(File, TempGuard)> {
152 static TMP_SEQ: AtomicU64 = AtomicU64::new(0);
153 let pid = std::process::id();
154 let nanos = SystemTime::now()
155 .duration_since(UNIX_EPOCH)
156 .map(|d| d.as_nanos())
157 .unwrap_or(0);
158
159 for _ in 0..128 {
160 let seq = TMP_SEQ.fetch_add(1, Ordering::Relaxed);
161 let tmp = dir.join(format!(".{file_name}.tmp.{pid}.{nanos}.{seq}"));
162 match OpenOptions::new().write(true).create_new(true).open(&tmp) {
163 Ok(file) => {
164 return Ok((
165 file,
166 TempGuard {
167 path: tmp,
168 armed: true,
169 },
170 ))
171 }
172 Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => continue,
173 Err(e) => return Err(e),
174 }
175 }
176
177 Err(std::io::Error::new(
178 std::io::ErrorKind::AlreadyExists,
179 "could not allocate a unique dbmd temp file",
180 ))
181}
182
183/// Best-effort `fsync` of the directory so a completed `rename` is durable across
184/// a crash. Non-fatal: some filesystems disallow directory `fsync`.
185fn sync_parent_dir(dir: &Path) {
186 if let Ok(d) = File::open(dir) {
187 let _ = d.sync_all();
188 }
189}
190
191#[cfg(test)]
192mod tests {
193 use super::*;
194 use tempfile::TempDir;
195
196 #[test]
197 fn write_atomic_creates_then_replaces_durably() {
198 let tmp = TempDir::new().unwrap();
199 let target = tmp.path().join("sub").join("file.txt"); // parent missing
200
201 write_atomic(&target, b"first").unwrap();
202 assert_eq!(std::fs::read(&target).unwrap(), b"first");
203
204 // Replace in place — content swaps, no temp files left behind.
205 write_atomic(&target, b"second").unwrap();
206 assert_eq!(std::fs::read(&target).unwrap(), b"second");
207
208 let leftovers: Vec<_> = std::fs::read_dir(target.parent().unwrap())
209 .unwrap()
210 .filter_map(|e| e.ok())
211 .filter(|e| e.file_name().to_string_lossy().contains(".tmp."))
212 .collect();
213 assert!(leftovers.is_empty(), "no temp files may be left behind");
214 }
215
216 #[test]
217 fn write_atomic_is_byte_exact_including_empty() {
218 let tmp = TempDir::new().unwrap();
219 let target = tmp.path().join("empty.txt");
220 write_atomic(&target, b"").unwrap();
221 assert_eq!(std::fs::read(&target).unwrap(), b"");
222 }
223
224 #[test]
225 fn write_atomic_new_creates_but_refuses_existing() {
226 let tmp = TempDir::new().unwrap();
227 let target = tmp.path().join("sub").join("file.txt");
228
229 write_atomic_new(&target, b"first").unwrap();
230 assert_eq!(std::fs::read(&target).unwrap(), b"first");
231
232 let err = write_atomic_new(&target, b"second").unwrap_err();
233 assert_eq!(err.kind(), std::io::ErrorKind::AlreadyExists);
234 assert_eq!(
235 std::fs::read(&target).unwrap(),
236 b"first",
237 "create-new failure must leave the existing destination untouched"
238 );
239
240 assert_no_temp_files(target.parent().unwrap());
241 }
242
243 #[test]
244 fn write_atomic_new_allows_only_one_concurrent_creator() {
245 use std::sync::{Arc, Barrier};
246
247 for round in 0..40 {
248 let tmp = TempDir::new().unwrap();
249 let target = tmp.path().join("file.txt");
250 let barrier = Arc::new(Barrier::new(8));
251
252 let handles: Vec<_> = (0..8)
253 .map(|i| {
254 let target = target.clone();
255 let barrier = Arc::clone(&barrier);
256 std::thread::spawn(move || {
257 let payload = format!("payload-{i}");
258 barrier.wait();
259 let result = write_atomic_new(&target, payload.as_bytes())
260 .map(|_| ())
261 .map_err(|e| e.kind());
262 (payload, result)
263 })
264 })
265 .collect();
266
267 let results: Vec<_> = handles.into_iter().map(|h| h.join().unwrap()).collect();
268 let winners: Vec<_> = results
269 .iter()
270 .filter_map(|(payload, result)| result.is_ok().then_some(payload))
271 .collect();
272 let already_exists = results
273 .iter()
274 .filter(|(_, result)| {
275 matches!(result, Err(kind) if *kind == std::io::ErrorKind::AlreadyExists)
276 })
277 .count();
278
279 assert_eq!(
280 winners.len(),
281 1,
282 "round {round}: exactly one creator may win, got {results:?}"
283 );
284 assert_eq!(
285 already_exists, 7,
286 "round {round}: every losing creator must get AlreadyExists, got {results:?}"
287 );
288
289 let written = std::fs::read_to_string(&target).unwrap();
290 assert_eq!(
291 written, *winners[0],
292 "round {round}: destination must contain the winner's payload"
293 );
294 assert_no_temp_files(tmp.path());
295 }
296 }
297
298 /// Regression for finding #22: an early return between temp-file creation and
299 /// a successful rename (e.g. `write_all`/`sync_all` failing under ENOSPC/EIO)
300 /// must NOT leave the hidden temp file orphaned in the data directory.
301 ///
302 /// Pre-fix, `create_temp_file` handed back a bare `PathBuf` with no `Drop`
303 /// cleanup, so dropping it without a rename — exactly what `?` does on a
304 /// write/sync failure — left the temp on disk. This reconstructs that path by
305 /// dropping the guard without renaming and asserting the temp is gone.
306 #[test]
307 fn regression_armed_guard_removes_temp_on_early_drop() {
308 let dir = TempDir::new().unwrap();
309 let (file, guard) = create_temp_file(dir.path(), "file.txt").unwrap();
310 let tmp_path = guard.path.clone();
311 assert!(
312 tmp_path.exists(),
313 "temp file should exist after create_temp_file"
314 );
315
316 // Simulate a write/sync failure bailing out before the rename: the file
317 // handle and the (still-armed) guard go out of scope without a rename.
318 drop(file);
319 drop(guard);
320
321 assert!(
322 !tmp_path.exists(),
323 "armed guard must remove the orphaned temp file on early drop"
324 );
325 // No stray `.tmp.` files left in the directory.
326 let leftovers: Vec<_> = std::fs::read_dir(dir.path())
327 .unwrap()
328 .filter_map(|e| e.ok())
329 .filter(|e| e.file_name().to_string_lossy().contains(".tmp."))
330 .collect();
331 assert!(leftovers.is_empty(), "no temp files may be left behind");
332 }
333
334 /// Once disarmed (after a successful rename) the guard must NOT delete the
335 /// path it was tracking — otherwise it would clobber the renamed destination.
336 #[test]
337 fn regression_disarmed_guard_leaves_file_intact() {
338 let dir = TempDir::new().unwrap();
339 let (file, mut guard) = create_temp_file(dir.path(), "kept.txt").unwrap();
340 drop(file);
341 let kept = guard.path.clone();
342
343 guard.disarm();
344 drop(guard);
345
346 assert!(
347 kept.exists(),
348 "disarmed guard must leave the renamed destination untouched"
349 );
350 }
351
352 fn assert_no_temp_files(dir: &Path) {
353 let leftovers: Vec<_> = std::fs::read_dir(dir)
354 .unwrap()
355 .filter_map(|e| e.ok())
356 .filter(|e| e.file_name().to_string_lossy().contains(".tmp."))
357 .collect();
358 assert!(leftovers.is_empty(), "no temp files may be left behind");
359 }
360
361 /// Regression: rewriting an existing file via `write_atomic` must PRESERVE
362 /// its permission bits. Pre-fix the temp file's default mode (0644) replaced
363 /// a deliberately-restricted destination (0600) on every rewrite — a quiet
364 /// permission-widening on user data. A first create still uses the default
365 /// mode (there is no destination mode to copy).
366 #[cfg(unix)]
367 #[test]
368 fn write_atomic_preserves_existing_destination_permissions() {
369 use std::os::unix::fs::PermissionsExt;
370
371 let tmp = TempDir::new().unwrap();
372 let target = tmp.path().join("private.md");
373
374 // Create, then restrict to 0600.
375 write_atomic(&target, b"secret v1").unwrap();
376 std::fs::set_permissions(&target, std::fs::Permissions::from_mode(0o600)).unwrap();
377 let before = std::fs::metadata(&target).unwrap().permissions().mode() & 0o777;
378 assert_eq!(before, 0o600, "fixture must start at 0600");
379
380 // Rewrite in place: the 0600 mode must survive (not reset to 0644).
381 write_atomic(&target, b"secret v2").unwrap();
382 let after = std::fs::metadata(&target).unwrap().permissions().mode() & 0o777;
383 assert_eq!(
384 after, 0o600,
385 "write_atomic must preserve the destination's 0600 mode, got {after:o}"
386 );
387 assert_eq!(std::fs::read(&target).unwrap(), b"secret v2");
388 }
389}