Skip to main content

dbmd_core/
fsx.rs

1//! `fsx` — the one atomic, durable file write for db.md's primary data.
2//!
3//! Every store-state file that holds **primary** data — content records
4//! ([`crate::parser::write_file`]), `log.md` and its archives ([`crate::log`]),
5//! and in-place link rewrites — is written through [`write_atomic`] or
6//! [`write_atomic_new`]:
7//!
8//! 1. write the bytes to a uniquely-named sibling temp file in the *same*
9//!    directory (`create_new`, so a predictable temp name can never be
10//!    clobbered — closing the temp-clobber race);
11//! 2. `fsync` the temp file;
12//! 3. either `rename` it over the destination ([`write_atomic`]) or hard-link it
13//!    into place with create-new semantics ([`write_atomic_new`]);
14//! 4. `fsync` the parent directory so the committed directory entry survives a
15//!    crash.
16//!
17//! These are the only primitives for durable writes — never `std::fs::write`,
18//! which is neither atomic nor crash-durable. Use [`write_atomic`] when replacing
19//! an existing file is intended; use [`write_atomic_new`] when the destination
20//! must not already exist.
21//!
22//! **Not for the index.** `index.md` / `index.jsonl` are *derived, rebuildable*
23//! artifacts on the O(changed) write-through path; they use their own
24//! atomic-but-not-`fsync`'d writer ([`crate::index`]'s `AtomicTemp`) on purpose
25//! — a crash-lost index write is recovered by `dbmd index rebuild`, so paying an
26//! `fsync` per catalog update on the hot loop would be cost without benefit.
27
28use std::fs::{self, File, OpenOptions};
29use std::io::Write;
30use std::path::{Path, PathBuf};
31use std::sync::atomic::{AtomicU64, Ordering};
32use std::time::{SystemTime, UNIX_EPOCH};
33
34/// Atomically and durably replace `path` with `bytes` (see the module docs for
35/// the write/fsync/rename/fsync sequence). The parent directory is created if
36/// missing. On *any* early return between temp-file creation and a successful
37/// rename — a `write_all`/`sync_all` failure (ENOSPC, EIO, quota) as well as a
38/// rename failure — the temp file is cleaned up rather than leaked, via the
39/// [`TempGuard`] `Drop` impl (mirroring `index.rs`'s `AtomicTemp`).
40pub fn write_atomic(path: &Path, bytes: &[u8]) -> std::io::Result<()> {
41    let dir = path.parent().unwrap_or_else(|| Path::new("."));
42    fs::create_dir_all(dir)?;
43
44    let file_name = path
45        .file_name()
46        .and_then(|s| s.to_str())
47        .unwrap_or("dbmd-tmp");
48    let (mut f, mut guard) = create_temp_file(dir, file_name)?;
49
50    // Scope the handle so it is flushed/closed before the rename. A failure here
51    // returns via `?`; `guard` then drops and removes the orphaned temp file.
52    {
53        f.write_all(bytes)?;
54        f.sync_all()?;
55    }
56
57    // The rename either errors (guard drops, cleaning up the temp) or succeeds
58    // (we disarm the guard so it does not remove the now-renamed destination).
59    fs::rename(&guard.path, path)?;
60    guard.disarm();
61    sync_parent_dir(dir);
62    Ok(())
63}
64
65/// Atomically and durably create `path` with `bytes`, failing with
66/// [`std::io::ErrorKind::AlreadyExists`] if the destination already exists.
67///
68/// This follows the same temp-file + file-fsync + parent-fsync sequence as
69/// [`write_atomic`], but installs the temp file with `hard_link(temp, path)`
70/// instead of `rename(temp, path)`. Hard-link creation is resolved atomically by
71/// the OS and refuses an existing destination, so concurrent creators for the
72/// same path produce exactly one winner and `AlreadyExists` for the rest. The
73/// temporary link is removed after the destination link is established.
74pub fn write_atomic_new(path: &Path, bytes: &[u8]) -> std::io::Result<()> {
75    let dir = path.parent().unwrap_or_else(|| Path::new("."));
76    fs::create_dir_all(dir)?;
77
78    let file_name = path
79        .file_name()
80        .and_then(|s| s.to_str())
81        .unwrap_or("dbmd-tmp");
82    let (mut f, mut guard) = create_temp_file(dir, file_name)?;
83
84    {
85        f.write_all(bytes)?;
86        f.sync_all()?;
87    }
88    drop(f);
89
90    fs::hard_link(&guard.path, path)?;
91    if fs::remove_file(&guard.path).is_ok() {
92        guard.disarm();
93    }
94    sync_parent_dir(dir);
95    Ok(())
96}
97
98/// Drop-based cleanup for the hidden temp file `write_atomic` creates. While
99/// armed, dropping the guard removes `path`. [`TempGuard::disarm`] is called
100/// only after a successful rename, or after a successful temp-link cleanup in
101/// [`write_atomic_new`], so the final destination is never touched.
102struct TempGuard {
103    path: PathBuf,
104    armed: bool,
105}
106
107impl TempGuard {
108    /// Stop cleaning up `path` on drop — used once the temp has been renamed
109    /// into place and is no longer a stray temp file.
110    fn disarm(&mut self) {
111        self.armed = false;
112    }
113}
114
115impl Drop for TempGuard {
116    fn drop(&mut self) {
117        // Best-effort cleanup if an error path bailed out before the rename.
118        if self.armed {
119            let _ = fs::remove_file(&self.path);
120        }
121    }
122}
123
124/// Create a uniquely-named temp file in `dir` with `create_new` (never clobbers
125/// a predictable name), retrying on the vanishingly-rare collision. The name is
126/// hidden (`.`-prefixed) and tagged with pid + nanos + a process-wide counter so
127/// concurrent writers in the same directory never pick the same path. Returns the
128/// open handle plus an armed [`TempGuard`] so any early return cleans up the temp.
129fn create_temp_file(dir: &Path, file_name: &str) -> std::io::Result<(File, TempGuard)> {
130    static TMP_SEQ: AtomicU64 = AtomicU64::new(0);
131    let pid = std::process::id();
132    let nanos = SystemTime::now()
133        .duration_since(UNIX_EPOCH)
134        .map(|d| d.as_nanos())
135        .unwrap_or(0);
136
137    for _ in 0..128 {
138        let seq = TMP_SEQ.fetch_add(1, Ordering::Relaxed);
139        let tmp = dir.join(format!(".{file_name}.tmp.{pid}.{nanos}.{seq}"));
140        match OpenOptions::new().write(true).create_new(true).open(&tmp) {
141            Ok(file) => {
142                return Ok((
143                    file,
144                    TempGuard {
145                        path: tmp,
146                        armed: true,
147                    },
148                ))
149            }
150            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => continue,
151            Err(e) => return Err(e),
152        }
153    }
154
155    Err(std::io::Error::new(
156        std::io::ErrorKind::AlreadyExists,
157        "could not allocate a unique dbmd temp file",
158    ))
159}
160
161/// Best-effort `fsync` of the directory so a completed `rename` is durable across
162/// a crash. Non-fatal: some filesystems disallow directory `fsync`.
163fn sync_parent_dir(dir: &Path) {
164    if let Ok(d) = File::open(dir) {
165        let _ = d.sync_all();
166    }
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172    use tempfile::TempDir;
173
174    #[test]
175    fn write_atomic_creates_then_replaces_durably() {
176        let tmp = TempDir::new().unwrap();
177        let target = tmp.path().join("sub").join("file.txt"); // parent missing
178
179        write_atomic(&target, b"first").unwrap();
180        assert_eq!(std::fs::read(&target).unwrap(), b"first");
181
182        // Replace in place — content swaps, no temp files left behind.
183        write_atomic(&target, b"second").unwrap();
184        assert_eq!(std::fs::read(&target).unwrap(), b"second");
185
186        let leftovers: Vec<_> = std::fs::read_dir(target.parent().unwrap())
187            .unwrap()
188            .filter_map(|e| e.ok())
189            .filter(|e| e.file_name().to_string_lossy().contains(".tmp."))
190            .collect();
191        assert!(leftovers.is_empty(), "no temp files may be left behind");
192    }
193
194    #[test]
195    fn write_atomic_is_byte_exact_including_empty() {
196        let tmp = TempDir::new().unwrap();
197        let target = tmp.path().join("empty.txt");
198        write_atomic(&target, b"").unwrap();
199        assert_eq!(std::fs::read(&target).unwrap(), b"");
200    }
201
202    #[test]
203    fn write_atomic_new_creates_but_refuses_existing() {
204        let tmp = TempDir::new().unwrap();
205        let target = tmp.path().join("sub").join("file.txt");
206
207        write_atomic_new(&target, b"first").unwrap();
208        assert_eq!(std::fs::read(&target).unwrap(), b"first");
209
210        let err = write_atomic_new(&target, b"second").unwrap_err();
211        assert_eq!(err.kind(), std::io::ErrorKind::AlreadyExists);
212        assert_eq!(
213            std::fs::read(&target).unwrap(),
214            b"first",
215            "create-new failure must leave the existing destination untouched"
216        );
217
218        assert_no_temp_files(target.parent().unwrap());
219    }
220
221    #[test]
222    fn write_atomic_new_allows_only_one_concurrent_creator() {
223        use std::sync::{Arc, Barrier};
224
225        for round in 0..40 {
226            let tmp = TempDir::new().unwrap();
227            let target = tmp.path().join("file.txt");
228            let barrier = Arc::new(Barrier::new(8));
229
230            let handles: Vec<_> = (0..8)
231                .map(|i| {
232                    let target = target.clone();
233                    let barrier = Arc::clone(&barrier);
234                    std::thread::spawn(move || {
235                        let payload = format!("payload-{i}");
236                        barrier.wait();
237                        let result = write_atomic_new(&target, payload.as_bytes())
238                            .map(|_| ())
239                            .map_err(|e| e.kind());
240                        (payload, result)
241                    })
242                })
243                .collect();
244
245            let results: Vec<_> = handles.into_iter().map(|h| h.join().unwrap()).collect();
246            let winners: Vec<_> = results
247                .iter()
248                .filter_map(|(payload, result)| result.is_ok().then_some(payload))
249                .collect();
250            let already_exists = results
251                .iter()
252                .filter(|(_, result)| {
253                    matches!(result, Err(kind) if *kind == std::io::ErrorKind::AlreadyExists)
254                })
255                .count();
256
257            assert_eq!(
258                winners.len(),
259                1,
260                "round {round}: exactly one creator may win, got {results:?}"
261            );
262            assert_eq!(
263                already_exists, 7,
264                "round {round}: every losing creator must get AlreadyExists, got {results:?}"
265            );
266
267            let written = std::fs::read_to_string(&target).unwrap();
268            assert_eq!(
269                written, *winners[0],
270                "round {round}: destination must contain the winner's payload"
271            );
272            assert_no_temp_files(tmp.path());
273        }
274    }
275
276    /// Regression for finding #22: an early return between temp-file creation and
277    /// a successful rename (e.g. `write_all`/`sync_all` failing under ENOSPC/EIO)
278    /// must NOT leave the hidden temp file orphaned in the data directory.
279    ///
280    /// Pre-fix, `create_temp_file` handed back a bare `PathBuf` with no `Drop`
281    /// cleanup, so dropping it without a rename — exactly what `?` does on a
282    /// write/sync failure — left the temp on disk. This reconstructs that path by
283    /// dropping the guard without renaming and asserting the temp is gone.
284    #[test]
285    fn regression_armed_guard_removes_temp_on_early_drop() {
286        let dir = TempDir::new().unwrap();
287        let (file, guard) = create_temp_file(dir.path(), "file.txt").unwrap();
288        let tmp_path = guard.path.clone();
289        assert!(
290            tmp_path.exists(),
291            "temp file should exist after create_temp_file"
292        );
293
294        // Simulate a write/sync failure bailing out before the rename: the file
295        // handle and the (still-armed) guard go out of scope without a rename.
296        drop(file);
297        drop(guard);
298
299        assert!(
300            !tmp_path.exists(),
301            "armed guard must remove the orphaned temp file on early drop"
302        );
303        // No stray `.tmp.` files left in the directory.
304        let leftovers: Vec<_> = std::fs::read_dir(dir.path())
305            .unwrap()
306            .filter_map(|e| e.ok())
307            .filter(|e| e.file_name().to_string_lossy().contains(".tmp."))
308            .collect();
309        assert!(leftovers.is_empty(), "no temp files may be left behind");
310    }
311
312    /// Once disarmed (after a successful rename) the guard must NOT delete the
313    /// path it was tracking — otherwise it would clobber the renamed destination.
314    #[test]
315    fn regression_disarmed_guard_leaves_file_intact() {
316        let dir = TempDir::new().unwrap();
317        let (file, mut guard) = create_temp_file(dir.path(), "kept.txt").unwrap();
318        drop(file);
319        let kept = guard.path.clone();
320
321        guard.disarm();
322        drop(guard);
323
324        assert!(
325            kept.exists(),
326            "disarmed guard must leave the renamed destination untouched"
327        );
328    }
329
330    fn assert_no_temp_files(dir: &Path) {
331        let leftovers: Vec<_> = std::fs::read_dir(dir)
332            .unwrap()
333            .filter_map(|e| e.ok())
334            .filter(|e| e.file_name().to_string_lossy().contains(".tmp."))
335            .collect();
336        assert!(leftovers.is_empty(), "no temp files may be left behind");
337    }
338}