Skip to main content

obj_core/platform/
mod.rs

1//! Platform layer (L0).
2//!
3//! This module owns the file-system primitives the pager and WAL
4//! build on: opening a database file, positioned reads and writes at
5//! fixed page boundaries, length queries, truncation, removal, and the
6//! durability primitive [`FileHandle::sync_data`].
7//!
8//! # `unsafe` policy
9//!
10//! Power-of-ten Rule 8 confines `unsafe` to this submodule (and to
11//! `libobj`). All positioned-I/O and durability calls go through the
12//! `rustix` crate, which provides audited safe wrappers. The
13//! cross-process locking submodule [`lock`] reaches for `libc::fcntl`
14//! / `LockFileEx` directly because `rustix` does not expose POSIX
15//! OFD-lock variants; every `unsafe` block in that submodule
16//! carries a `// SAFETY:` comment per Rule 8. This `mod.rs` itself
17//! contains no `unsafe` blocks and is `#![deny(unsafe_code)]`; the
18//! lint is scoped to the file rather than the module tree so the
19//! `lock` submodule can re-introduce its (audited) `unsafe`
20//! blocks.
21
22#![deny(unsafe_code)]
23
24#[cfg(any(test, feature = "fault-injection"))]
25pub mod fault;
26
27pub mod lock;
28
29pub use crate::platform::lock::{ReaderLock, WriterLock};
30
31use std::fs::{File, OpenOptions};
32use std::io;
33use std::path::Path;
34
35use rustix::fs::FileExt as _;
36
37use crate::error::{Error, Result};
38
39/// File-backend abstraction the pager and WAL build on.
40///
41/// `FileBackend` is the common subset of [`FileHandle`] operations
42/// that fault-injection harnesses and the production type both expose
43/// (Rule 9). Production code never holds `dyn FileBackend`; both
44/// [`crate::pager::Pager`] and [`crate::wal::Wal`] are generic over
45/// `F: FileBackend` so the dispatch stays monomorphised.
46///
47/// New methods added to this trait MUST mirror an existing
48/// [`FileHandle`] method exactly. Adding a method that does not exist
49/// on the production type would let the harness perform syscalls
50/// production code cannot — a forbidden divergence (the harness
51/// must be a strict superset of legal behaviour, never a separate
52/// kingdom).
53pub trait FileBackend: Sized {
54    /// Length of the file in bytes. See [`FileHandle::len`].
55    ///
56    /// # Errors
57    ///
58    /// Returns [`Error::Io`] on syscall failure.
59    fn len(&self) -> Result<u64>;
60
61    /// `true` iff the file has zero length.
62    ///
63    /// # Errors
64    ///
65    /// Returns [`Error::Io`] on syscall failure.
66    fn is_empty(&self) -> Result<bool> {
67        Ok(self.len()? == 0)
68    }
69
70    /// Positioned read. See [`FileHandle::read_exact_at`].
71    ///
72    /// # Errors
73    ///
74    /// Returns [`Error::Io`] on syscall failure or harness-injected
75    /// short read.
76    fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()>;
77
78    /// Positioned write. See [`FileHandle::write_all_at`].
79    ///
80    /// # Errors
81    ///
82    /// Returns [`Error::Io`] on syscall failure.
83    fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()>;
84
85    /// Truncate or extend the file. See [`FileHandle::set_len`].
86    ///
87    /// # Errors
88    ///
89    /// Returns [`Error::Io`] on syscall failure.
90    fn set_len(&self, new_len: u64) -> Result<()>;
91
92    /// See [`FileHandle::sync_data`].
93    ///
94    /// # Errors
95    ///
96    /// Returns [`Error::Io`] on syscall failure.
97    fn sync_data(&self, mode: SyncMode) -> Result<()>;
98
99    /// See [`FileHandle::sync_all`].
100    ///
101    /// # Errors
102    ///
103    /// Returns [`Error::Io`] on syscall failure.
104    fn sync_all(&self) -> Result<()>;
105}
106
107/// Durability mode for [`FileHandle::sync_data`].
108///
109/// `SyncMode` is the user-visible knob that selects the cross-platform
110/// fsync primitive `obj` calls after a WAL commit. The contract for
111/// each variant is documented in `docs/format.md` § `SyncMode`.
112///
113/// The default is [`SyncMode::Full`]: a `commit` that returns
114/// `Ok(())` is durable across a system-wide power loss. `Normal` is
115/// the throughput-tuned middle ground; `Off` skips the syscall and is
116/// only safe for tests and benchmarks.
117///
118/// Power-of-ten Rule 5: a three-state enum is far cheaper to audit
119/// than three `bool` knobs, and the variants are exhaustive at every
120/// `match`.
121#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
122pub enum SyncMode {
123    /// Strongest durability. Survives system-wide power loss.
124    ///
125    /// Maps to `fcntl(F_FULLFSYNC)` on macOS (forces the drive cache
126    /// to flush), `FlushFileBuffers` on Windows, and `fdatasync` on
127    /// Linux / BSDs. macOS's plain `fsync` is **not** sufficient
128    /// here — it does not flush the drive cache; `F_FULLFSYNC` does.
129    /// This is the standard wisdom for safety-critical macOS storage.
130    #[default]
131    Full,
132
133    /// Process-crash and kernel-panic durability; may lose data on a
134    /// sudden power loss if the drive's write cache has not been
135    /// flushed by the time the OS acknowledges the call.
136    ///
137    /// Maps to `fsync` on Unix and `FlushFileBuffers` on Windows. On
138    /// Windows there is no weaker primitive than `FlushFileBuffers`,
139    /// so `Normal` and `Full` are equivalent there.
140    Normal,
141
142    /// No durability call. The OS may write the data eventually, but
143    /// `obj` does not ask it to. Use only for tests and benchmarks
144    /// where data loss is acceptable.
145    Off,
146}
147
148/// A handle to a database file capable of positioned reads and writes
149/// at page boundaries.
150///
151/// `FileHandle` is intentionally minimal — it exposes only the
152/// operations the pager (L1) and WAL (L2) need. Higher layers must
153/// never reach past it into `std::fs` directly; routing every syscall
154/// through this type is how the project keeps Rule 8 enforceable.
155#[derive(Debug)]
156pub struct FileHandle {
157    file: File,
158}
159
160impl FileHandle {
161    /// Open `path` for read-write access, creating it if it does not
162    /// exist. The new file is empty; the caller is responsible for
163    /// writing the file header.
164    ///
165    /// # Errors
166    ///
167    /// Returns [`Error::Io`] if the file cannot be opened or created
168    /// (permission denied, missing parent directory, etc.).
169    pub fn open_or_create<P: AsRef<Path>>(path: P) -> Result<Self> {
170        let file = OpenOptions::new()
171            .read(true)
172            .write(true)
173            .create(true)
174            .truncate(false)
175            .open(path)?;
176        Ok(Self { file })
177    }
178
179    /// Open `path` for read-write access, failing if the file
180    /// already exists (`O_CREAT | O_EXCL` on POSIX, `CREATE_NEW` on
181    /// Windows). Used by M11 #92 hot-backup to guarantee the
182    /// destination is never overwritten.
183    ///
184    /// # Errors
185    ///
186    /// Returns [`Error::Io`] if the file already exists, the parent
187    /// directory does not exist, or any other syscall failure
188    /// occurs.
189    pub fn create_new<P: AsRef<Path>>(path: P) -> Result<Self> {
190        let file = OpenOptions::new()
191            .read(true)
192            .write(true)
193            .create_new(true)
194            .open(path)?;
195        Ok(Self { file })
196    }
197
198    /// Length of the file in bytes.
199    ///
200    /// # Errors
201    ///
202    /// Returns [`Error::Io`] if the metadata syscall fails.
203    pub fn len(&self) -> Result<u64> {
204        let meta = self.file.metadata()?;
205        Ok(meta.len())
206    }
207
208    /// `true` if the file is zero-length (i.e. just created).
209    ///
210    /// # Errors
211    ///
212    /// Returns [`Error::Io`] if the metadata syscall fails.
213    pub fn is_empty(&self) -> Result<bool> {
214        Ok(self.len()? == 0)
215    }
216
217    /// Positioned read. Fills `buf` from byte offset `offset`.
218    ///
219    /// # Errors
220    ///
221    /// Returns [`Error::Io`] on syscall failure or on short read
222    /// (e.g. file shorter than `offset + buf.len()`).
223    pub fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> {
224        // `FileExt::read_exact_at` is `rustix`'s audited wrapper around
225        // `pread`. It does not require `unsafe`.
226        self.file.read_exact_at(buf, offset).map_err(Error::from)
227    }
228
229    /// Positioned write. Writes `buf` to byte offset `offset`.
230    ///
231    /// # Errors
232    ///
233    /// Returns [`Error::Io`] on syscall failure or on short write.
234    pub fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()> {
235        self.file.write_all_at(buf, offset).map_err(Error::from)
236    }
237
238    /// Truncate or extend the file to `new_len` bytes.
239    ///
240    /// Used by the pager when the freelist is exhausted and a fresh
241    /// page must be appended.
242    ///
243    /// # Errors
244    ///
245    /// Returns [`Error::Io`] on syscall failure.
246    pub fn set_len(&self, new_len: u64) -> Result<()> {
247        self.file.set_len(new_len).map_err(Error::from)
248    }
249
250    /// Force file contents and metadata to disk. Used at close.
251    ///
252    /// Power-of-ten Rule 7: the underlying call returns
253    /// `io::Result<()>` and is propagated explicitly.
254    ///
255    /// # Errors
256    ///
257    /// Returns [`Error::Io`] on syscall failure.
258    pub fn sync_all(&self) -> Result<()> {
259        self.file.sync_all().map_err(Error::from)
260    }
261
262    /// Force file data (and on `Full`, the drive cache) to persistent
263    /// storage according to `mode`. See [`SyncMode`] for the exact
264    /// per-variant durability promise.
265    ///
266    /// On `SyncMode::Off` this call is a no-op.
267    ///
268    /// # Errors
269    ///
270    /// Returns [`Error::Io`] on syscall failure.
271    pub fn sync_data(&self, mode: SyncMode) -> Result<()> {
272        match mode {
273            SyncMode::Off => Ok(()),
274            SyncMode::Normal => sync_data_normal(&self.file),
275            SyncMode::Full => sync_data_full(&self.file),
276        }
277    }
278}
279
280impl FileBackend for FileHandle {
281    fn len(&self) -> Result<u64> {
282        FileHandle::len(self)
283    }
284    fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> {
285        FileHandle::read_exact_at(self, buf, offset)
286    }
287    fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()> {
288        FileHandle::write_all_at(self, buf, offset)
289    }
290    fn set_len(&self, new_len: u64) -> Result<()> {
291        FileHandle::set_len(self, new_len)
292    }
293    fn sync_data(&self, mode: SyncMode) -> Result<()> {
294        FileHandle::sync_data(self, mode)
295    }
296    fn sync_all(&self) -> Result<()> {
297        FileHandle::sync_all(self)
298    }
299}
300
301// --------------------------------------------------------------------
302// Per-platform sync primitives. Kept as small free functions so the
303// platform switch is one match-arm per variant rather than threading
304// `cfg` attributes through `FileHandle::sync_data`.
305// --------------------------------------------------------------------
306
307/// `Normal` durability — `fsync` on Unix, `FlushFileBuffers` on
308/// Windows. Survives process / kernel crash but may lose data on a
309/// sudden power loss if the drive cache has not been flushed.
310fn sync_data_normal(file: &File) -> Result<()> {
311    // `std::fs::File::sync_all` invokes `fsync(2)` on Unix and
312    // `FlushFileBuffers` on Windows. Both flush the OS page cache;
313    // neither, on macOS, flushes the drive cache (that is `Full`'s
314    // job via `F_FULLFSYNC`). Using `sync_all` here keeps the
315    // platform switch in one place and avoids reimplementing the
316    // `fsync` syscall ourselves.
317    file.sync_all().map_err(Error::from)
318}
319
320/// `Full` durability — flush the drive cache where the platform
321/// distinguishes it from the OS cache. See [`SyncMode::Full`] for
322/// the per-OS mapping.
323#[cfg(target_vendor = "apple")]
324fn sync_data_full(file: &File) -> Result<()> {
325    // macOS: plain `fsync` does NOT flush the drive cache. The
326    // documented way to do that is `fcntl(F_FULLFSYNC)`, which the
327    // `rustix` crate exposes safely. See
328    // <https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fcntl.2.html>.
329    rustix::fs::fcntl_fullfsync(file).map_err(|e| Error::Io(io::Error::from(e)))
330}
331
332/// `Full` durability on non-Apple Unix targets: `fdatasync(2)` is
333/// sufficient (the on-disk data is flushed, metadata changes that do
334/// not affect the data — like mtime — are not).
335#[cfg(all(unix, not(target_vendor = "apple")))]
336fn sync_data_full(file: &File) -> Result<()> {
337    rustix::fs::fdatasync(file).map_err(|e| Error::Io(io::Error::from(e)))
338}
339
340/// `Full` durability on Windows: `FlushFileBuffers` is the strongest
341/// primitive; `std::fs::File::sync_all` invokes it.
342#[cfg(windows)]
343fn sync_data_full(file: &File) -> Result<()> {
344    file.sync_all().map_err(Error::from)
345}
346
347/// Delete the file at `path` if it exists.
348///
349/// Used by `Pager::close()` to remove the WAL sidecar after a clean
350/// shutdown. Missing-file is intentionally **not** an error; the
351/// post-condition is "no file at `path`", and that is satisfied either
352/// by deletion or by absence.
353///
354/// # Errors
355///
356/// Returns [`Error::Io`] on any failure other than `NotFound`.
357pub fn remove_file_if_exists<P: AsRef<Path>>(path: P) -> Result<()> {
358    match std::fs::remove_file(path) {
359        Ok(()) => Ok(()),
360        Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()),
361        Err(e) => Err(Error::Io(e)),
362    }
363}
364
365impl From<io::ErrorKind> for Error {
366    fn from(kind: io::ErrorKind) -> Self {
367        Error::Io(io::Error::from(kind))
368    }
369}
370
371#[cfg(test)]
372mod tests {
373    use super::{FileHandle, SyncMode};
374    use tempfile::TempDir;
375
376    fn write_and_sync(mode: SyncMode) {
377        let dir = TempDir::new().expect("tempdir");
378        let path = dir.path().join("sync.bin");
379        let h = FileHandle::open_or_create(&path).expect("open");
380        h.set_len(4096).expect("set_len");
381        h.write_all_at(&[0xABu8; 4096], 0).expect("write");
382        h.sync_data(mode).expect("sync_data must succeed");
383    }
384
385    #[test]
386    fn sync_data_full_returns_ok() {
387        write_and_sync(SyncMode::Full);
388    }
389
390    #[test]
391    fn sync_data_normal_returns_ok() {
392        write_and_sync(SyncMode::Normal);
393    }
394
395    #[test]
396    fn sync_data_off_is_noop() {
397        write_and_sync(SyncMode::Off);
398    }
399
400    #[test]
401    fn default_is_full() {
402        assert_eq!(SyncMode::default(), SyncMode::Full);
403    }
404}