obj_core/platform/mod.rs
1//! Platform layer (L0).
2//!
3//! This module owns the file-system primitives the pager and WAL
4//! build on: opening a database file, positioned reads and writes at
5//! fixed page boundaries, length queries, truncation, removal, and the
6//! durability primitive [`FileHandle::sync_data`].
7//!
8//! # `unsafe` policy
9//!
10//! Power-of-ten Rule 8 confines `unsafe` to this submodule (and to
11//! `libobj`). All positioned-I/O and durability calls go through the
12//! `rustix` crate, which provides audited safe wrappers. The
13//! cross-process locking submodule [`lock`] reaches for `libc::fcntl`
14//! / `LockFileEx` directly because `rustix` does not expose POSIX
15//! OFD-lock variants; every `unsafe` block in that submodule
16//! carries a `// SAFETY:` comment per Rule 8. This `mod.rs` itself
17//! contains no `unsafe` blocks and is `#![deny(unsafe_code)]`; the
18//! lint is scoped to the file rather than the module tree so the
19//! `lock` submodule can re-introduce its (audited) `unsafe`
20//! blocks.
21
22#![deny(unsafe_code)]
23
24#[cfg(any(test, feature = "fault-injection"))]
25pub mod fault;
26
27pub mod lock;
28
29pub use crate::platform::lock::{ReaderLock, WriterLock};
30
31use std::fs::{File, OpenOptions};
32use std::io;
33use std::path::Path;
34
35use rustix::fs::FileExt as _;
36
37use crate::error::{Error, Result};
38
39/// File-backend abstraction the pager and WAL build on.
40///
41/// `FileBackend` is the common subset of [`FileHandle`] operations
42/// that fault-injection harnesses and the production type both expose
43/// (Rule 9). Production code never holds `dyn FileBackend`; both
44/// [`crate::pager::Pager`] and [`crate::wal::Wal`] are generic over
45/// `F: FileBackend` so the dispatch stays monomorphised.
46///
47/// New methods added to this trait MUST mirror an existing
48/// [`FileHandle`] method exactly. Adding a method that does not exist
49/// on the production type would let the harness perform syscalls
50/// production code cannot — a forbidden divergence (the harness
51/// must be a strict superset of legal behaviour, never a separate
52/// kingdom).
53pub trait FileBackend: Sized {
54 /// Length of the file in bytes. See [`FileHandle::len`].
55 ///
56 /// # Errors
57 ///
58 /// Returns [`Error::Io`] on syscall failure.
59 fn len(&self) -> Result<u64>;
60
61 /// `true` iff the file has zero length.
62 ///
63 /// # Errors
64 ///
65 /// Returns [`Error::Io`] on syscall failure.
66 fn is_empty(&self) -> Result<bool> {
67 Ok(self.len()? == 0)
68 }
69
70 /// Positioned read. See [`FileHandle::read_exact_at`].
71 ///
72 /// # Errors
73 ///
74 /// Returns [`Error::Io`] on syscall failure or harness-injected
75 /// short read.
76 fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()>;
77
78 /// Positioned write. See [`FileHandle::write_all_at`].
79 ///
80 /// # Errors
81 ///
82 /// Returns [`Error::Io`] on syscall failure.
83 fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()>;
84
85 /// Truncate or extend the file. See [`FileHandle::set_len`].
86 ///
87 /// # Errors
88 ///
89 /// Returns [`Error::Io`] on syscall failure.
90 fn set_len(&self, new_len: u64) -> Result<()>;
91
92 /// See [`FileHandle::sync_data`].
93 ///
94 /// # Errors
95 ///
96 /// Returns [`Error::Io`] on syscall failure.
97 fn sync_data(&self, mode: SyncMode) -> Result<()>;
98
99 /// See [`FileHandle::sync_all`].
100 ///
101 /// # Errors
102 ///
103 /// Returns [`Error::Io`] on syscall failure.
104 fn sync_all(&self) -> Result<()>;
105}
106
107/// Durability mode for [`FileHandle::sync_data`].
108///
109/// `SyncMode` is the user-visible knob that selects the cross-platform
110/// fsync primitive `obj` calls after a WAL commit. The contract for
111/// each variant is documented in `docs/format.md` § `SyncMode`.
112///
113/// The default is [`SyncMode::Full`]: a `commit` that returns
114/// `Ok(())` is durable across a system-wide power loss. `Normal` is
115/// the throughput-tuned middle ground; `Off` skips the syscall and is
116/// only safe for tests and benchmarks.
117///
118/// Power-of-ten Rule 5: a three-state enum is far cheaper to audit
119/// than three `bool` knobs, and the variants are exhaustive at every
120/// `match`.
121#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
122pub enum SyncMode {
123 /// Strongest durability. Survives system-wide power loss.
124 ///
125 /// Maps to `fcntl(F_FULLFSYNC)` on macOS (forces the drive cache
126 /// to flush), `FlushFileBuffers` on Windows, and `fdatasync` on
127 /// Linux / BSDs. macOS's plain `fsync` is **not** sufficient
128 /// here — it does not flush the drive cache; `F_FULLFSYNC` does.
129 /// This is the standard wisdom for safety-critical macOS storage.
130 #[default]
131 Full,
132
133 /// Process-crash and kernel-panic durability; may lose data on a
134 /// sudden power loss if the drive's write cache has not been
135 /// flushed by the time the OS acknowledges the call.
136 ///
137 /// Maps to `fsync` on Unix and `FlushFileBuffers` on Windows. On
138 /// Windows there is no weaker primitive than `FlushFileBuffers`,
139 /// so `Normal` and `Full` are equivalent there.
140 Normal,
141
142 /// No durability call. The OS may write the data eventually, but
143 /// `obj` does not ask it to. Use only for tests and benchmarks
144 /// where data loss is acceptable.
145 Off,
146}
147
148/// A handle to a database file capable of positioned reads and writes
149/// at page boundaries.
150///
151/// `FileHandle` is intentionally minimal — it exposes only the
152/// operations the pager (L1) and WAL (L2) need. Higher layers must
153/// never reach past it into `std::fs` directly; routing every syscall
154/// through this type is how the project keeps Rule 8 enforceable.
155#[derive(Debug)]
156pub struct FileHandle {
157 file: File,
158}
159
160impl FileHandle {
161 /// Open `path` for read-write access, creating it if it does not
162 /// exist. The new file is empty; the caller is responsible for
163 /// writing the file header.
164 ///
165 /// # Errors
166 ///
167 /// Returns [`Error::Io`] if the file cannot be opened or created
168 /// (permission denied, missing parent directory, etc.).
169 pub fn open_or_create<P: AsRef<Path>>(path: P) -> Result<Self> {
170 let file = OpenOptions::new()
171 .read(true)
172 .write(true)
173 .create(true)
174 .truncate(false)
175 .open(path)?;
176 Ok(Self { file })
177 }
178
179 /// Open `path` for read-write access, failing if the file
180 /// already exists (`O_CREAT | O_EXCL` on POSIX, `CREATE_NEW` on
181 /// Windows). Used by M11 #92 hot-backup to guarantee the
182 /// destination is never overwritten.
183 ///
184 /// # Errors
185 ///
186 /// Returns [`Error::Io`] if the file already exists, the parent
187 /// directory does not exist, or any other syscall failure
188 /// occurs.
189 pub fn create_new<P: AsRef<Path>>(path: P) -> Result<Self> {
190 let file = OpenOptions::new()
191 .read(true)
192 .write(true)
193 .create_new(true)
194 .open(path)?;
195 Ok(Self { file })
196 }
197
198 /// Length of the file in bytes.
199 ///
200 /// # Errors
201 ///
202 /// Returns [`Error::Io`] if the metadata syscall fails.
203 pub fn len(&self) -> Result<u64> {
204 let meta = self.file.metadata()?;
205 Ok(meta.len())
206 }
207
208 /// `true` if the file is zero-length (i.e. just created).
209 ///
210 /// # Errors
211 ///
212 /// Returns [`Error::Io`] if the metadata syscall fails.
213 pub fn is_empty(&self) -> Result<bool> {
214 Ok(self.len()? == 0)
215 }
216
217 /// Positioned read. Fills `buf` from byte offset `offset`.
218 ///
219 /// # Errors
220 ///
221 /// Returns [`Error::Io`] on syscall failure or on short read
222 /// (e.g. file shorter than `offset + buf.len()`).
223 pub fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> {
224 // `FileExt::read_exact_at` is `rustix`'s audited wrapper around
225 // `pread`. It does not require `unsafe`.
226 self.file.read_exact_at(buf, offset).map_err(Error::from)
227 }
228
229 /// Positioned write. Writes `buf` to byte offset `offset`.
230 ///
231 /// # Errors
232 ///
233 /// Returns [`Error::Io`] on syscall failure or on short write.
234 pub fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()> {
235 self.file.write_all_at(buf, offset).map_err(Error::from)
236 }
237
238 /// Truncate or extend the file to `new_len` bytes.
239 ///
240 /// Used by the pager when the freelist is exhausted and a fresh
241 /// page must be appended.
242 ///
243 /// # Errors
244 ///
245 /// Returns [`Error::Io`] on syscall failure.
246 pub fn set_len(&self, new_len: u64) -> Result<()> {
247 self.file.set_len(new_len).map_err(Error::from)
248 }
249
250 /// Force file contents and metadata to disk. Used at close.
251 ///
252 /// Power-of-ten Rule 7: the underlying call returns
253 /// `io::Result<()>` and is propagated explicitly.
254 ///
255 /// # Errors
256 ///
257 /// Returns [`Error::Io`] on syscall failure.
258 pub fn sync_all(&self) -> Result<()> {
259 self.file.sync_all().map_err(Error::from)
260 }
261
262 /// Force file data (and on `Full`, the drive cache) to persistent
263 /// storage according to `mode`. See [`SyncMode`] for the exact
264 /// per-variant durability promise.
265 ///
266 /// On `SyncMode::Off` this call is a no-op.
267 ///
268 /// # Errors
269 ///
270 /// Returns [`Error::Io`] on syscall failure.
271 pub fn sync_data(&self, mode: SyncMode) -> Result<()> {
272 match mode {
273 SyncMode::Off => Ok(()),
274 SyncMode::Normal => sync_data_normal(&self.file),
275 SyncMode::Full => sync_data_full(&self.file),
276 }
277 }
278}
279
280impl FileBackend for FileHandle {
281 fn len(&self) -> Result<u64> {
282 FileHandle::len(self)
283 }
284 fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<()> {
285 FileHandle::read_exact_at(self, buf, offset)
286 }
287 fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<()> {
288 FileHandle::write_all_at(self, buf, offset)
289 }
290 fn set_len(&self, new_len: u64) -> Result<()> {
291 FileHandle::set_len(self, new_len)
292 }
293 fn sync_data(&self, mode: SyncMode) -> Result<()> {
294 FileHandle::sync_data(self, mode)
295 }
296 fn sync_all(&self) -> Result<()> {
297 FileHandle::sync_all(self)
298 }
299}
300
301// --------------------------------------------------------------------
302// Per-platform sync primitives. Kept as small free functions so the
303// platform switch is one match-arm per variant rather than threading
304// `cfg` attributes through `FileHandle::sync_data`.
305// --------------------------------------------------------------------
306
307/// `Normal` durability — `fsync` on Unix, `FlushFileBuffers` on
308/// Windows. Survives process / kernel crash but may lose data on a
309/// sudden power loss if the drive cache has not been flushed.
310fn sync_data_normal(file: &File) -> Result<()> {
311 // `std::fs::File::sync_all` invokes `fsync(2)` on Unix and
312 // `FlushFileBuffers` on Windows. Both flush the OS page cache;
313 // neither, on macOS, flushes the drive cache (that is `Full`'s
314 // job via `F_FULLFSYNC`). Using `sync_all` here keeps the
315 // platform switch in one place and avoids reimplementing the
316 // `fsync` syscall ourselves.
317 file.sync_all().map_err(Error::from)
318}
319
320/// `Full` durability — flush the drive cache where the platform
321/// distinguishes it from the OS cache. See [`SyncMode::Full`] for
322/// the per-OS mapping.
323#[cfg(target_vendor = "apple")]
324fn sync_data_full(file: &File) -> Result<()> {
325 // macOS: plain `fsync` does NOT flush the drive cache. The
326 // documented way to do that is `fcntl(F_FULLFSYNC)`, which the
327 // `rustix` crate exposes safely. See
328 // <https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man2/fcntl.2.html>.
329 rustix::fs::fcntl_fullfsync(file).map_err(|e| Error::Io(io::Error::from(e)))
330}
331
332/// `Full` durability on non-Apple Unix targets: `fdatasync(2)` is
333/// sufficient (the on-disk data is flushed, metadata changes that do
334/// not affect the data — like mtime — are not).
335#[cfg(all(unix, not(target_vendor = "apple")))]
336fn sync_data_full(file: &File) -> Result<()> {
337 rustix::fs::fdatasync(file).map_err(|e| Error::Io(io::Error::from(e)))
338}
339
340/// `Full` durability on Windows: `FlushFileBuffers` is the strongest
341/// primitive; `std::fs::File::sync_all` invokes it.
342#[cfg(windows)]
343fn sync_data_full(file: &File) -> Result<()> {
344 file.sync_all().map_err(Error::from)
345}
346
347/// Delete the file at `path` if it exists.
348///
349/// Used by `Pager::close()` to remove the WAL sidecar after a clean
350/// shutdown. Missing-file is intentionally **not** an error; the
351/// post-condition is "no file at `path`", and that is satisfied either
352/// by deletion or by absence.
353///
354/// # Errors
355///
356/// Returns [`Error::Io`] on any failure other than `NotFound`.
357pub fn remove_file_if_exists<P: AsRef<Path>>(path: P) -> Result<()> {
358 match std::fs::remove_file(path) {
359 Ok(()) => Ok(()),
360 Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(()),
361 Err(e) => Err(Error::Io(e)),
362 }
363}
364
365impl From<io::ErrorKind> for Error {
366 fn from(kind: io::ErrorKind) -> Self {
367 Error::Io(io::Error::from(kind))
368 }
369}
370
371#[cfg(test)]
372mod tests {
373 use super::{FileHandle, SyncMode};
374 use tempfile::TempDir;
375
376 fn write_and_sync(mode: SyncMode) {
377 let dir = TempDir::new().expect("tempdir");
378 let path = dir.path().join("sync.bin");
379 let h = FileHandle::open_or_create(&path).expect("open");
380 h.set_len(4096).expect("set_len");
381 h.write_all_at(&[0xABu8; 4096], 0).expect("write");
382 h.sync_data(mode).expect("sync_data must succeed");
383 }
384
385 #[test]
386 fn sync_data_full_returns_ok() {
387 write_and_sync(SyncMode::Full);
388 }
389
390 #[test]
391 fn sync_data_normal_returns_ok() {
392 write_and_sync(SyncMode::Normal);
393 }
394
395 #[test]
396 fn sync_data_off_is_noop() {
397 write_and_sync(SyncMode::Off);
398 }
399
400 #[test]
401 fn default_is_full() {
402 assert_eq!(SyncMode::default(), SyncMode::Full);
403 }
404}